684 files changed, 164348 insertions, 0 deletions
diff --git a/rts/Adjustor.c b/rts/Adjustor.c
new file mode 100644
index 0000000000..f3e5bfe6aa
--- /dev/null
+++ b/rts/Adjustor.c
@@ -0,0 +1,1110 @@
+/* -----------------------------------------------------------------------------
+ * Foreign export adjustor thunks
+ *
+ * Copyright (c) 1998.
+ *
+ * ---------------------------------------------------------------------------*/
+
+/* A little bit of background...
+
+An adjustor thunk is a dynamically allocated code snippet that allows
+Haskell closures to be viewed as C function pointers. 
+
+Stable pointers provide a way for the outside world to get access to,
+and evaluate, Haskell heap objects, with the RTS providing a small
+range of ops for doing so. So, assuming we've got a stable pointer in
+our hand in C, we can jump into the Haskell world and evaluate a callback
+procedure, say. This works OK in some cases where callbacks are used, but
+does require the external code to know about stable pointers and how to deal
+with them. We'd like to hide the Haskell-nature of a callback and have it
+be invoked just like any other C function pointer. 
+
+Enter adjustor thunks. An adjustor thunk is a little piece of code
+that's generated on-the-fly (one per Haskell closure being exported)
+that, when entered using some 'universal' calling convention (e.g., the
+C calling convention on platform X), pushes an implicit stable pointer
+(to the Haskell callback) before calling another (static) C function stub
+which takes care of entering the Haskell code via its stable pointer.
+
+An adjustor thunk is allocated on the C heap, and is called from within
+Haskell just before handing out the function pointer to the Haskell (IO)
+action. User code should never have to invoke it explicitly.
+
+An adjustor thunk differs from a C function pointer in one respect: when
+the code is through with it, it has to be freed in order to release Haskell
+and C resources. Failure to do so result in memory leaks on both the C and
+Haskell side.
+*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsExternal.h"
+#include "RtsUtils.h"
+#include <stdlib.h>
+
+#if defined(_WIN32)
+#include <windows.h>
+#endif
+
+#if defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
+#include <string.h>
+#endif
+
+#ifdef LEADING_UNDERSCORE
+#define UNDERSCORE "_"
+#else 
+#define UNDERSCORE ""
+#endif
+#if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
+/* 
+  Now here's something obscure for you:
+
+  When generating an adjustor thunk that uses the C calling
+  convention, we have to make sure that the thunk kicks off
+  the process of jumping into Haskell with a tail jump. Why?
+  Because as a result of jumping in into Haskell we may end
+  up freeing the very adjustor thunk we came from using
+  freeHaskellFunctionPtr(). Hence, we better not return to
+  the adjustor code on our way  out, since it could by then
+  point to junk.
+  
+  The fix is readily at hand, just include the opcodes
+  for the C stack fixup code that we need to perform when
+  returning in some static piece of memory and arrange
+  to return to it before tail jumping from the adjustor thunk.
+*/
+static void  GNUC3_ATTRIBUTE(used) obscure_ccall_wrapper(void)
+{
+  __asm__ (
+     ".globl " UNDERSCORE "obscure_ccall_ret_code\n"
+     UNDERSCORE "obscure_ccall_ret_code:\n\t"
+     "addl $0x4, %esp\n\t"
+     "ret"
+   );
+}
+extern void obscure_ccall_ret_code(void);
+
+#if defined(openbsd_HOST_OS)
+static unsigned char *obscure_ccall_ret_code_dyn;
+#endif
+
+#endif
+
+#if defined(x86_64_HOST_ARCH)
+static void GNUC3_ATTRIBUTE(used) obscure_ccall_wrapper(void)
+{
+  __asm__ (
+   ".globl " UNDERSCORE "obscure_ccall_ret_code\n"
+   UNDERSCORE "obscure_ccall_ret_code:\n\t"
+   "addq $0x8, %rsp\n\t"
+   "ret"
+  );
+}
+extern void obscure_ccall_ret_code(void);
+#endif
+
+#if defined(alpha_HOST_ARCH)
+/* To get the definition of PAL_imb: */
+# if defined(linux_HOST_OS)
+#  include <asm/pal.h>
+# else
+#  include <machine/pal.h>
+# endif
+#endif
+
+#if defined(ia64_HOST_ARCH)
+#include "Storage.h"
+
+/* Layout of a function descriptor */
+typedef struct _IA64FunDesc {
+    StgWord64 ip;
+    StgWord64 gp;
+} IA64FunDesc;
+
+static void *
+stgAllocStable(size_t size_in_bytes, StgStablePtr *stable)
+{
+  StgArrWords* arr;
+  nat data_size_in_words, total_size_in_words;
+  
+  /* round up to a whole number of words */
+  data_size_in_words  = (size_in_bytes + sizeof(W_) + 1) / sizeof(W_);
+  total_size_in_words = sizeofW(StgArrWords) + data_size_in_words;
+  
+  /* allocate and fill it in */
+  arr = (StgArrWords *)allocate(total_size_in_words);
+  SET_ARR_HDR(arr, &stg_ARR_WORDS_info, CCCS, data_size_in_words);
+ 
+  /* obtain a stable ptr */
+  *stable = getStablePtr((StgPtr)arr);
+
+  /* and return a ptr to the goods inside the array */
+  return(&(arr->payload));
+}
+#endif
+
+#if defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
+__asm__("obscure_ccall_ret_code:\n\t"
+        "lwz 1,0(1)\n\t"
+        "lwz 0,4(1)\n\t"
+        "mtlr 0\n\t"
+        "blr");
+extern void obscure_ccall_ret_code(void);
+#endif
+
+#if defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
+#if !(defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS))
+
+/* !!! !!! WARNING: !!! !!!
+ * This structure is accessed from AdjustorAsm.s
+ * Any changes here have to be mirrored in the offsets there.
+ */
+
+typedef struct AdjustorStub {
+#if defined(powerpc_HOST_ARCH) && defined(darwin_HOST_OS)
+    unsigned        lis;
+    unsigned        ori;
+    unsigned        lwz;
+    unsigned        mtctr;
+    unsigned        bctr;
+    StgFunPtr       code;
+#elif defined(powerpc64_HOST_ARCH) && defined(darwin_HOST_OS)
+        /* powerpc64-darwin: just guessing that it won't use fundescs. */
+    unsigned        lis;
+    unsigned        ori;
+    unsigned        rldimi;
+    unsigned        oris;
+    unsigned        ori2;
+    unsigned        lwz;
+    unsigned        mtctr;
+    unsigned        bctr;
+    StgFunPtr       code;
+#else
+        /* fundesc-based ABIs */
+#define         FUNDESCS
+    StgFunPtr       code;
+    struct AdjustorStub
+                    *toc;
+    void            *env;
+#endif
+    StgStablePtr    hptr;
+    StgFunPtr       wptr;
+    StgInt          negative_framesize;
+    StgInt          extrawords_plus_one;
+} AdjustorStub;
+
+#endif
+#endif
+
+#if defined(i386_HOST_ARCH) && defined(darwin_HOST_OS)
+
+/* !!! !!! WARNING: !!! !!!
+ * This structure is accessed from AdjustorAsm.s
+ * Any changes here have to be mirrored in the offsets there.
+ */
+
+typedef struct AdjustorStub {
+    unsigned char   call[8];
+    StgStablePtr    hptr;
+    StgFunPtr       wptr;
+    StgInt          frame_size;
+    StgInt          argument_size;
+} AdjustorStub;
+#endif
+
+#if defined(darwin_HOST_OS) || defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
+static int totalArgumentSize(char *typeString)
+{
+    int sz = 0;
+    while(*typeString)
+    {
+        char t = *typeString++;
+
+        switch(t)
+        {
+                // on 32-bit platforms, Double and Int64 occupy two words.
+            case 'd':
+            case 'l':
+                if(sizeof(void*) == 4)
+                {
+                    sz += 2;
+                    break;
+                }
+                // everything else is one word.
+            default:
+                sz += 1;
+        }
+    }
+    return sz;
+}
+#endif
+
+void*
+createAdjustor(int cconv, StgStablePtr hptr,
+	       StgFunPtr wptr,
+	       char *typeString
+#if !defined(powerpc_HOST_ARCH) && !defined(powerpc64_HOST_ARCH) && !defined(x86_64_HOST_ARCH)
+	          STG_UNUSED
+#endif
+              )
+{
+  void *adjustor = NULL;
+
+  switch (cconv)
+  {
+  case 0: /* _stdcall */
+#if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
+    /* Magic constant computed by inspecting the code length of
+       the following assembly language snippet
+       (offset and machine code prefixed):
+
+     <0>:	58	          popl   %eax              # temp. remove ret addr..
+     <1>:	68 fd fc fe fa    pushl  0xfafefcfd  	   # constant is large enough to
+        			   	           	   # hold a StgStablePtr
+     <6>:	50	          pushl  %eax		   # put back ret. addr
+     <7>:	b8 fa ef ff 00	  movl   $0x00ffeffa, %eax # load up wptr
+     <c>: 	ff e0             jmp    %eax        	   # and jump to it.
+		# the callee cleans up the stack
+    */
+    adjustor = stgMallocBytesRWX(14);
+    {
+	unsigned char *const adj_code = (unsigned char *)adjustor;
+	adj_code[0x00] = (unsigned char)0x58;  /* popl %eax  */
+
+	adj_code[0x01] = (unsigned char)0x68;  /* pushl hptr (which is a dword immediate ) */
+	*((StgStablePtr*)(adj_code + 0x02)) = (StgStablePtr)hptr;
+
+	adj_code[0x06] = (unsigned char)0x50; /* pushl %eax */
+
+	adj_code[0x07] = (unsigned char)0xb8; /* movl  $wptr, %eax */
+	*((StgFunPtr*)(adj_code + 0x08)) = (StgFunPtr)wptr;
+
+	adj_code[0x0c] = (unsigned char)0xff; /* jmp %eax */
+	adj_code[0x0d] = (unsigned char)0xe0;
+    }
+#endif
+    break;
+
+  case 1: /* _ccall */
+#if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
+  /* Magic constant computed by inspecting the code length of
+     the following assembly language snippet
+     (offset and machine code prefixed):
+
+  <00>: 68 ef be ad de     pushl  $0xdeadbeef  	   # constant is large enough to
+        			   	           # hold a StgStablePtr
+  <05>:	b8 fa ef ff 00	   movl   $0x00ffeffa, %eax # load up wptr
+  <0a>: 68 ef be ad de     pushl  $obscure_ccall_ret_code # push the return address
+  <0f>: ff e0              jmp    *%eax            # jump to wptr
+
+    The ccall'ing version is a tad different, passing in the return
+    address of the caller to the auto-generated C stub (which enters
+    via the stable pointer.) (The auto-generated C stub is in on this
+    game, don't worry :-)
+
+    See the comment next to obscure_ccall_ret_code why we need to
+    perform a tail jump instead of a call, followed by some C stack
+    fixup.
+
+    Note: The adjustor makes the assumption that any return value
+    coming back from the C stub is not stored on the stack.
+    That's (thankfully) the case here with the restricted set of 
+    return types that we support.
+  */
+    adjustor = stgMallocBytesRWX(17);
+    {
+	unsigned char *const adj_code = (unsigned char *)adjustor;
+
+	adj_code[0x00] = (unsigned char)0x68;  /* pushl hptr (which is a dword immediate ) */
+	*((StgStablePtr*)(adj_code+0x01)) = (StgStablePtr)hptr;
+
+	adj_code[0x05] = (unsigned char)0xb8;  /* movl  $wptr, %eax */
+	*((StgFunPtr*)(adj_code + 0x06)) = (StgFunPtr)wptr;
+
+	adj_code[0x0a] = (unsigned char)0x68;  /* pushl obscure_ccall_ret_code */
+	*((StgFunPtr*)(adj_code + 0x0b)) = 
+#if !defined(openbsd_HOST_OS)
+			(StgFunPtr)obscure_ccall_ret_code;
+#else
+			(StgFunPtr)obscure_ccall_ret_code_dyn;
+#endif
+
+	adj_code[0x0f] = (unsigned char)0xff; /* jmp *%eax */
+	adj_code[0x10] = (unsigned char)0xe0; 
+    }
+#elif defined(i386_HOST_ARCH) && defined(darwin_HOST_OS)
+    {
+        /*
+          What's special about Darwin/Mac OS X on i386?
+          It wants the stack to stay 16-byte aligned.
+          
+          We offload most of the work to AdjustorAsm.S.
+        */
+        AdjustorStub *adjustorStub = stgMallocBytesRWX(sizeof(AdjustorStub));
+        adjustor = adjustorStub;
+
+        extern void adjustorCode(void);
+        int sz = totalArgumentSize(typeString);
+        
+        adjustorStub->call[0] = 0xe8;
+        *(long*)&adjustorStub->call[1] = ((char*)&adjustorCode) - ((char*)adjustorStub + 5);
+        adjustorStub->hptr = hptr;
+        adjustorStub->wptr = wptr;
+        
+            // The adjustor puts the following things on the stack:
+            // 1.) %ebp link
+            // 2.) padding and (a copy of) the arguments
+            // 3.) a dummy argument
+            // 4.) hptr
+            // 5.) return address (for returning to the adjustor)
+            // All these have to add up to a multiple of 16. 
+
+            // first, include everything in frame_size
+        adjustorStub->frame_size = sz * 4 + 16;
+            // align to 16 bytes
+        adjustorStub->frame_size = (adjustorStub->frame_size + 15) & ~15;
+            // only count 2.) and 3.) as part of frame_size
+        adjustorStub->frame_size -= 12; 
+        adjustorStub->argument_size = sz;
+    }
+    
+#elif defined(x86_64_HOST_ARCH)
+    /*
+      stack at call:
+               argn
+	       ...
+	       arg7
+               return address
+	       %rdi,%rsi,%rdx,%rcx,%r8,%r9 = arg0..arg6
+
+      if there are <6 integer args, then we can just push the
+      StablePtr into %edi and shuffle the other args up.
+
+      If there are >=6 integer args, then we have to flush one arg
+      to the stack, and arrange to adjust the stack ptr on return.
+      The stack will be rearranged to this:
+
+             argn
+	     ...
+	     arg7
+	     return address  *** <-- dummy arg in stub fn.
+	     arg6
+	     obscure_ccall_ret_code
+
+      This unfortunately means that the type of the stub function
+      must have a dummy argument for the original return address
+      pointer inserted just after the 6th integer argument.
+
+      Code for the simple case:
+
+   0:   4d 89 c1                mov    %r8,%r9
+   3:   49 89 c8                mov    %rcx,%r8
+   6:   48 89 d1                mov    %rdx,%rcx
+   9:   48 89 f2                mov    %rsi,%rdx
+   c:   48 89 fe                mov    %rdi,%rsi
+   f:   48 8b 3d 0a 00 00 00    mov    10(%rip),%rdi
+  16:   ff 25 0c 00 00 00       jmpq   *12(%rip)
+  ... 
+  20: .quad 0  # aligned on 8-byte boundary
+  28: .quad 0  # aligned on 8-byte boundary
+
+
+  And the version for >=6 integer arguments:
+
+   0:   41 51                   push   %r9
+   2:   ff 35 20 00 00 00       pushq  32(%rip)        # 28 <ccall_adjustor+0x28>
+   8:   4d 89 c1                mov    %r8,%r9
+   b:   49 89 c8                mov    %rcx,%r8
+   e:   48 89 d1                mov    %rdx,%rcx
+  11:   48 89 f2                mov    %rsi,%rdx
+  14:   48 89 fe                mov    %rdi,%rsi
+  17:   48 8b 3d 12 00 00 00    mov    18(%rip),%rdi        # 30 <ccall_adjustor+0x30>
+  1e:   ff 25 14 00 00 00       jmpq   *20(%rip)        # 38 <ccall_adjustor+0x38>
+  ...
+  28: .quad 0  # aligned on 8-byte boundary
+  30: .quad 0  # aligned on 8-byte boundary
+  38: .quad 0  # aligned on 8-byte boundary
+    */
+
+    /* we assume the small code model (gcc -mcmmodel=small) where
+     * all symbols are <2^32, so hence wptr should fit into 32 bits.
+     */
+    ASSERT(((long)wptr >> 32) == 0);
+
+    {  
+	int i = 0;
+	char *c;
+
+	// determine whether we have 6 or more integer arguments,
+	// and therefore need to flush one to the stack.
+	for (c = typeString; *c != '\0'; c++) {
+	    if (*c == 'i' || *c == 'l') i++;
+	    if (i == 6) break;
+	}
+
+	if (i < 6) {
+	    adjustor = stgMallocBytesRWX(0x30);
+
+	    *(StgInt32 *)adjustor        = 0x49c1894d;
+	    *(StgInt32 *)(adjustor+0x4)  = 0x8948c889;
+	    *(StgInt32 *)(adjustor+0x8)  = 0xf28948d1;
+	    *(StgInt32 *)(adjustor+0xc)  = 0x48fe8948;
+	    *(StgInt32 *)(adjustor+0x10) = 0x000a3d8b;
+	    *(StgInt32 *)(adjustor+0x14) = 0x25ff0000;
+	    *(StgInt32 *)(adjustor+0x18) = 0x0000000c;
+	    *(StgInt64 *)(adjustor+0x20) = (StgInt64)hptr;
+	    *(StgInt64 *)(adjustor+0x28) = (StgInt64)wptr;
+	}
+	else
+	{
+	    adjustor = stgMallocBytesRWX(0x40);
+
+	    *(StgInt32 *)adjustor        = 0x35ff5141;
+	    *(StgInt32 *)(adjustor+0x4)  = 0x00000020;
+	    *(StgInt32 *)(adjustor+0x8)  = 0x49c1894d;
+	    *(StgInt32 *)(adjustor+0xc)  = 0x8948c889;
+	    *(StgInt32 *)(adjustor+0x10) = 0xf28948d1;
+	    *(StgInt32 *)(adjustor+0x14) = 0x48fe8948;
+	    *(StgInt32 *)(adjustor+0x18) = 0x00123d8b;
+	    *(StgInt32 *)(adjustor+0x1c) = 0x25ff0000;
+	    *(StgInt32 *)(adjustor+0x20) = 0x00000014;
+	    
+	    *(StgInt64 *)(adjustor+0x28) = (StgInt64)obscure_ccall_ret_code;
+	    *(StgInt64 *)(adjustor+0x30) = (StgInt64)hptr;
+	    *(StgInt64 *)(adjustor+0x38) = (StgInt64)wptr;
+	}
+    }
+#elif defined(sparc_HOST_ARCH)
+  /* Magic constant computed by inspecting the code length of the following
+     assembly language snippet (offset and machine code prefixed):
+
+     <00>: 9C23A008   sub   %sp, 8, %sp         ! make room for %o4/%o5 in caller's frame
+     <04>: DA23A060   st    %o5, [%sp + 96]     ! shift registers by 2 positions
+     <08>: D823A05C   st    %o4, [%sp + 92]
+     <0C>: 9A10000B   mov   %o3, %o5
+     <10>: 9810000A   mov   %o2, %o4
+     <14>: 96100009   mov   %o1, %o3
+     <18>: 94100008   mov   %o0, %o2
+     <1C>: 13000000   sethi %hi(wptr), %o1      ! load up wptr (1 of 2)
+     <20>: 11000000   sethi %hi(hptr), %o0      ! load up hptr (1 of 2)
+     <24>: 81C26000   jmp   %o1 + %lo(wptr)     ! jump to wptr (load 2 of 2)
+     <28>: 90122000   or    %o0, %lo(hptr), %o0 ! load up hptr (2 of 2, delay slot)
+     <2C>  00000000                             ! place for getting hptr back easily
+
+     ccall'ing on SPARC is easy, because we are quite lucky to push a
+     multiple of 8 bytes (1 word hptr + 1 word dummy arg) in front of the
+     existing arguments (note that %sp must stay double-word aligned at
+     all times, see ABI spec at http://www.sparc.org/standards/psABI3rd.pdf).
+     To do this, we extend the *caller's* stack frame by 2 words and shift
+     the output registers used for argument passing (%o0 - %o5, we are a *leaf*
+     procedure because of the tail-jump) by 2 positions. This makes room in
+     %o0 and %o1 for the additinal arguments, namely  hptr and a dummy (used
+     for destination addr of jump on SPARC, return address on x86, ...). This
+     shouldn't cause any problems for a C-like caller: alloca is implemented
+     similarly, and local variables should be accessed via %fp, not %sp. In a
+     nutshell: This should work! (Famous last words! :-)
+  */
+    adjustor = stgMallocBytesRWX(4*(11+1));
+    {
+        unsigned long *const adj_code = (unsigned long *)adjustor;
+
+        adj_code[ 0]  = 0x9C23A008UL;   /* sub   %sp, 8, %sp         */
+        adj_code[ 1]  = 0xDA23A060UL;   /* st    %o5, [%sp + 96]     */
+        adj_code[ 2]  = 0xD823A05CUL;   /* st    %o4, [%sp + 92]     */
+        adj_code[ 3]  = 0x9A10000BUL;   /* mov   %o3, %o5            */
+        adj_code[ 4]  = 0x9810000AUL;   /* mov   %o2, %o4            */
+        adj_code[ 5]  = 0x96100009UL;   /* mov   %o1, %o3            */
+        adj_code[ 6]  = 0x94100008UL;   /* mov   %o0, %o2            */
+        adj_code[ 7]  = 0x13000000UL;   /* sethi %hi(wptr), %o1      */
+        adj_code[ 7] |= ((unsigned long)wptr) >> 10;
+        adj_code[ 8]  = 0x11000000UL;   /* sethi %hi(hptr), %o0      */
+        adj_code[ 8] |= ((unsigned long)hptr) >> 10;
+        adj_code[ 9]  = 0x81C26000UL;   /* jmp   %o1 + %lo(wptr)     */
+        adj_code[ 9] |= ((unsigned long)wptr) & 0x000003FFUL;
+        adj_code[10]  = 0x90122000UL;   /* or    %o0, %lo(hptr), %o0 */
+        adj_code[10] |= ((unsigned long)hptr) & 0x000003FFUL;
+
+        adj_code[11]  = (unsigned long)hptr;
+
+        /* flush cache */
+        asm("flush %0" : : "r" (adj_code     ));
+        asm("flush %0" : : "r" (adj_code +  2));
+        asm("flush %0" : : "r" (adj_code +  4));
+        asm("flush %0" : : "r" (adj_code +  6));
+        asm("flush %0" : : "r" (adj_code + 10));
+
+        /* max. 5 instructions latency, and we need at >= 1 for returning */
+        asm("nop");
+        asm("nop");
+        asm("nop");
+        asm("nop");
+    }
+#elif defined(alpha_HOST_ARCH)
+  /* Magic constant computed by inspecting the code length of
+     the following assembly language snippet
+     (offset and machine code prefixed; note that the machine code
+     shown is longwords stored in little-endian order):
+
+  <00>: 46520414	mov	a2, a4
+  <04>: 46100412	mov	a0, a2
+  <08>: a61b0020	ldq     a0, 0x20(pv)	# load up hptr
+  <0c>: 46730415	mov	a3, a5
+  <10>: a77b0028	ldq     pv, 0x28(pv)	# load up wptr
+  <14>: 46310413	mov	a1, a3
+  <18>: 6bfb----	jmp     (pv), <hint>	# jump to wptr (with hint)
+  <1c>: 00000000				# padding for alignment
+  <20>: [8 bytes for hptr quadword]
+  <28>: [8 bytes for wptr quadword]
+
+     The "computed" jump at <08> above is really a jump to a fixed
+     location.  Accordingly, we place an always-correct hint in the
+     jump instruction, namely the address offset from <0c> to wptr,
+     divided by 4, taking the lowest 14 bits.
+
+     We only support passing 4 or fewer argument words, for the same
+     reason described under sparc_HOST_ARCH above by JRS, 21 Aug 01.
+     On the Alpha the first 6 integer arguments are in a0 through a5,
+     and the rest on the stack.  Hence we want to shuffle the original
+     caller's arguments by two.
+
+     On the Alpha the calling convention is so complex and dependent
+     on the callee's signature -- for example, the stack pointer has
+     to be a multiple of 16 -- that it seems impossible to me [ccshan]
+     to handle the general case correctly without changing how the
+     adjustor is called from C.  For now, our solution of shuffling
+     registers only and ignoring the stack only works if the original
+     caller passed 4 or fewer argument words.
+
+TODO: Depending on how much allocation overhead stgMallocBytes uses for
+      header information (more precisely, if the overhead is no more than
+      4 bytes), we should move the first three instructions above down by
+      4 bytes (getting rid of the nop), hence saving memory. [ccshan]
+  */
+    ASSERT(((StgWord64)wptr & 3) == 0);
+    adjustor = stgMallocBytesRWX(48);
+    {
+	StgWord64 *const code = (StgWord64 *)adjustor;
+
+	code[0] = 0x4610041246520414L;
+	code[1] = 0x46730415a61b0020L;
+	code[2] = 0x46310413a77b0028L;
+	code[3] = 0x000000006bfb0000L
+		| (((StgWord32*)(wptr) - (StgWord32*)(code) - 3) & 0x3fff);
+
+	code[4] = (StgWord64)hptr;
+	code[5] = (StgWord64)wptr;
+
+	/* Ensure that instruction cache is consistent with our new code */
+	__asm__ volatile("call_pal %0" : : "i" (PAL_imb));
+    }
+#elif defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
+
+#define OP_LO(op,lo)  ((((unsigned)(op)) << 16) | (((unsigned)(lo)) & 0xFFFF))
+#define OP_HI(op,hi)  ((((unsigned)(op)) << 16) | (((unsigned)(hi)) >> 16))
+    {
+        /* The PowerPC Linux (32-bit) calling convention is annoyingly complex.
+           We need to calculate all the details of the stack frame layout,
+           taking into account the types of all the arguments, and then
+           generate code on the fly. */
+    
+        int src_gpr = 3, dst_gpr = 5;
+        int fpr = 3;
+        int src_offset = 0, dst_offset = 0;
+        int n = strlen(typeString),i;
+        int src_locs[n], dst_locs[n];
+        int frameSize;
+        unsigned *code;
+      
+            /* Step 1:
+               Calculate where the arguments should go.
+               src_locs[] will contain the locations of the arguments in the
+               original stack frame passed to the adjustor.
+               dst_locs[] will contain the locations of the arguments after the
+               adjustor runs, on entry to the wrapper proc pointed to by wptr.
+
+               This algorithm is based on the one described on page 3-19 of the
+               System V ABI PowerPC Processor Supplement.
+            */
+        for(i=0;typeString[i];i++)
+        {
+            char t = typeString[i];
+            if((t == 'f' || t == 'd') && fpr <= 8)
+                src_locs[i] = dst_locs[i] = -32-(fpr++);
+            else
+            {
+                if(t == 'l' && src_gpr <= 9)
+                {
+                    if((src_gpr & 1) == 0)
+                        src_gpr++;
+                    src_locs[i] = -src_gpr;
+                    src_gpr += 2;
+                }
+                else if(t == 'i' && src_gpr <= 10)
+                {
+                    src_locs[i] = -(src_gpr++);
+                }
+                else
+                {
+                    if(t == 'l' || t == 'd')
+                    {
+                        if(src_offset % 8)
+                            src_offset += 4;
+                    }
+                    src_locs[i] = src_offset;
+                    src_offset += (t == 'l' || t == 'd') ? 8 : 4;
+                }
+
+                if(t == 'l' && dst_gpr <= 9)
+                {
+                    if((dst_gpr & 1) == 0)
+                        dst_gpr++;
+                    dst_locs[i] = -dst_gpr;
+                    dst_gpr += 2;
+                }
+                else if(t == 'i' && dst_gpr <= 10)
+                {
+                    dst_locs[i] = -(dst_gpr++);
+                }
+                else
+                {
+                    if(t == 'l' || t == 'd')
+                    {
+                        if(dst_offset % 8)
+                            dst_offset += 4;
+                    }
+                    dst_locs[i] = dst_offset;
+                    dst_offset += (t == 'l' || t == 'd') ? 8 : 4;
+                }
+            }
+        }
+
+        frameSize = dst_offset + 8;
+        frameSize = (frameSize+15) & ~0xF;
+
+            /* Step 2:
+               Build the adjustor.
+            */
+                    // allocate space for at most 4 insns per parameter
+                    // plus 14 more instructions.
+        adjustor = stgMallocBytesRWX(4 * (4*n + 14));
+        code = (unsigned*)adjustor;
+        
+        *code++ = 0x48000008; // b *+8
+            // * Put the hptr in a place where freeHaskellFunctionPtr
+            //   can get at it.
+        *code++ = (unsigned) hptr;
+
+            // * save the link register
+        *code++ = 0x7c0802a6; // mflr r0;
+        *code++ = 0x90010004; // stw r0, 4(r1);
+            // * and build a new stack frame
+        *code++ = OP_LO(0x9421, -frameSize); // stwu r1, -frameSize(r1)
+
+            // * now generate instructions to copy arguments
+            //   from the old stack frame into the new stack frame.
+        for(i=n-1;i>=0;i--)
+        {
+            if(src_locs[i] < -32)
+                ASSERT(dst_locs[i] == src_locs[i]);
+            else if(src_locs[i] < 0)
+            {
+                // source in GPR.
+                ASSERT(typeString[i] != 'f' && typeString[i] != 'd');
+                if(dst_locs[i] < 0)
+                {
+                    ASSERT(dst_locs[i] > -32);
+                        // dst is in GPR, too.
+
+                    if(typeString[i] == 'l')
+                    {
+                            // mr dst+1, src+1
+                        *code++ = 0x7c000378
+                                | ((-dst_locs[i]+1) << 16)
+                                | ((-src_locs[i]+1) << 11)
+                                | ((-src_locs[i]+1) << 21);
+                    }
+                    // mr dst, src
+                    *code++ = 0x7c000378
+                            | ((-dst_locs[i]) << 16)
+                            | ((-src_locs[i]) << 11)
+                            | ((-src_locs[i]) << 21);
+                }
+                else
+                {
+                    if(typeString[i] == 'l')
+                    {
+                            // stw src+1, dst_offset+4(r1)
+                        *code++ = 0x90010000
+                                | ((-src_locs[i]+1) << 21)
+                                | (dst_locs[i] + 4);
+                    }
+                    
+                        // stw src, dst_offset(r1)
+                    *code++ = 0x90010000
+                            | ((-src_locs[i]) << 21)
+                            | (dst_locs[i] + 8);
+                }
+            }
+            else
+            {
+                ASSERT(dst_locs[i] >= 0);
+                ASSERT(typeString[i] != 'f' && typeString[i] != 'd');
+
+                if(typeString[i] == 'l')
+                {
+                    // lwz r0, src_offset(r1)
+                        *code++ = 0x80010000
+                                | (src_locs[i] + frameSize + 8 + 4);
+                    // stw r0, dst_offset(r1)
+                        *code++ = 0x90010000
+                                | (dst_locs[i] + 8 + 4);
+                    }
+                // lwz r0, src_offset(r1)
+                    *code++ = 0x80010000
+                            | (src_locs[i] + frameSize + 8);
+                // stw r0, dst_offset(r1)
+                    *code++ = 0x90010000
+                            | (dst_locs[i] + 8);
+           }
+        }
+
+            // * hptr will be the new first argument.
+            // lis r3, hi(hptr)
+        *code++ = OP_HI(0x3c60, hptr);
+            // ori r3,r3,lo(hptr)
+        *code++ = OP_LO(0x6063, hptr);
+
+            // * we need to return to a piece of code
+            //   which will tear down the stack frame.
+            // lis r11,hi(obscure_ccall_ret_code)
+        *code++ = OP_HI(0x3d60, obscure_ccall_ret_code);
+            // ori r11,r11,lo(obscure_ccall_ret_code)
+        *code++ = OP_LO(0x616b, obscure_ccall_ret_code);
+            // mtlr r11
+        *code++ = 0x7d6803a6;
+
+            // * jump to wptr
+            // lis r11,hi(wptr)
+        *code++ = OP_HI(0x3d60, wptr);
+            // ori r11,r11,lo(wptr)
+        *code++ = OP_LO(0x616b, wptr);
+            // mtctr r11
+        *code++ = 0x7d6903a6;
+            // bctr
+        *code++ = 0x4e800420;
+
+        // Flush the Instruction cache:
+        {
+            unsigned *p = adjustor;
+            while(p < code)
+            {
+                __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
+                                 : : "r" (p));
+                p++;
+            }
+            __asm__ volatile ("sync\n\tisync");
+        }
+    }
+
+#elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
+        
+#define OP_LO(op,lo)  ((((unsigned)(op)) << 16) | (((unsigned)(lo)) & 0xFFFF))
+#define OP_HI(op,hi)  ((((unsigned)(op)) << 16) | (((unsigned)(hi)) >> 16))
+    {
+        /* The following code applies to all PowerPC and PowerPC64 platforms
+           whose stack layout is based on the AIX ABI.
+
+           Besides (obviously) AIX, this includes
+            Mac OS 9 and BeOS/PPC (may they rest in peace),
+                which use the 32-bit AIX ABI
+            powerpc64-linux,
+                which uses the 64-bit AIX ABI
+            and Darwin (Mac OS X),
+                which uses the same stack layout as AIX,
+                but no function descriptors.
+
+           The actual stack-frame shuffling is implemented out-of-line
+           in the function adjustorCode, in AdjustorAsm.S.
+           Here, we set up an AdjustorStub structure, which
+           is a function descriptor (on platforms that have function
+           descriptors) or a short piece of stub code (on Darwin) to call
+           adjustorCode with a pointer to the AdjustorStub struct loaded
+           into register r2.
+
+           One nice thing about this is that there is _no_ code generated at
+           runtime on the platforms that have function descriptors.
+        */
+        AdjustorStub *adjustorStub;
+        int sz = 0, extra_sz, total_sz;
+
+            // from AdjustorAsm.s
+            // not declared as a function so that AIX-style
+            // fundescs can never get in the way.
+        extern void *adjustorCode;
+        
+#ifdef FUNDESCS
+        adjustorStub = stgMallocBytes(sizeof(AdjustorStub), "createAdjustor");
+#else
+        adjustorStub = stgMallocBytesRWX(sizeof(AdjustorStub));
+#endif
+        adjustor = adjustorStub;
+            
+        adjustorStub->code = (void*) &adjustorCode;
+
+#ifdef FUNDESCS
+            // function descriptors are a cool idea.
+            // We don't need to generate any code at runtime.
+        adjustorStub->toc = adjustorStub;
+#else
+
+            // no function descriptors :-(
+            // We need to do things "by hand".
+#if defined(powerpc_HOST_ARCH)
+            // lis  r2, hi(adjustorStub)
+        adjustorStub->lis = OP_HI(0x3c40, adjustorStub);
+            // ori  r2, r2, lo(adjustorStub)
+        adjustorStub->ori = OP_LO(0x6042, adjustorStub);
+            // lwz r0, code(r2)
+        adjustorStub->lwz = OP_LO(0x8002, (char*)(&adjustorStub->code)
+                                        - (char*)adjustorStub);
+            // mtctr r0
+        adjustorStub->mtctr = 0x7c0903a6;
+            // bctr
+        adjustorStub->bctr = 0x4e800420;
+#else
+        barf("adjustor creation not supported on this platform");
+#endif
+
+        // Flush the Instruction cache:
+        {
+            int n = sizeof(AdjustorStub)/sizeof(unsigned);
+            unsigned *p = (unsigned*)adjustor;
+            while(n--)
+            {
+                __asm__ volatile ("dcbf 0,%0\n\tsync\n\ticbi 0,%0"
+                                    : : "r" (p));
+                p++;
+            }
+            __asm__ volatile ("sync\n\tisync");
+        }
+#endif
+
+            // Calculate the size of the stack frame, in words.
+        sz = totalArgumentSize(typeString);
+        
+            // The first eight words of the parameter area
+            // are just "backing store" for the parameters passed in
+            // the GPRs. extra_sz is the number of words beyond those first
+            // 8 words.
+        extra_sz = sz - 8;
+        if(extra_sz < 0)
+            extra_sz = 0;
+
+            // Calculate the total size of the stack frame.
+        total_sz = (6 /* linkage area */
+                  + 8 /* minimum parameter area */
+                  + 2 /* two extra arguments */
+                  + extra_sz)*sizeof(StgWord);
+       
+            // align to 16 bytes.
+            // AIX only requires 8 bytes, but who cares?
+        total_sz = (total_sz+15) & ~0xF;
+       
+            // Fill in the information that adjustorCode in AdjustorAsm.S
+            // will use to create a new stack frame with the additional args.
+        adjustorStub->hptr = hptr;
+        adjustorStub->wptr = wptr;
+        adjustorStub->negative_framesize = -total_sz;
+        adjustorStub->extrawords_plus_one = extra_sz + 1;
+    }
+
+#elif defined(ia64_HOST_ARCH)
+/*
+    Up to 8 inputs are passed in registers.  We flush the last two inputs to
+    the stack, initially into the 16-byte scratch region left by the caller.
+    We then shuffle the others along by 4 (taking 2 registers for ourselves
+    to save return address and previous function state - we need to come back
+    here on the way out to restore the stack, so this is a real function
+    rather than just a trampoline).
+    
+    The function descriptor we create contains the gp of the target function
+    so gp is already loaded correctly.
+
+	[MLX]       alloc r16=ar.pfs,10,2,0
+		    movl r17=wptr
+	[MII]       st8.spill [r12]=r38,8		// spill in6 (out4)
+		    mov r41=r37				// out7 = in5 (out3)
+		    mov r40=r36;;			// out6 = in4 (out2)
+	[MII]       st8.spill [r12]=r39			// spill in7 (out5)
+		    mov.sptk b6=r17,50
+		    mov r38=r34;;			// out4 = in2 (out0)
+	[MII]       mov r39=r35				// out5 = in3 (out1)
+		    mov r37=r33				// out3 = in1 (loc1)
+		    mov r36=r32				// out2 = in0 (loc0)
+	[MLX]       adds r12=-24,r12			// update sp
+		    movl r34=hptr;;			// out0 = hptr
+	[MIB]       mov r33=r16				// loc1 = ar.pfs
+		    mov r32=b0				// loc0 = retaddr
+		    br.call.sptk.many b0=b6;;
+
+	[MII]       adds r12=-16,r12
+		    mov b0=r32
+		    mov.i ar.pfs=r33
+	[MFB]       nop.m 0x0
+		    nop.f 0x0
+		    br.ret.sptk.many b0;;
+*/
+
+/* These macros distribute a long constant into the two words of an MLX bundle */
+#define BITS(val,start,count)	(((val) >> (start)) & ((1 << (count))-1))
+#define MOVL_LOWORD(val)	(BITS(val,22,18) << 46)
+#define MOVL_HIWORD(val)	(BITS(val,40,23) | (BITS(val,0,7) << 36) | (BITS(val,7,9) << 50) \
+				| (BITS(val,16,5) << 55) | (BITS(val,21,1) << 44) | BITS(val,63,1) << 59)
+
+    {
+	StgStablePtr stable;
+	IA64FunDesc *wdesc = (IA64FunDesc *)wptr;
+	StgWord64 wcode = wdesc->ip;
+	IA64FunDesc *fdesc;
+	StgWord64 *code;
+
+	/* we allocate on the Haskell heap since malloc'd memory isn't executable - argh */
+	adjustor = stgAllocStable(sizeof(IA64FunDesc)+18*8, &stable);
+
+	fdesc = (IA64FunDesc *)adjustor;
+	code = (StgWord64 *)(fdesc + 1);
+	fdesc->ip = (StgWord64)code;
+	fdesc->gp = wdesc->gp;
+
+	code[0]  = 0x0000058004288004 | MOVL_LOWORD(wcode);
+	code[1]  = 0x6000000220000000 | MOVL_HIWORD(wcode);
+	code[2]  = 0x029015d818984001;
+	code[3]  = 0x8401200500420094;
+	code[4]  = 0x886011d8189c0001;
+	code[5]  = 0x84011004c00380c0;
+	code[6]  = 0x0250210046013800;
+	code[7]  = 0x8401000480420084;
+	code[8]  = 0x0000233f19a06005 | MOVL_LOWORD((StgWord64)hptr);
+	code[9]  = 0x6000000440000000 | MOVL_HIWORD((StgWord64)hptr);
+	code[10] = 0x0200210020010811;
+	code[11] = 0x1080006800006200;
+	code[12] = 0x0000210018406000;
+	code[13] = 0x00aa021000038005;
+	code[14] = 0x000000010000001d;
+	code[15] = 0x0084000880000200;
+
+	/* save stable pointers in convenient form */
+	code[16] = (StgWord64)hptr;
+	code[17] = (StgWord64)stable;
+    }
+#else
+    barf("adjustor creation not supported on this platform");
+#endif
+    break;
+  
+  default:
+    ASSERT(0);
+    break;
+  }
+
+  /* Have fun! */
+  return adjustor;
+}
+
+
+void
+freeHaskellFunctionPtr(void* ptr)
+{
+#if defined(i386_HOST_ARCH) && !defined(darwin_HOST_OS)
+ if ( *(unsigned char*)ptr != 0x68 &&
+      *(unsigned char*)ptr != 0x58 ) {
+   errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
+   return;
+ }
+
+ /* Free the stable pointer first..*/
+ if (*(unsigned char*)ptr == 0x68) { /* Aha, a ccall adjustor! */
+    freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x01)));
+ } else {
+    freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x02)));
+ }
+#elif defined(x86_TARGET_ARCH) && defined(darwin_HOST_OS)
+if ( *(unsigned char*)ptr != 0xe8 ) {
+   errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
+   return;
+ }
+ freeStablePtr(((AdjustorStub*)ptr)->hptr);
+#elif defined(x86_64_HOST_ARCH)
+ if ( *(StgWord16 *)ptr == 0x894d ) {
+     freeStablePtr(*(StgStablePtr*)(ptr+0x20));
+ } else if ( *(StgWord16 *)ptr == 0x5141 ) {
+     freeStablePtr(*(StgStablePtr*)(ptr+0x30));
+ } else {
+   errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
+   return;
+ }
+#elif defined(sparc_HOST_ARCH)
+ if ( *(unsigned long*)ptr != 0x9C23A008UL ) {
+   errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
+   return;
+ }
+
+ /* Free the stable pointer first..*/
+ freeStablePtr(*((StgStablePtr*)((unsigned long*)ptr + 11)));
+#elif defined(alpha_HOST_ARCH)
+ if ( *(StgWord64*)ptr != 0xa77b0018a61b0010L ) {
+   errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
+   return;
+ }
+
+ /* Free the stable pointer first..*/
+ freeStablePtr(*((StgStablePtr*)((unsigned char*)ptr + 0x10)));
+#elif defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS)
+ if ( *(StgWord*)ptr != 0x48000008 ) {
+   errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
+   return;
+ }
+ freeStablePtr(((StgStablePtr*)ptr)[1]);
+#elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
+ extern void* adjustorCode;
+ if ( ((AdjustorStub*)ptr)->code != (StgFunPtr) &adjustorCode ) {
+   errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
+   return;
+ }
+ freeStablePtr(((AdjustorStub*)ptr)->hptr);
+#elif defined(ia64_HOST_ARCH)
+ IA64FunDesc *fdesc = (IA64FunDesc *)ptr;
+ StgWord64 *code = (StgWord64 *)(fdesc+1);
+
+ if (fdesc->ip != (StgWord64)code) {
+   errorBelch("freeHaskellFunctionPtr: not for me, guv! %p\n", ptr);
+   return;
+ }
+ freeStablePtr((StgStablePtr)code[16]);
+ freeStablePtr((StgStablePtr)code[17]);
+ return;
+#else
+ ASSERT(0);
+#endif
+ *((unsigned char*)ptr) = '\0';
+
+ stgFree(ptr);
+}
+
+
+/*
+ * Function: initAdjustor()
+ *
+ * Perform initialisation of adjustor thunk layer (if needed.)
+ */
+void
+initAdjustor(void)
+{
+#if defined(i386_HOST_ARCH) && defined(openbsd_HOST_OS)
+    obscure_ccall_ret_code_dyn = stgMallocBytesRWX(4);
+    obscure_ccall_ret_code_dyn[0] = ((unsigned char *)obscure_ccall_ret_code)[0];
+    obscure_ccall_ret_code_dyn[1] = ((unsigned char *)obscure_ccall_ret_code)[1];
+    obscure_ccall_ret_code_dyn[2] = ((unsigned char *)obscure_ccall_ret_code)[2];
+    obscure_ccall_ret_code_dyn[3] = ((unsigned char *)obscure_ccall_ret_code)[3];
+#endif
+}
diff --git a/rts/AdjustorAsm.S b/rts/AdjustorAsm.S
new file mode 100644
index 0000000000..cfdef68349
--- /dev/null
+++ b/rts/AdjustorAsm.S
@@ -0,0 +1,189 @@
+#include "../includes/ghcconfig.h"
+
+/* ******************************** PowerPC ******************************** */
+
+#if defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH)
+#if !(defined(powerpc_HOST_ARCH) && defined(linux_HOST_OS))
+    /* The following code applies, with some differences,
+       to all powerpc platforms except for powerpc32-linux,
+       whose calling convention is annoyingly complex.
+    */
+
+
+    /* The code is "almost" the same for
+       32-bit and for 64-bit
+    */
+#if defined(powerpc64_HOST_ARCH)
+#define WS          8
+#define LOAD        ld
+#define STORE       std
+#else
+#define WS          4
+#define LOAD        lwz
+#define STORE       stw
+#endif
+
+    /* Some info about stack frame layout */
+#define LINK_SLOT           (2*WS)
+#define LINKAGE_AREA_SIZE   (6*WS)
+
+    /* The following defines mirror struct AdjustorStub
+       from Adjustor.c. Make sure to keep these in sync.
+    */
+#if defined(powerpc_HOST_ARCH) && defined(darwin_HOST_OS)
+#define HEADER_WORDS   6
+#elif defined(powerpc64_HOST_ARCH) && defined(darwin_HOST_OS)
+#else
+#define HEADER_WORDS   3
+#endif
+
+#define HPTR_OFF        ((HEADER_WORDS    )*WS)
+#define WPTR_OFF        ((HEADER_WORDS + 1)*WS)
+#define FRAMESIZE_OFF   ((HEADER_WORDS + 2)*WS)
+#define EXTRA_WORDS_OFF ((HEADER_WORDS + 3)*WS)
+
+    /* Darwin insists on register names, everyone else prefers
+       to use numbers. */
+#if !defined(darwin_HOST_OS)
+#define r0 0
+#define r1 1
+#define r2 2
+#define r3 3
+#define r4 4
+#define r5 5
+#define r6 6
+#define r7 7
+#define r8 8
+#define r9 9
+#define r10 10
+#define r11 11
+#define r12 12
+
+#define r30 30
+#define r31 31
+#endif
+
+
+.text
+#if LEADING_UNDERSCORE
+    .globl _adjustorCode
+_adjustorCode:
+#else
+    .globl adjustorCode
+        /* Note that we don't build a function descriptor
+           for AIX-derived ABIs here. This will happen at runtime
+           in createAdjustor().
+        */
+adjustorCode:
+#endif
+    /* On entry, r2 will point to the AdjustorStub data structure. */
+
+        /* save the link */
+    mflr    r0
+    STORE   r0, LINK_SLOT(r1)
+    
+        /* set up stack frame */
+    LOAD    r12, FRAMESIZE_OFF(r2)
+#ifdef powerpc64_HOST_ARCH
+    stdux   r1, r1, r12
+#else   
+    stwux   r1, r1, r12
+#endif
+
+        /* Save some regs so that we can use them.
+           Note that we use the "Red Zone" below the stack pointer.
+        */
+    STORE   r31, -WS(r1)
+    STORE   r30, -2*WS(r1)
+
+    mr      r31, r1
+    subf    r30, r12, r31
+
+    LOAD    r12, EXTRA_WORDS_OFF(r2)
+    mtctr   r12
+    b       2f
+1:
+    LOAD    r0, LINKAGE_AREA_SIZE +  8*WS(r30)
+    STORE   r0, LINKAGE_AREA_SIZE + 10*WS(r31)
+    addi    r30, r30, WS
+    addi    r31, r31, WS
+2:
+    bdnz    1b
+
+        /* Restore r30 and r31 now.
+        */
+    LOAD    r31, -WS(r1)
+    LOAD    r30, -2*WS(r1)
+
+    STORE   r10, LINKAGE_AREA_SIZE + 9*WS(r1)
+    STORE   r9,  LINKAGE_AREA_SIZE + 8*WS(r1)
+    mr      r10, r8
+    mr      r9, r7
+    mr      r8, r6
+    mr      r7, r5
+    mr      r6, r4
+    mr      r5, r3
+
+    LOAD    r3, HPTR_OFF(r2)
+
+    LOAD    r12, WPTR_OFF(r2)
+#if defined(darwin_HOST_OS)
+    mtctr   r12
+#else
+    LOAD    r0, 0(r12)
+        /* The function we're calling will never be a nested function,
+           so we don't load r11. 
+        */
+    mtctr   r0
+    LOAD    r2, WS(r12)
+#endif
+    bctrl
+
+    LOAD    r1, 0(r1)
+    LOAD    r0, LINK_SLOT(r1)
+    mtlr    r0
+    blr
+#endif
+
+/* ********************************* i386 ********************************** */
+
+#elif defined(i386_HOST_ARCH) && defined(darwin_HOST_OS)
+
+#define WS              4
+#define RETVAL_OFF      5
+#define HEADER_BYTES    8
+
+#define HPTR_OFF        HEADER_BYTES
+#define WPTR_OFF        (HEADER_BYTES + 1*WS)
+#define FRAMESIZE_OFF   (HEADER_BYTES + 2*WS)
+#define ARGWORDS_OFF    (HEADER_BYTES + 3*WS)
+
+    .globl _adjustorCode
+_adjustorCode:
+    popl    %eax
+    subl    $RETVAL_OFF, %eax
+    
+    pushl   %ebp
+    movl    %esp, %ebp
+    
+    subl    FRAMESIZE_OFF(%eax), %esp
+
+    pushl   %esi
+    pushl   %edi
+        
+    leal    8(%ebp), %esi
+    leal    12(%esp), %edi
+    movl    ARGWORDS_OFF(%eax), %ecx
+    rep
+    movsl
+    
+    popl    %edi
+    popl    %esi
+    
+    pushl   HPTR_OFF(%eax)
+    call    *WPTR_OFF(%eax)
+    
+    leave
+    ret
+#endif
+
diff --git a/rts/Apply.cmm b/rts/Apply.cmm
new file mode 100644
index 0000000000..e0ca03944c
--- /dev/null
+++ b/rts/Apply.cmm
@@ -0,0 +1,268 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The University of Glasgow 2004
+ *
+ * Application-related bits.
+ *
+ * This file is written in a subset of C--, extended with various
+ * features specific to GHC.  It is compiled by GHC directly.  For the
+ * syntax of .cmm files, see the parser in ghc/compiler/cmm/CmmParse.y.
+ *
+ * -------------------------------------------------------------------------- */
+
+#include "Cmm.h"
+
+/* ----------------------------------------------------------------------------
+ * Evaluate a closure and return it.
+ *
+ * There isn't an info table / return address version of stg_ap_0, because
+ * everything being returned is guaranteed evaluated, so it would be a no-op.
+ */
+
+STRING(stg_ap_0_ret_str,"stg_ap_0_ret... ")
+
+stg_ap_0_fast
+{ 
+    // fn is in R1, no args on the stack
+
+    IF_DEBUG(apply,
+	foreign "C" debugBelch(stg_ap_0_ret_str) [R1];
+	foreign "C" printClosure(R1 "ptr") [R1]);
+
+    IF_DEBUG(sanity,
+	foreign "C" checkStackChunk(Sp "ptr",
+				    CurrentTSO + TSO_OFFSET_StgTSO_stack +
+				    WDS(TO_W_(StgTSO_stack_size(CurrentTSO))) "ptr") [R1]);
+
+    ENTER();
+}
+
+/* -----------------------------------------------------------------------------
+   Entry Code for a PAP.
+
+   This entry code is *only* called by one of the stg_ap functions.
+   On entry: Sp points to the remaining arguments on the stack.  If
+   the stack check fails, we can just push the PAP on the stack and
+   return to the scheduler.
+
+   On entry: R1 points to the PAP.  The rest of the function's
+   arguments (apart from those that are already in the PAP) are on the
+   stack, starting at Sp(0).  R2 contains an info table which
+   describes these arguments, which is used in the event that the
+   stack check in the entry code below fails.  The info table is
+   currently one of the stg_ap_*_ret family, as this code is always
+   entered from those functions.
+
+   The idea is to copy the chunk of stack from the PAP object onto the
+   stack / into registers, and enter the function.
+   -------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_PAP,/*special layout*/0,0,PAP,"PAP","PAP")
+{  foreign "C" barf("PAP object entered!"); }
+    
+stg_PAP_apply
+{
+  W_ Words;
+  W_ pap;
+    
+  pap = R1;
+
+  Words = TO_W_(StgPAP_n_args(pap));
+
+  //
+  // Check for stack overflow and bump the stack pointer.
+  // We have a hand-rolled stack check fragment here, because none of
+  // the canned ones suit this situation.
+  //
+  if ((Sp - WDS(Words)) < SpLim) {
+      // there is a return address in R2 in the event of a
+      // stack check failure.  The various stg_apply functions arrange
+      // this before calling stg_PAP_entry.
+      Sp_adj(-1); 
+      Sp(0) = R2;
+      jump stg_gc_unpt_r1;
+  }
+  Sp_adj(-Words);
+
+  // profiling
+  TICK_ENT_PAP();
+  LDV_ENTER(pap);
+  // Enter PAP cost centre 
+  ENTER_CCS_PAP_CL(pap);
+
+  R1 = StgPAP_fun(pap);
+
+  // Reload the stack 
+  W_ i;
+  W_ p;
+  p = pap + SIZEOF_StgHeader + OFFSET_StgPAP_payload;
+  i = 0;
+for:
+  if (i < Words) {
+    Sp(i) = W_[p];
+    p = p + WDS(1);
+    i = i + 1;
+    goto for;
+  }
+
+  // Off we go! 
+  TICK_ENT_VIA_NODE();
+
+#ifdef NO_ARG_REGS
+  jump %GET_ENTRY(R1);
+#else
+      W_ info;
+      info = %GET_FUN_INFO(R1);
+      W_ type;
+      type = TO_W_(StgFunInfoExtra_fun_type(info));
+      if (type == ARG_GEN) {
+	  jump StgFunInfoExtra_slow_apply(info);
+      }
+      if (type == ARG_GEN_BIG) {
+	  jump StgFunInfoExtra_slow_apply(info);
+      }
+      if (type == ARG_BCO) {
+	  Sp_adj(-2);
+	  Sp(1) = R1;
+	  Sp(0) = stg_apply_interp_info;
+	  jump stg_yield_to_interpreter;
+      }
+      jump W_[stg_ap_stack_entries + 
+		WDS(TO_W_(StgFunInfoExtra_fun_type(info)))];
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+   Entry Code for an AP (a PAP with arity zero).
+
+   The entry code is very similar to a PAP, except there are no
+   further arguments on the stack to worry about, so the stack check
+   is simpler.  We must also push an update frame on the stack before
+   applying the function.
+   -------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_AP,/*special layout*/0,0,AP,"AP","AP")
+{
+  W_ Words;
+  W_ ap;
+    
+  ap = R1;
+  
+  Words = TO_W_(StgAP_n_args(ap));
+
+  /* 
+   * Check for stack overflow.  IMPORTANT: use a _NP check here,
+   * because if the check fails, we might end up blackholing this very
+   * closure, in which case we must enter the blackhole on return rather
+   * than continuing to evaluate the now-defunct closure.
+   */
+  STK_CHK_NP(WDS(Words) + SIZEOF_StgUpdateFrame);
+
+  PUSH_UPD_FRAME(Sp - SIZEOF_StgUpdateFrame, R1);
+  Sp = Sp - SIZEOF_StgUpdateFrame - WDS(Words);
+
+  TICK_ENT_AP();
+  LDV_ENTER(ap);
+
+  // Enter PAP cost centre
+  ENTER_CCS_PAP_CL(ap);   // ToDo: ENTER_CC_AP_CL 
+
+  R1 = StgAP_fun(ap);
+
+  // Reload the stack 
+  W_ i;
+  W_ p;
+  p = ap + SIZEOF_StgHeader + OFFSET_StgAP_payload;
+  i = 0;
+for:
+  if (i < Words) {
+    Sp(i) = W_[p];
+    p = p + WDS(1);
+    i = i + 1;
+    goto for;
+  }
+
+  // Off we go! 
+  TICK_ENT_VIA_NODE();
+
+#ifdef NO_ARG_REGS
+  jump %GET_ENTRY(R1);
+#else
+      W_ info;
+      info = %GET_FUN_INFO(R1);
+      W_ type;
+      type = TO_W_(StgFunInfoExtra_fun_type(info));
+      if (type == ARG_GEN) {
+	  jump StgFunInfoExtra_slow_apply(info);
+      }
+      if (type == ARG_GEN_BIG) {
+	  jump StgFunInfoExtra_slow_apply(info);
+      }
+      if (type == ARG_BCO) {
+	  Sp_adj(-2);
+	  Sp(1) = R1;
+	  Sp(0) = stg_apply_interp_info;
+	  jump stg_yield_to_interpreter;
+      }
+      jump W_[stg_ap_stack_entries + 
+		WDS(TO_W_(StgFunInfoExtra_fun_type(info)))];
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+   Entry Code for an AP_STACK.
+
+   Very similar to a PAP and AP.  The layout is the same as PAP
+   and AP, except that the payload is a chunk of stack instead of
+   being described by the function's info table.  Like an AP,
+   there are no further arguments on the stack to worry about.
+   However, the function closure (ap->fun) does not necessarily point
+   directly to a function, so we have to enter it using stg_ap_0.
+   -------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_AP_STACK,/*special layout*/0,0,AP_STACK,"AP_STACK","AP_STACK")
+{
+  W_ Words;
+  W_ ap;
+
+  ap = R1;
+  
+  Words = StgAP_STACK_size(ap);
+
+  /* 
+   * Check for stack overflow.  IMPORTANT: use a _NP check here,
+   * because if the check fails, we might end up blackholing this very
+   * closure, in which case we must enter the blackhole on return rather
+   * than continuing to evaluate the now-defunct closure.
+   */
+  STK_CHK_NP(WDS(Words) + SIZEOF_StgUpdateFrame);
+
+  PUSH_UPD_FRAME(Sp - SIZEOF_StgUpdateFrame, R1);
+  Sp = Sp - SIZEOF_StgUpdateFrame - WDS(Words);
+
+  TICK_ENT_AP();
+  LDV_ENTER(ap);
+
+  // Enter PAP cost centre
+  ENTER_CCS_PAP_CL(ap);   // ToDo: ENTER_CC_AP_CL 
+
+  R1 = StgAP_STACK_fun(ap);
+
+  // Reload the stack
+  W_ i;
+  W_ p;
+  p = ap + SIZEOF_StgHeader + OFFSET_StgAP_STACK_payload;
+  i = 0;
+for:
+  if (i < Words) {
+    Sp(i) = W_[p];
+    p = p + WDS(1);
+    i = i + 1;
+    goto for;
+  }
+
+  // Off we go!
+  TICK_ENT_VIA_NODE();
+
+  ENTER();
+}
diff --git a/rts/Apply.h b/rts/Apply.h
new file mode 100644
index 0000000000..76e36cb9fb
--- /dev/null
+++ b/rts/Apply.h
@@ -0,0 +1,29 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The University of Glasgow 2002-2004
+ *
+ * Declarations for things defined in AutoApply.cmm
+ *
+ * -------------------------------------------------------------------------- */
+
+#ifndef APPLY_H
+#define APPLY_H
+
+// canned slow entry points, indexed by arg type (ARG_P, ARG_PP, etc.)
+#ifdef IN_STG_CODE
+extern StgWord stg_ap_stack_entries[];
+#else
+extern StgFun *stg_ap_stack_entries[];
+#endif
+
+// canned register save code for heap check failure in a function
+#ifdef IN_STG_CODE
+extern StgWord stg_stack_save_entries[];
+#else
+extern StgFun *stg_stack_save_entries[];
+#endif
+
+// canned bitmap for each arg type
+extern StgWord stg_arg_bitmaps[];
+
+#endif /* APPLY_H */
diff --git a/rts/Arena.c b/rts/Arena.c
new file mode 100644
index 0000000000..76ac23cf88
--- /dev/null
+++ b/rts/Arena.c
@@ -0,0 +1,120 @@
+/* -----------------------------------------------------------------------------
+   (c) The University of Glasgow 2001
+
+   Arena allocation.  Arenas provide fast memory allocation at the
+   expense of fine-grained recycling of storage: memory may be
+   only be returned to the system by freeing the entire arena, it
+   isn't possible to return individual objects within an arena.
+
+   Do not assume that sequentially allocated objects will be adjacent
+   in memory.
+   
+   Quirks: this allocator makes use of the RTS block allocator.  If
+   the current block doesn't have enough room for the requested
+   object, then a new block is allocated.  This means that allocating
+   large objects will tend to result in wasted space at the end of
+   each block.  In the worst case, half of the allocated space is
+   wasted.  This allocator is therefore best suited to situations in
+   which most allocations are small.
+   -------------------------------------------------------------------------- */
+
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "BlockAlloc.h"
+#include "Arena.h"
+
+#include <stdlib.h>
+
+// Each arena struct is allocated using malloc().
+struct _Arena {
+    bdescr *current;
+    StgWord *free;		// ptr to next free byte in current block
+    StgWord *lim;		// limit (== last free byte + 1)
+};
+
+// We like to keep track of how many blocks we've allocated for 
+// Storage.c:memInventory().
+static long arena_blocks = 0;
+
+// Begin a new arena
+Arena *
+newArena( void )
+{
+    Arena *arena;
+
+    arena = stgMallocBytes(sizeof(Arena), "newArena");
+    arena->current = allocBlock();
+    arena->current->link = NULL;
+    arena->free = arena->current->start;
+    arena->lim  = arena->current->start + BLOCK_SIZE_W;
+    arena_blocks++;
+
+    return arena;
+}
+
+// The minimum alignment of an allocated block.
+#define MIN_ALIGN 8
+
+/* 'n' is assumed to be a power of 2 */
+#define ROUNDUP(x,n)  (((x)+((n)-1))&(~((n)-1)))
+#define B_TO_W(x)     ((x) / sizeof(W_))
+
+// Allocate some memory in an arena
+void  *
+arenaAlloc( Arena *arena, size_t size )
+{
+    void *p;
+    nat size_w;
+    nat req_blocks;
+    bdescr *bd;
+
+    // round up to nearest alignment chunk.
+    size = ROUNDUP(size,MIN_ALIGN);
+
+    // size of allocated block in words.
+    size_w = B_TO_W(size);
+
+    if ( arena->free + size_w < arena->lim ) {
+	// enough room in the current block...
+	p = arena->free;
+	arena->free += size_w;
+	return p;
+    } else {
+	// allocate a fresh block...
+	req_blocks =  (lnat)BLOCK_ROUND_UP(size) / BLOCK_SIZE;
+	bd = allocGroup(req_blocks);
+	arena_blocks += req_blocks;
+
+	bd->gen_no  = 0;
+	bd->step    = NULL;
+	bd->flags   = 0;
+	bd->free    = bd->start;
+	bd->link    = arena->current;
+	arena->current = bd;
+	arena->free = bd->free + size_w;
+	arena->lim = bd->free + bd->blocks * BLOCK_SIZE_W;
+	return bd->start;
+    }
+}
+
+// Free an entire arena
+void
+arenaFree( Arena *arena )
+{
+    bdescr *bd, *next;
+
+    for (bd = arena->current; bd != NULL; bd = next) {
+	next = bd->link;
+	arena_blocks -= bd->blocks;
+	ASSERT(arena_blocks >= 0);
+	freeGroup(bd);
+    }
+    stgFree(arena);
+}
+
+unsigned long
+arenaBlocks( void )
+{
+    return arena_blocks;
+}
+
diff --git a/rts/Arena.h b/rts/Arena.h
new file mode 100644
index 0000000000..7a2989e543
--- /dev/null
+++ b/rts/Arena.h
@@ -0,0 +1,25 @@
+/* -----------------------------------------------------------------------------
+   (c) The University of Glasgow 2001
+
+   Arena allocation interface.
+   -------------------------------------------------------------------------- */
+
+#ifndef ARENA_H
+#define ARENA_H
+
+// Abstract type of arenas
+typedef struct _Arena Arena;
+
+// Start a new arena
+extern Arena * newArena   ( void );
+
+// Allocate memory in an arena
+extern void  * arenaAlloc ( Arena *, size_t );
+
+// Free an entire arena
+extern void    arenaFree  ( Arena * );
+
+// For internal use only:
+extern unsigned long arenaBlocks( void );
+
+#endif /* ARENA_H */
diff --git a/rts/AutoApply.h b/rts/AutoApply.h
new file mode 100644
index 0000000000..bbec1224ff
--- /dev/null
+++ b/rts/AutoApply.h
@@ -0,0 +1,80 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The University of Glasgow 2002-2004
+ *
+ * Helper bits for the generic apply code (AutoApply.hc)
+ *
+ * -------------------------------------------------------------------------- */
+
+#ifndef AUTOAPPLY_H
+#define AUTOAPPLY_H
+
+// Build a new PAP: function is in R1
+// ret addr and m arguments taking up n words are on the stack.
+// NB. x is a dummy argument attached to the 'for' label so that
+// BUILD_PAP can be used multiple times in the same function.
+#define BUILD_PAP(m,n,f,x)				\
+    W_ pap;						\
+    W_ size;						\
+    W_ i;						\
+    size = SIZEOF_StgPAP + WDS(n);			\
+    HP_CHK_NP_ASSIGN_SP0(size,f);			\
+    TICK_ALLOC_HEAP_NOCTR(BYTES_TO_WDS(size));		\
+    TICK_ALLOC_PAP(n+1 /* +1 for the FUN */, 0);	\
+    pap = Hp + WDS(1) - size;				\
+    SET_HDR(pap, stg_PAP_info, W_[CCCS]);		\
+    StgPAP_arity(pap) = HALF_W_(arity - m);		\
+    StgPAP_fun(pap)   = R1;				\
+    StgPAP_n_args(pap) = HALF_W_(n);			\
+    i = 0;						\
+  for##x:						\
+    if (i < n) {					\
+	StgPAP_payload(pap,i) = Sp(1+i);		\
+	i = i + 1;					\
+	goto for##x;					\
+    }							\
+    R1 = pap;						\
+    Sp_adj(1 + n);					\
+    jump %ENTRY_CODE(Sp(0));
+
+// Copy the old PAP, build a new one with the extra arg(s)
+// ret addr and m arguments taking up n words are on the stack.
+// NB. x is a dummy argument attached to the 'for' label so that
+// BUILD_PAP can be used multiple times in the same function.
+#define NEW_PAP(m,n,f,x)					\
+     W_ pap;							\
+     W_ new_pap;						\
+     W_ size;							\
+     W_ i;							\
+     pap = R1;							\
+     size = SIZEOF_StgPAP + WDS(TO_W_(StgPAP_n_args(pap))) + WDS(n);	\
+     HP_CHK_NP_ASSIGN_SP0(size,f);				\
+     TICK_ALLOC_HEAP_NOCTR(BYTES_TO_WDS(size));			\
+     TICK_ALLOC_PAP(n+1 /* +1 for the FUN */, 0);		\
+     new_pap = Hp + WDS(1) - size;				\
+     SET_HDR(new_pap, stg_PAP_info, W_[CCCS]);			\
+     StgPAP_arity(new_pap) = HALF_W_(arity - m);		\
+     W_ n_args;							\
+     n_args = TO_W_(StgPAP_n_args(pap));			\
+     StgPAP_n_args(new_pap) = HALF_W_(n_args + n);		\
+     StgPAP_fun(new_pap) = StgPAP_fun(pap);			\
+     i = 0;							\
+   for1##x:							\
+     if (i < n_args) {						\
+         StgPAP_payload(new_pap,i) = StgPAP_payload(pap,i);	\
+	 i = i + 1;						\
+	 goto for1##x;						\
+     }								\
+     i = 0;							\
+   for2##x:							\
+     if (i < n) {						\
+	 StgPAP_payload(new_pap,n_args+i) = Sp(1+i);		\
+         i = i + 1;						\
+         goto for2##x;						\
+     }								\
+     R1 = new_pap;						\
+     Sp_adj(n+1);						\
+     jump %ENTRY_CODE(Sp(0));
+
+#endif /* APPLY_H */
+
diff --git a/rts/AwaitEvent.h b/rts/AwaitEvent.h
new file mode 100644
index 0000000000..e03cb4444e
--- /dev/null
+++ b/rts/AwaitEvent.h
@@ -0,0 +1,24 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1998-2005
+ *
+ * The awaitEvent() interface, for the non-threaded RTS
+ *
+ * -------------------------------------------------------------------------*/
+
+#ifndef AWAITEVENT_H
+#define AWAITEVENT_H
+
+#if !defined(THREADED_RTS)
+/* awaitEvent(rtsBool wait)
+ *
+ * Checks for blocked threads that need to be woken.
+ *
+ * Called from STG :  NO
+ * Locks assumed   :  sched_mutex
+ */
+void awaitEvent(rtsBool wait);  /* In posix/Select.c or
+				 * win32/AwaitEvent.c */
+#endif
+
+#endif /* SELECT_H */
diff --git a/rts/BlockAlloc.c b/rts/BlockAlloc.c
new file mode 100644
index 0000000000..5e0e321947
--- /dev/null
+++ b/rts/BlockAlloc.c
@@ -0,0 +1,391 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1998-2006
+ * 
+ * The block allocator and free list manager.
+ *
+ * This is the architecture independent part of the block allocator.
+ * It requires only the following support from the operating system: 
+ *
+ *    void *getMBlock();
+ *
+ * returns the address of an MBLOCK_SIZE region of memory, aligned on
+ * an MBLOCK_SIZE boundary.  There is no requirement for successive
+ * calls to getMBlock to return strictly increasing addresses.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "BlockAlloc.h"
+#include "MBlock.h"
+#include "Storage.h"
+
+#include <string.h>
+
+static void    initMBlock(void *mblock);
+static bdescr *allocMegaGroup(nat mblocks);
+static void    freeMegaGroup(bdescr *bd);
+
+// In THREADED_RTS mode, the free list is protected by sm_mutex.
+static bdescr *free_list = NULL;
+
+/* -----------------------------------------------------------------------------
+   Initialisation
+   -------------------------------------------------------------------------- */
+
+void initBlockAllocator(void)
+{
+    // The free list starts off NULL
+}
+
+/* -----------------------------------------------------------------------------
+   Allocation
+   -------------------------------------------------------------------------- */
+
+STATIC_INLINE void
+initGroup(nat n, bdescr *head)
+{
+  bdescr *bd;
+  nat i;
+
+  if (n != 0) {
+    head->blocks = n;
+    head->free   = head->start;
+    head->link   = NULL;
+    for (i=1, bd = head+1; i < n; i++, bd++) {
+      bd->free = 0;
+      bd->blocks = 0;
+      bd->link = head;
+    }
+  }
+}
+
+bdescr *
+allocGroup(nat n)
+{
+  void *mblock;
+  bdescr *bd, **last;
+
+  ASSERT_SM_LOCK();
+  ASSERT(n != 0);
+
+  if (n > BLOCKS_PER_MBLOCK) {
+    return allocMegaGroup(BLOCKS_TO_MBLOCKS(n));
+  }
+
+  last = &free_list;
+  for (bd = free_list; bd != NULL; bd = bd->link) {
+    if (bd->blocks == n) {	/* exactly the right size! */
+      *last = bd->link;
+      /* no initialisation necessary - this is already a
+       * self-contained block group. */
+      bd->free = bd->start;	/* block isn't free now */
+      bd->link = NULL;
+      return bd;
+    }
+    if (bd->blocks >  n) {	/* block too big... */
+      bd->blocks -= n;		/* take a chunk off the *end* */
+      bd += bd->blocks;
+      initGroup(n, bd);		/* initialise it */
+      return bd;
+    }
+    last = &bd->link;
+  }
+  
+  mblock = getMBlock();		/* get a new megablock */
+  initMBlock(mblock);		/* initialise the start fields */
+  bd = FIRST_BDESCR(mblock);
+  initGroup(n,bd);		/* we know the group will fit */
+  if (n < BLOCKS_PER_MBLOCK) {
+    initGroup(BLOCKS_PER_MBLOCK-n, bd+n);
+    freeGroup(bd+n);      	/* add the rest on to the free list */
+  }
+  return bd;
+}
+
+bdescr *
+allocGroup_lock(nat n)
+{
+    bdescr *bd;
+    ACQUIRE_SM_LOCK;
+    bd = allocGroup(n);
+    RELEASE_SM_LOCK;
+    return bd;
+}
+
+bdescr *
+allocBlock(void)
+{
+  return allocGroup(1);
+}
+
+bdescr *
+allocBlock_lock(void)
+{
+    bdescr *bd;
+    ACQUIRE_SM_LOCK;
+    bd = allocBlock();
+    RELEASE_SM_LOCK;
+    return bd;
+}
+
+/* -----------------------------------------------------------------------------
+   Any request larger than BLOCKS_PER_MBLOCK needs a megablock group.
+   First, search the free list for enough contiguous megablocks to
+   fulfill the request - if we don't have enough, we need to
+   allocate some new ones.
+
+   A megablock group looks just like a normal block group, except that
+   the blocks field in the head will be larger than BLOCKS_PER_MBLOCK.
+
+   Note that any objects placed in this group must start in the first
+   megablock, since the other blocks don't have block descriptors.
+   -------------------------------------------------------------------------- */
+   
+static bdescr *
+allocMegaGroup(nat n)
+{
+  nat mbs_found;
+  bdescr *bd, *last, *grp_start, *grp_prev;
+
+  mbs_found = 0;
+  grp_start = NULL;
+  grp_prev  = NULL;
+  last      = NULL;
+  for (bd = free_list; bd != NULL; bd = bd->link) {
+
+    if (bd->blocks == BLOCKS_PER_MBLOCK) {	/* whole megablock found */
+
+      /* is it the first one we've found or a non-contiguous megablock? */
+      if (grp_start == NULL ||
+          bd->start != last->start + MBLOCK_SIZE/sizeof(W_)) {
+	grp_start = bd;
+	grp_prev  = last;
+	mbs_found = 1;
+      } else {
+	mbs_found++;
+      }
+
+      if (mbs_found == n) {	/* found enough contig megablocks? */
+	break;
+      }
+    } 
+
+    else {			/* only a partial megablock, start again */
+      grp_start = NULL;
+    }
+
+    last = bd;
+  }
+
+  /* found all the megablocks we need on the free list
+   */
+  if (mbs_found == n) {
+    /* remove the megablocks from the free list */
+    if (grp_prev == NULL) {	/* bd now points to the last mblock */
+      free_list = bd->link;
+    } else {
+      grp_prev->link = bd->link;
+    }
+  }
+
+  /* the free list wasn't sufficient, allocate all new mblocks.
+   */
+  else {
+    void *mblock = getMBlocks(n);
+    initMBlock(mblock);		/* only need to init the 1st one */
+    grp_start = FIRST_BDESCR(mblock);
+  }
+
+  /* set up the megablock group */
+  initGroup(BLOCKS_PER_MBLOCK, grp_start);
+  grp_start->blocks = MBLOCK_GROUP_BLOCKS(n);
+  return grp_start;
+}
+
+/* -----------------------------------------------------------------------------
+   De-Allocation
+   -------------------------------------------------------------------------- */
+
+/* coalesce the group p with p->link if possible.
+ *
+ * Returns p->link if no coalescing was done, otherwise returns a
+ * pointer to the newly enlarged group p.
+ */
+
+STATIC_INLINE bdescr *
+coalesce(bdescr *p)
+{
+  bdescr *bd, *q;
+  nat i, blocks;
+
+  q = p->link;
+  if (q != NULL && p->start + p->blocks * BLOCK_SIZE_W == q->start) {
+    /* can coalesce */
+    p->blocks += q->blocks;
+    p->link    = q->link;
+    blocks = q->blocks;
+    for (i = 0, bd = q; i < blocks; bd++, i++) {
+	bd->free = 0;
+	bd->blocks = 0;
+	bd->link = p;
+    }
+    return p;
+  }
+  return q;
+}
+
+void
+freeGroup(bdescr *p)
+{
+  bdescr *bd, *last;
+  
+  ASSERT_SM_LOCK();
+
+  /* are we dealing with a megablock group? */
+  if (p->blocks > BLOCKS_PER_MBLOCK) {
+    freeMegaGroup(p);
+    return;
+  }
+
+
+  p->free = (void *)-1;  /* indicates that this block is free */
+  p->step = NULL;
+  p->gen_no = 0;
+  /* fill the block group with garbage if sanity checking is on */
+  IF_DEBUG(sanity,memset(p->start, 0xaa, p->blocks * BLOCK_SIZE));
+
+  /* find correct place in free list to place new group */
+  last = NULL;
+  for (bd = free_list; bd != NULL && bd->start < p->start; 
+       bd = bd->link) {
+    last = bd;
+  }
+
+  /* now, last = previous group (or NULL) */
+  if (last == NULL) {
+    p->link = free_list;
+    free_list = p;
+  } else {
+    /* coalesce with previous group if possible */
+    p->link = last->link;
+    last->link = p;
+    p = coalesce(last);
+  }
+
+  /* coalesce with next group if possible */
+  coalesce(p);
+  IF_DEBUG(sanity, checkFreeListSanity());
+}
+
+void
+freeGroup_lock(bdescr *p)
+{
+    ACQUIRE_SM_LOCK;
+    freeGroup(p);
+    RELEASE_SM_LOCK;
+}
+
+static void
+freeMegaGroup(bdescr *p)
+{
+  nat n;
+  void *q = p;
+
+  n = ((bdescr *)q)->blocks * BLOCK_SIZE / MBLOCK_SIZE + 1;
+  for (; n > 0; q += MBLOCK_SIZE, n--) {
+    initMBlock(MBLOCK_ROUND_DOWN(q));
+    initGroup(BLOCKS_PER_MBLOCK, (bdescr *)q);
+    freeGroup((bdescr *)q);
+  }
+}
+
+void
+freeChain(bdescr *bd)
+{
+  bdescr *next_bd;
+  while (bd != NULL) {
+    next_bd = bd->link;
+    freeGroup(bd);
+    bd = next_bd;
+  }
+}
+
+void
+freeChain_lock(bdescr *bd)
+{
+    ACQUIRE_SM_LOCK;
+    freeChain(bd);
+    RELEASE_SM_LOCK;
+}
+
+static void
+initMBlock(void *mblock)
+{
+  bdescr *bd;
+  void *block;
+
+  /* the first few Bdescr's in a block are unused, so we don't want to
+   * put them all on the free list.
+   */
+  block = FIRST_BLOCK(mblock);
+  bd    = FIRST_BDESCR(mblock);
+
+  /* Initialise the start field of each block descriptor
+   */
+  for (; block <= LAST_BLOCK(mblock); bd += 1, block += BLOCK_SIZE) {
+    bd->start = block;
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   Debugging
+   -------------------------------------------------------------------------- */
+
+#ifdef DEBUG
+static void
+checkWellFormedGroup( bdescr *bd )
+{
+    nat i;
+
+    for (i = 1; i < bd->blocks; i++) {
+	ASSERT(bd[i].blocks == 0);
+	ASSERT(bd[i].free   == 0);
+	ASSERT(bd[i].link   == bd);
+    }
+}
+
+void
+checkFreeListSanity(void)
+{
+  bdescr *bd;
+
+  for (bd = free_list; bd != NULL; bd = bd->link) {
+    IF_DEBUG(block_alloc,
+	     debugBelch("group at 0x%p, length %d blocks\n", 
+			bd->start, bd->blocks));
+    ASSERT(bd->blocks > 0);
+    checkWellFormedGroup(bd);
+    if (bd->link != NULL) {
+      /* make sure we're fully coalesced */
+      ASSERT(bd->start + bd->blocks * BLOCK_SIZE_W != bd->link->start);
+      ASSERT(bd->start < bd->link->start);
+    }
+  }
+}
+
+nat /* BLOCKS */
+countFreeList(void)
+{
+  bdescr *bd;
+  lnat total_blocks = 0;
+
+  for (bd = free_list; bd != NULL; bd = bd->link) {
+    total_blocks += bd->blocks;
+  }
+  return total_blocks;
+}
+#endif
diff --git a/rts/BlockAlloc.h b/rts/BlockAlloc.h
new file mode 100644
index 0000000000..1472ac6f76
--- /dev/null
+++ b/rts/BlockAlloc.h
@@ -0,0 +1,19 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-1999
+ *
+ * Block Allocator Interface
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef BLOCK_ALLOC_H
+#define BLOCK_ALLOC_H
+
+/* Debugging  -------------------------------------------------------------- */
+
+#ifdef DEBUG
+extern void checkFreeListSanity(void);
+nat         countFreeList(void);
+#endif
+
+#endif /* BLOCK_ALLOC_H */
diff --git a/rts/Capability.c b/rts/Capability.c
new file mode 100644
index 0000000000..51a42ef468
--- /dev/null
+++ b/rts/Capability.c
@@ -0,0 +1,668 @@
+/* ---------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2003-2006
+ *
+ * Capabilities
+ *
+ * A Capability represent the token required to execute STG code,
+ * and all the state an OS thread/task needs to run Haskell code:
+ * its STG registers, a pointer to its TSO, a nursery etc. During
+ * STG execution, a pointer to the capabilitity is kept in a
+ * register (BaseReg; actually it is a pointer to cap->r).
+ *
+ * Only in an THREADED_RTS build will there be multiple capabilities,
+ * for non-threaded builds there is only one global capability, namely
+ * MainCapability.
+ *
+ * --------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "RtsFlags.h"
+#include "STM.h"
+#include "OSThreads.h"
+#include "Capability.h"
+#include "Schedule.h"
+#include "Sparks.h"
+
+// one global capability, this is the Capability for non-threaded
+// builds, and for +RTS -N1
+Capability MainCapability;
+
+nat n_capabilities;
+Capability *capabilities = NULL;
+
+// Holds the Capability which last became free.  This is used so that
+// an in-call has a chance of quickly finding a free Capability.
+// Maintaining a global free list of Capabilities would require global
+// locking, so we don't do that.
+Capability *last_free_capability;
+
+#if defined(THREADED_RTS)
+STATIC_INLINE rtsBool
+globalWorkToDo (void)
+{
+    return blackholes_need_checking
+	|| sched_state >= SCHED_INTERRUPTING
+	;
+}
+#endif
+
+#if defined(THREADED_RTS)
+STATIC_INLINE rtsBool
+anyWorkForMe( Capability *cap, Task *task )
+{
+    if (task->tso != NULL) {
+	// A bound task only runs if its thread is on the run queue of
+	// the capability on which it was woken up.  Otherwise, we
+	// can't be sure that we have the right capability: the thread
+	// might be woken up on some other capability, and task->cap
+	// could change under our feet.
+	return !emptyRunQueue(cap) && cap->run_queue_hd->bound == task;
+    } else {
+	// A vanilla worker task runs if either there is a lightweight
+	// thread at the head of the run queue, or the run queue is
+	// empty and (there are sparks to execute, or there is some
+	// other global condition to check, such as threads blocked on
+	// blackholes).
+	if (emptyRunQueue(cap)) {
+	    return !emptySparkPoolCap(cap)
+		|| !emptyWakeupQueue(cap)
+		|| globalWorkToDo();
+	} else
+	    return cap->run_queue_hd->bound == NULL;
+    }
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+ * Manage the returning_tasks lists.
+ *
+ * These functions require cap->lock
+ * -------------------------------------------------------------------------- */
+
+#if defined(THREADED_RTS)
+STATIC_INLINE void
+newReturningTask (Capability *cap, Task *task)
+{
+    ASSERT_LOCK_HELD(&cap->lock);
+    ASSERT(task->return_link == NULL);
+    if (cap->returning_tasks_hd) {
+	ASSERT(cap->returning_tasks_tl->return_link == NULL);
+	cap->returning_tasks_tl->return_link = task;
+    } else {
+	cap->returning_tasks_hd = task;
+    }
+    cap->returning_tasks_tl = task;
+}
+
+STATIC_INLINE Task *
+popReturningTask (Capability *cap)
+{
+    ASSERT_LOCK_HELD(&cap->lock);
+    Task *task;
+    task = cap->returning_tasks_hd;
+    ASSERT(task);
+    cap->returning_tasks_hd = task->return_link;
+    if (!cap->returning_tasks_hd) {
+	cap->returning_tasks_tl = NULL;
+    }
+    task->return_link = NULL;
+    return task;
+}
+#endif
+
+/* ----------------------------------------------------------------------------
+ * Initialisation
+ *
+ * The Capability is initially marked not free.
+ * ------------------------------------------------------------------------- */
+
+static void
+initCapability( Capability *cap, nat i )
+{
+    nat g;
+
+    cap->no = i;
+    cap->in_haskell        = rtsFalse;
+
+    cap->run_queue_hd      = END_TSO_QUEUE;
+    cap->run_queue_tl      = END_TSO_QUEUE;
+
+#if defined(THREADED_RTS)
+    initMutex(&cap->lock);
+    cap->running_task      = NULL; // indicates cap is free
+    cap->spare_workers     = NULL;
+    cap->suspended_ccalling_tasks = NULL;
+    cap->returning_tasks_hd = NULL;
+    cap->returning_tasks_tl = NULL;
+    cap->wakeup_queue_hd    = END_TSO_QUEUE;
+    cap->wakeup_queue_tl    = END_TSO_QUEUE;
+#endif
+
+    cap->f.stgGCEnter1     = (F_)__stg_gc_enter_1;
+    cap->f.stgGCFun        = (F_)__stg_gc_fun;
+
+    cap->mut_lists  = stgMallocBytes(sizeof(bdescr *) *
+				     RtsFlags.GcFlags.generations,
+				     "initCapability");
+
+    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+	cap->mut_lists[g] = NULL;
+    }
+
+    cap->free_tvar_wait_queues = END_STM_WAIT_QUEUE;
+    cap->free_trec_chunks = END_STM_CHUNK_LIST;
+    cap->free_trec_headers = NO_TREC;
+    cap->transaction_tokens = 0;
+}
+
+/* ---------------------------------------------------------------------------
+ * Function:  initCapabilities()
+ *
+ * Purpose:   set up the Capability handling. For the THREADED_RTS build,
+ *            we keep a table of them, the size of which is
+ *            controlled by the user via the RTS flag -N.
+ *
+ * ------------------------------------------------------------------------- */
+void
+initCapabilities( void )
+{
+#if defined(THREADED_RTS)
+    nat i;
+
+#ifndef REG_Base
+    // We can't support multiple CPUs if BaseReg is not a register
+    if (RtsFlags.ParFlags.nNodes > 1) {
+	errorBelch("warning: multiple CPUs not supported in this build, reverting to 1");
+	RtsFlags.ParFlags.nNodes = 1;
+    }
+#endif
+
+    n_capabilities = RtsFlags.ParFlags.nNodes;
+
+    if (n_capabilities == 1) {
+	capabilities = &MainCapability;
+	// THREADED_RTS must work on builds that don't have a mutable
+	// BaseReg (eg. unregisterised), so in this case
+	// capabilities[0] must coincide with &MainCapability.
+    } else {
+	capabilities = stgMallocBytes(n_capabilities * sizeof(Capability),
+				      "initCapabilities");
+    }
+
+    for (i = 0; i < n_capabilities; i++) {
+	initCapability(&capabilities[i], i);
+    }
+
+    IF_DEBUG(scheduler, sched_belch("allocated %d capabilities", 
+				    n_capabilities));
+
+#else /* !THREADED_RTS */
+
+    n_capabilities = 1;
+    capabilities = &MainCapability;
+    initCapability(&MainCapability, 0);
+
+#endif
+
+    // There are no free capabilities to begin with.  We will start
+    // a worker Task to each Capability, which will quickly put the
+    // Capability on the free list when it finds nothing to do.
+    last_free_capability = &capabilities[0];
+}
+
+/* ----------------------------------------------------------------------------
+ * Give a Capability to a Task.  The task must currently be sleeping
+ * on its condition variable.
+ *
+ * Requires cap->lock (modifies cap->running_task).
+ *
+ * When migrating a Task, the migrater must take task->lock before
+ * modifying task->cap, to synchronise with the waking up Task.
+ * Additionally, the migrater should own the Capability (when
+ * migrating the run queue), or cap->lock (when migrating
+ * returning_workers).
+ *
+ * ------------------------------------------------------------------------- */
+
+#if defined(THREADED_RTS)
+STATIC_INLINE void
+giveCapabilityToTask (Capability *cap USED_IF_DEBUG, Task *task)
+{
+    ASSERT_LOCK_HELD(&cap->lock);
+    ASSERT(task->cap == cap);
+    IF_DEBUG(scheduler,
+	     sched_belch("passing capability %d to %s %p",
+			 cap->no, task->tso ? "bound task" : "worker",
+			 (void *)task->id));
+    ACQUIRE_LOCK(&task->lock);
+    task->wakeup = rtsTrue;
+    // the wakeup flag is needed because signalCondition() doesn't
+    // flag the condition if the thread is already runniing, but we want
+    // it to be sticky.
+    signalCondition(&task->cond);
+    RELEASE_LOCK(&task->lock);
+}
+#endif
+
+/* ----------------------------------------------------------------------------
+ * Function:  releaseCapability(Capability*)
+ *
+ * Purpose:   Letting go of a capability. Causes a
+ *            'returning worker' thread or a 'waiting worker'
+ *            to wake up, in that order.
+ * ------------------------------------------------------------------------- */
+
+#if defined(THREADED_RTS)
+void
+releaseCapability_ (Capability* cap)
+{
+    Task *task;
+
+    task = cap->running_task;
+
+    ASSERT_PARTIAL_CAPABILITY_INVARIANTS(cap,task);
+
+    cap->running_task = NULL;
+
+    // Check to see whether a worker thread can be given
+    // the go-ahead to return the result of an external call..
+    if (cap->returning_tasks_hd != NULL) {
+	giveCapabilityToTask(cap,cap->returning_tasks_hd);
+	// The Task pops itself from the queue (see waitForReturnCapability())
+	return;
+    }
+
+    // If the next thread on the run queue is a bound thread,
+    // give this Capability to the appropriate Task.
+    if (!emptyRunQueue(cap) && cap->run_queue_hd->bound) {
+	// Make sure we're not about to try to wake ourselves up
+	ASSERT(task != cap->run_queue_hd->bound);
+	task = cap->run_queue_hd->bound;
+	giveCapabilityToTask(cap,task);
+	return;
+    }
+
+    if (!cap->spare_workers) {
+	// Create a worker thread if we don't have one.  If the system
+	// is interrupted, we only create a worker task if there
+	// are threads that need to be completed.  If the system is
+	// shutting down, we never create a new worker.
+	if (sched_state < SCHED_SHUTTING_DOWN || !emptyRunQueue(cap)) {
+	    IF_DEBUG(scheduler,
+		     sched_belch("starting new worker on capability %d", cap->no));
+	    startWorkerTask(cap, workerStart);
+	    return;
+	}
+    }
+
+    // If we have an unbound thread on the run queue, or if there's
+    // anything else to do, give the Capability to a worker thread.
+    if (!emptyRunQueue(cap) || !emptyWakeupQueue(cap)
+	      || !emptySparkPoolCap(cap) || globalWorkToDo()) {
+	if (cap->spare_workers) {
+	    giveCapabilityToTask(cap,cap->spare_workers);
+	    // The worker Task pops itself from the queue;
+	    return;
+	}
+    }
+
+    last_free_capability = cap;
+    IF_DEBUG(scheduler, sched_belch("freeing capability %d", cap->no));
+}
+
+void
+releaseCapability (Capability* cap USED_IF_THREADS)
+{
+    ACQUIRE_LOCK(&cap->lock);
+    releaseCapability_(cap);
+    RELEASE_LOCK(&cap->lock);
+}
+
+static void
+releaseCapabilityAndQueueWorker (Capability* cap USED_IF_THREADS)
+{
+    Task *task;
+
+    ACQUIRE_LOCK(&cap->lock);
+
+    task = cap->running_task;
+
+    // If the current task is a worker, save it on the spare_workers
+    // list of this Capability.  A worker can mark itself as stopped,
+    // in which case it is not replaced on the spare_worker queue.
+    // This happens when the system is shutting down (see
+    // Schedule.c:workerStart()).
+    // Also, be careful to check that this task hasn't just exited
+    // Haskell to do a foreign call (task->suspended_tso).
+    if (!isBoundTask(task) && !task->stopped && !task->suspended_tso) {
+	task->next = cap->spare_workers;
+	cap->spare_workers = task;
+    }
+    // Bound tasks just float around attached to their TSOs.
+
+    releaseCapability_(cap);
+
+    RELEASE_LOCK(&cap->lock);
+}
+#endif
+
+/* ----------------------------------------------------------------------------
+ * waitForReturnCapability( Task *task )
+ *
+ * Purpose:  when an OS thread returns from an external call,
+ * it calls waitForReturnCapability() (via Schedule.resumeThread())
+ * to wait for permission to enter the RTS & communicate the
+ * result of the external call back to the Haskell thread that
+ * made it.
+ *
+ * ------------------------------------------------------------------------- */
+void
+waitForReturnCapability (Capability **pCap, Task *task)
+{
+#if !defined(THREADED_RTS)
+
+    MainCapability.running_task = task;
+    task->cap = &MainCapability;
+    *pCap = &MainCapability;
+
+#else
+    Capability *cap = *pCap;
+
+    if (cap == NULL) {
+	// Try last_free_capability first
+	cap = last_free_capability;
+	if (!cap->running_task) {
+	    nat i;
+	    // otherwise, search for a free capability
+	    for (i = 0; i < n_capabilities; i++) {
+		cap = &capabilities[i];
+		if (!cap->running_task) {
+		    break;
+		}
+	    }
+	    // Can't find a free one, use last_free_capability.
+	    cap = last_free_capability;
+	}
+
+	// record the Capability as the one this Task is now assocated with.
+	task->cap = cap;
+
+    } else {
+	ASSERT(task->cap == cap);
+    }
+
+    ACQUIRE_LOCK(&cap->lock);
+
+    IF_DEBUG(scheduler,
+	     sched_belch("returning; I want capability %d", cap->no));
+
+    if (!cap->running_task) {
+	// It's free; just grab it
+	cap->running_task = task;
+	RELEASE_LOCK(&cap->lock);
+    } else {
+	newReturningTask(cap,task);
+	RELEASE_LOCK(&cap->lock);
+
+	for (;;) {
+	    ACQUIRE_LOCK(&task->lock);
+	    // task->lock held, cap->lock not held
+	    if (!task->wakeup) waitCondition(&task->cond, &task->lock);
+	    cap = task->cap;
+	    task->wakeup = rtsFalse;
+	    RELEASE_LOCK(&task->lock);
+
+	    // now check whether we should wake up...
+	    ACQUIRE_LOCK(&cap->lock);
+	    if (cap->running_task == NULL) {
+		if (cap->returning_tasks_hd != task) {
+		    giveCapabilityToTask(cap,cap->returning_tasks_hd);
+		    RELEASE_LOCK(&cap->lock);
+		    continue;
+		}
+		cap->running_task = task;
+		popReturningTask(cap);
+		RELEASE_LOCK(&cap->lock);
+		break;
+	    }
+	    RELEASE_LOCK(&cap->lock);
+	}
+
+    }
+
+    ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
+
+    IF_DEBUG(scheduler,
+	     sched_belch("returning; got capability %d", cap->no));
+
+    *pCap = cap;
+#endif
+}
+
+#if defined(THREADED_RTS)
+/* ----------------------------------------------------------------------------
+ * yieldCapability
+ * ------------------------------------------------------------------------- */
+
+void
+yieldCapability (Capability** pCap, Task *task)
+{
+    Capability *cap = *pCap;
+
+    // The fast path has no locking, if we don't enter this while loop
+
+    while ( cap->returning_tasks_hd != NULL || !anyWorkForMe(cap,task) ) {
+	IF_DEBUG(scheduler, sched_belch("giving up capability %d", cap->no));
+
+	// We must now release the capability and wait to be woken up
+	// again.
+	task->wakeup = rtsFalse;
+	releaseCapabilityAndQueueWorker(cap);
+
+	for (;;) {
+	    ACQUIRE_LOCK(&task->lock);
+	    // task->lock held, cap->lock not held
+	    if (!task->wakeup) waitCondition(&task->cond, &task->lock);
+	    cap = task->cap;
+	    task->wakeup = rtsFalse;
+	    RELEASE_LOCK(&task->lock);
+
+	    IF_DEBUG(scheduler, sched_belch("woken up on capability %d", cap->no));
+	    ACQUIRE_LOCK(&cap->lock);
+	    if (cap->running_task != NULL) {
+		IF_DEBUG(scheduler, sched_belch("capability %d is owned by another task", cap->no));
+		RELEASE_LOCK(&cap->lock);
+		continue;
+	    }
+
+	    if (task->tso == NULL) {
+		ASSERT(cap->spare_workers != NULL);
+		// if we're not at the front of the queue, release it
+		// again.  This is unlikely to happen.
+		if (cap->spare_workers != task) {
+		    giveCapabilityToTask(cap,cap->spare_workers);
+		    RELEASE_LOCK(&cap->lock);
+		    continue;
+		}
+		cap->spare_workers = task->next;
+		task->next = NULL;
+	    }
+	    cap->running_task = task;
+	    RELEASE_LOCK(&cap->lock);
+	    break;
+	}
+
+	IF_DEBUG(scheduler, sched_belch("got capability %d", cap->no));
+	ASSERT(cap->running_task == task);
+    }
+
+    *pCap = cap;
+
+    ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
+
+    return;
+}
+
+/* ----------------------------------------------------------------------------
+ * Wake up a thread on a Capability.
+ *
+ * This is used when the current Task is running on a Capability and
+ * wishes to wake up a thread on a different Capability.
+ * ------------------------------------------------------------------------- */
+
+void
+wakeupThreadOnCapability (Capability *cap, StgTSO *tso)
+{
+    ASSERT(tso->cap == cap);
+    ASSERT(tso->bound ? tso->bound->cap == cap : 1);
+
+    ACQUIRE_LOCK(&cap->lock);
+    if (cap->running_task == NULL) {
+	// nobody is running this Capability, we can add our thread
+	// directly onto the run queue and start up a Task to run it.
+	appendToRunQueue(cap,tso);
+
+	// start it up
+	cap->running_task = myTask(); // precond for releaseCapability_()
+	releaseCapability_(cap);
+    } else {
+	appendToWakeupQueue(cap,tso);
+	// someone is running on this Capability, so it cannot be
+	// freed without first checking the wakeup queue (see
+	// releaseCapability_).
+    }
+    RELEASE_LOCK(&cap->lock);
+}
+
+/* ----------------------------------------------------------------------------
+ * prodCapabilities
+ *
+ * Used to indicate that the interrupted flag is now set, or some
+ * other global condition that might require waking up a Task on each
+ * Capability.
+ * ------------------------------------------------------------------------- */
+
+static void
+prodCapabilities(rtsBool all)
+{
+    nat i;
+    Capability *cap;
+    Task *task;
+
+    for (i=0; i < n_capabilities; i++) {
+	cap = &capabilities[i];
+	ACQUIRE_LOCK(&cap->lock);
+	if (!cap->running_task) {
+	    if (cap->spare_workers) {
+		task = cap->spare_workers;
+		ASSERT(!task->stopped);
+		giveCapabilityToTask(cap,task);
+		if (!all) {
+		    RELEASE_LOCK(&cap->lock);
+		    return;
+		}
+	    }
+	}
+	RELEASE_LOCK(&cap->lock);
+    }
+    return;
+}
+
+void
+prodAllCapabilities (void)
+{
+    prodCapabilities(rtsTrue);
+}
+
+/* ----------------------------------------------------------------------------
+ * prodOneCapability
+ *
+ * Like prodAllCapabilities, but we only require a single Task to wake
+ * up in order to service some global event, such as checking for
+ * deadlock after some idle time has passed.
+ * ------------------------------------------------------------------------- */
+
+void
+prodOneCapability (void)
+{
+    prodCapabilities(rtsFalse);
+}
+
+/* ----------------------------------------------------------------------------
+ * shutdownCapability
+ *
+ * At shutdown time, we want to let everything exit as cleanly as
+ * possible.  For each capability, we let its run queue drain, and
+ * allow the workers to stop.
+ *
+ * This function should be called when interrupted and
+ * shutting_down_scheduler = rtsTrue, thus any worker that wakes up
+ * will exit the scheduler and call taskStop(), and any bound thread
+ * that wakes up will return to its caller.  Runnable threads are
+ * killed.
+ *
+ * ------------------------------------------------------------------------- */
+
+void
+shutdownCapability (Capability *cap, Task *task)
+{
+    nat i;
+
+    ASSERT(sched_state == SCHED_SHUTTING_DOWN);
+
+    task->cap = cap;
+
+    for (i = 0; i < 50; i++) {
+	IF_DEBUG(scheduler, sched_belch("shutting down capability %d, attempt %d", cap->no, i));
+	ACQUIRE_LOCK(&cap->lock);
+	if (cap->running_task) {
+	    RELEASE_LOCK(&cap->lock);
+	    IF_DEBUG(scheduler, sched_belch("not owner, yielding"));
+	    yieldThread();
+	    continue;
+	}
+	cap->running_task = task;
+	if (!emptyRunQueue(cap) || cap->spare_workers) {
+	    IF_DEBUG(scheduler, sched_belch("runnable threads or workers still alive, yielding"));
+	    releaseCapability_(cap); // this will wake up a worker
+	    RELEASE_LOCK(&cap->lock);
+	    yieldThread();
+	    continue;
+	}
+	IF_DEBUG(scheduler, sched_belch("capability %d is stopped.", cap->no));
+	RELEASE_LOCK(&cap->lock);
+	break;
+    }
+    // we now have the Capability, its run queue and spare workers
+    // list are both empty.
+}
+
+/* ----------------------------------------------------------------------------
+ * tryGrabCapability
+ *
+ * Attempt to gain control of a Capability if it is free.
+ *
+ * ------------------------------------------------------------------------- */
+
+rtsBool
+tryGrabCapability (Capability *cap, Task *task)
+{
+    if (cap->running_task != NULL) return rtsFalse;
+    ACQUIRE_LOCK(&cap->lock);
+    if (cap->running_task != NULL) {
+	RELEASE_LOCK(&cap->lock);
+	return rtsFalse;
+    }
+    task->cap = cap;
+    cap->running_task = task;
+    RELEASE_LOCK(&cap->lock);
+    return rtsTrue;
+}
+
+
+#endif /* THREADED_RTS */
+
+
diff --git a/rts/Capability.h b/rts/Capability.h
new file mode 100644
index 0000000000..a2551d0cc5
--- /dev/null
+++ b/rts/Capability.h
@@ -0,0 +1,250 @@
+/* ---------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2001-2006
+ *
+ * Capabilities
+ *
+ * The notion of a capability is used when operating in multi-threaded
+ * environments (which the THREADED_RTS build of the RTS does), to
+ * hold all the state an OS thread/task needs to run Haskell code:
+ * its STG registers, a pointer to its  TSO, a nursery etc. During
+ * STG execution, a pointer to the capabilitity is kept in a 
+ * register (BaseReg).
+ *
+ * Only in an THREADED_RTS build will there be multiple capabilities,
+ * in the non-threaded builds there is one global capability, namely
+ * MainCapability.
+ *
+ * This header file contains the functions for working with capabilities.
+ * (the main, and only, consumer of this interface is the scheduler).
+ * 
+ * --------------------------------------------------------------------------*/
+
+#ifndef CAPABILITY_H
+#define CAPABILITY_H
+
+#include "RtsFlags.h"
+#include "Task.h"
+
+struct Capability_ {
+    // State required by the STG virtual machine when running Haskell
+    // code.  During STG execution, the BaseReg register always points
+    // to the StgRegTable of the current Capability (&cap->r).
+    StgFunTable f;
+    StgRegTable r;
+
+    nat no;  // capability number.
+
+    // The Task currently holding this Capability.  This task has
+    // exclusive access to the contents of this Capability (apart from
+    // returning_tasks_hd/returning_tasks_tl).
+    // Locks required: cap->lock.
+    Task *running_task;
+
+    // true if this Capability is running Haskell code, used for
+    // catching unsafe call-ins.
+    rtsBool in_haskell;
+
+    // The run queue.  The Task owning this Capability has exclusive
+    // access to its run queue, so can wake up threads without
+    // taking a lock, and the common path through the scheduler is
+    // also lock-free.
+    StgTSO *run_queue_hd;
+    StgTSO *run_queue_tl;
+
+    // Tasks currently making safe foreign calls.  Doubly-linked.
+    // When returning, a task first acquires the Capability before
+    // removing itself from this list, so that the GC can find all
+    // the suspended TSOs easily.  Hence, when migrating a Task from
+    // the returning_tasks list, we must also migrate its entry from
+    // this list.
+    Task *suspended_ccalling_tasks;
+
+    // One mutable list per generation, so we don't need to take any
+    // locks when updating an old-generation thunk.  These
+    // mini-mut-lists are moved onto the respective gen->mut_list at
+    // each GC.
+    bdescr **mut_lists;
+
+#if defined(THREADED_RTS)
+    // Worker Tasks waiting in the wings.  Singly-linked.
+    Task *spare_workers;
+
+    // This lock protects running_task, returning_tasks_{hd,tl}, wakeup_queue.
+    Mutex lock;
+
+    // Tasks waiting to return from a foreign call, or waiting to make
+    // a new call-in using this Capability (NULL if empty).
+    // NB. this field needs to be modified by tasks other than the
+    // running_task, so it requires cap->lock to modify.  A task can
+    // check whether it is NULL without taking the lock, however.
+    Task *returning_tasks_hd; // Singly-linked, with head/tail
+    Task *returning_tasks_tl;
+
+    // A list of threads to append to this Capability's run queue at
+    // the earliest opportunity.  These are threads that have been
+    // woken up by another Capability.
+    StgTSO *wakeup_queue_hd;
+    StgTSO *wakeup_queue_tl;
+#endif
+
+    // Per-capability STM-related data
+    StgTVarWaitQueue *free_tvar_wait_queues;
+    StgTRecChunk *free_trec_chunks;
+    StgTRecHeader *free_trec_headers;
+    nat transaction_tokens;
+}; // typedef Capability, defined in RtsAPI.h
+
+
+#if defined(THREADED_RTS)
+#define ASSERT_TASK_ID(task) ASSERT(task->id == osThreadId())
+#else
+#define ASSERT_TASK_ID(task) /*empty*/
+#endif
+
+// These properties should be true when a Task is holding a Capability
+#define ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task)			\
+  ASSERT(cap->running_task != NULL && cap->running_task == task);	\
+  ASSERT(task->cap == cap);						\
+  ASSERT_PARTIAL_CAPABILITY_INVARIANTS(cap,task)
+
+// Sometimes a Task holds a Capability, but the Task is not associated
+// with that Capability (ie. task->cap != cap).  This happens when
+// (a) a Task holds multiple Capabilities, and (b) when the current
+// Task is bound, its thread has just blocked, and it may have been
+// moved to another Capability.
+#define ASSERT_PARTIAL_CAPABILITY_INVARIANTS(cap,task)	\
+  ASSERT(cap->run_queue_hd == END_TSO_QUEUE ?		\
+	    cap->run_queue_tl == END_TSO_QUEUE : 1);	\
+  ASSERT(myTask() == task);				\
+  ASSERT_TASK_ID(task);
+
+// Converts a *StgRegTable into a *Capability.
+//
+INLINE_HEADER Capability *
+regTableToCapability (StgRegTable *reg)
+{
+    return (Capability *)((void *)((unsigned char*)reg - sizeof(StgFunTable)));
+}
+
+// Initialise the available capabilities.
+//
+void initCapabilities (void);
+
+// Release a capability.  This is called by a Task that is exiting
+// Haskell to make a foreign call, or in various other cases when we
+// want to relinquish a Capability that we currently hold.
+//
+// ASSUMES: cap->running_task is the current Task.
+//
+#if defined(THREADED_RTS)
+void releaseCapability  (Capability* cap);
+void releaseCapability_ (Capability* cap); // assumes cap->lock is held
+#else
+// releaseCapability() is empty in non-threaded RTS
+INLINE_HEADER void releaseCapability  (Capability* cap STG_UNUSED) {};
+INLINE_HEADER void releaseCapability_ (Capability* cap STG_UNUSED) {};
+#endif
+
+#if !IN_STG_CODE
+// one global capability
+extern Capability MainCapability; 
+#endif
+
+// Array of all the capabilities
+//
+extern nat n_capabilities;
+extern Capability *capabilities;
+
+// The Capability that was last free.  Used as a good guess for where
+// to assign new threads.
+//
+extern Capability *last_free_capability;
+
+// Acquires a capability at a return point.  If *cap is non-NULL, then
+// this is taken as a preference for the Capability we wish to
+// acquire.
+//
+// OS threads waiting in this function get priority over those waiting
+// in waitForCapability().
+//
+// On return, *cap is non-NULL, and points to the Capability acquired.
+//
+void waitForReturnCapability (Capability **cap/*in/out*/, Task *task);
+
+INLINE_HEADER void recordMutableCap (StgClosure *p, Capability *cap, nat gen);
+
+#if defined(THREADED_RTS)
+
+// Gives up the current capability IFF there is a higher-priority
+// thread waiting for it.  This happens in one of two ways:
+//
+//   (a) we are passing the capability to another OS thread, so
+//       that it can run a bound Haskell thread, or
+//
+//   (b) there is an OS thread waiting to return from a foreign call
+//
+// On return: *pCap is NULL if the capability was released.  The
+// current task should then re-acquire it using waitForCapability().
+//
+void yieldCapability (Capability** pCap, Task *task);
+
+// Acquires a capability for doing some work.
+//
+// On return: pCap points to the capability.
+//
+void waitForCapability (Task *task, Mutex *mutex, Capability **pCap);
+
+// Wakes up a thread on a Capability (probably a different Capability
+// from the one held by the current Task).
+//
+void wakeupThreadOnCapability (Capability *cap, StgTSO *tso);
+
+// Wakes up a worker thread on just one Capability, used when we
+// need to service some global event.
+//
+void prodOneCapability (void);
+
+// Similar to prodOneCapability(), but prods all of them.
+//
+void prodAllCapabilities (void);
+
+// Waits for a capability to drain of runnable threads and workers,
+// and then acquires it.  Used at shutdown time.
+//
+void shutdownCapability (Capability *cap, Task *task);
+
+// Attempt to gain control of a Capability if it is free.
+//
+rtsBool tryGrabCapability (Capability *cap, Task *task);
+
+#else // !THREADED_RTS
+
+// Grab a capability.  (Only in the non-threaded RTS; in the threaded
+// RTS one of the waitFor*Capability() functions must be used).
+//
+extern void grabCapability (Capability **pCap);
+
+#endif /* !THREADED_RTS */
+
+/* -----------------------------------------------------------------------------
+ * INLINE functions... private below here
+ * -------------------------------------------------------------------------- */
+
+INLINE_HEADER void
+recordMutableCap (StgClosure *p, Capability *cap, nat gen)
+{
+    bdescr *bd;
+
+    bd = cap->mut_lists[gen];
+    if (bd->free >= bd->start + BLOCK_SIZE_W) {
+	bdescr *new_bd;
+	new_bd = allocBlock_lock();
+	new_bd->link = bd;
+	bd = new_bd;
+	cap->mut_lists[gen] = bd;
+    }
+    *bd->free++ = (StgWord)p;
+}
+
+#endif /* CAPABILITY_H */
diff --git a/rts/ClosureFlags.c b/rts/ClosureFlags.c
new file mode 100644
index 0000000000..5545693362
--- /dev/null
+++ b/rts/ClosureFlags.c
@@ -0,0 +1,107 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1998-1999
+ *
+ * Closure type flags
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+
+StgWord16 closure_flags[] = {
+
+/* 
+ * These *must* be in the same order as the closure types in
+ * ClosureTypes.h.
+ */
+
+/* ToDo: some of these flags seem to be duplicated.
+ *       - NS is the same as HNF, and the negation of THU
+ * (however, we set NS for indirections, which is probably the
+ *  right thing to do, since we never get indirections pointing
+ *  to thunks.)
+ */
+
+/*                              0    1    2    3    4   5   6   7 */
+/*			      HNF  BTM   NS  STA  THU MUT UPT SRT */
+
+/* INVALID_OBJECT       = */ ( 0                                      	 ),
+/* CONSTR  		= */ (_HNF|     _NS                           	 ),
+/* CONSTR_1_0	   	= */ (_HNF|     _NS                           	 ),
+/* CONSTR_0_1	   	= */ (_HNF|     _NS                           	 ),
+/* CONSTR_2_0	   	= */ (_HNF|     _NS                           	 ),
+/* CONSTR_1_1	   	= */ (_HNF|     _NS                           	 ),
+/* CONSTR_0_2	   	= */ (_HNF|     _NS                           	 ),
+/* CONSTR_INTLIKE 	= */ (_HNF|     _NS|_STA                      	 ),
+/* CONSTR_CHARLIKE  	= */ (_HNF|     _NS|_STA                      	 ),
+/* CONSTR_STATIC	= */ (_HNF|     _NS|_STA                      	 ),
+/* CONSTR_NOCAF_STATIC  = */ (_HNF|     _NS|_STA                      	 ),
+/* FUN		   	= */ (_HNF|     _NS|                  _SRT    	 ),
+/* FUN_1_0		= */ (_HNF|     _NS|		      _SRT       ),
+/* FUN_0_1		= */ (_HNF|     _NS|		      _SRT       ),
+/* FUN_2_0		= */ (_HNF|     _NS|		      _SRT       ),
+/* FUN_1_1		= */ (_HNF|     _NS|		      _SRT       ),
+/* FUN_0_2		= */ (_HNF|     _NS|		      _SRT       ),
+/* FUN_STATIC	   	= */ (_HNF|     _NS|_STA|             _SRT    	 ),
+/* THUNK		= */ (     _BTM|         _THU|        _SRT    	 ),
+/* THUNK_1_0		= */ (     _BTM|         _THU|        _SRT    	 ),
+/* THUNK_0_1		= */ (     _BTM|         _THU|        _SRT    	 ),
+/* THUNK_2_0		= */ (     _BTM|         _THU|        _SRT    	 ),
+/* THUNK_1_1		= */ (     _BTM|         _THU|        _SRT    	 ),
+/* THUNK_0_2		= */ (     _BTM|         _THU|        _SRT    	 ),
+/* THUNK_STATIC	   	= */ (     _BTM|    _STA|_THU|        _SRT    	 ),
+/* THUNK_SELECTOR	= */ (     _BTM|         _THU|        _SRT    	 ),
+/* BCO		   	= */ (_HNF|     _NS                           	 ),
+/* AP			= */ (                   _THU                 	 ),
+/* PAP		   	= */ (_HNF|     _NS				 ),
+/* AP_STACK	   	= */ (          	 _THU			 ),
+/* IND		   	= */ (          _NS|			    _IND ),
+/* IND_OLDGEN	   	= */ (          _NS|			    _IND ),
+/* IND_PERM		= */ (          _NS|			    _IND ),
+/* IND_OLDGEN_PERM	= */ (          _NS|			    _IND ),
+/* IND_STATIC	   	= */ (          _NS|_STA|                   _IND ),
+/* RET_BCO		= */ (     _BTM                                  ),
+/* RET_SMALL		= */ (     _BTM|                       _SRT      ),
+/* RET_VEC_SMALL	= */ (     _BTM|                       _SRT      ),
+/* RET_BIG		= */ (                                 _SRT      ),
+/* RET_VEC_BIG	   	= */ (                                 _SRT      ),
+/* RET_DYN		= */ (                                 _SRT      ),
+/* RET_FUN		= */ ( 0                                         ),
+/* UPDATE_FRAME        	= */ (     _BTM                                  ),
+/* CATCH_FRAME	   	= */ (     _BTM                                  ),
+/* STOP_FRAME	   	= */ (     _BTM                                  ),
+/* CAF_BLACKHOLE   	= */ ( 	   _BTM|_NS|              _UPT           ),
+/* BLACKHOLE		= */ ( 	        _NS|              _UPT           ),
+/* SE_BLACKHOLE		= */ ( 	        _NS|              _UPT           ),
+/* SE_CAF_BLACKHOLE	= */ ( 	        _NS|              _UPT           ),
+/* MVAR		   	= */ (_HNF|     _NS|         _MUT|_UPT           ),
+/* ARR_WORDS		= */ (_HNF|     _NS|              _UPT           ),
+/* MUT_ARR_PTRS_CLEAN  	= */ (_HNF|     _NS|         _MUT|_UPT           ),
+/* MUT_ARR_PTRS_DIRTY  	= */ (_HNF|     _NS|         _MUT|_UPT           ),
+/* MUT_ARR_PTRS_FROZEN0	= */ (_HNF|     _NS|         _MUT|_UPT           ),
+/* MUT_ARR_PTRS_FROZEN 	= */ (_HNF|     _NS|              _UPT           ),
+/* MUT_VAR_CLEAN	= */ (_HNF|     _NS|         _MUT|_UPT           ),
+/* MUT_VAR_DIRTY	= */ (_HNF|     _NS|         _MUT|_UPT           ),
+/* WEAK		   	= */ (_HNF|     _NS|              _UPT           ),
+/* STABLE_NAME	   	= */ (_HNF|     _NS|              _UPT           ),
+/* TSO                 	= */ (_HNF|     _NS|         _MUT|_UPT           ),
+/* BLOCKED_FETCH	= */ (_HNF|     _NS|         _MUT|_UPT           ),
+/* FETCH_ME		= */ (_HNF|     _NS|         _MUT|_UPT           ),
+/* FETCH_ME_BQ          = */ ( 	        _NS|         _MUT|_UPT           ),
+/* RBH                  = */ ( 	        _NS|         _MUT|_UPT           ),
+/* EVACUATED		= */ ( 0                                         ),
+/* REMOTE_REF		= */ (_HNF|     _NS|              _UPT           ),
+/* TVAR_WAIT_QUEUE      = */ (          _NS|         _MUT|_UPT           ),
+/* TVAR                 = */ (_HNF|     _NS|         _MUT|_UPT           ), 
+/* TREC_CHUNK           = */ (          _NS|         _MUT|_UPT           ),
+/* TREC_HEADER          = */ (          _NS|         _MUT|_UPT           ),
+/* ATOMICALLY_FRAME     = */ (     _BTM                                  ),
+/* CATCH_RETRY_FRAME    = */ (     _BTM                                  ),
+/* CATCH_STM_FRAME      = */ (     _BTM                                  )
+};
+
+#if N_CLOSURE_TYPES != 73
+#error Closure types changed: update ClosureFlags.c!
+#endif
+
diff --git a/rts/Disassembler.c b/rts/Disassembler.c
new file mode 100644
index 0000000000..b084a29b89
--- /dev/null
+++ b/rts/Disassembler.c
@@ -0,0 +1,281 @@
+/* -----------------------------------------------------------------------------
+ * Bytecode disassembler
+ *
+ * Copyright (c) 1994-2002.
+ *
+ * $RCSfile: Disassembler.c,v $
+ * $Revision: 1.29 $
+ * $Date: 2004/09/03 15:28:19 $
+ * ---------------------------------------------------------------------------*/
+
+#ifdef DEBUG
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsAPI.h"
+#include "RtsUtils.h"
+#include "Closures.h"
+#include "TSO.h"
+#include "Schedule.h"
+
+#include "Bytecodes.h"
+#include "Printer.h"
+#include "Disassembler.h"
+#include "Interpreter.h"
+
+/* --------------------------------------------------------------------------
+ * Disassembler
+ * ------------------------------------------------------------------------*/
+
+int
+disInstr ( StgBCO *bco, int pc )
+{
+   int i;
+
+   StgWord16*     instrs      = (StgWord16*)(bco->instrs->payload);
+
+   StgArrWords*   literal_arr = bco->literals;
+   StgWord*       literals    = (StgWord*)(&literal_arr->payload[0]);
+
+   StgMutArrPtrs* ptrs_arr    = bco->ptrs;
+   StgPtr*        ptrs        = (StgPtr*)(&ptrs_arr->payload[0]);
+
+   StgArrWords*   itbls_arr   = bco->itbls;
+   StgInfoTable** itbls       = (StgInfoTable**)(&itbls_arr->payload[0]);
+
+   switch (instrs[pc++]) {
+      case bci_SWIZZLE:
+         debugBelch("SWIZZLE stkoff %d by %d\n",
+                         instrs[pc], (signed int)instrs[pc+1]);
+         pc += 2; break;
+      case bci_CCALL:
+         debugBelch("CCALL    marshaller at 0x%x\n", 
+                         literals[instrs[pc]] );
+         pc += 1; break;
+      case bci_STKCHECK: 
+         debugBelch("STKCHECK %d\n", instrs[pc] );
+         pc += 1; break;
+      case bci_PUSH_L: 
+         debugBelch("PUSH_L   %d\n", instrs[pc] );
+         pc += 1; break;
+      case bci_PUSH_LL:
+         debugBelch("PUSH_LL  %d %d\n", instrs[pc], instrs[pc+1] ); 
+         pc += 2; break;
+      case bci_PUSH_LLL:
+         debugBelch("PUSH_LLL %d %d %d\n", instrs[pc], instrs[pc+1], 
+                                                            instrs[pc+2] ); 
+         pc += 3; break;
+      case bci_PUSH_G:
+         debugBelch("PUSH_G   " ); printPtr( ptrs[instrs[pc]] );
+         debugBelch("\n" );
+         pc += 1; break;
+
+      case bci_PUSH_ALTS:
+         debugBelch("PUSH_ALTS  " ); printPtr( ptrs[instrs[pc]] );
+         debugBelch("\n");
+         pc += 1; break;
+      case bci_PUSH_ALTS_P:
+         debugBelch("PUSH_ALTS_P  " ); printPtr( ptrs[instrs[pc]] );
+         debugBelch("\n");
+         pc += 1; break;
+      case bci_PUSH_ALTS_N:
+         debugBelch("PUSH_ALTS_N  " ); printPtr( ptrs[instrs[pc]] );
+         debugBelch("\n");
+         pc += 1; break;
+      case bci_PUSH_ALTS_F:
+         debugBelch("PUSH_ALTS_F  " ); printPtr( ptrs[instrs[pc]] );
+         debugBelch("\n");
+         pc += 1; break;
+      case bci_PUSH_ALTS_D:
+         debugBelch("PUSH_ALTS_D  " ); printPtr( ptrs[instrs[pc]] );
+         debugBelch("\n");
+         pc += 1; break;
+      case bci_PUSH_ALTS_L:
+         debugBelch("PUSH_ALTS_L  " ); printPtr( ptrs[instrs[pc]] );
+         debugBelch("\n");
+         pc += 1; break;
+      case bci_PUSH_ALTS_V:
+         debugBelch("PUSH_ALTS_V  " ); printPtr( ptrs[instrs[pc]] );
+         debugBelch("\n");
+         pc += 1; break;
+
+      case bci_PUSH_UBX:
+         debugBelch("PUSH_UBX ");
+         for (i = 0; i < instrs[pc+1]; i++) 
+            debugBelch("0x%x ", literals[i + instrs[pc]] );
+         debugBelch("\n");
+         pc += 2; break;
+      case bci_PUSH_APPLY_N:
+	  debugBelch("PUSH_APPLY_N\n");
+	  break;
+      case bci_PUSH_APPLY_V:
+	  debugBelch("PUSH_APPLY_V\n");
+	  break;
+      case bci_PUSH_APPLY_F:
+	  debugBelch("PUSH_APPLY_F\n");
+	  break;
+      case bci_PUSH_APPLY_D:
+	  debugBelch("PUSH_APPLY_D\n");
+	  break;
+      case bci_PUSH_APPLY_L:
+	  debugBelch("PUSH_APPLY_L\n");
+	  break;
+      case bci_PUSH_APPLY_P:
+	  debugBelch("PUSH_APPLY_P\n");
+	  break;
+      case bci_PUSH_APPLY_PP:
+	  debugBelch("PUSH_APPLY_PP\n");
+	  break;
+      case bci_PUSH_APPLY_PPP:
+	  debugBelch("PUSH_APPLY_PPP\n");
+	  break;
+      case bci_PUSH_APPLY_PPPP:
+	  debugBelch("PUSH_APPLY_PPPP\n");
+	  break;
+      case bci_PUSH_APPLY_PPPPP:
+	  debugBelch("PUSH_APPLY_PPPPP\n");
+	  break;
+      case bci_PUSH_APPLY_PPPPPP:
+	  debugBelch("PUSH_APPLY_PPPPPP\n");
+	  break;
+      case bci_SLIDE: 
+         debugBelch("SLIDE     %d down by %d\n", instrs[pc], instrs[pc+1] );
+         pc += 2; break;
+      case bci_ALLOC_AP:
+         debugBelch("ALLOC_AP  %d words\n", instrs[pc] );
+         pc += 1; break;
+      case bci_ALLOC_PAP:
+         debugBelch("ALLOC_PAP %d words, %d arity\n",
+		 instrs[pc], instrs[pc+1] );
+         pc += 2; break;
+      case bci_MKAP:
+         debugBelch("MKAP      %d words, %d stkoff\n", instrs[pc+1], 
+                                                           instrs[pc] );
+         pc += 2; break;
+      case bci_UNPACK:
+         debugBelch("UNPACK    %d\n", instrs[pc] );
+         pc += 1; break;
+      case bci_PACK:
+         debugBelch("PACK      %d words with itbl ", instrs[pc+1] );
+         printPtr( (StgPtr)itbls[instrs[pc]] );
+         debugBelch("\n");
+         pc += 2; break;
+
+      case bci_TESTLT_I:
+         debugBelch("TESTLT_I  %d, fail to %d\n", literals[instrs[pc]],
+                                                      instrs[pc+1]);
+         pc += 2; break;
+      case bci_TESTEQ_I:
+         debugBelch("TESTEQ_I  %d, fail to %d\n", literals[instrs[pc]],
+                                                      instrs[pc+1]);
+         pc += 2; break;
+
+      case bci_TESTLT_F:
+         debugBelch("TESTLT_F  %d, fail to %d\n", literals[instrs[pc]],
+                                                      instrs[pc+1]);
+         pc += 2; break;
+      case bci_TESTEQ_F:
+         debugBelch("TESTEQ_F  %d, fail to %d\n", literals[instrs[pc]],
+                                                      instrs[pc+1]);
+         pc += 2; break;
+
+      case bci_TESTLT_D:
+         debugBelch("TESTLT_D  %d, fail to %d\n", literals[instrs[pc]],
+                                                      instrs[pc+1]);
+         pc += 2; break;
+      case bci_TESTEQ_D:
+         debugBelch("TESTEQ_D  %d, fail to %d\n", literals[instrs[pc]],
+                                                      instrs[pc+1]);
+         pc += 2; break;
+
+      case bci_TESTLT_P:
+         debugBelch("TESTLT_P  %d, fail to %d\n", instrs[pc],
+                                                      instrs[pc+1]);
+         pc += 2; break;
+      case bci_TESTEQ_P:
+         debugBelch("TESTEQ_P  %d, fail to %d\n", instrs[pc],
+                                                      instrs[pc+1]);
+         pc += 2; break;
+      case bci_CASEFAIL: 
+         debugBelch("CASEFAIL\n" );
+         break;
+      case bci_JMP:
+         debugBelch("JMP to    %d\n", instrs[pc]);
+         pc += 1; break;
+
+      case bci_ENTER:
+         debugBelch("ENTER\n");
+         break;
+
+      case bci_RETURN:
+         debugBelch("RETURN\n" );
+	 break;
+      case bci_RETURN_P:
+         debugBelch("RETURN_P\n" );
+	 break;
+      case bci_RETURN_N:
+         debugBelch("RETURN_N\n" );
+	 break;
+      case bci_RETURN_F:
+         debugBelch("RETURN_F\n" );
+	 break;
+      case bci_RETURN_D:
+         debugBelch("RETURN_D\n" );
+	 break;
+      case bci_RETURN_L:
+         debugBelch("RETURN_L\n" );
+	 break;
+      case bci_RETURN_V:
+         debugBelch("RETURN_V\n" );
+	 break;
+
+      default:
+         barf("disInstr: unknown opcode");
+   }
+   return pc;
+}
+
+
+/* Something of a kludge .. how do we know where the end of the insn
+   array is, since it isn't recorded anywhere?  Answer: the first
+   short is the number of bytecodes which follow it.  
+   See ByteCodeGen.linkBCO.insns_arr for construction ...  
+*/
+void disassemble( StgBCO *bco )
+{
+   nat i, j;
+   StgWord16*     instrs    = (StgWord16*)(bco->instrs->payload);
+   StgMutArrPtrs* ptrs      = bco->ptrs;
+   nat            nbcs      = (int)instrs[0];
+   nat            pc        = 1;
+
+   debugBelch("BCO\n" );
+   pc = 1;
+   while (pc <= nbcs) {
+      debugBelch("\t%2d:  ", pc );
+      pc = disInstr ( bco, pc );
+   }
+
+   debugBelch("INSTRS:\n   " );
+   j = 16;
+   for (i = 0; i < nbcs; i++) {
+      debugBelch("%3d ", (int)instrs[i] );
+      j--; 
+      if (j == 0) { j = 16; debugBelch("\n   "); };
+   }
+   debugBelch("\n");
+
+   debugBelch("PTRS:\n   " );
+   j = 8;
+   for (i = 0; i < ptrs->ptrs; i++) {
+      debugBelch("%8p ", ptrs->payload[i] );
+      j--; 
+      if (j == 0) { j = 8; debugBelch("\n   "); };
+   }
+   debugBelch("\n");
+
+   debugBelch("\n");
+   ASSERT(pc == nbcs+1);
+}
+
+#endif /* DEBUG */
diff --git a/rts/Disassembler.h b/rts/Disassembler.h
new file mode 100644
index 0000000000..2851097117
--- /dev/null
+++ b/rts/Disassembler.h
@@ -0,0 +1,19 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Prototypes for functions in Disassembler.c
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef DISASSEMBLER_H
+#define DISASSEMBLER_H
+
+#ifdef DEBUG
+
+extern int  disInstr   ( StgBCO *bco, int pc );
+extern void disassemble( StgBCO *bco );
+
+#endif
+
+#endif /* DISASSEMBLER_H */
diff --git a/rts/Exception.cmm b/rts/Exception.cmm
new file mode 100644
index 0000000000..b5c29626b2
--- /dev/null
+++ b/rts/Exception.cmm
@@ -0,0 +1,446 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2004
+ *
+ * Exception support
+ *
+ * This file is written in a subset of C--, extended with various
+ * features specific to GHC.  It is compiled by GHC directly.  For the
+ * syntax of .cmm files, see the parser in ghc/compiler/cmm/CmmParse.y.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Cmm.h"
+
+/* -----------------------------------------------------------------------------
+   Exception Primitives
+
+   A thread can request that asynchronous exceptions not be delivered
+   ("blocked") for the duration of an I/O computation.  The primitive
+   
+	blockAsyncExceptions# :: IO a -> IO a
+
+   is used for this purpose.  During a blocked section, asynchronous
+   exceptions may be unblocked again temporarily:
+
+	unblockAsyncExceptions# :: IO a -> IO a
+
+   Furthermore, asynchronous exceptions are blocked automatically during
+   the execution of an exception handler.  Both of these primitives
+   leave a continuation on the stack which reverts to the previous
+   state (blocked or unblocked) on exit.
+
+   A thread which wants to raise an exception in another thread (using
+   killThread#) must block until the target thread is ready to receive
+   it.  The action of unblocking exceptions in a thread will release all
+   the threads waiting to deliver exceptions to that thread.
+
+   NB. there's a bug in here.  If a thread is inside an
+   unsafePerformIO, and inside blockAsyncExceptions# (there is an
+   unblockAsyncExceptions_ret on the stack), and it is blocked in an
+   interruptible operation, and it receives an exception, then the
+   unsafePerformIO thunk will be updated with a stack object
+   containing the unblockAsyncExceptions_ret frame.  Later, when
+   someone else evaluates this thunk, the blocked exception state is
+   not restored, and the result is that unblockAsyncExceptions_ret
+   will attempt to unblock exceptions in the current thread, but it'll
+   find that the CurrentTSO->blocked_exceptions is NULL.  Hence, we
+   work around this by checking for NULL in awakenBlockedQueue().
+
+   -------------------------------------------------------------------------- */
+
+INFO_TABLE_RET( stg_unblockAsyncExceptionszh_ret,
+ 		0/*framesize*/, 0/*bitmap*/, RET_SMALL )
+{
+    // Not true: see comments above
+    // ASSERT(StgTSO_blocked_exceptions(CurrentTSO) != NULL);
+#if defined(GRAN) || defined(PAR)
+    foreign "C" awakenBlockedQueue(MyCapability() "ptr", StgTSO_blocked_exceptions(CurrentTSO) "ptr", 
+				   NULL "ptr"); 
+#else
+    foreign "C" awakenBlockedQueue(MyCapability() "ptr", StgTSO_blocked_exceptions(CurrentTSO) "ptr");
+#endif
+    StgTSO_blocked_exceptions(CurrentTSO) = NULL;
+#ifdef REG_R1
+    Sp_adj(1);
+    jump %ENTRY_CODE(Sp(0));
+#else
+    Sp(1) = Sp(0);
+    Sp_adj(1);
+    jump %ENTRY_CODE(Sp(1));
+#endif
+}
+
+INFO_TABLE_RET( stg_blockAsyncExceptionszh_ret,
+  		0/*framesize*/, 0/*bitmap*/, RET_SMALL )
+{
+    // Not true: see comments above
+    // ASSERT(StgTSO_blocked_exceptions(CurrentTSO) == NULL);
+    StgTSO_blocked_exceptions(CurrentTSO) = END_TSO_QUEUE;
+#ifdef REG_R1
+    Sp_adj(1);
+    jump %ENTRY_CODE(Sp(0));
+#else
+    Sp(1) = Sp(0);
+    Sp_adj(1);
+    jump %ENTRY_CODE(Sp(1));
+#endif
+}
+
+blockAsyncExceptionszh_fast
+{
+    /* Args: R1 :: IO a */
+    STK_CHK_GEN( WDS(2)/* worst case */, R1_PTR, blockAsyncExceptionszh_fast);
+
+    if (StgTSO_blocked_exceptions(CurrentTSO) == NULL) {
+      StgTSO_blocked_exceptions(CurrentTSO) = END_TSO_QUEUE;
+      /* avoid growing the stack unnecessarily */
+      if (Sp(0) == stg_blockAsyncExceptionszh_ret_info) {
+	Sp_adj(1);
+      } else {
+	Sp_adj(-1);
+	Sp(0) = stg_unblockAsyncExceptionszh_ret_info;
+      }
+    }
+    TICK_UNKNOWN_CALL();
+    TICK_SLOW_CALL_v();
+    jump stg_ap_v_fast;
+}
+
+unblockAsyncExceptionszh_fast
+{
+    /* Args: R1 :: IO a */
+    STK_CHK_GEN( WDS(2), R1_PTR, unblockAsyncExceptionszh_fast);
+
+    if (StgTSO_blocked_exceptions(CurrentTSO) != NULL) {
+#if defined(GRAN) || defined(PAR)
+      foreign "C" awakenBlockedQueue(MyCapability() "ptr", StgTSO_blocked_exceptions(CurrentTSO) "ptr", 
+	          		     StgTSO_block_info(CurrentTSO) "ptr");
+#else
+      foreign "C" awakenBlockedQueue(MyCapability() "ptr", StgTSO_blocked_exceptions(CurrentTSO) "ptr");
+#endif
+      StgTSO_blocked_exceptions(CurrentTSO) = NULL;
+
+      /* avoid growing the stack unnecessarily */
+      if (Sp(0) == stg_unblockAsyncExceptionszh_ret_info) {
+	Sp_adj(1);
+      } else {
+	Sp_adj(-1);
+	Sp(0) = stg_blockAsyncExceptionszh_ret_info;
+      }
+    }
+    TICK_UNKNOWN_CALL();
+    TICK_SLOW_CALL_v();
+    jump stg_ap_v_fast;
+}
+
+
+#define interruptible(what_next) 		\
+        (   what_next == BlockedOnMVar		\
+         || what_next == BlockedOnException	\
+         || what_next == BlockedOnRead		\
+         || what_next == BlockedOnWrite		\
+         || what_next == BlockedOnDelay		\
+         || what_next == BlockedOnDoProc)
+
+killThreadzh_fast
+{
+  /* args: R1 = TSO to kill, R2 = Exception */
+
+  W_ why_blocked;
+
+  /* This thread may have been relocated.
+   * (see Schedule.c:threadStackOverflow)
+   */
+ while:
+  if (StgTSO_what_next(R1) == ThreadRelocated::I16) {
+    R1 = StgTSO_link(R1);
+    goto while;
+  }
+
+  /* Determine whether this thread is interruptible or not */
+
+  /* If the target thread is currently blocking async exceptions,
+   * we'll have to block until it's ready to accept them.  The
+   * exception is interruptible threads - ie. those that are blocked
+   * on some resource.
+   */
+  why_blocked = TO_W_(StgTSO_why_blocked(R1));
+  if (StgTSO_blocked_exceptions(R1) != NULL && !interruptible(why_blocked))
+  {
+      StgTSO_link(CurrentTSO) = StgTSO_blocked_exceptions(R1);
+      StgTSO_blocked_exceptions(R1) = CurrentTSO;
+      
+      StgTSO_why_blocked(CurrentTSO) = BlockedOnException::I16;
+      StgTSO_block_info(CurrentTSO) = R1;
+      
+      BLOCK( R1_PTR & R2_PTR, killThreadzh_fast );
+  }
+
+  /* Killed threads turn into zombies, which might be garbage
+   * collected at a later date.  That's why we don't have to
+   * explicitly remove them from any queues they might be on.
+   */
+
+  /* We might have killed ourselves.  In which case, better be *very*
+   * careful.  If the exception killed us, then return to the scheduler.
+   * If the exception went to a catch frame, we'll just continue from
+   * the handler.
+   */
+  if (R1 == CurrentTSO) {
+	SAVE_THREAD_STATE();
+	foreign "C" raiseAsync(MyCapability() "ptr", R1 "ptr", R2 "ptr");
+	if (StgTSO_what_next(CurrentTSO) == ThreadKilled::I16) {
+		R1 = ThreadFinished;
+		jump StgReturn;
+	} else {
+		LOAD_THREAD_STATE();
+		ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
+		jump %ENTRY_CODE(Sp(0));
+	}
+  } else {
+	foreign "C" raiseAsync(MyCapability() "ptr", R1 "ptr", R2 "ptr");
+  }
+
+  jump %ENTRY_CODE(Sp(0));
+}
+
+/* -----------------------------------------------------------------------------
+   Catch frames
+   -------------------------------------------------------------------------- */
+
+#ifdef REG_R1
+#define CATCH_FRAME_ENTRY_TEMPLATE(label,ret) 	\
+   label					\
+   {						\
+      Sp = Sp + SIZEOF_StgCatchFrame; 		\
+      jump ret;					\
+   }
+#else
+#define CATCH_FRAME_ENTRY_TEMPLATE(label,ret) 	\
+   label					\
+   {						\
+      W_ rval;					\
+      rval = Sp(0);				\
+      Sp = Sp + SIZEOF_StgCatchFrame;		\
+      Sp(0) = rval;				\
+      jump ret;					\
+   }
+#endif
+
+#ifdef REG_R1
+#define SP_OFF 0
+#else
+#define SP_OFF 1
+#endif
+
+CATCH_FRAME_ENTRY_TEMPLATE(stg_catch_frame_0_ret,%RET_VEC(Sp(SP_OFF),0))
+CATCH_FRAME_ENTRY_TEMPLATE(stg_catch_frame_1_ret,%RET_VEC(Sp(SP_OFF),1))
+CATCH_FRAME_ENTRY_TEMPLATE(stg_catch_frame_2_ret,%RET_VEC(Sp(SP_OFF),2))
+CATCH_FRAME_ENTRY_TEMPLATE(stg_catch_frame_3_ret,%RET_VEC(Sp(SP_OFF),3))
+CATCH_FRAME_ENTRY_TEMPLATE(stg_catch_frame_4_ret,%RET_VEC(Sp(SP_OFF),4))
+CATCH_FRAME_ENTRY_TEMPLATE(stg_catch_frame_5_ret,%RET_VEC(Sp(SP_OFF),5))
+CATCH_FRAME_ENTRY_TEMPLATE(stg_catch_frame_6_ret,%RET_VEC(Sp(SP_OFF),6))
+CATCH_FRAME_ENTRY_TEMPLATE(stg_catch_frame_7_ret,%RET_VEC(Sp(SP_OFF),7))
+
+#if MAX_VECTORED_RTN > 8
+#error MAX_VECTORED_RTN has changed: please modify stg_catch_frame too.
+#endif
+
+#if defined(PROFILING)
+#define CATCH_FRAME_BITMAP 7
+#define CATCH_FRAME_WORDS  4
+#else
+#define CATCH_FRAME_BITMAP 1
+#define CATCH_FRAME_WORDS  2
+#endif
+
+/* Catch frames are very similar to update frames, but when entering
+ * one we just pop the frame off the stack and perform the correct
+ * kind of return to the activation record underneath us on the stack.
+ */
+
+INFO_TABLE_RET(stg_catch_frame,
+	       CATCH_FRAME_WORDS, CATCH_FRAME_BITMAP,
+	       CATCH_FRAME,
+	       stg_catch_frame_0_ret,
+	       stg_catch_frame_1_ret,
+	       stg_catch_frame_2_ret,
+	       stg_catch_frame_3_ret,
+	       stg_catch_frame_4_ret,
+	       stg_catch_frame_5_ret,
+	       stg_catch_frame_6_ret,
+	       stg_catch_frame_7_ret)
+CATCH_FRAME_ENTRY_TEMPLATE(,%ENTRY_CODE(Sp(SP_OFF)))
+
+/* -----------------------------------------------------------------------------
+ * The catch infotable
+ *
+ * This should be exactly the same as would be generated by this STG code
+ *
+ * catch = {x,h} \n {} -> catch#{x,h}
+ *
+ * It is used in deleteThread when reverting blackholes.
+ * -------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_catch,2,0,FUN,"catch","catch")
+{
+  R2 = StgClosure_payload(R1,1); /* h */
+  R1 = StgClosure_payload(R1,0); /* x */
+  jump catchzh_fast;
+}
+
+catchzh_fast
+{
+    /* args: R1 = m :: IO a, R2 = handler :: Exception -> IO a */
+    STK_CHK_GEN(SIZEOF_StgCatchFrame + WDS(1), R1_PTR & R2_PTR, catchzh_fast);
+  
+    /* Set up the catch frame */
+    Sp = Sp - SIZEOF_StgCatchFrame;
+    SET_HDR(Sp,stg_catch_frame_info,W_[CCCS]);
+    
+    StgCatchFrame_handler(Sp) = R2;
+    StgCatchFrame_exceptions_blocked(Sp) = 
+	(StgTSO_blocked_exceptions(CurrentTSO) != NULL);
+    TICK_CATCHF_PUSHED();
+
+    /* Apply R1 to the realworld token */
+    TICK_UNKNOWN_CALL();
+    TICK_SLOW_CALL_v();
+    jump stg_ap_v_fast;
+}      
+
+/* -----------------------------------------------------------------------------
+ * The raise infotable
+ * 
+ * This should be exactly the same as would be generated by this STG code
+ *
+ *   raise = {err} \n {} -> raise#{err}
+ *
+ * It is used in raisezh_fast to update thunks on the update list
+ * -------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_raise,1,0,THUNK_1_0,"raise","raise")
+{
+  R1 = StgThunk_payload(R1,0);
+  jump raisezh_fast;
+}
+
+raisezh_fast
+{
+    W_ handler;
+    W_ raise_closure;
+    W_ frame_type;
+    /* args : R1 :: Exception */
+
+
+#if defined(PROFILING)
+    /* Debugging tool: on raising an  exception, show where we are. */
+
+    /* ToDo: currently this is a hack.  Would be much better if
+     * the info was only displayed for an *uncaught* exception.
+     */
+    if (RtsFlags_ProfFlags_showCCSOnException(RtsFlags)) {
+      foreign "C" fprintCCS_stderr(W_[CCCS] "ptr");
+    }
+#endif
+
+retry_pop_stack:
+    StgTSO_sp(CurrentTSO) = Sp;
+    frame_type = foreign "C" raiseExceptionHelper(BaseReg "ptr", CurrentTSO "ptr", R1 "ptr");
+    Sp = StgTSO_sp(CurrentTSO);
+    if (frame_type == ATOMICALLY_FRAME) {
+      /* The exception has reached the edge of a memory transaction.  Check that 
+       * the transaction is valid.  If not then perhaps the exception should
+       * not have been thrown: re-run the transaction */
+      W_ trec;
+      W_ r;
+      trec = StgTSO_trec(CurrentTSO);
+      r = foreign "C" stmValidateNestOfTransactions(trec "ptr");
+      foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr");
+      StgTSO_trec(CurrentTSO) = NO_TREC;
+      if (r) {
+        // Transaction was valid: continue searching for a catch frame
+        Sp = Sp + SIZEOF_StgAtomicallyFrame;
+        goto retry_pop_stack;
+      } else {
+        // Transaction was not valid: we retry the exception (otherwise continue
+        // with a further call to raiseExceptionHelper)
+        "ptr" trec = foreign "C" stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr");
+        StgTSO_trec(CurrentTSO) = trec;
+        R1 = StgAtomicallyFrame_code(Sp);
+        jump stg_ap_v_fast;
+      }          
+    }
+
+    if (frame_type == STOP_FRAME) {
+	/*
+	 * We've stripped the entire stack, the thread is now dead.
+	 * We will leave the stack in a GC'able state, see the stg_stop_thread
+	 * entry code in StgStartup.cmm.
+	 */
+	Sp = CurrentTSO + TSO_OFFSET_StgTSO_stack 
+		+ WDS(TO_W_(StgTSO_stack_size(CurrentTSO))) - WDS(2);
+	Sp(1) = R1;		/* save the exception */
+	Sp(0) = stg_enter_info; /* so that GC can traverse this stack */
+	StgTSO_what_next(CurrentTSO) = ThreadKilled::I16;
+	SAVE_THREAD_STATE();	/* inline! */
+
+	/* The return code goes in BaseReg->rRet, and BaseReg is returned in R1 */
+	StgRegTable_rRet(BaseReg) = ThreadFinished;
+	R1 = BaseReg;
+
+	jump StgReturn;
+    }
+
+    /* Ok, Sp points to the enclosing CATCH_FRAME or CATCH_STM_FRAME.  Pop everything
+     * down to and including this frame, update Su, push R1, and enter the handler.
+     */
+    if (frame_type == CATCH_FRAME) {
+      handler = StgCatchFrame_handler(Sp);
+    } else {
+      handler = StgCatchSTMFrame_handler(Sp);
+    }
+
+    /* Restore the blocked/unblocked state for asynchronous exceptions
+     * at the CATCH_FRAME.  
+     *
+     * If exceptions were unblocked, arrange that they are unblocked
+     * again after executing the handler by pushing an
+     * unblockAsyncExceptions_ret stack frame.
+     */
+    W_ frame;
+    frame = Sp;
+    if (frame_type == CATCH_FRAME) {
+      Sp = Sp + SIZEOF_StgCatchFrame;
+      if (StgCatchFrame_exceptions_blocked(frame) == 0) {
+        Sp_adj(-1);
+        Sp(0) = stg_unblockAsyncExceptionszh_ret_info;
+      }
+    } else {
+      Sp = Sp + SIZEOF_StgCatchSTMFrame;
+    }
+
+    /* Ensure that async excpetions are blocked when running the handler.
+    */
+    if (StgTSO_blocked_exceptions(CurrentTSO) == NULL) {
+      StgTSO_blocked_exceptions(CurrentTSO) = END_TSO_QUEUE;
+    }
+
+    /* Call the handler, passing the exception value and a realworld
+     * token as arguments.
+     */
+    Sp_adj(-1);
+    Sp(0) = R1;
+    R1 = handler;
+    Sp_adj(-1);
+    TICK_UNKNOWN_CALL();
+    TICK_SLOW_CALL_pv();
+    jump RET_LBL(stg_ap_pv);
+}
+
+raiseIOzh_fast
+{
+  /* Args :: R1 :: Exception */
+  jump raisezh_fast;
+}
diff --git a/rts/Exception.h b/rts/Exception.h
new file mode 100644
index 0000000000..f7832f4045
--- /dev/null
+++ b/rts/Exception.h
@@ -0,0 +1,40 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Exception support
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef EXCEPTION_H
+#define EXCEPTION_H
+
+extern const StgRetInfoTable stg_blockAsyncExceptionszh_ret_info;
+extern const StgRetInfoTable stg_unblockAsyncExceptionszh_ret_info;
+
+/* Determine whether a thread is interruptible (ie. blocked
+ * indefinitely).  Interruptible threads can be sent an exception with
+ * killThread# even if they have async exceptions blocked.
+ */
+STATIC_INLINE int
+interruptible(StgTSO *t)
+{
+  switch (t->why_blocked) {
+  case BlockedOnMVar:
+  case BlockedOnException:
+  case BlockedOnRead:
+  case BlockedOnWrite:
+#if defined(mingw32_HOST_OS)
+  case BlockedOnDoProc:
+#endif
+  case BlockedOnDelay:
+    return 1;
+  // NB. Threaded blocked on foreign calls (BlockedOnCCall) are
+  // *not* interruptible.  We can't send these threads an exception.
+  default:
+    return 0;
+  }
+}
+
+#endif /* EXCEPTION_H */
+
diff --git a/rts/FrontPanel.c b/rts/FrontPanel.c
new file mode 100644
index 0000000000..579b75bab3
--- /dev/null
+++ b/rts/FrontPanel.c
@@ -0,0 +1,802 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 2000
+ *
+ * RTS GTK Front Panel
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifdef RTS_GTK_FRONTPANEL
+
+/* Alas, not Posix. */
+/* #include "PosixSource.h" */
+
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "MBlock.h"
+#include "FrontPanel.h"
+#include "Storage.h"
+#include "Stats.h"
+#include "RtsFlags.h"
+#include "Schedule.h"
+
+#include <gtk/gtk.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "VisSupport.h"
+#include "VisWindow.h"
+
+static GtkWidget *window, *map_drawing_area, *gen_drawing_area;
+static GtkWidget *res_drawing_area;
+static GtkWidget *continue_but, *stop_but, *quit_but;
+static GtkWidget *statusbar;
+static GtkWidget *live_label, *allocated_label;
+static GtkWidget *footprint_label, *alloc_rate_label;
+static GtkWidget *map_ruler, *gen_ruler;
+static GtkWidget *res_vruler, *res_hruler;
+static GtkWidget *running_label, *b_read_label, *b_write_label, *total_label;
+static GtkWidget *b_mvar_label, *b_bh_label, *b_throwto_label, *sleeping_label;
+
+static guint status_context_id;
+
+gboolean continue_now = FALSE, stop_now = FALSE, quit = FALSE;
+UpdateMode update_mode = Continuous;
+
+static GdkPixmap *map_pixmap = NULL;
+static GdkPixmap *gen_pixmap = NULL;
+static GdkPixmap *res_pixmap = NULL;
+
+#define N_GENS 10
+
+static GdkColor 
+    bdescr_color = { 0, 0xffff, 0, 0 },	/* red */
+    free_color   = { 0, 0, 0, 0xffff },	/* blue */
+    gen_colors[N_GENS] = {
+	{ 0, 0, 0xffff, 0 },
+	{ 0, 0, 0xf000, 0 },
+	{ 0, 0, 0xe000, 0 },
+	{ 0, 0, 0xd000, 0 },
+	{ 0, 0, 0xc000, 0 },
+	{ 0, 0, 0xb000, 0 },
+	{ 0, 0, 0xa000, 0 },
+	{ 0, 0, 0x9000, 0 },
+	{ 0, 0, 0x8000, 0 },
+	{ 0, 0, 0x7000, 0 }
+    };
+
+GdkGC *my_gc = NULL;
+
+static void *mem_start = (void *) 0x50000000;
+
+static void colorBlock( void *addr, GdkColor *color, 
+			nat block_width, nat block_height, 
+			nat blocks_per_line );
+
+static void residencyCensus( void );
+static void updateResidencyGraph( void );
+static void updateThreadsPanel( void );
+
+/* Some code pinched from examples/scribble-simple in the GTK+
+ * distribution.
+ */
+
+/* Create a new backing pixmap of the appropriate size */
+static gint 
+configure_event( GtkWidget *widget, GdkEventConfigure *event STG_UNUSED,
+		 GdkPixmap **pixmap )
+{
+  if (*pixmap)
+    gdk_pixmap_unref(*pixmap);
+
+  *pixmap = gdk_pixmap_new(widget->window,
+			   widget->allocation.width,
+			   widget->allocation.height,
+			   -1);
+
+  gdk_draw_rectangle (*pixmap,
+		      widget->style->white_gc,
+		      TRUE,
+		      0, 0,
+		      widget->allocation.width,
+		      widget->allocation.height);
+
+  debugBelch("configure!\n");
+  updateFrontPanel();
+  return TRUE;
+}
+
+/* Redraw the screen from the backing pixmap */
+static gint 
+expose_event( GtkWidget *widget, GdkEventExpose *event, GdkPixmap **pixmap )
+{
+  gdk_draw_pixmap(widget->window,
+		  widget->style->fg_gc[GTK_WIDGET_STATE (widget)],
+		  *pixmap,
+		  event->area.x, event->area.y,
+		  event->area.x, event->area.y,
+		  event->area.width, event->area.height);
+
+  return FALSE;
+}
+
+void
+initFrontPanel( void )
+{
+    GdkColormap *colormap;
+    GtkWidget *gen_hbox;
+
+    gtk_init( &prog_argc, &prog_argv );
+
+    window = create_GHC_Front_Panel();
+    map_drawing_area  = lookup_widget(window, "memmap");
+    gen_drawing_area  = lookup_widget(window, "generations");
+    res_drawing_area  = lookup_widget(window, "res_drawingarea");
+    stop_but          = lookup_widget(window, "stop_but");
+    continue_but      = lookup_widget(window, "continue_but");
+    quit_but          = lookup_widget(window, "quit_but");
+    statusbar         = lookup_widget(window, "statusbar");
+    live_label        = lookup_widget(window, "live_label");
+    footprint_label   = lookup_widget(window, "footprint_label");
+    allocated_label   = lookup_widget(window, "allocated_label");
+    alloc_rate_label  = lookup_widget(window, "alloc_rate_label");
+    gen_hbox          = lookup_widget(window, "gen_hbox");
+    gen_ruler         = lookup_widget(window, "gen_ruler");
+    map_ruler         = lookup_widget(window, "map_ruler");
+    res_vruler        = lookup_widget(window, "res_vruler");
+    res_hruler        = lookup_widget(window, "res_hruler");
+    running_label     = lookup_widget(window, "running_label");
+    b_read_label      = lookup_widget(window, "blockread_label");
+    b_write_label     = lookup_widget(window, "blockwrite_label");
+    b_mvar_label      = lookup_widget(window, "blockmvar_label");
+    b_bh_label        = lookup_widget(window, "blockbh_label");
+    b_throwto_label   = lookup_widget(window, "blockthrowto_label");
+    sleeping_label    = lookup_widget(window, "sleeping_label");
+    total_label       = lookup_widget(window, "total_label");
+    
+    status_context_id = 
+	gtk_statusbar_get_context_id( GTK_STATUSBAR(statusbar), "context" );
+
+    /* hook up some signals for the mem map drawing area */
+    gtk_signal_connect (GTK_OBJECT(map_drawing_area), "expose_event",
+			(GtkSignalFunc)expose_event, &map_pixmap);
+    gtk_signal_connect (GTK_OBJECT(map_drawing_area), "configure_event",
+			(GtkSignalFunc)configure_event, &map_pixmap);
+
+    gtk_widget_set_events(map_drawing_area, GDK_EXPOSURE_MASK);
+
+    /* hook up some signals for the gen drawing area */
+    gtk_signal_connect (GTK_OBJECT(gen_drawing_area), "expose_event",
+			(GtkSignalFunc)expose_event, &gen_pixmap);
+    gtk_signal_connect (GTK_OBJECT(gen_drawing_area), "configure_event",
+			(GtkSignalFunc)configure_event, &gen_pixmap);
+
+    gtk_widget_set_events(gen_drawing_area, GDK_EXPOSURE_MASK);
+    
+    /* hook up some signals for the res drawing area */
+    gtk_signal_connect (GTK_OBJECT(res_drawing_area), "expose_event",
+			(GtkSignalFunc)expose_event, &res_pixmap);
+    gtk_signal_connect (GTK_OBJECT(res_drawing_area), "configure_event",
+			(GtkSignalFunc)configure_event, &res_pixmap);
+
+    gtk_widget_set_events(res_drawing_area, GDK_EXPOSURE_MASK);
+    
+    /* allocate our colors */
+    colormap = gdk_colormap_get_system();
+    gdk_colormap_alloc_color(colormap, &bdescr_color, TRUE, TRUE);
+    gdk_colormap_alloc_color(colormap, &free_color, TRUE, TRUE);
+
+    {
+	gboolean success[N_GENS];
+	gdk_colormap_alloc_colors(colormap, gen_colors, N_GENS, TRUE,
+				  TRUE, success);
+	if (!success) { barf("can't allocate colors"); }
+    }
+
+    /* set the labels on the generation histogram */
+    {
+	char buf[64];
+	nat g, s;
+	GtkWidget *label;
+
+	for(g = 0; g < RtsFlags.GcFlags.generations; g++) {
+	    for(s = 0; s < generations[g].n_steps; s++) {
+		g_snprintf( buf, 64, "%d.%d", g, s );
+		label = gtk_label_new( buf );
+		gtk_box_pack_start( GTK_BOX(gen_hbox), label,
+				    TRUE, TRUE, 5 );
+		gtk_widget_show(label);
+	    }
+	}
+    }
+
+    gtk_widget_show(window);
+
+    /* wait for the user to press "Continue" before getting going... */
+    gtk_statusbar_push( GTK_STATUSBAR(statusbar), status_context_id, 
+			"Program start");
+    gtk_widget_set_sensitive( stop_but, FALSE );
+    continue_now = FALSE;
+    while (continue_now == FALSE) {
+	gtk_main_iteration();
+    }
+    gtk_statusbar_pop( GTK_STATUSBAR(statusbar), status_context_id );
+    gtk_statusbar_push( GTK_STATUSBAR(statusbar), status_context_id, 
+			"Running");
+
+    gtk_widget_set_sensitive( continue_but, FALSE );
+    gtk_widget_set_sensitive( stop_but, TRUE );
+    gtk_widget_set_sensitive( quit_but, FALSE );
+
+    while (gtk_events_pending()) {
+	gtk_main_iteration();
+    }
+}
+
+void
+stopFrontPanel( void )
+{
+    gtk_widget_set_sensitive( quit_but, TRUE );
+    gtk_widget_set_sensitive( continue_but, FALSE );
+    gtk_widget_set_sensitive( stop_but, FALSE );
+
+    updateFrontPanel();
+
+    gtk_statusbar_push( GTK_STATUSBAR(statusbar), status_context_id, 
+			"Program finished");
+
+    quit = FALSE;
+    while (quit == FALSE) {
+	gtk_main_iteration();
+    }
+}
+
+static void
+waitForContinue( void )
+{
+    gtk_widget_set_sensitive( continue_but, TRUE );
+    gtk_widget_set_sensitive( stop_but, FALSE );
+    stop_now = FALSE;
+    continue_now = FALSE;
+    while (continue_now == FALSE) {
+	gtk_main_iteration();
+    }
+    gtk_widget_set_sensitive( continue_but, FALSE );
+    gtk_widget_set_sensitive( stop_but, TRUE );
+}
+
+void
+updateFrontPanelBeforeGC( nat N )
+{
+    char buf[1000];
+
+    updateFrontPanel();
+
+    if (update_mode == BeforeGC 
+	|| update_mode == BeforeAfterGC
+	|| stop_now == TRUE) {
+	g_snprintf( buf, 1000, "Stopped (before GC, generation %d)", N );
+	gtk_statusbar_push( GTK_STATUSBAR(statusbar), status_context_id, buf );
+	waitForContinue();
+	gtk_statusbar_pop( GTK_STATUSBAR(statusbar), status_context_id );
+    }
+
+    g_snprintf( buf, 1000, "Garbage collecting (generation %d)", N );
+    gtk_statusbar_push( GTK_STATUSBAR(statusbar), status_context_id, buf);
+
+    while (gtk_events_pending()) {
+	gtk_main_iteration();
+    }
+}
+
+static void
+numLabel( GtkWidget *lbl, nat n )
+{
+    char buf[64];
+    g_snprintf(buf, 64, "%d", n);
+    gtk_label_set_text( GTK_LABEL(lbl), buf );
+}
+
+void
+updateFrontPanelAfterGC( nat N, lnat live )
+{
+    char buf[1000];
+
+    gtk_statusbar_pop( GTK_STATUSBAR(statusbar), status_context_id );
+
+    /* is a major GC? */
+    if (N == RtsFlags.GcFlags.generations-1) {
+	residencyCensus();
+    }
+
+    updateFrontPanel();
+
+    if (update_mode == AfterGC 
+	|| update_mode == BeforeAfterGC
+	|| stop_now == TRUE) {
+	snprintf( buf, 1000, "Stopped (after GC, generation %d)", N );
+	gtk_statusbar_push( GTK_STATUSBAR(statusbar), status_context_id, buf );
+	waitForContinue();
+	gtk_statusbar_pop( GTK_STATUSBAR(statusbar), status_context_id );
+    }
+
+    {
+	double words_to_megs = (1024 * 1024) / sizeof(W_);
+	double time = mut_user_time();
+
+	snprintf( buf, 1000, "%.2f", (double)live / words_to_megs );
+	gtk_label_set_text( GTK_LABEL(live_label), buf );
+
+	snprintf( buf, 1000, "%.2f", (double)total_allocated / words_to_megs );
+	gtk_label_set_text( GTK_LABEL(allocated_label), buf );
+
+	snprintf( buf, 1000, "%.2f",
+		  (double)(mblocks_allocated * MBLOCK_SIZE_W) / words_to_megs );
+	gtk_label_set_text( GTK_LABEL(footprint_label), buf );
+
+	if ( time == 0.0 )
+	    snprintf( buf, 1000, "%.2f", time );
+	else
+	    snprintf( buf, 1000, "%.2f",
+		      (double)(total_allocated / words_to_megs) / time );
+	gtk_label_set_text( GTK_LABEL(alloc_rate_label), buf );
+    }
+
+    while (gtk_events_pending()) {
+	gtk_main_iteration();
+    }
+}
+
+void
+updateFrontPanel( void )
+{
+    void *m, *a;
+    bdescr *bd;
+
+    updateThreadsPanel();
+
+    if (my_gc == NULL) {
+	my_gc = gdk_gc_new( window->window );
+    }
+
+    if (map_pixmap != NULL) {
+	nat height, width, blocks_per_line, 
+	    block_height, block_width, mblock_height;
+
+	height = map_drawing_area->allocation.height;
+	width  = map_drawing_area->allocation.width;
+
+	mblock_height =  height / mblocks_allocated;
+	blocks_per_line = 16;
+	block_height  = mblock_height / 
+	    ((MBLOCK_SIZE/BLOCK_SIZE) / blocks_per_line);
+	while (block_height == 0) {
+	    blocks_per_line *= 2;
+	    block_height  = mblock_height / 
+		((MBLOCK_SIZE/BLOCK_SIZE) / blocks_per_line);
+	}
+	block_width = width / blocks_per_line;
+
+	gdk_draw_rectangle (map_pixmap,
+			    map_drawing_area->style->bg_gc[GTK_STATE_NORMAL],
+			    TRUE,
+			    0, 0,
+			    map_drawing_area->allocation.width,
+			    map_drawing_area->allocation.height);
+	
+	for ( m = mem_start; 
+	      (char *)m < (char *)mem_start + 
+		  (mblocks_allocated * MBLOCK_SIZE); 
+	      (char *)m += MBLOCK_SIZE ) {
+	    
+	    /* color the bdescr area first */
+	    for (a = m; a < FIRST_BLOCK(m); (char *)a += BLOCK_SIZE) {
+		colorBlock( a, &bdescr_color, 
+			    block_width, block_height, blocks_per_line );
+	    }
+	    
+#if 0 /* Segfaults because bd appears to be bogus but != NULL. stolz, 2003-06-24 */
+	    /* color each block */
+	    for (; a <= LAST_BLOCK(m); (char *)a += BLOCK_SIZE) {
+		bd = Bdescr((P_)a);
+		ASSERT(bd->start == a);
+                if (bd->flags & BF_FREE) {
+		    colorBlock( a, &free_color, 
+				block_width, block_height, blocks_per_line );
+		} else {
+		    colorBlock( a, &gen_colors[bd->gen_no],
+				block_width, block_height, blocks_per_line );
+		}
+	    }
+#endif
+	}
+
+	
+	{ 
+	    nat height = map_drawing_area->allocation.height,
+		block_height, mblock_height;
+
+	    block_height = (height / mblocks_allocated) / 
+		((MBLOCK_SIZE/BLOCK_SIZE) / blocks_per_line);
+	    if (block_height < 1) block_height = 1;
+	    mblock_height = block_height * 
+		((MBLOCK_SIZE/BLOCK_SIZE) / blocks_per_line);
+
+	    gtk_ruler_set_range( GTK_RULER(map_ruler), 0, 
+				 (double)(height * mblocks_allocated) / 
+				 (double)((mblock_height * mblocks_allocated)),
+				 0,
+				 (double)(height * mblocks_allocated) / 
+				 (double)((mblock_height * mblocks_allocated))
+		);
+	}
+				  
+	gtk_widget_draw( map_drawing_area, NULL );
+    }
+
+    if (gen_pixmap != NULL) {
+
+	GdkRectangle rect;
+	nat g, s, columns, column, max_blocks, height_blocks,
+	    width, height;
+	
+	gdk_draw_rectangle (gen_pixmap,
+			    gen_drawing_area->style->white_gc,
+			    TRUE,
+			    0, 0,
+			    gen_drawing_area->allocation.width,
+			    gen_drawing_area->allocation.height);
+
+	height = gen_drawing_area->allocation.height;
+	width  = gen_drawing_area->allocation.width;
+
+	columns = 0; max_blocks = 0;
+	for(g = 0; g < RtsFlags.GcFlags.generations; g++) {
+	    columns += generations[g].n_steps;
+	    for(s = 0; s < generations[g].n_steps; s++) {
+		if (generations[g].steps[s].n_blocks > max_blocks) {
+		    max_blocks = generations[g].steps[s].n_blocks;
+		}
+	    }
+	}
+
+	/* find a reasonable height value larger than max_blocks */
+	{ 
+	    nat n = 0;
+	    while (max_blocks != 0) {
+		max_blocks >>= 1; n++;
+	    }
+	    height_blocks = 1 << n;
+	}
+
+	column = 0;
+	for(g = 0; g < RtsFlags.GcFlags.generations; g++) {
+	    for(s = 0; s < generations[g].n_steps; s++, column++) {
+		gdk_gc_set_foreground(my_gc, &gen_colors[g]);
+
+		rect.x = column * (width / columns);
+
+		if (generations[g].steps[s].n_blocks == 0)
+		    rect.y = height;
+		else
+		    rect.y = height - 
+			(height * generations[g].steps[s].n_blocks
+			 / height_blocks);
+
+		rect.width = (width / columns);
+		rect.height = height - rect.y;
+
+		gdk_draw_rectangle( gen_pixmap, my_gc, TRUE/*filled*/, 
+				    rect.x, rect.y, rect.width,
+				    rect.height );
+	    }
+	}
+
+	gtk_ruler_set_range( GTK_RULER(gen_ruler), 
+			     height_blocks * BLOCK_SIZE / (1024 * 1024),
+			     0, 0,
+			     height_blocks * BLOCK_SIZE / (1024 * 1024)
+	    );
+
+	gtk_widget_draw( gen_drawing_area, NULL );
+    }
+
+    if (res_pixmap != NULL) {
+	updateResidencyGraph();
+    }
+
+    while (gtk_events_pending()) {
+	gtk_main_iteration_do(FALSE/*don't block*/);
+    }
+}
+
+static void
+colorBlock( void *addr, GdkColor *color, 
+	    nat block_width, nat block_height, nat blocks_per_line )
+{
+    GdkRectangle rect;
+    nat block_no;
+
+    gdk_gc_set_foreground(my_gc, color);
+
+    block_no = ((char *)addr - (char *)mem_start) / BLOCK_SIZE;
+
+    rect.x = (block_no % blocks_per_line) * block_width;
+    rect.y = block_no / blocks_per_line * block_height;
+    rect.width = block_width;
+    rect.height = block_height;
+    gdk_draw_rectangle( map_pixmap, my_gc, TRUE/*filled*/, 
+			rect.x, rect.y, rect.width, rect.height );
+}
+
+static void
+updateThreadsPanel( void )
+{
+    nat running = 0,
+	b_read = 0,
+	b_write = 0,
+	b_mvar = 0,
+	b_throwto = 0,
+	b_bh = 0,
+	sleeping = 0,
+	total = 0;
+
+    StgTSO *t;
+
+    for (t = all_threads; t != END_TSO_QUEUE; t = t->global_link) {
+	switch (t->what_next) {
+	case ThreadKilled:	    break;
+	case ThreadComplete:	    break;
+	default:
+	    switch (t->why_blocked) {
+	    case BlockedOnRead:       b_read++;    break;
+	    case BlockedOnWrite:      b_write++;   break;
+	    case BlockedOnDelay:      sleeping++;  break;
+	    case BlockedOnMVar:       b_mvar++;    break;
+	    case BlockedOnException:  b_throwto++; break;
+	    case BlockedOnBlackHole:  b_bh++;      break;
+	    case NotBlocked:          running++;   break;
+	    }
+	}
+    }
+    total = running + b_read + b_write + b_mvar + b_throwto + b_bh + sleeping;
+    numLabel(running_label,   running);
+    numLabel(b_read_label,    b_read);
+    numLabel(b_write_label,   b_write);
+    numLabel(b_mvar_label,    b_mvar);
+    numLabel(b_bh_label,      b_bh);
+    numLabel(b_throwto_label, b_throwto);
+    numLabel(sleeping_label,  sleeping);
+    numLabel(total_label,     total);
+}
+
+typedef enum { Thunk, Fun, Constr, BlackHole,
+	       Array, Thread, Other, N_Cats } ClosureCategory;
+
+#define N_SLICES 100
+
+static nat *res_prof[N_SLICES];
+static double res_time[N_SLICES];
+static nat next_slice = 0;
+
+static void
+residencyCensus( void )
+{
+    nat slice = next_slice++, *prof;
+    bdescr *bd;
+    nat g, s, size, type;
+    StgPtr p;
+    StgInfoTable *info;
+
+    if (slice >= N_SLICES) {
+	barf("too many slices");
+    }
+    res_prof[slice] = stgMallocBytes(N_Cats * sizeof(nat), "residencyCensus");
+    prof = res_prof[slice];
+    memset(prof, 0, N_Cats * sizeof(nat));
+
+    res_time[slice] = mut_user_time();
+    
+    for(g = 0; g < RtsFlags.GcFlags.generations; g++) {
+	for(s = 0; s < generations[g].n_steps; s++) {
+
+	    /* skip over g0s0 if multi-generational */
+	    if (RtsFlags.GcFlags.generations > 1 &&
+		g == 0 && s == 0) continue;
+
+	    if (RtsFlags.GcFlags.generations == 1) {
+/*		bd = generations[g].steps[s].to_blocks; FIXME to_blocks does not exist */
+	    } else {
+		bd = generations[g].steps[s].blocks;
+	    }
+
+	    for (; bd != NULL; bd = bd->link) {
+
+		p = bd->start;
+
+		while (p < bd->free) {
+		    info = get_itbl((StgClosure *)p);
+		    type = Other;
+		    
+		    switch (info->type) {
+
+		    case CONSTR:
+		    case BCO:
+			if (((StgClosure *)p)->header.info == &stg_DEAD_WEAK_info) {
+			    size = sizeofW(StgWeak);
+			    type = Other;
+			    break;
+			}
+			/* else, fall through... */
+		    case CONSTR_1_0:
+		    case CONSTR_0_1:
+		    case CONSTR_1_1:
+		    case CONSTR_0_2:
+		    case CONSTR_2_0:
+			size = sizeW_fromITBL(info);
+			type = Constr;
+			break;
+			
+		    case FUN_1_0:
+		    case FUN_0_1:
+			size = sizeofW(StgHeader) + 1;
+			goto fun;
+		    case FUN_1_1:
+		    case FUN_0_2:
+		    case FUN_2_0:
+		    case FUN:
+			size = sizeW_fromITBL(info);
+		    fun:
+			type = Fun;
+			break;
+
+		    case THUNK_1_0:
+		    case THUNK_0_1:
+		    case THUNK_SELECTOR:
+			size = sizeofW(StgHeader) + 2;
+			goto thunk;
+		    case THUNK_1_1:
+		    case THUNK_0_2:
+		    case THUNK_2_0:
+		    case THUNK:
+			size = sizeW_fromITBL(info);
+		    thunk:
+			type = Thunk;
+			break;
+
+		    case CAF_BLACKHOLE:
+		    case SE_CAF_BLACKHOLE:
+		    case SE_BLACKHOLE:
+		    case BLACKHOLE:
+/*		    case BLACKHOLE_BQ: FIXME: case does not exist */
+			size = sizeW_fromITBL(info);
+			type = BlackHole;
+			break;
+
+		    case AP:
+			size = pap_sizeW((StgPAP *)p);
+			type = Thunk;
+			break;
+
+		    case PAP:
+			size = pap_sizeW((StgPAP *)p);
+			type = Fun;
+			break;
+			
+		    case ARR_WORDS:
+			size = arr_words_sizeW(stgCast(StgArrWords*,p));
+			type = Array;
+			break;
+			
+		    case MUT_ARR_PTRS:
+		    case MUT_ARR_PTRS_FROZEN:
+			size = mut_arr_ptrs_sizeW((StgMutArrPtrs *)p);
+			type = Array;
+			break;
+			
+		    case TSO:
+			size = tso_sizeW((StgTSO *)p);
+			type = Thread;
+			break;
+			
+		    case WEAK:
+		    case STABLE_NAME:
+		    case MVAR:
+		    case MUT_VAR:
+/*		    case MUT_CONS: FIXME: case does not exist */
+		    case IND_PERM:
+		    case IND_OLDGEN_PERM:
+			size = sizeW_fromITBL(info);
+			type = Other;
+			break;
+
+		    default:
+			barf("updateResidencyGraph: strange closure "
+                             "%d", info->type );
+		    }
+
+		    prof[type] += size;
+		    p += size;
+		}
+	    }
+	}
+    }
+
+}
+	    
+static void
+updateResidencyGraph( void )
+{
+    nat total, prev_total, i, max_res;
+    double time;
+    double time_scale = 1;
+    nat last_slice = next_slice-1;
+    double res_scale  = 1; /* in megabytes, doubles */
+    nat *prof;
+    nat width, height;
+    GdkPoint points[4];
+
+    gdk_draw_rectangle (res_pixmap,
+			res_drawing_area->style->bg_gc[GTK_STATE_NORMAL],
+			TRUE,
+			0, 0,
+			res_drawing_area->allocation.width,
+			res_drawing_area->allocation.height);
+    
+    if (next_slice == 0) return;
+
+    time = res_time[last_slice];
+    while (time > time_scale) {
+	time_scale *= 2;
+    }
+
+    max_res = 0; 
+    for (i = 0; i < next_slice; i++) {
+	prof = res_prof[i];
+	total = prof[Thunk] + prof[Fun] + prof[Constr] +
+	    prof[BlackHole] + prof[Array] + prof[Other];
+	if (total > max_res) {
+	    max_res = total;
+	}
+    }
+    while (max_res > res_scale) {
+	res_scale *= 2;
+    }
+
+    height = res_drawing_area->allocation.height;
+    width  = res_drawing_area->allocation.width;
+
+    points[0].x = 0;
+    points[0].y = height;
+    points[1].y = height;
+    points[3].x = 0;
+    points[3].y = height;
+
+    gdk_gc_set_foreground(my_gc, &free_color);
+
+    prev_total = 0;
+    for (i = 0; i < next_slice; i++) {
+	prof = res_prof[i];
+	total = prof[Thunk] + prof[Fun] + prof[Constr] +
+	    prof[BlackHole] + prof[Array] + prof[Other];
+	points[1].x = width * res_time[i] / time_scale;
+	points[2].x = points[1].x;
+	points[2].y = height - ((height * total) / res_scale);
+	gdk_draw_polygon(res_pixmap, my_gc, TRUE/*filled*/, points, 4);
+	points[3] = points[2];
+	points[0] = points[1];
+    }
+
+    gtk_ruler_set_range( GTK_RULER(res_vruler), 
+			 res_scale / ((1024*1024)/sizeof(W_)),
+			 0, 0,
+			 res_scale / ((1024*1024)/sizeof(W_)) );
+
+    gtk_ruler_set_range( GTK_RULER(res_hruler), 
+			 0, time_scale, 0, time_scale );
+
+
+    gtk_widget_draw( res_drawing_area, NULL );
+}
+
+#endif /* RTS_GTK_FRONTPANEL */
diff --git a/rts/FrontPanel.h b/rts/FrontPanel.h
new file mode 100644
index 0000000000..de3b741657
--- /dev/null
+++ b/rts/FrontPanel.h
@@ -0,0 +1,35 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 2000-2005
+ *
+ * RTS GTK Front Panel
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef FRONTPANEL_H
+#define FRONTPANEL_H
+
+#ifdef RTS_GTK_FRONTPANEL
+
+#include "Rts.h"  /* needed because this file gets included by
+		   * auto-generated code */
+
+void initFrontPanel( void );
+void stopFrontPanel( void );
+void updateFrontPanelBeforeGC( nat N );
+void updateFrontPanelAfterGC( nat N, lnat live );
+void updateFrontPanel( void );
+
+
+/* --------- PRIVATE ----------------------------------------- */
+
+#include <gdk/gdktypes.h>
+
+typedef enum { BeforeGC, AfterGC, BeforeAfterGC, Continuous } UpdateMode;
+extern UpdateMode update_mode;
+extern gboolean continue_now, stop_now, quit;
+
+#endif /* RTS_GTK_FRONTPANEL */
+
+#endif /* FRONTPANEL_H */
+
diff --git a/rts/GC.c b/rts/GC.c
new file mode 100644
index 0000000000..a13cd33afa
--- /dev/null
+++ b/rts/GC.c
@@ -0,0 +1,4719 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1998-2003
+ *
+ * Generational garbage collector
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "Apply.h"
+#include "OSThreads.h"
+#include "Storage.h"
+#include "LdvProfile.h"
+#include "Updates.h"
+#include "Stats.h"
+#include "Schedule.h"
+#include "Sanity.h"
+#include "BlockAlloc.h"
+#include "MBlock.h"
+#include "ProfHeap.h"
+#include "SchedAPI.h"
+#include "Weak.h"
+#include "Prelude.h"
+#include "ParTicky.h"		// ToDo: move into Rts.h
+#include "GCCompact.h"
+#include "RtsSignals.h"
+#include "STM.h"
+#if defined(GRAN) || defined(PAR)
+# include "GranSimRts.h"
+# include "ParallelRts.h"
+# include "FetchMe.h"
+# if defined(DEBUG)
+#  include "Printer.h"
+#  include "ParallelDebug.h"
+# endif
+#endif
+#include "HsFFI.h"
+#include "Linker.h"
+#if defined(RTS_GTK_FRONTPANEL)
+#include "FrontPanel.h"
+#endif
+
+#include "RetainerProfile.h"
+
+#include <string.h>
+
+// Turn off inlining when debugging - it obfuscates things
+#ifdef DEBUG
+# undef  STATIC_INLINE
+# define STATIC_INLINE static
+#endif
+
+/* STATIC OBJECT LIST.
+ *
+ * During GC:
+ * We maintain a linked list of static objects that are still live.
+ * The requirements for this list are:
+ *
+ *  - we need to scan the list while adding to it, in order to
+ *    scavenge all the static objects (in the same way that
+ *    breadth-first scavenging works for dynamic objects).
+ *
+ *  - we need to be able to tell whether an object is already on
+ *    the list, to break loops.
+ *
+ * Each static object has a "static link field", which we use for
+ * linking objects on to the list.  We use a stack-type list, consing
+ * objects on the front as they are added (this means that the
+ * scavenge phase is depth-first, not breadth-first, but that
+ * shouldn't matter).  
+ *
+ * A separate list is kept for objects that have been scavenged
+ * already - this is so that we can zero all the marks afterwards.
+ *
+ * An object is on the list if its static link field is non-zero; this
+ * means that we have to mark the end of the list with '1', not NULL.  
+ *
+ * Extra notes for generational GC:
+ *
+ * Each generation has a static object list associated with it.  When
+ * collecting generations up to N, we treat the static object lists
+ * from generations > N as roots.
+ *
+ * We build up a static object list while collecting generations 0..N,
+ * which is then appended to the static object list of generation N+1.
+ */
+static StgClosure* static_objects;      // live static objects
+StgClosure* scavenged_static_objects;   // static objects scavenged so far
+
+/* N is the oldest generation being collected, where the generations
+ * are numbered starting at 0.  A major GC (indicated by the major_gc
+ * flag) is when we're collecting all generations.  We only attempt to
+ * deal with static objects and GC CAFs when doing a major GC.
+ */
+static nat N;
+static rtsBool major_gc;
+
+/* Youngest generation that objects should be evacuated to in
+ * evacuate().  (Logically an argument to evacuate, but it's static
+ * a lot of the time so we optimise it into a global variable).
+ */
+static nat evac_gen;
+
+/* Whether to do eager promotion or not.
+ */
+static rtsBool eager_promotion;
+
+/* Weak pointers
+ */
+StgWeak *old_weak_ptr_list; // also pending finaliser list
+
+/* Which stage of processing various kinds of weak pointer are we at?
+ * (see traverse_weak_ptr_list() below for discussion).
+ */
+typedef enum { WeakPtrs, WeakThreads, WeakDone } WeakStage;
+static WeakStage weak_stage;
+
+/* List of all threads during GC
+ */
+static StgTSO *old_all_threads;
+StgTSO *resurrected_threads;
+
+/* Flag indicating failure to evacuate an object to the desired
+ * generation.
+ */
+static rtsBool failed_to_evac;
+
+/* Saved nursery (used for 2-space collector only)
+ */
+static bdescr *saved_nursery;
+static nat saved_n_blocks;
+  
+/* Data used for allocation area sizing.
+ */
+static lnat new_blocks;		 // blocks allocated during this GC 
+static lnat new_scavd_blocks;	 // ditto, but depth-first blocks
+static lnat g0s0_pcnt_kept = 30; // percentage of g0s0 live at last minor GC 
+
+/* Used to avoid long recursion due to selector thunks
+ */
+static lnat thunk_selector_depth = 0;
+#define MAX_THUNK_SELECTOR_DEPTH 8
+
+/* Mut-list stats */
+#ifdef DEBUG
+static nat 
+    mutlist_MUTVARS,
+    mutlist_MUTARRS,
+    mutlist_OTHERS;
+#endif
+
+/* -----------------------------------------------------------------------------
+   Static function declarations
+   -------------------------------------------------------------------------- */
+
+static bdescr *     gc_alloc_block          ( step *stp );
+static void         mark_root               ( StgClosure **root );
+
+// Use a register argument for evacuate, if available.
+#if __GNUC__ >= 2
+#define REGPARM1 __attribute__((regparm(1)))
+#else
+#define REGPARM1
+#endif
+
+REGPARM1 static StgClosure * evacuate (StgClosure *q);
+
+static void         zero_static_object_list ( StgClosure* first_static );
+
+static rtsBool      traverse_weak_ptr_list  ( void );
+static void         mark_weak_ptr_list      ( StgWeak **list );
+
+static StgClosure * eval_thunk_selector     ( nat field, StgSelector * p );
+
+
+static void    scavenge                ( step * );
+static void    scavenge_mark_stack     ( void );
+static void    scavenge_stack          ( StgPtr p, StgPtr stack_end );
+static rtsBool scavenge_one            ( StgPtr p );
+static void    scavenge_large          ( step * );
+static void    scavenge_static         ( void );
+static void    scavenge_mutable_list   ( generation *g );
+
+static void    scavenge_large_bitmap   ( StgPtr p, 
+					 StgLargeBitmap *large_bitmap, 
+					 nat size );
+
+#if 0 && defined(DEBUG)
+static void         gcCAFs                  ( void );
+#endif
+
+/* -----------------------------------------------------------------------------
+   inline functions etc. for dealing with the mark bitmap & stack.
+   -------------------------------------------------------------------------- */
+
+#define MARK_STACK_BLOCKS 4
+
+static bdescr *mark_stack_bdescr;
+static StgPtr *mark_stack;
+static StgPtr *mark_sp;
+static StgPtr *mark_splim;
+
+// Flag and pointers used for falling back to a linear scan when the
+// mark stack overflows.
+static rtsBool mark_stack_overflowed;
+static bdescr *oldgen_scan_bd;
+static StgPtr  oldgen_scan;
+
+STATIC_INLINE rtsBool
+mark_stack_empty(void)
+{
+    return mark_sp == mark_stack;
+}
+
+STATIC_INLINE rtsBool
+mark_stack_full(void)
+{
+    return mark_sp >= mark_splim;
+}
+
+STATIC_INLINE void
+reset_mark_stack(void)
+{
+    mark_sp = mark_stack;
+}
+
+STATIC_INLINE void
+push_mark_stack(StgPtr p)
+{
+    *mark_sp++ = p;
+}
+
+STATIC_INLINE StgPtr
+pop_mark_stack(void)
+{
+    return *--mark_sp;
+}
+
+/* -----------------------------------------------------------------------------
+   Allocate a new to-space block in the given step.
+   -------------------------------------------------------------------------- */
+
+static bdescr *
+gc_alloc_block(step *stp)
+{
+    bdescr *bd = allocBlock();
+    bd->gen_no = stp->gen_no;
+    bd->step = stp;
+    bd->link = NULL;
+
+    // blocks in to-space in generations up to and including N
+    // get the BF_EVACUATED flag.
+    if (stp->gen_no <= N) {
+	bd->flags = BF_EVACUATED;
+    } else {
+	bd->flags = 0;
+    }
+
+    // Start a new to-space block, chain it on after the previous one.
+    if (stp->hp_bd != NULL) {
+	stp->hp_bd->free = stp->hp;
+	stp->hp_bd->link = bd;
+    }
+
+    stp->hp_bd = bd;
+    stp->hp    = bd->start;
+    stp->hpLim = stp->hp + BLOCK_SIZE_W;
+
+    stp->n_blocks++;
+    new_blocks++;
+
+    return bd;
+}
+
+static bdescr *
+gc_alloc_scavd_block(step *stp)
+{
+    bdescr *bd = allocBlock();
+    bd->gen_no = stp->gen_no;
+    bd->step = stp;
+
+    // blocks in to-space in generations up to and including N
+    // get the BF_EVACUATED flag.
+    if (stp->gen_no <= N) {
+	bd->flags = BF_EVACUATED;
+    } else {
+	bd->flags = 0;
+    }
+
+    bd->link = stp->blocks;
+    stp->blocks = bd;
+
+    if (stp->scavd_hp != NULL) {
+	Bdescr(stp->scavd_hp)->free = stp->scavd_hp;
+    }
+    stp->scavd_hp    = bd->start;
+    stp->scavd_hpLim = stp->scavd_hp + BLOCK_SIZE_W;
+
+    stp->n_blocks++;
+    new_scavd_blocks++;
+
+    return bd;
+}
+
+/* -----------------------------------------------------------------------------
+   GarbageCollect
+
+   Rough outline of the algorithm: for garbage collecting generation N
+   (and all younger generations):
+
+     - follow all pointers in the root set.  the root set includes all 
+       mutable objects in all generations (mutable_list).
+
+     - for each pointer, evacuate the object it points to into either
+
+       + to-space of the step given by step->to, which is the next
+         highest step in this generation or the first step in the next
+         generation if this is the last step.
+
+       + to-space of generations[evac_gen]->steps[0], if evac_gen != 0.
+         When we evacuate an object we attempt to evacuate
+         everything it points to into the same generation - this is
+         achieved by setting evac_gen to the desired generation.  If
+         we can't do this, then an entry in the mut list has to
+         be made for the cross-generation pointer.
+
+       + if the object is already in a generation > N, then leave
+         it alone.
+
+     - repeatedly scavenge to-space from each step in each generation
+       being collected until no more objects can be evacuated.
+      
+     - free from-space in each step, and set from-space = to-space.
+
+   Locks held: all capabilities are held throughout GarbageCollect().
+
+   -------------------------------------------------------------------------- */
+
+void
+GarbageCollect ( void (*get_roots)(evac_fn), rtsBool force_major_gc )
+{
+  bdescr *bd;
+  step *stp;
+  lnat live, allocated, copied = 0, scavd_copied = 0;
+  lnat oldgen_saved_blocks = 0;
+  nat g, s, i;
+
+  ACQUIRE_SM_LOCK;
+
+#ifdef PROFILING
+  CostCentreStack *prev_CCS;
+#endif
+
+#if defined(DEBUG) && defined(GRAN)
+  IF_DEBUG(gc, debugBelch("@@ Starting garbage collection at %ld (%lx)\n", 
+		     Now, Now));
+#endif
+
+#if defined(RTS_USER_SIGNALS)
+  // block signals
+  blockUserSignals();
+#endif
+
+  // tell the STM to discard any cached closures its hoping to re-use
+  stmPreGCHook();
+
+  // tell the stats department that we've started a GC 
+  stat_startGC();
+
+#ifdef DEBUG
+  // check for memory leaks if DEBUG is on 
+  memInventory();
+#endif
+
+#ifdef DEBUG
+  mutlist_MUTVARS = 0;
+  mutlist_MUTARRS = 0;
+  mutlist_OTHERS = 0;
+#endif
+
+  // Init stats and print par specific (timing) info 
+  PAR_TICKY_PAR_START();
+
+  // attribute any costs to CCS_GC 
+#ifdef PROFILING
+  prev_CCS = CCCS;
+  CCCS = CCS_GC;
+#endif
+
+  /* Approximate how much we allocated.  
+   * Todo: only when generating stats? 
+   */
+  allocated = calcAllocated();
+
+  /* Figure out which generation to collect
+   */
+  if (force_major_gc) {
+    N = RtsFlags.GcFlags.generations - 1;
+    major_gc = rtsTrue;
+  } else {
+    N = 0;
+    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+      if (generations[g].steps[0].n_blocks +
+	  generations[g].steps[0].n_large_blocks
+	  >= generations[g].max_blocks) {
+        N = g;
+      }
+    }
+    major_gc = (N == RtsFlags.GcFlags.generations-1);
+  }
+
+#ifdef RTS_GTK_FRONTPANEL
+  if (RtsFlags.GcFlags.frontpanel) {
+      updateFrontPanelBeforeGC(N);
+  }
+#endif
+
+  // check stack sanity *before* GC (ToDo: check all threads) 
+#if defined(GRAN)
+  // ToDo!: check sanity  IF_DEBUG(sanity, checkTSOsSanity());
+#endif
+  IF_DEBUG(sanity, checkFreeListSanity());
+
+  /* Initialise the static object lists
+   */
+  static_objects = END_OF_STATIC_LIST;
+  scavenged_static_objects = END_OF_STATIC_LIST;
+
+  /* Save the nursery if we're doing a two-space collection.
+   * g0s0->blocks will be used for to-space, so we need to get the
+   * nursery out of the way.
+   */
+  if (RtsFlags.GcFlags.generations == 1) {
+      saved_nursery = g0s0->blocks;
+      saved_n_blocks = g0s0->n_blocks;
+      g0s0->blocks = NULL;
+      g0s0->n_blocks = 0;
+  }
+
+  /* Keep a count of how many new blocks we allocated during this GC
+   * (used for resizing the allocation area, later).
+   */
+  new_blocks = 0;
+  new_scavd_blocks = 0;
+
+  // Initialise to-space in all the generations/steps that we're
+  // collecting.
+  //
+  for (g = 0; g <= N; g++) {
+
+    // throw away the mutable list.  Invariant: the mutable list
+    // always has at least one block; this means we can avoid a check for
+    // NULL in recordMutable().
+    if (g != 0) {
+	freeChain(generations[g].mut_list);
+	generations[g].mut_list = allocBlock();
+	for (i = 0; i < n_capabilities; i++) {
+	    freeChain(capabilities[i].mut_lists[g]);
+	    capabilities[i].mut_lists[g] = allocBlock();
+	}
+    }
+
+    for (s = 0; s < generations[g].n_steps; s++) {
+
+      // generation 0, step 0 doesn't need to-space 
+      if (g == 0 && s == 0 && RtsFlags.GcFlags.generations > 1) { 
+	continue; 
+      }
+
+      stp = &generations[g].steps[s];
+      ASSERT(stp->gen_no == g);
+
+      // start a new to-space for this step.
+      stp->old_blocks   = stp->blocks;
+      stp->n_old_blocks = stp->n_blocks;
+
+      // allocate the first to-space block; extra blocks will be
+      // chained on as necessary.
+      stp->hp_bd     = NULL;
+      bd = gc_alloc_block(stp);
+      stp->blocks      = bd;
+      stp->n_blocks    = 1;
+      stp->scan        = bd->start;
+      stp->scan_bd     = bd;
+
+      // allocate a block for "already scavenged" objects.  This goes
+      // on the front of the stp->blocks list, so it won't be
+      // traversed by the scavenging sweep.
+      gc_alloc_scavd_block(stp);
+
+      // initialise the large object queues.
+      stp->new_large_objects = NULL;
+      stp->scavenged_large_objects = NULL;
+      stp->n_scavenged_large_blocks = 0;
+
+      // mark the large objects as not evacuated yet 
+      for (bd = stp->large_objects; bd; bd = bd->link) {
+	bd->flags &= ~BF_EVACUATED;
+      }
+
+      // for a compacted step, we need to allocate the bitmap
+      if (stp->is_compacted) {
+	  nat bitmap_size; // in bytes
+	  bdescr *bitmap_bdescr;
+	  StgWord *bitmap;
+
+	  bitmap_size = stp->n_old_blocks * BLOCK_SIZE / (sizeof(W_)*BITS_PER_BYTE);
+
+	  if (bitmap_size > 0) {
+	      bitmap_bdescr = allocGroup((lnat)BLOCK_ROUND_UP(bitmap_size) 
+					 / BLOCK_SIZE);
+	      stp->bitmap = bitmap_bdescr;
+	      bitmap = bitmap_bdescr->start;
+	      
+	      IF_DEBUG(gc, debugBelch("bitmap_size: %d, bitmap: %p",
+				   bitmap_size, bitmap););
+	      
+	      // don't forget to fill it with zeros!
+	      memset(bitmap, 0, bitmap_size);
+	      
+	      // For each block in this step, point to its bitmap from the
+	      // block descriptor.
+	      for (bd=stp->old_blocks; bd != NULL; bd = bd->link) {
+		  bd->u.bitmap = bitmap;
+		  bitmap += BLOCK_SIZE_W / (sizeof(W_)*BITS_PER_BYTE);
+
+		  // Also at this point we set the BF_COMPACTED flag
+		  // for this block.  The invariant is that
+		  // BF_COMPACTED is always unset, except during GC
+		  // when it is set on those blocks which will be
+		  // compacted.
+		  bd->flags |= BF_COMPACTED;
+	      }
+	  }
+      }
+    }
+  }
+
+  /* make sure the older generations have at least one block to
+   * allocate into (this makes things easier for copy(), see below).
+   */
+  for (g = N+1; g < RtsFlags.GcFlags.generations; g++) {
+    for (s = 0; s < generations[g].n_steps; s++) {
+      stp = &generations[g].steps[s];
+      if (stp->hp_bd == NULL) {
+	  ASSERT(stp->blocks == NULL);
+	  bd = gc_alloc_block(stp);
+	  stp->blocks = bd;
+	  stp->n_blocks = 1;
+      }
+      if (stp->scavd_hp == NULL) {
+	  gc_alloc_scavd_block(stp);
+	  stp->n_blocks++;
+      }
+      /* Set the scan pointer for older generations: remember we
+       * still have to scavenge objects that have been promoted. */
+      stp->scan = stp->hp;
+      stp->scan_bd = stp->hp_bd;
+      stp->new_large_objects = NULL;
+      stp->scavenged_large_objects = NULL;
+      stp->n_scavenged_large_blocks = 0;
+    }
+
+    /* Move the private mutable lists from each capability onto the
+     * main mutable list for the generation.
+     */
+    for (i = 0; i < n_capabilities; i++) {
+	for (bd = capabilities[i].mut_lists[g]; 
+	     bd->link != NULL; bd = bd->link) {
+	    /* nothing */
+	}
+	bd->link = generations[g].mut_list;
+	generations[g].mut_list = capabilities[i].mut_lists[g];
+	capabilities[i].mut_lists[g] = allocBlock();
+    }
+  }
+
+  /* Allocate a mark stack if we're doing a major collection.
+   */
+  if (major_gc) {
+      mark_stack_bdescr = allocGroup(MARK_STACK_BLOCKS);
+      mark_stack = (StgPtr *)mark_stack_bdescr->start;
+      mark_sp    = mark_stack;
+      mark_splim = mark_stack + (MARK_STACK_BLOCKS * BLOCK_SIZE_W);
+  } else {
+      mark_stack_bdescr = NULL;
+  }
+
+  eager_promotion = rtsTrue; // for now
+
+  /* -----------------------------------------------------------------------
+   * follow all the roots that we know about:
+   *   - mutable lists from each generation > N
+   * we want to *scavenge* these roots, not evacuate them: they're not
+   * going to move in this GC.
+   * Also: do them in reverse generation order.  This is because we
+   * often want to promote objects that are pointed to by older
+   * generations early, so we don't have to repeatedly copy them.
+   * Doing the generations in reverse order ensures that we don't end
+   * up in the situation where we want to evac an object to gen 3 and
+   * it has already been evaced to gen 2.
+   */
+  { 
+    int st;
+    for (g = RtsFlags.GcFlags.generations-1; g > N; g--) {
+      generations[g].saved_mut_list = generations[g].mut_list;
+      generations[g].mut_list = allocBlock(); 
+        // mut_list always has at least one block.
+    }
+
+    for (g = RtsFlags.GcFlags.generations-1; g > N; g--) {
+      IF_PAR_DEBUG(verbose, printMutableList(&generations[g]));
+      scavenge_mutable_list(&generations[g]);
+      evac_gen = g;
+      for (st = generations[g].n_steps-1; st >= 0; st--) {
+	scavenge(&generations[g].steps[st]);
+      }
+    }
+  }
+
+  /* follow roots from the CAF list (used by GHCi)
+   */
+  evac_gen = 0;
+  markCAFs(mark_root);
+
+  /* follow all the roots that the application knows about.
+   */
+  evac_gen = 0;
+  get_roots(mark_root);
+
+#if defined(PAR)
+  /* And don't forget to mark the TSO if we got here direct from
+   * Haskell! */
+  /* Not needed in a seq version?
+  if (CurrentTSO) {
+    CurrentTSO = (StgTSO *)MarkRoot((StgClosure *)CurrentTSO);
+  }
+  */
+
+  // Mark the entries in the GALA table of the parallel system 
+  markLocalGAs(major_gc);
+  // Mark all entries on the list of pending fetches 
+  markPendingFetches(major_gc);
+#endif
+
+  /* Mark the weak pointer list, and prepare to detect dead weak
+   * pointers.
+   */
+  mark_weak_ptr_list(&weak_ptr_list);
+  old_weak_ptr_list = weak_ptr_list;
+  weak_ptr_list = NULL;
+  weak_stage = WeakPtrs;
+
+  /* The all_threads list is like the weak_ptr_list.  
+   * See traverse_weak_ptr_list() for the details.
+   */
+  old_all_threads = all_threads;
+  all_threads = END_TSO_QUEUE;
+  resurrected_threads = END_TSO_QUEUE;
+
+  /* Mark the stable pointer table.
+   */
+  markStablePtrTable(mark_root);
+
+  /* -------------------------------------------------------------------------
+   * Repeatedly scavenge all the areas we know about until there's no
+   * more scavenging to be done.
+   */
+  { 
+    rtsBool flag;
+  loop:
+    flag = rtsFalse;
+
+    // scavenge static objects 
+    if (major_gc && static_objects != END_OF_STATIC_LIST) {
+	IF_DEBUG(sanity, checkStaticObjects(static_objects));
+	scavenge_static();
+    }
+
+    /* When scavenging the older generations:  Objects may have been
+     * evacuated from generations <= N into older generations, and we
+     * need to scavenge these objects.  We're going to try to ensure that
+     * any evacuations that occur move the objects into at least the
+     * same generation as the object being scavenged, otherwise we
+     * have to create new entries on the mutable list for the older
+     * generation.
+     */
+
+    // scavenge each step in generations 0..maxgen 
+    { 
+      long gen;
+      int st; 
+
+    loop2:
+      // scavenge objects in compacted generation
+      if (mark_stack_overflowed || oldgen_scan_bd != NULL ||
+	  (mark_stack_bdescr != NULL && !mark_stack_empty())) {
+	  scavenge_mark_stack();
+	  flag = rtsTrue;
+      }
+
+      for (gen = RtsFlags.GcFlags.generations; --gen >= 0; ) {
+	for (st = generations[gen].n_steps; --st >= 0; ) {
+	  if (gen == 0 && st == 0 && RtsFlags.GcFlags.generations > 1) { 
+	    continue; 
+	  }
+	  stp = &generations[gen].steps[st];
+	  evac_gen = gen;
+	  if (stp->hp_bd != stp->scan_bd || stp->scan < stp->hp) {
+	    scavenge(stp);
+	    flag = rtsTrue;
+	    goto loop2;
+	  }
+	  if (stp->new_large_objects != NULL) {
+	    scavenge_large(stp);
+	    flag = rtsTrue;
+	    goto loop2;
+	  }
+	}
+      }
+    }
+
+    if (flag) { goto loop; }
+
+    // must be last...  invariant is that everything is fully
+    // scavenged at this point.
+    if (traverse_weak_ptr_list()) { // returns rtsTrue if evaced something 
+      goto loop;
+    }
+  }
+
+  /* Update the pointers from the task list - these are
+   * treated as weak pointers because we want to allow a main thread
+   * to get a BlockedOnDeadMVar exception in the same way as any other
+   * thread.  Note that the threads should all have been retained by
+   * GC by virtue of being on the all_threads list, we're just
+   * updating pointers here.
+   */
+  {
+      Task *task;
+      StgTSO *tso;
+      for (task = all_tasks; task != NULL; task = task->all_link) {
+	  if (!task->stopped && task->tso) {
+	      ASSERT(task->tso->bound == task);
+	      tso = (StgTSO *) isAlive((StgClosure *)task->tso);
+	      if (tso == NULL) {
+		  barf("task %p: main thread %d has been GC'd", 
+#ifdef THREADED_RTS
+		       (void *)task->id, 
+#else
+		       (void *)task,
+#endif
+		       task->tso->id);
+	      }
+	      task->tso = tso;
+	  }
+      }
+  }
+
+#if defined(PAR)
+  // Reconstruct the Global Address tables used in GUM 
+  rebuildGAtables(major_gc);
+  IF_DEBUG(sanity, checkLAGAtable(rtsTrue/*check closures, too*/));
+#endif
+
+  // Now see which stable names are still alive.
+  gcStablePtrTable();
+
+  // Tidy the end of the to-space chains 
+  for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+      for (s = 0; s < generations[g].n_steps; s++) {
+	  stp = &generations[g].steps[s];
+	  if (!(g == 0 && s == 0 && RtsFlags.GcFlags.generations > 1)) {
+	      ASSERT(Bdescr(stp->hp) == stp->hp_bd);
+	      stp->hp_bd->free = stp->hp;
+	      Bdescr(stp->scavd_hp)->free = stp->scavd_hp;
+	  }
+      }
+  }
+
+#ifdef PROFILING
+  // We call processHeapClosureForDead() on every closure destroyed during
+  // the current garbage collection, so we invoke LdvCensusForDead().
+  if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_LDV
+      || RtsFlags.ProfFlags.bioSelector != NULL)
+    LdvCensusForDead(N);
+#endif
+
+  // NO MORE EVACUATION AFTER THIS POINT!
+  // Finally: compaction of the oldest generation.
+  if (major_gc && oldest_gen->steps[0].is_compacted) {
+      // save number of blocks for stats
+      oldgen_saved_blocks = oldest_gen->steps[0].n_old_blocks;
+      compact(get_roots);
+  }
+
+  IF_DEBUG(sanity, checkGlobalTSOList(rtsFalse));
+
+  /* run through all the generations/steps and tidy up 
+   */
+  copied = new_blocks * BLOCK_SIZE_W;
+  scavd_copied =  new_scavd_blocks * BLOCK_SIZE_W;
+  for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+
+    if (g <= N) {
+      generations[g].collections++; // for stats 
+    }
+
+    // Count the mutable list as bytes "copied" for the purposes of
+    // stats.  Every mutable list is copied during every GC.
+    if (g > 0) {
+	nat mut_list_size = 0;
+	for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) {
+	    mut_list_size += bd->free - bd->start;
+	}
+	copied +=  mut_list_size;
+
+	IF_DEBUG(gc, debugBelch("mut_list_size: %ld (%d vars, %d arrays, %d others)\n", mut_list_size * sizeof(W_), mutlist_MUTVARS, mutlist_MUTARRS, mutlist_OTHERS));
+    }
+
+    for (s = 0; s < generations[g].n_steps; s++) {
+      bdescr *next;
+      stp = &generations[g].steps[s];
+
+      if (!(g == 0 && s == 0 && RtsFlags.GcFlags.generations > 1)) {
+	// stats information: how much we copied 
+	if (g <= N) {
+	  copied -= stp->hp_bd->start + BLOCK_SIZE_W -
+	    stp->hp_bd->free;
+	  scavd_copied -= (P_)(BLOCK_ROUND_UP(stp->scavd_hp)) - stp->scavd_hp;
+	}
+      }
+
+      // for generations we collected... 
+      if (g <= N) {
+
+	/* free old memory and shift to-space into from-space for all
+	 * the collected steps (except the allocation area).  These
+	 * freed blocks will probaby be quickly recycled.
+	 */
+	if (!(g == 0 && s == 0)) {
+	    if (stp->is_compacted) {
+		// for a compacted step, just shift the new to-space
+		// onto the front of the now-compacted existing blocks.
+		for (bd = stp->blocks; bd != NULL; bd = bd->link) {
+		    bd->flags &= ~BF_EVACUATED;  // now from-space 
+		}
+		// tack the new blocks on the end of the existing blocks
+		if (stp->old_blocks != NULL) {
+		    for (bd = stp->old_blocks; bd != NULL; bd = next) {
+			// NB. this step might not be compacted next
+			// time, so reset the BF_COMPACTED flags.
+			// They are set before GC if we're going to
+			// compact.  (search for BF_COMPACTED above).
+			bd->flags &= ~BF_COMPACTED;
+			next = bd->link;
+			if (next == NULL) {
+			    bd->link = stp->blocks;
+			}
+		    }
+		    stp->blocks = stp->old_blocks;
+		}
+		// add the new blocks to the block tally
+		stp->n_blocks += stp->n_old_blocks;
+		ASSERT(countBlocks(stp->blocks) == stp->n_blocks);
+	    } else {
+		freeChain(stp->old_blocks);
+		for (bd = stp->blocks; bd != NULL; bd = bd->link) {
+		    bd->flags &= ~BF_EVACUATED;	 // now from-space 
+		}
+	    }
+	    stp->old_blocks = NULL;
+	    stp->n_old_blocks = 0;
+	}
+
+	/* LARGE OBJECTS.  The current live large objects are chained on
+	 * scavenged_large, having been moved during garbage
+	 * collection from large_objects.  Any objects left on
+	 * large_objects list are therefore dead, so we free them here.
+	 */
+	for (bd = stp->large_objects; bd != NULL; bd = next) {
+	  next = bd->link;
+	  freeGroup(bd);
+	  bd = next;
+	}
+
+	// update the count of blocks used by large objects
+	for (bd = stp->scavenged_large_objects; bd != NULL; bd = bd->link) {
+	  bd->flags &= ~BF_EVACUATED;
+	}
+	stp->large_objects  = stp->scavenged_large_objects;
+	stp->n_large_blocks = stp->n_scavenged_large_blocks;
+
+      } else {
+	// for older generations... 
+	
+	/* For older generations, we need to append the
+	 * scavenged_large_object list (i.e. large objects that have been
+	 * promoted during this GC) to the large_object list for that step.
+	 */
+	for (bd = stp->scavenged_large_objects; bd; bd = next) {
+	  next = bd->link;
+	  bd->flags &= ~BF_EVACUATED;
+	  dbl_link_onto(bd, &stp->large_objects);
+	}
+
+	// add the new blocks we promoted during this GC 
+	stp->n_large_blocks += stp->n_scavenged_large_blocks;
+      }
+    }
+  }
+
+  /* Reset the sizes of the older generations when we do a major
+   * collection.
+   *
+   * CURRENT STRATEGY: make all generations except zero the same size.
+   * We have to stay within the maximum heap size, and leave a certain
+   * percentage of the maximum heap size available to allocate into.
+   */
+  if (major_gc && RtsFlags.GcFlags.generations > 1) {
+      nat live, size, min_alloc;
+      nat max  = RtsFlags.GcFlags.maxHeapSize;
+      nat gens = RtsFlags.GcFlags.generations;
+
+      // live in the oldest generations
+      live = oldest_gen->steps[0].n_blocks +
+	     oldest_gen->steps[0].n_large_blocks;
+
+      // default max size for all generations except zero
+      size = stg_max(live * RtsFlags.GcFlags.oldGenFactor,
+		     RtsFlags.GcFlags.minOldGenSize);
+
+      // minimum size for generation zero
+      min_alloc = stg_max((RtsFlags.GcFlags.pcFreeHeap * max) / 200,
+			  RtsFlags.GcFlags.minAllocAreaSize);
+
+      // Auto-enable compaction when the residency reaches a
+      // certain percentage of the maximum heap size (default: 30%).
+      if (RtsFlags.GcFlags.generations > 1 &&
+	  (RtsFlags.GcFlags.compact ||
+	   (max > 0 &&
+	    oldest_gen->steps[0].n_blocks > 
+	    (RtsFlags.GcFlags.compactThreshold * max) / 100))) {
+	  oldest_gen->steps[0].is_compacted = 1;
+//	  debugBelch("compaction: on\n", live);
+      } else {
+	  oldest_gen->steps[0].is_compacted = 0;
+//	  debugBelch("compaction: off\n", live);
+      }
+
+      // if we're going to go over the maximum heap size, reduce the
+      // size of the generations accordingly.  The calculation is
+      // different if compaction is turned on, because we don't need
+      // to double the space required to collect the old generation.
+      if (max != 0) {
+
+	  // this test is necessary to ensure that the calculations
+	  // below don't have any negative results - we're working
+	  // with unsigned values here.
+	  if (max < min_alloc) {
+	      heapOverflow();
+	  }
+
+	  if (oldest_gen->steps[0].is_compacted) {
+	      if ( (size + (size - 1) * (gens - 2) * 2) + min_alloc > max ) {
+		  size = (max - min_alloc) / ((gens - 1) * 2 - 1);
+	      }
+	  } else {
+	      if ( (size * (gens - 1) * 2) + min_alloc > max ) {
+		  size = (max - min_alloc) / ((gens - 1) * 2);
+	      }
+	  }
+
+	  if (size < live) {
+	      heapOverflow();
+	  }
+      }
+
+#if 0
+      debugBelch("live: %d, min_alloc: %d, size : %d, max = %d\n", live,
+	      min_alloc, size, max);
+#endif
+
+      for (g = 0; g < gens; g++) {
+	  generations[g].max_blocks = size;
+      }
+  }
+
+  // Guess the amount of live data for stats.
+  live = calcLive();
+
+  /* Free the small objects allocated via allocate(), since this will
+   * all have been copied into G0S1 now.  
+   */
+  if (small_alloc_list != NULL) {
+    freeChain(small_alloc_list);
+  }
+  small_alloc_list = NULL;
+  alloc_blocks = 0;
+  alloc_Hp = NULL;
+  alloc_HpLim = NULL;
+  alloc_blocks_lim = RtsFlags.GcFlags.minAllocAreaSize;
+
+  // Start a new pinned_object_block
+  pinned_object_block = NULL;
+
+  /* Free the mark stack.
+   */
+  if (mark_stack_bdescr != NULL) {
+      freeGroup(mark_stack_bdescr);
+  }
+
+  /* Free any bitmaps.
+   */
+  for (g = 0; g <= N; g++) {
+      for (s = 0; s < generations[g].n_steps; s++) {
+	  stp = &generations[g].steps[s];
+	  if (stp->bitmap != NULL) {
+	      freeGroup(stp->bitmap);
+	      stp->bitmap = NULL;
+	  }
+      }
+  }
+
+  /* Two-space collector:
+   * Free the old to-space, and estimate the amount of live data.
+   */
+  if (RtsFlags.GcFlags.generations == 1) {
+    nat blocks;
+    
+    if (g0s0->old_blocks != NULL) {
+      freeChain(g0s0->old_blocks);
+    }
+    for (bd = g0s0->blocks; bd != NULL; bd = bd->link) {
+      bd->flags = 0;	// now from-space 
+    }
+    g0s0->old_blocks = g0s0->blocks;
+    g0s0->n_old_blocks = g0s0->n_blocks;
+    g0s0->blocks = saved_nursery;
+    g0s0->n_blocks = saved_n_blocks;
+
+    /* For a two-space collector, we need to resize the nursery. */
+    
+    /* set up a new nursery.  Allocate a nursery size based on a
+     * function of the amount of live data (by default a factor of 2)
+     * Use the blocks from the old nursery if possible, freeing up any
+     * left over blocks.
+     *
+     * If we get near the maximum heap size, then adjust our nursery
+     * size accordingly.  If the nursery is the same size as the live
+     * data (L), then we need 3L bytes.  We can reduce the size of the
+     * nursery to bring the required memory down near 2L bytes.
+     * 
+     * A normal 2-space collector would need 4L bytes to give the same
+     * performance we get from 3L bytes, reducing to the same
+     * performance at 2L bytes.
+     */
+    blocks = g0s0->n_old_blocks;
+
+    if ( RtsFlags.GcFlags.maxHeapSize != 0 &&
+	 blocks * RtsFlags.GcFlags.oldGenFactor * 2 > 
+	   RtsFlags.GcFlags.maxHeapSize ) {
+      long adjusted_blocks;  // signed on purpose 
+      int pc_free; 
+      
+      adjusted_blocks = (RtsFlags.GcFlags.maxHeapSize - 2 * blocks);
+      IF_DEBUG(gc, debugBelch("@@ Near maximum heap size of 0x%x blocks, blocks = %d, adjusted to %ld", RtsFlags.GcFlags.maxHeapSize, blocks, adjusted_blocks));
+      pc_free = adjusted_blocks * 100 / RtsFlags.GcFlags.maxHeapSize;
+      if (pc_free < RtsFlags.GcFlags.pcFreeHeap) /* might even be < 0 */ {
+	heapOverflow();
+      }
+      blocks = adjusted_blocks;
+      
+    } else {
+      blocks *= RtsFlags.GcFlags.oldGenFactor;
+      if (blocks < RtsFlags.GcFlags.minAllocAreaSize) {
+	blocks = RtsFlags.GcFlags.minAllocAreaSize;
+      }
+    }
+    resizeNurseries(blocks);
+    
+  } else {
+    /* Generational collector:
+     * If the user has given us a suggested heap size, adjust our
+     * allocation area to make best use of the memory available.
+     */
+
+    if (RtsFlags.GcFlags.heapSizeSuggestion) {
+      long blocks;
+      nat needed = calcNeeded(); 	// approx blocks needed at next GC 
+
+      /* Guess how much will be live in generation 0 step 0 next time.
+       * A good approximation is obtained by finding the
+       * percentage of g0s0 that was live at the last minor GC.
+       */
+      if (N == 0) {
+	g0s0_pcnt_kept = (new_blocks * 100) / countNurseryBlocks();
+      }
+
+      /* Estimate a size for the allocation area based on the
+       * information available.  We might end up going slightly under
+       * or over the suggested heap size, but we should be pretty
+       * close on average.
+       *
+       * Formula:            suggested - needed
+       *                ----------------------------
+       *                    1 + g0s0_pcnt_kept/100
+       *
+       * where 'needed' is the amount of memory needed at the next
+       * collection for collecting all steps except g0s0.
+       */
+      blocks = 
+	(((long)RtsFlags.GcFlags.heapSizeSuggestion - (long)needed) * 100) /
+	(100 + (long)g0s0_pcnt_kept);
+      
+      if (blocks < (long)RtsFlags.GcFlags.minAllocAreaSize) {
+	blocks = RtsFlags.GcFlags.minAllocAreaSize;
+      }
+      
+      resizeNurseries((nat)blocks);
+
+    } else {
+      // we might have added extra large blocks to the nursery, so
+      // resize back to minAllocAreaSize again.
+      resizeNurseriesFixed(RtsFlags.GcFlags.minAllocAreaSize);
+    }
+  }
+
+ // mark the garbage collected CAFs as dead 
+#if 0 && defined(DEBUG) // doesn't work at the moment 
+  if (major_gc) { gcCAFs(); }
+#endif
+  
+#ifdef PROFILING
+  // resetStaticObjectForRetainerProfiling() must be called before
+  // zeroing below.
+  resetStaticObjectForRetainerProfiling();
+#endif
+
+  // zero the scavenged static object list 
+  if (major_gc) {
+    zero_static_object_list(scavenged_static_objects);
+  }
+
+  // Reset the nursery
+  resetNurseries();
+
+  // start any pending finalizers 
+  RELEASE_SM_LOCK;
+  scheduleFinalizers(last_free_capability, old_weak_ptr_list);
+  ACQUIRE_SM_LOCK;
+  
+  // send exceptions to any threads which were about to die 
+  RELEASE_SM_LOCK;
+  resurrectThreads(resurrected_threads);
+  ACQUIRE_SM_LOCK;
+
+  // Update the stable pointer hash table.
+  updateStablePtrTable(major_gc);
+
+  // check sanity after GC 
+  IF_DEBUG(sanity, checkSanity());
+
+  // extra GC trace info 
+  IF_DEBUG(gc, statDescribeGens());
+
+#ifdef DEBUG
+  // symbol-table based profiling 
+  /*  heapCensus(to_blocks); */ /* ToDo */
+#endif
+
+  // restore enclosing cost centre 
+#ifdef PROFILING
+  CCCS = prev_CCS;
+#endif
+
+#ifdef DEBUG
+  // check for memory leaks if DEBUG is on 
+  memInventory();
+#endif
+
+#ifdef RTS_GTK_FRONTPANEL
+  if (RtsFlags.GcFlags.frontpanel) {
+      updateFrontPanelAfterGC( N, live );
+  }
+#endif
+
+  // ok, GC over: tell the stats department what happened. 
+  stat_endGC(allocated, live, copied, scavd_copied, N);
+
+#if defined(RTS_USER_SIGNALS)
+  // unblock signals again
+  unblockUserSignals();
+#endif
+
+  RELEASE_SM_LOCK;
+
+  //PAR_TICKY_TP();
+}
+
+
+/* -----------------------------------------------------------------------------
+   Weak Pointers
+
+   traverse_weak_ptr_list is called possibly many times during garbage
+   collection.  It returns a flag indicating whether it did any work
+   (i.e. called evacuate on any live pointers).
+
+   Invariant: traverse_weak_ptr_list is called when the heap is in an
+   idempotent state.  That means that there are no pending
+   evacuate/scavenge operations.  This invariant helps the weak
+   pointer code decide which weak pointers are dead - if there are no
+   new live weak pointers, then all the currently unreachable ones are
+   dead.
+
+   For generational GC: we just don't try to finalize weak pointers in
+   older generations than the one we're collecting.  This could
+   probably be optimised by keeping per-generation lists of weak
+   pointers, but for a few weak pointers this scheme will work.
+
+   There are three distinct stages to processing weak pointers:
+
+   - weak_stage == WeakPtrs
+
+     We process all the weak pointers whos keys are alive (evacuate
+     their values and finalizers), and repeat until we can find no new
+     live keys.  If no live keys are found in this pass, then we
+     evacuate the finalizers of all the dead weak pointers in order to
+     run them.
+
+   - weak_stage == WeakThreads
+
+     Now, we discover which *threads* are still alive.  Pointers to
+     threads from the all_threads and main thread lists are the
+     weakest of all: a pointers from the finalizer of a dead weak
+     pointer can keep a thread alive.  Any threads found to be unreachable
+     are evacuated and placed on the resurrected_threads list so we 
+     can send them a signal later.
+
+   - weak_stage == WeakDone
+
+     No more evacuation is done.
+
+   -------------------------------------------------------------------------- */
+
+static rtsBool 
+traverse_weak_ptr_list(void)
+{
+  StgWeak *w, **last_w, *next_w;
+  StgClosure *new;
+  rtsBool flag = rtsFalse;
+
+  switch (weak_stage) {
+
+  case WeakDone:
+      return rtsFalse;
+
+  case WeakPtrs:
+      /* doesn't matter where we evacuate values/finalizers to, since
+       * these pointers are treated as roots (iff the keys are alive).
+       */
+      evac_gen = 0;
+      
+      last_w = &old_weak_ptr_list;
+      for (w = old_weak_ptr_list; w != NULL; w = next_w) {
+	  
+	  /* There might be a DEAD_WEAK on the list if finalizeWeak# was
+	   * called on a live weak pointer object.  Just remove it.
+	   */
+	  if (w->header.info == &stg_DEAD_WEAK_info) {
+	      next_w = ((StgDeadWeak *)w)->link;
+	      *last_w = next_w;
+	      continue;
+	  }
+	  
+	  switch (get_itbl(w)->type) {
+
+	  case EVACUATED:
+	      next_w = (StgWeak *)((StgEvacuated *)w)->evacuee;
+	      *last_w = next_w;
+	      continue;
+
+	  case WEAK:
+	      /* Now, check whether the key is reachable.
+	       */
+	      new = isAlive(w->key);
+	      if (new != NULL) {
+		  w->key = new;
+		  // evacuate the value and finalizer 
+		  w->value = evacuate(w->value);
+		  w->finalizer = evacuate(w->finalizer);
+		  // remove this weak ptr from the old_weak_ptr list 
+		  *last_w = w->link;
+		  // and put it on the new weak ptr list 
+		  next_w  = w->link;
+		  w->link = weak_ptr_list;
+		  weak_ptr_list = w;
+		  flag = rtsTrue;
+		  IF_DEBUG(weak, debugBelch("Weak pointer still alive at %p -> %p", 
+				       w, w->key));
+		  continue;
+	      }
+	      else {
+		  last_w = &(w->link);
+		  next_w = w->link;
+		  continue;
+	      }
+
+	  default:
+	      barf("traverse_weak_ptr_list: not WEAK");
+	  }
+      }
+      
+      /* If we didn't make any changes, then we can go round and kill all
+       * the dead weak pointers.  The old_weak_ptr list is used as a list
+       * of pending finalizers later on.
+       */
+      if (flag == rtsFalse) {
+	  for (w = old_weak_ptr_list; w; w = w->link) {
+	      w->finalizer = evacuate(w->finalizer);
+	  }
+
+	  // Next, move to the WeakThreads stage after fully
+	  // scavenging the finalizers we've just evacuated.
+	  weak_stage = WeakThreads;
+      }
+
+      return rtsTrue;
+
+  case WeakThreads:
+      /* Now deal with the all_threads list, which behaves somewhat like
+       * the weak ptr list.  If we discover any threads that are about to
+       * become garbage, we wake them up and administer an exception.
+       */
+      {
+	  StgTSO *t, *tmp, *next, **prev;
+	  
+	  prev = &old_all_threads;
+	  for (t = old_all_threads; t != END_TSO_QUEUE; t = next) {
+	      
+	      tmp = (StgTSO *)isAlive((StgClosure *)t);
+	      
+	      if (tmp != NULL) {
+		  t = tmp;
+	      }
+	      
+	      ASSERT(get_itbl(t)->type == TSO);
+	      switch (t->what_next) {
+	      case ThreadRelocated:
+		  next = t->link;
+		  *prev = next;
+		  continue;
+	      case ThreadKilled:
+	      case ThreadComplete:
+		  // finshed or died.  The thread might still be alive, but we
+		  // don't keep it on the all_threads list.  Don't forget to
+		  // stub out its global_link field.
+		  next = t->global_link;
+		  t->global_link = END_TSO_QUEUE;
+		  *prev = next;
+		  continue;
+	      default:
+		  ;
+	      }
+	      
+	      // Threads blocked on black holes: if the black hole
+	      // is alive, then the thread is alive too.
+	      if (tmp == NULL && t->why_blocked == BlockedOnBlackHole) {
+		  if (isAlive(t->block_info.closure)) {
+		      t = (StgTSO *)evacuate((StgClosure *)t);
+		      tmp = t;
+		      flag = rtsTrue;
+		  }
+	      }
+
+	      if (tmp == NULL) {
+		  // not alive (yet): leave this thread on the
+		  // old_all_threads list.
+		  prev = &(t->global_link);
+		  next = t->global_link;
+	      } 
+	      else {
+		  // alive: move this thread onto the all_threads list.
+		  next = t->global_link;
+		  t->global_link = all_threads;
+		  all_threads  = t;
+		  *prev = next;
+	      }
+	  }
+      }
+      
+      /* If we evacuated any threads, we need to go back to the scavenger.
+       */
+      if (flag) return rtsTrue;
+
+      /* And resurrect any threads which were about to become garbage.
+       */
+      {
+	  StgTSO *t, *tmp, *next;
+	  for (t = old_all_threads; t != END_TSO_QUEUE; t = next) {
+	      next = t->global_link;
+	      tmp = (StgTSO *)evacuate((StgClosure *)t);
+	      tmp->global_link = resurrected_threads;
+	      resurrected_threads = tmp;
+	  }
+      }
+      
+      /* Finally, we can update the blackhole_queue.  This queue
+       * simply strings together TSOs blocked on black holes, it is
+       * not intended to keep anything alive.  Hence, we do not follow
+       * pointers on the blackhole_queue until now, when we have
+       * determined which TSOs are otherwise reachable.  We know at
+       * this point that all TSOs have been evacuated, however.
+       */
+      { 
+	  StgTSO **pt;
+	  for (pt = &blackhole_queue; *pt != END_TSO_QUEUE; pt = &((*pt)->link)) {
+	      *pt = (StgTSO *)isAlive((StgClosure *)*pt);
+	      ASSERT(*pt != NULL);
+	  }
+      }
+
+      weak_stage = WeakDone;  // *now* we're done,
+      return rtsTrue;         // but one more round of scavenging, please
+
+  default:
+      barf("traverse_weak_ptr_list");
+      return rtsTrue;
+  }
+
+}
+
+/* -----------------------------------------------------------------------------
+   After GC, the live weak pointer list may have forwarding pointers
+   on it, because a weak pointer object was evacuated after being
+   moved to the live weak pointer list.  We remove those forwarding
+   pointers here.
+
+   Also, we don't consider weak pointer objects to be reachable, but
+   we must nevertheless consider them to be "live" and retain them.
+   Therefore any weak pointer objects which haven't as yet been
+   evacuated need to be evacuated now.
+   -------------------------------------------------------------------------- */
+
+
+static void
+mark_weak_ptr_list ( StgWeak **list )
+{
+  StgWeak *w, **last_w;
+
+  last_w = list;
+  for (w = *list; w; w = w->link) {
+      // w might be WEAK, EVACUATED, or DEAD_WEAK (actually CON_STATIC) here
+      ASSERT(w->header.info == &stg_DEAD_WEAK_info 
+	     || get_itbl(w)->type == WEAK || get_itbl(w)->type == EVACUATED);
+      w = (StgWeak *)evacuate((StgClosure *)w);
+      *last_w = w;
+      last_w = &(w->link);
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   isAlive determines whether the given closure is still alive (after
+   a garbage collection) or not.  It returns the new address of the
+   closure if it is alive, or NULL otherwise.
+
+   NOTE: Use it before compaction only!
+   -------------------------------------------------------------------------- */
+
+
+StgClosure *
+isAlive(StgClosure *p)
+{
+  const StgInfoTable *info;
+  bdescr *bd;
+
+  while (1) {
+
+    ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
+    info = get_itbl(p);
+
+    // ignore static closures 
+    //
+    // ToDo: for static closures, check the static link field.
+    // Problem here is that we sometimes don't set the link field, eg.
+    // for static closures with an empty SRT or CONSTR_STATIC_NOCAFs.
+    //
+    if (!HEAP_ALLOCED(p)) {
+	return p;
+    }
+
+    // ignore closures in generations that we're not collecting. 
+    bd = Bdescr((P_)p);
+    if (bd->gen_no > N) {
+	return p;
+    }
+
+    // if it's a pointer into to-space, then we're done
+    if (bd->flags & BF_EVACUATED) {
+	return p;
+    }
+
+    // large objects use the evacuated flag
+    if (bd->flags & BF_LARGE) {
+	return NULL;
+    }
+
+    // check the mark bit for compacted steps
+    if ((bd->flags & BF_COMPACTED) && is_marked((P_)p,bd)) {
+	return p;
+    }
+
+    switch (info->type) {
+
+    case IND:
+    case IND_STATIC:
+    case IND_PERM:
+    case IND_OLDGEN:		// rely on compatible layout with StgInd 
+    case IND_OLDGEN_PERM:
+      // follow indirections 
+      p = ((StgInd *)p)->indirectee;
+      continue;
+
+    case EVACUATED:
+      // alive! 
+      return ((StgEvacuated *)p)->evacuee;
+
+    case TSO:
+      if (((StgTSO *)p)->what_next == ThreadRelocated) {
+	p = (StgClosure *)((StgTSO *)p)->link;
+	continue;
+      } 
+      return NULL;
+
+    default:
+      // dead. 
+      return NULL;
+    }
+  }
+}
+
+static void
+mark_root(StgClosure **root)
+{
+  *root = evacuate(*root);
+}
+
+STATIC_INLINE void 
+upd_evacuee(StgClosure *p, StgClosure *dest)
+{
+    // not true: (ToDo: perhaps it should be)
+    // ASSERT(Bdescr((P_)dest)->flags & BF_EVACUATED);
+    SET_INFO(p, &stg_EVACUATED_info);
+    ((StgEvacuated *)p)->evacuee = dest;
+}
+
+
+STATIC_INLINE StgClosure *
+copy(StgClosure *src, nat size, step *stp)
+{
+  StgPtr to, from;
+  nat i;
+#ifdef PROFILING
+  // @LDV profiling
+  nat size_org = size;
+#endif
+
+  TICK_GC_WORDS_COPIED(size);
+  /* Find out where we're going, using the handy "to" pointer in 
+   * the step of the source object.  If it turns out we need to
+   * evacuate to an older generation, adjust it here (see comment
+   * by evacuate()).
+   */
+  if (stp->gen_no < evac_gen) {
+      if (eager_promotion) {
+	  stp = &generations[evac_gen].steps[0];
+      } else {
+	  failed_to_evac = rtsTrue;
+      }
+  }
+
+  /* chain a new block onto the to-space for the destination step if
+   * necessary.
+   */
+  if (stp->hp + size >= stp->hpLim) {
+    gc_alloc_block(stp);
+  }
+
+  to = stp->hp;
+  from = (StgPtr)src;
+  stp->hp = to + size;
+  for (i = 0; i < size; i++) { // unroll for small i
+      to[i] = from[i];
+  }
+  upd_evacuee((StgClosure *)from,(StgClosure *)to);
+
+#ifdef PROFILING
+  // We store the size of the just evacuated object in the LDV word so that
+  // the profiler can guess the position of the next object later.
+  SET_EVACUAEE_FOR_LDV(from, size_org);
+#endif
+  return (StgClosure *)to;
+}
+
+// Same as copy() above, except the object will be allocated in memory
+// that will not be scavenged.  Used for object that have no pointer
+// fields.
+STATIC_INLINE StgClosure *
+copy_noscav(StgClosure *src, nat size, step *stp)
+{
+  StgPtr to, from;
+  nat i;
+#ifdef PROFILING
+  // @LDV profiling
+  nat size_org = size;
+#endif
+
+  TICK_GC_WORDS_COPIED(size);
+  /* Find out where we're going, using the handy "to" pointer in 
+   * the step of the source object.  If it turns out we need to
+   * evacuate to an older generation, adjust it here (see comment
+   * by evacuate()).
+   */
+  if (stp->gen_no < evac_gen) {
+      if (eager_promotion) {
+	  stp = &generations[evac_gen].steps[0];
+      } else {
+	  failed_to_evac = rtsTrue;
+      }
+  }
+
+  /* chain a new block onto the to-space for the destination step if
+   * necessary.
+   */
+  if (stp->scavd_hp + size >= stp->scavd_hpLim) {
+    gc_alloc_scavd_block(stp);
+  }
+
+  to = stp->scavd_hp;
+  from = (StgPtr)src;
+  stp->scavd_hp = to + size;
+  for (i = 0; i < size; i++) { // unroll for small i
+      to[i] = from[i];
+  }
+  upd_evacuee((StgClosure *)from,(StgClosure *)to);
+
+#ifdef PROFILING
+  // We store the size of the just evacuated object in the LDV word so that
+  // the profiler can guess the position of the next object later.
+  SET_EVACUAEE_FOR_LDV(from, size_org);
+#endif
+  return (StgClosure *)to;
+}
+
+/* Special version of copy() for when we only want to copy the info
+ * pointer of an object, but reserve some padding after it.  This is
+ * used to optimise evacuation of BLACKHOLEs.
+ */
+
+
+static StgClosure *
+copyPart(StgClosure *src, nat size_to_reserve, nat size_to_copy, step *stp)
+{
+  P_ dest, to, from;
+#ifdef PROFILING
+  // @LDV profiling
+  nat size_to_copy_org = size_to_copy;
+#endif
+
+  TICK_GC_WORDS_COPIED(size_to_copy);
+  if (stp->gen_no < evac_gen) {
+      if (eager_promotion) {
+	  stp = &generations[evac_gen].steps[0];
+      } else {
+	  failed_to_evac = rtsTrue;
+      }
+  }
+
+  if (stp->hp + size_to_reserve >= stp->hpLim) {
+    gc_alloc_block(stp);
+  }
+
+  for(to = stp->hp, from = (P_)src; size_to_copy>0; --size_to_copy) {
+    *to++ = *from++;
+  }
+  
+  dest = stp->hp;
+  stp->hp += size_to_reserve;
+  upd_evacuee(src,(StgClosure *)dest);
+#ifdef PROFILING
+  // We store the size of the just evacuated object in the LDV word so that
+  // the profiler can guess the position of the next object later.
+  // size_to_copy_org is wrong because the closure already occupies size_to_reserve
+  // words.
+  SET_EVACUAEE_FOR_LDV(src, size_to_reserve);
+  // fill the slop
+  if (size_to_reserve - size_to_copy_org > 0)
+    LDV_FILL_SLOP(stp->hp - 1, (int)(size_to_reserve - size_to_copy_org)); 
+#endif
+  return (StgClosure *)dest;
+}
+
+
+/* -----------------------------------------------------------------------------
+   Evacuate a large object
+
+   This just consists of removing the object from the (doubly-linked)
+   step->large_objects list, and linking it on to the (singly-linked)
+   step->new_large_objects list, from where it will be scavenged later.
+
+   Convention: bd->flags has BF_EVACUATED set for a large object
+   that has been evacuated, or unset otherwise.
+   -------------------------------------------------------------------------- */
+
+
+STATIC_INLINE void
+evacuate_large(StgPtr p)
+{
+  bdescr *bd = Bdescr(p);
+  step *stp;
+
+  // object must be at the beginning of the block (or be a ByteArray)
+  ASSERT(get_itbl((StgClosure *)p)->type == ARR_WORDS ||
+	 (((W_)p & BLOCK_MASK) == 0));
+
+  // already evacuated? 
+  if (bd->flags & BF_EVACUATED) { 
+    /* Don't forget to set the failed_to_evac flag if we didn't get
+     * the desired destination (see comments in evacuate()).
+     */
+    if (bd->gen_no < evac_gen) {
+      failed_to_evac = rtsTrue;
+      TICK_GC_FAILED_PROMOTION();
+    }
+    return;
+  }
+
+  stp = bd->step;
+  // remove from large_object list 
+  if (bd->u.back) {
+    bd->u.back->link = bd->link;
+  } else { // first object in the list 
+    stp->large_objects = bd->link;
+  }
+  if (bd->link) {
+    bd->link->u.back = bd->u.back;
+  }
+  
+  /* link it on to the evacuated large object list of the destination step
+   */
+  stp = bd->step->to;
+  if (stp->gen_no < evac_gen) {
+      if (eager_promotion) {
+	  stp = &generations[evac_gen].steps[0];
+      } else {
+	  failed_to_evac = rtsTrue;
+      }
+  }
+
+  bd->step = stp;
+  bd->gen_no = stp->gen_no;
+  bd->link = stp->new_large_objects;
+  stp->new_large_objects = bd;
+  bd->flags |= BF_EVACUATED;
+}
+
+/* -----------------------------------------------------------------------------
+   Evacuate
+
+   This is called (eventually) for every live object in the system.
+
+   The caller to evacuate specifies a desired generation in the
+   evac_gen global variable.  The following conditions apply to
+   evacuating an object which resides in generation M when we're
+   collecting up to generation N
+
+   if  M >= evac_gen 
+           if  M > N     do nothing
+	   else          evac to step->to
+
+   if  M < evac_gen      evac to evac_gen, step 0
+
+   if the object is already evacuated, then we check which generation
+   it now resides in.
+
+   if  M >= evac_gen     do nothing
+   if  M <  evac_gen     set failed_to_evac flag to indicate that we
+                         didn't manage to evacuate this object into evac_gen.
+
+
+   OPTIMISATION NOTES:
+
+   evacuate() is the single most important function performance-wise
+   in the GC.  Various things have been tried to speed it up, but as
+   far as I can tell the code generated by gcc 3.2 with -O2 is about
+   as good as it's going to get.  We pass the argument to evacuate()
+   in a register using the 'regparm' attribute (see the prototype for
+   evacuate() near the top of this file).
+
+   Changing evacuate() to take an (StgClosure **) rather than
+   returning the new pointer seems attractive, because we can avoid
+   writing back the pointer when it hasn't changed (eg. for a static
+   object, or an object in a generation > N).  However, I tried it and
+   it doesn't help.  One reason is that the (StgClosure **) pointer
+   gets spilled to the stack inside evacuate(), resulting in far more
+   extra reads/writes than we save.
+   -------------------------------------------------------------------------- */
+
+REGPARM1 static StgClosure *
+evacuate(StgClosure *q)
+{
+#if defined(PAR)
+  StgClosure *to;
+#endif
+  bdescr *bd = NULL;
+  step *stp;
+  const StgInfoTable *info;
+
+loop:
+  ASSERT(LOOKS_LIKE_CLOSURE_PTR(q));
+
+  if (!HEAP_ALLOCED(q)) {
+
+      if (!major_gc) return q;
+
+      info = get_itbl(q);
+      switch (info->type) {
+
+      case THUNK_STATIC:
+	  if (info->srt_bitmap != 0 && 
+	      *THUNK_STATIC_LINK((StgClosure *)q) == NULL) {
+	      *THUNK_STATIC_LINK((StgClosure *)q) = static_objects;
+	      static_objects = (StgClosure *)q;
+	  }
+	  return q;
+	  
+      case FUN_STATIC:
+	  if (info->srt_bitmap != 0 && 
+	      *FUN_STATIC_LINK((StgClosure *)q) == NULL) {
+	      *FUN_STATIC_LINK((StgClosure *)q) = static_objects;
+	      static_objects = (StgClosure *)q;
+	  }
+	  return q;
+	  
+      case IND_STATIC:
+	  /* If q->saved_info != NULL, then it's a revertible CAF - it'll be
+	   * on the CAF list, so don't do anything with it here (we'll
+	   * scavenge it later).
+	   */
+	  if (((StgIndStatic *)q)->saved_info == NULL
+	      && *IND_STATIC_LINK((StgClosure *)q) == NULL) {
+	      *IND_STATIC_LINK((StgClosure *)q) = static_objects;
+	      static_objects = (StgClosure *)q;
+	  }
+	  return q;
+	  
+      case CONSTR_STATIC:
+	  if (*STATIC_LINK(info,(StgClosure *)q) == NULL) {
+	      *STATIC_LINK(info,(StgClosure *)q) = static_objects;
+	      static_objects = (StgClosure *)q;
+	  }
+	  return q;
+	  
+      case CONSTR_INTLIKE:
+      case CONSTR_CHARLIKE:
+      case CONSTR_NOCAF_STATIC:
+	  /* no need to put these on the static linked list, they don't need
+	   * to be scavenged.
+	   */
+	  return q;
+	  
+      default:
+	  barf("evacuate(static): strange closure type %d", (int)(info->type));
+      }
+  }
+
+  bd = Bdescr((P_)q);
+
+  if (bd->gen_no > N) {
+      /* Can't evacuate this object, because it's in a generation
+       * older than the ones we're collecting.  Let's hope that it's
+       * in evac_gen or older, or we will have to arrange to track
+       * this pointer using the mutable list.
+       */
+      if (bd->gen_no < evac_gen) {
+	  // nope 
+	  failed_to_evac = rtsTrue;
+	  TICK_GC_FAILED_PROMOTION();
+      }
+      return q;
+  }
+
+  if ((bd->flags & (BF_LARGE | BF_COMPACTED | BF_EVACUATED)) != 0) {
+
+      /* pointer into to-space: just return it.  This normally
+       * shouldn't happen, but alllowing it makes certain things
+       * slightly easier (eg. the mutable list can contain the same
+       * object twice, for example).
+       */
+      if (bd->flags & BF_EVACUATED) {
+	  if (bd->gen_no < evac_gen) {
+	      failed_to_evac = rtsTrue;
+	      TICK_GC_FAILED_PROMOTION();
+	  }
+	  return q;
+      }
+
+      /* evacuate large objects by re-linking them onto a different list.
+       */
+      if (bd->flags & BF_LARGE) {
+	  info = get_itbl(q);
+	  if (info->type == TSO && 
+	      ((StgTSO *)q)->what_next == ThreadRelocated) {
+	      q = (StgClosure *)((StgTSO *)q)->link;
+	      goto loop;
+	  }
+	  evacuate_large((P_)q);
+	  return q;
+      }
+      
+      /* If the object is in a step that we're compacting, then we
+       * need to use an alternative evacuate procedure.
+       */
+      if (bd->flags & BF_COMPACTED) {
+	  if (!is_marked((P_)q,bd)) {
+	      mark((P_)q,bd);
+	      if (mark_stack_full()) {
+		  mark_stack_overflowed = rtsTrue;
+		  reset_mark_stack();
+	      }
+	      push_mark_stack((P_)q);
+	  }
+	  return q;
+      }
+  }
+      
+  stp = bd->step->to;
+
+  info = get_itbl(q);
+  
+  switch (info->type) {
+
+  case MUT_VAR_CLEAN:
+  case MUT_VAR_DIRTY:
+  case MVAR:
+      return copy(q,sizeW_fromITBL(info),stp);
+
+  case CONSTR_0_1:
+  { 
+      StgWord w = (StgWord)q->payload[0];
+      if (q->header.info == Czh_con_info &&
+	  // unsigned, so always true:  (StgChar)w >= MIN_CHARLIKE &&  
+	  (StgChar)w <= MAX_CHARLIKE) {
+	  return (StgClosure *)CHARLIKE_CLOSURE((StgChar)w);
+      }
+      if (q->header.info == Izh_con_info &&
+	  (StgInt)w >= MIN_INTLIKE && (StgInt)w <= MAX_INTLIKE) {
+	  return (StgClosure *)INTLIKE_CLOSURE((StgInt)w);
+      }
+      // else
+      return copy_noscav(q,sizeofW(StgHeader)+1,stp);
+  }
+
+  case FUN_0_1:
+  case FUN_1_0:
+  case CONSTR_1_0:
+    return copy(q,sizeofW(StgHeader)+1,stp);
+
+  case THUNK_1_0:
+  case THUNK_0_1:
+    return copy(q,sizeofW(StgThunk)+1,stp);
+
+  case THUNK_1_1:
+  case THUNK_2_0:
+  case THUNK_0_2:
+#ifdef NO_PROMOTE_THUNKS
+    if (bd->gen_no == 0 && 
+	bd->step->no != 0 &&
+	bd->step->no == generations[bd->gen_no].n_steps-1) {
+      stp = bd->step;
+    }
+#endif
+    return copy(q,sizeofW(StgThunk)+2,stp);
+
+  case FUN_1_1:
+  case FUN_2_0:
+  case CONSTR_1_1:
+  case CONSTR_2_0:
+  case FUN_0_2:
+    return copy(q,sizeofW(StgHeader)+2,stp);
+
+  case CONSTR_0_2:
+    return copy_noscav(q,sizeofW(StgHeader)+2,stp);
+
+  case THUNK:
+    return copy(q,thunk_sizeW_fromITBL(info),stp);
+
+  case FUN:
+  case CONSTR:
+  case IND_PERM:
+  case IND_OLDGEN_PERM:
+  case WEAK:
+  case STABLE_NAME:
+    return copy(q,sizeW_fromITBL(info),stp);
+
+  case BCO:
+      return copy(q,bco_sizeW((StgBCO *)q),stp);
+
+  case CAF_BLACKHOLE:
+  case SE_CAF_BLACKHOLE:
+  case SE_BLACKHOLE:
+  case BLACKHOLE:
+    return copyPart(q,BLACKHOLE_sizeW(),sizeofW(StgHeader),stp);
+
+  case THUNK_SELECTOR:
+    {
+	StgClosure *p;
+	const StgInfoTable *info_ptr;
+
+	if (thunk_selector_depth > MAX_THUNK_SELECTOR_DEPTH) {
+	    return copy(q,THUNK_SELECTOR_sizeW(),stp);
+	}
+
+	// stashed away for LDV profiling, see below
+	info_ptr = q->header.info;
+
+	p = eval_thunk_selector(info->layout.selector_offset,
+				(StgSelector *)q);
+
+	if (p == NULL) {
+	    return copy(q,THUNK_SELECTOR_sizeW(),stp);
+	} else {
+	    StgClosure *val;
+	    // q is still BLACKHOLE'd.
+	    thunk_selector_depth++;
+	    val = evacuate(p);
+	    thunk_selector_depth--;
+
+#ifdef PROFILING
+	    // For the purposes of LDV profiling, we have destroyed
+	    // the original selector thunk.
+	    SET_INFO(q, info_ptr);
+	    LDV_RECORD_DEAD_FILL_SLOP_DYNAMIC(q);
+#endif
+
+	    // Update the THUNK_SELECTOR with an indirection to the
+	    // EVACUATED closure now at p.  Why do this rather than
+	    // upd_evacuee(q,p)?  Because we have an invariant that an
+	    // EVACUATED closure always points to an object in the
+	    // same or an older generation (required by the short-cut
+	    // test in the EVACUATED case, below).
+	    SET_INFO(q, &stg_IND_info);
+	    ((StgInd *)q)->indirectee = p;
+
+	    // For the purposes of LDV profiling, we have created an
+	    // indirection.
+	    LDV_RECORD_CREATE(q);
+
+	    return val;
+	}
+    }
+
+  case IND:
+  case IND_OLDGEN:
+    // follow chains of indirections, don't evacuate them 
+    q = ((StgInd*)q)->indirectee;
+    goto loop;
+
+  case RET_BCO:
+  case RET_SMALL:
+  case RET_VEC_SMALL:
+  case RET_BIG:
+  case RET_VEC_BIG:
+  case RET_DYN:
+  case UPDATE_FRAME:
+  case STOP_FRAME:
+  case CATCH_FRAME:
+  case CATCH_STM_FRAME:
+  case CATCH_RETRY_FRAME:
+  case ATOMICALLY_FRAME:
+    // shouldn't see these 
+    barf("evacuate: stack frame at %p\n", q);
+
+  case PAP:
+      return copy(q,pap_sizeW((StgPAP*)q),stp);
+
+  case AP:
+      return copy(q,ap_sizeW((StgAP*)q),stp);
+
+  case AP_STACK:
+      return copy(q,ap_stack_sizeW((StgAP_STACK*)q),stp);
+
+  case EVACUATED:
+    /* Already evacuated, just return the forwarding address.
+     * HOWEVER: if the requested destination generation (evac_gen) is
+     * older than the actual generation (because the object was
+     * already evacuated to a younger generation) then we have to
+     * set the failed_to_evac flag to indicate that we couldn't 
+     * manage to promote the object to the desired generation.
+     */
+    /* 
+     * Optimisation: the check is fairly expensive, but we can often
+     * shortcut it if either the required generation is 0, or the
+     * current object (the EVACUATED) is in a high enough generation.
+     * We know that an EVACUATED always points to an object in the
+     * same or an older generation.  stp is the lowest step that the
+     * current object would be evacuated to, so we only do the full
+     * check if stp is too low.
+     */
+    if (evac_gen > 0 && stp->gen_no < evac_gen) {  // optimisation 
+      StgClosure *p = ((StgEvacuated*)q)->evacuee;
+      if (HEAP_ALLOCED(p) && Bdescr((P_)p)->gen_no < evac_gen) {
+	failed_to_evac = rtsTrue;
+	TICK_GC_FAILED_PROMOTION();
+      }
+    }
+    return ((StgEvacuated*)q)->evacuee;
+
+  case ARR_WORDS:
+      // just copy the block 
+      return copy_noscav(q,arr_words_sizeW((StgArrWords *)q),stp);
+
+  case MUT_ARR_PTRS_CLEAN:
+  case MUT_ARR_PTRS_DIRTY:
+  case MUT_ARR_PTRS_FROZEN:
+  case MUT_ARR_PTRS_FROZEN0:
+      // just copy the block 
+      return copy(q,mut_arr_ptrs_sizeW((StgMutArrPtrs *)q),stp);
+
+  case TSO:
+    {
+      StgTSO *tso = (StgTSO *)q;
+
+      /* Deal with redirected TSOs (a TSO that's had its stack enlarged).
+       */
+      if (tso->what_next == ThreadRelocated) {
+	q = (StgClosure *)tso->link;
+	goto loop;
+      }
+
+      /* To evacuate a small TSO, we need to relocate the update frame
+       * list it contains.  
+       */
+      {
+	  StgTSO *new_tso;
+	  StgPtr p, q;
+
+	  new_tso = (StgTSO *)copyPart((StgClosure *)tso,
+				       tso_sizeW(tso),
+				       sizeofW(StgTSO), stp);
+	  move_TSO(tso, new_tso);
+	  for (p = tso->sp, q = new_tso->sp;
+	       p < tso->stack+tso->stack_size;) {
+	      *q++ = *p++;
+	  }
+	  
+	  return (StgClosure *)new_tso;
+      }
+    }
+
+#if defined(PAR)
+  case RBH:
+    {
+      //StgInfoTable *rip = get_closure_info(q, &size, &ptrs, &nonptrs, &vhs, str);
+      to = copy(q,BLACKHOLE_sizeW(),stp); 
+      //ToDo: derive size etc from reverted IP
+      //to = copy(q,size,stp);
+      IF_DEBUG(gc,
+	       debugBelch("@@ evacuate: RBH %p (%s) to %p (%s)",
+		     q, info_type(q), to, info_type(to)));
+      return to;
+    }
+
+  case BLOCKED_FETCH:
+    ASSERT(sizeofW(StgBlockedFetch) >= MIN_PAYLOD_SIZE);
+    to = copy(q,sizeofW(StgBlockedFetch),stp);
+    IF_DEBUG(gc,
+	     debugBelch("@@ evacuate: %p (%s) to %p (%s)",
+		   q, info_type(q), to, info_type(to)));
+    return to;
+
+# ifdef DIST    
+  case REMOTE_REF:
+# endif
+  case FETCH_ME:
+    ASSERT(sizeofW(StgBlockedFetch) >= MIN_PAYLOAD_SIZE);
+    to = copy(q,sizeofW(StgFetchMe),stp);
+    IF_DEBUG(gc,
+	     debugBelch("@@ evacuate: %p (%s) to %p (%s)",
+		   q, info_type(q), to, info_type(to)));
+    return to;
+
+  case FETCH_ME_BQ:
+    ASSERT(sizeofW(StgBlockedFetch) >= MIN_PAYLOAD_SIZE);
+    to = copy(q,sizeofW(StgFetchMeBlockingQueue),stp);
+    IF_DEBUG(gc,
+	     debugBelch("@@ evacuate: %p (%s) to %p (%s)",
+		   q, info_type(q), to, info_type(to)));
+    return to;
+#endif
+
+  case TREC_HEADER: 
+    return copy(q,sizeofW(StgTRecHeader),stp);
+
+  case TVAR_WAIT_QUEUE:
+    return copy(q,sizeofW(StgTVarWaitQueue),stp);
+
+  case TVAR:
+    return copy(q,sizeofW(StgTVar),stp);
+    
+  case TREC_CHUNK:
+    return copy(q,sizeofW(StgTRecChunk),stp);
+
+  default:
+    barf("evacuate: strange closure type %d", (int)(info->type));
+  }
+
+  barf("evacuate");
+}
+
+/* -----------------------------------------------------------------------------
+   Evaluate a THUNK_SELECTOR if possible.
+
+   returns: NULL if we couldn't evaluate this THUNK_SELECTOR, or
+   a closure pointer if we evaluated it and this is the result.  Note
+   that "evaluating" the THUNK_SELECTOR doesn't necessarily mean
+   reducing it to HNF, just that we have eliminated the selection.
+   The result might be another thunk, or even another THUNK_SELECTOR.
+
+   If the return value is non-NULL, the original selector thunk has
+   been BLACKHOLE'd, and should be updated with an indirection or a
+   forwarding pointer.  If the return value is NULL, then the selector
+   thunk is unchanged.
+
+   ***
+   ToDo: the treatment of THUNK_SELECTORS could be improved in the
+   following way (from a suggestion by Ian Lynagh):
+
+   We can have a chain like this:
+
+      sel_0 --> (a,b)
+                 |
+                 |-----> sel_0 --> (a,b)
+                                    |
+                                    |-----> sel_0 --> ...
+
+   and the depth limit means we don't go all the way to the end of the
+   chain, which results in a space leak.  This affects the recursive
+   call to evacuate() in the THUNK_SELECTOR case in evacuate(): *not*
+   the recursive call to eval_thunk_selector() in
+   eval_thunk_selector().
+
+   We could eliminate the depth bound in this case, in the following
+   way:
+
+      - traverse the chain once to discover the *value* of the 
+        THUNK_SELECTOR.  Mark all THUNK_SELECTORS that we
+        visit on the way as having been visited already (somehow).
+
+      - in a second pass, traverse the chain again updating all
+        THUNK_SEELCTORS that we find on the way with indirections to
+        the value.
+
+      - if we encounter a "marked" THUNK_SELECTOR in a normal 
+        evacuate(), we konw it can't be updated so just evac it.
+
+   Program that illustrates the problem:
+
+	foo [] = ([], [])
+	foo (x:xs) = let (ys, zs) = foo xs
+	             in if x >= 0 then (x:ys, zs) else (ys, x:zs)
+
+	main = bar [1..(100000000::Int)]
+	bar xs = (\(ys, zs) -> print ys >> print zs) (foo xs)
+
+   -------------------------------------------------------------------------- */
+
+static inline rtsBool
+is_to_space ( StgClosure *p )
+{
+    bdescr *bd;
+
+    bd = Bdescr((StgPtr)p);
+    if (HEAP_ALLOCED(p) &&
+	((bd->flags & BF_EVACUATED) 
+	 || ((bd->flags & BF_COMPACTED) &&
+	     is_marked((P_)p,bd)))) {
+	return rtsTrue;
+    } else {
+	return rtsFalse;
+    }
+}    
+
+static StgClosure *
+eval_thunk_selector( nat field, StgSelector * p )
+{
+    StgInfoTable *info;
+    const StgInfoTable *info_ptr;
+    StgClosure *selectee;
+    
+    selectee = p->selectee;
+
+    // Save the real info pointer (NOTE: not the same as get_itbl()).
+    info_ptr = p->header.info;
+
+    // If the THUNK_SELECTOR is in a generation that we are not
+    // collecting, then bail out early.  We won't be able to save any
+    // space in any case, and updating with an indirection is trickier
+    // in an old gen.
+    if (Bdescr((StgPtr)p)->gen_no > N) {
+	return NULL;
+    }
+
+    // BLACKHOLE the selector thunk, since it is now under evaluation.
+    // This is important to stop us going into an infinite loop if
+    // this selector thunk eventually refers to itself.
+    SET_INFO(p,&stg_BLACKHOLE_info);
+
+selector_loop:
+
+    // We don't want to end up in to-space, because this causes
+    // problems when the GC later tries to evacuate the result of
+    // eval_thunk_selector().  There are various ways this could
+    // happen:
+    //
+    // 1. following an IND_STATIC
+    //
+    // 2. when the old generation is compacted, the mark phase updates
+    //    from-space pointers to be to-space pointers, and we can't
+    //    reliably tell which we're following (eg. from an IND_STATIC).
+    // 
+    // 3. compacting GC again: if we're looking at a constructor in
+    //    the compacted generation, it might point directly to objects
+    //    in to-space.  We must bale out here, otherwise doing the selection
+    //    will result in a to-space pointer being returned.
+    //
+    //  (1) is dealt with using a BF_EVACUATED test on the
+    //  selectee. (2) and (3): we can tell if we're looking at an
+    //  object in the compacted generation that might point to
+    //  to-space objects by testing that (a) it is BF_COMPACTED, (b)
+    //  the compacted generation is being collected, and (c) the
+    //  object is marked.  Only a marked object may have pointers that
+    //  point to to-space objects, because that happens when
+    //  scavenging.
+    //
+    //  The to-space test is now embodied in the in_to_space() inline
+    //  function, as it is re-used below.
+    //
+    if (is_to_space(selectee)) {
+	goto bale_out;
+    }
+
+    info = get_itbl(selectee);
+    switch (info->type) {
+      case CONSTR:
+      case CONSTR_1_0:
+      case CONSTR_0_1:
+      case CONSTR_2_0:
+      case CONSTR_1_1:
+      case CONSTR_0_2:
+      case CONSTR_STATIC:
+      case CONSTR_NOCAF_STATIC:
+	  // check that the size is in range 
+	  ASSERT(field <  (StgWord32)(info->layout.payload.ptrs + 
+				      info->layout.payload.nptrs));
+	  
+	  // Select the right field from the constructor, and check
+	  // that the result isn't in to-space.  It might be in
+	  // to-space if, for example, this constructor contains
+	  // pointers to younger-gen objects (and is on the mut-once
+	  // list).
+	  //
+	  { 
+	      StgClosure *q;
+	      q = selectee->payload[field];
+	      if (is_to_space(q)) {
+		  goto bale_out;
+	      } else {
+		  return q;
+	      }
+	  }
+
+      case IND:
+      case IND_PERM:
+      case IND_OLDGEN:
+      case IND_OLDGEN_PERM:
+      case IND_STATIC:
+	  selectee = ((StgInd *)selectee)->indirectee;
+	  goto selector_loop;
+
+      case EVACUATED:
+	  // We don't follow pointers into to-space; the constructor
+	  // has already been evacuated, so we won't save any space
+	  // leaks by evaluating this selector thunk anyhow.
+	  break;
+
+      case THUNK_SELECTOR:
+      {
+	  StgClosure *val;
+
+	  // check that we don't recurse too much, re-using the
+	  // depth bound also used in evacuate().
+	  if (thunk_selector_depth >= MAX_THUNK_SELECTOR_DEPTH) {
+	      break;
+	  }
+	  thunk_selector_depth++;
+
+	  val = eval_thunk_selector(info->layout.selector_offset, 
+				    (StgSelector *)selectee);
+
+	  thunk_selector_depth--;
+
+	  if (val == NULL) { 
+	      break;
+	  } else {
+	      // We evaluated this selector thunk, so update it with
+	      // an indirection.  NOTE: we don't use UPD_IND here,
+	      // because we are guaranteed that p is in a generation
+	      // that we are collecting, and we never want to put the
+	      // indirection on a mutable list.
+#ifdef PROFILING
+	      // For the purposes of LDV profiling, we have destroyed
+	      // the original selector thunk.
+	      SET_INFO(p, info_ptr);
+	      LDV_RECORD_DEAD_FILL_SLOP_DYNAMIC(selectee);
+#endif
+	      ((StgInd *)selectee)->indirectee = val;
+	      SET_INFO(selectee,&stg_IND_info);
+
+	      // For the purposes of LDV profiling, we have created an
+	      // indirection.
+	      LDV_RECORD_CREATE(selectee);
+
+	      selectee = val;
+	      goto selector_loop;
+	  }
+      }
+
+      case AP:
+      case AP_STACK:
+      case THUNK:
+      case THUNK_1_0:
+      case THUNK_0_1:
+      case THUNK_2_0:
+      case THUNK_1_1:
+      case THUNK_0_2:
+      case THUNK_STATIC:
+      case CAF_BLACKHOLE:
+      case SE_CAF_BLACKHOLE:
+      case SE_BLACKHOLE:
+      case BLACKHOLE:
+#if defined(PAR)
+      case RBH:
+      case BLOCKED_FETCH:
+# ifdef DIST    
+      case REMOTE_REF:
+# endif
+      case FETCH_ME:
+      case FETCH_ME_BQ:
+#endif
+	  // not evaluated yet 
+	  break;
+    
+      default:
+	barf("eval_thunk_selector: strange selectee %d",
+	     (int)(info->type));
+    }
+
+bale_out:
+    // We didn't manage to evaluate this thunk; restore the old info pointer
+    SET_INFO(p, info_ptr);
+    return NULL;
+}
+
+/* -----------------------------------------------------------------------------
+   move_TSO is called to update the TSO structure after it has been
+   moved from one place to another.
+   -------------------------------------------------------------------------- */
+
+void
+move_TSO (StgTSO *src, StgTSO *dest)
+{
+    ptrdiff_t diff;
+
+    // relocate the stack pointer... 
+    diff = (StgPtr)dest - (StgPtr)src; // In *words* 
+    dest->sp = (StgPtr)dest->sp + diff;
+}
+
+/* Similar to scavenge_large_bitmap(), but we don't write back the
+ * pointers we get back from evacuate().
+ */
+static void
+scavenge_large_srt_bitmap( StgLargeSRT *large_srt )
+{
+    nat i, b, size;
+    StgWord bitmap;
+    StgClosure **p;
+    
+    b = 0;
+    bitmap = large_srt->l.bitmap[b];
+    size   = (nat)large_srt->l.size;
+    p      = (StgClosure **)large_srt->srt;
+    for (i = 0; i < size; ) {
+	if ((bitmap & 1) != 0) {
+	    evacuate(*p);
+	}
+	i++;
+	p++;
+	if (i % BITS_IN(W_) == 0) {
+	    b++;
+	    bitmap = large_srt->l.bitmap[b];
+	} else {
+	    bitmap = bitmap >> 1;
+	}
+    }
+}
+
+/* evacuate the SRT.  If srt_bitmap is zero, then there isn't an
+ * srt field in the info table.  That's ok, because we'll
+ * never dereference it.
+ */
+STATIC_INLINE void
+scavenge_srt (StgClosure **srt, nat srt_bitmap)
+{
+  nat bitmap;
+  StgClosure **p;
+
+  bitmap = srt_bitmap;
+  p = srt;
+
+  if (bitmap == (StgHalfWord)(-1)) {  
+      scavenge_large_srt_bitmap( (StgLargeSRT *)srt );
+      return;
+  }
+
+  while (bitmap != 0) {
+      if ((bitmap & 1) != 0) {
+#ifdef ENABLE_WIN32_DLL_SUPPORT
+	  // Special-case to handle references to closures hiding out in DLLs, since
+	  // double indirections required to get at those. The code generator knows
+	  // which is which when generating the SRT, so it stores the (indirect)
+	  // reference to the DLL closure in the table by first adding one to it.
+	  // We check for this here, and undo the addition before evacuating it.
+	  // 
+	  // If the SRT entry hasn't got bit 0 set, the SRT entry points to a
+	  // closure that's fixed at link-time, and no extra magic is required.
+	  if ( (unsigned long)(*srt) & 0x1 ) {
+	      evacuate(*stgCast(StgClosure**,(stgCast(unsigned long, *srt) & ~0x1)));
+	  } else {
+	      evacuate(*p);
+	  }
+#else
+	  evacuate(*p);
+#endif
+      }
+      p++;
+      bitmap = bitmap >> 1;
+  }
+}
+
+
+STATIC_INLINE void
+scavenge_thunk_srt(const StgInfoTable *info)
+{
+    StgThunkInfoTable *thunk_info;
+
+    if (!major_gc) return;
+
+    thunk_info = itbl_to_thunk_itbl(info);
+    scavenge_srt((StgClosure **)GET_SRT(thunk_info), thunk_info->i.srt_bitmap);
+}
+
+STATIC_INLINE void
+scavenge_fun_srt(const StgInfoTable *info)
+{
+    StgFunInfoTable *fun_info;
+
+    if (!major_gc) return;
+  
+    fun_info = itbl_to_fun_itbl(info);
+    scavenge_srt((StgClosure **)GET_FUN_SRT(fun_info), fun_info->i.srt_bitmap);
+}
+
+/* -----------------------------------------------------------------------------
+   Scavenge a TSO.
+   -------------------------------------------------------------------------- */
+
+static void
+scavengeTSO (StgTSO *tso)
+{
+    if (   tso->why_blocked == BlockedOnMVar
+	|| tso->why_blocked == BlockedOnBlackHole
+	|| tso->why_blocked == BlockedOnException
+#if defined(PAR)
+	|| tso->why_blocked == BlockedOnGA
+	|| tso->why_blocked == BlockedOnGA_NoSend
+#endif
+	) {
+	tso->block_info.closure = evacuate(tso->block_info.closure);
+    }
+    if ( tso->blocked_exceptions != NULL ) {
+	tso->blocked_exceptions = 
+	    (StgTSO *)evacuate((StgClosure *)tso->blocked_exceptions);
+    }
+    
+    // We don't always chase the link field: TSOs on the blackhole
+    // queue are not automatically alive, so the link field is a
+    // "weak" pointer in that case.
+    if (tso->why_blocked != BlockedOnBlackHole) {
+	tso->link = (StgTSO *)evacuate((StgClosure *)tso->link);
+    }
+
+    // scavange current transaction record
+    tso->trec = (StgTRecHeader *)evacuate((StgClosure *)tso->trec);
+    
+    // scavenge this thread's stack 
+    scavenge_stack(tso->sp, &(tso->stack[tso->stack_size]));
+}
+
+/* -----------------------------------------------------------------------------
+   Blocks of function args occur on the stack (at the top) and
+   in PAPs.
+   -------------------------------------------------------------------------- */
+
+STATIC_INLINE StgPtr
+scavenge_arg_block (StgFunInfoTable *fun_info, StgClosure **args)
+{
+    StgPtr p;
+    StgWord bitmap;
+    nat size;
+
+    p = (StgPtr)args;
+    switch (fun_info->f.fun_type) {
+    case ARG_GEN:
+	bitmap = BITMAP_BITS(fun_info->f.b.bitmap);
+	size = BITMAP_SIZE(fun_info->f.b.bitmap);
+	goto small_bitmap;
+    case ARG_GEN_BIG:
+	size = GET_FUN_LARGE_BITMAP(fun_info)->size;
+	scavenge_large_bitmap(p, GET_FUN_LARGE_BITMAP(fun_info), size);
+	p += size;
+	break;
+    default:
+	bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]);
+	size = BITMAP_SIZE(stg_arg_bitmaps[fun_info->f.fun_type]);
+    small_bitmap:
+	while (size > 0) {
+	    if ((bitmap & 1) == 0) {
+		*p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	    }
+	    p++;
+	    bitmap = bitmap >> 1;
+	    size--;
+	}
+	break;
+    }
+    return p;
+}
+
+STATIC_INLINE StgPtr
+scavenge_PAP_payload (StgClosure *fun, StgClosure **payload, StgWord size)
+{
+    StgPtr p;
+    StgWord bitmap;
+    StgFunInfoTable *fun_info;
+    
+    fun_info = get_fun_itbl(fun);
+    ASSERT(fun_info->i.type != PAP);
+    p = (StgPtr)payload;
+
+    switch (fun_info->f.fun_type) {
+    case ARG_GEN:
+	bitmap = BITMAP_BITS(fun_info->f.b.bitmap);
+	goto small_bitmap;
+    case ARG_GEN_BIG:
+	scavenge_large_bitmap(p, GET_FUN_LARGE_BITMAP(fun_info), size);
+	p += size;
+	break;
+    case ARG_BCO:
+	scavenge_large_bitmap((StgPtr)payload, BCO_BITMAP(fun), size);
+	p += size;
+	break;
+    default:
+	bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]);
+    small_bitmap:
+	while (size > 0) {
+	    if ((bitmap & 1) == 0) {
+		*p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	    }
+	    p++;
+	    bitmap = bitmap >> 1;
+	    size--;
+	}
+	break;
+    }
+    return p;
+}
+
+STATIC_INLINE StgPtr
+scavenge_PAP (StgPAP *pap)
+{
+    pap->fun = evacuate(pap->fun);
+    return scavenge_PAP_payload (pap->fun, pap->payload, pap->n_args);
+}
+
+STATIC_INLINE StgPtr
+scavenge_AP (StgAP *ap)
+{
+    ap->fun = evacuate(ap->fun);
+    return scavenge_PAP_payload (ap->fun, ap->payload, ap->n_args);
+}
+
+/* -----------------------------------------------------------------------------
+   Scavenge a given step until there are no more objects in this step
+   to scavenge.
+
+   evac_gen is set by the caller to be either zero (for a step in a
+   generation < N) or G where G is the generation of the step being
+   scavenged.  
+
+   We sometimes temporarily change evac_gen back to zero if we're
+   scavenging a mutable object where early promotion isn't such a good
+   idea.  
+   -------------------------------------------------------------------------- */
+
+static void
+scavenge(step *stp)
+{
+  StgPtr p, q;
+  StgInfoTable *info;
+  bdescr *bd;
+  nat saved_evac_gen = evac_gen;
+
+  p = stp->scan;
+  bd = stp->scan_bd;
+
+  failed_to_evac = rtsFalse;
+
+  /* scavenge phase - standard breadth-first scavenging of the
+   * evacuated objects 
+   */
+
+  while (bd != stp->hp_bd || p < stp->hp) {
+
+    // If we're at the end of this block, move on to the next block 
+    if (bd != stp->hp_bd && p == bd->free) {
+      bd = bd->link;
+      p = bd->start;
+      continue;
+    }
+
+    ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
+    info = get_itbl((StgClosure *)p);
+    
+    ASSERT(thunk_selector_depth == 0);
+
+    q = p;
+    switch (info->type) {
+
+    case MVAR:
+    { 
+	StgMVar *mvar = ((StgMVar *)p);
+	evac_gen = 0;
+	mvar->head = (StgTSO *)evacuate((StgClosure *)mvar->head);
+	mvar->tail = (StgTSO *)evacuate((StgClosure *)mvar->tail);
+	mvar->value = evacuate((StgClosure *)mvar->value);
+	evac_gen = saved_evac_gen;
+	failed_to_evac = rtsTrue; // mutable.
+	p += sizeofW(StgMVar);
+	break;
+    }
+
+    case FUN_2_0:
+	scavenge_fun_srt(info);
+	((StgClosure *)p)->payload[1] = evacuate(((StgClosure *)p)->payload[1]);
+	((StgClosure *)p)->payload[0] = evacuate(((StgClosure *)p)->payload[0]);
+	p += sizeofW(StgHeader) + 2;
+	break;
+
+    case THUNK_2_0:
+	scavenge_thunk_srt(info);
+	((StgThunk *)p)->payload[1] = evacuate(((StgThunk *)p)->payload[1]);
+	((StgThunk *)p)->payload[0] = evacuate(((StgThunk *)p)->payload[0]);
+	p += sizeofW(StgThunk) + 2;
+	break;
+
+    case CONSTR_2_0:
+	((StgClosure *)p)->payload[1] = evacuate(((StgClosure *)p)->payload[1]);
+	((StgClosure *)p)->payload[0] = evacuate(((StgClosure *)p)->payload[0]);
+	p += sizeofW(StgHeader) + 2;
+	break;
+	
+    case THUNK_1_0:
+	scavenge_thunk_srt(info);
+	((StgThunk *)p)->payload[0] = evacuate(((StgThunk *)p)->payload[0]);
+	p += sizeofW(StgThunk) + 1;
+	break;
+	
+    case FUN_1_0:
+	scavenge_fun_srt(info);
+    case CONSTR_1_0:
+	((StgClosure *)p)->payload[0] = evacuate(((StgClosure *)p)->payload[0]);
+	p += sizeofW(StgHeader) + 1;
+	break;
+	
+    case THUNK_0_1:
+	scavenge_thunk_srt(info);
+	p += sizeofW(StgThunk) + 1;
+	break;
+	
+    case FUN_0_1:
+	scavenge_fun_srt(info);
+    case CONSTR_0_1:
+	p += sizeofW(StgHeader) + 1;
+	break;
+	
+    case THUNK_0_2:
+	scavenge_thunk_srt(info);
+	p += sizeofW(StgThunk) + 2;
+	break;
+	
+    case FUN_0_2:
+	scavenge_fun_srt(info);
+    case CONSTR_0_2:
+	p += sizeofW(StgHeader) + 2;
+	break;
+	
+    case THUNK_1_1:
+	scavenge_thunk_srt(info);
+	((StgThunk *)p)->payload[0] = evacuate(((StgThunk *)p)->payload[0]);
+	p += sizeofW(StgThunk) + 2;
+	break;
+
+    case FUN_1_1:
+	scavenge_fun_srt(info);
+    case CONSTR_1_1:
+	((StgClosure *)p)->payload[0] = evacuate(((StgClosure *)p)->payload[0]);
+	p += sizeofW(StgHeader) + 2;
+	break;
+	
+    case FUN:
+	scavenge_fun_srt(info);
+	goto gen_obj;
+
+    case THUNK:
+    {
+	StgPtr end;
+
+	scavenge_thunk_srt(info);
+	end = (P_)((StgThunk *)p)->payload + info->layout.payload.ptrs;
+	for (p = (P_)((StgThunk *)p)->payload; p < end; p++) {
+	    *p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	}
+	p += info->layout.payload.nptrs;
+	break;
+    }
+	
+    gen_obj:
+    case CONSTR:
+    case WEAK:
+    case STABLE_NAME:
+    {
+	StgPtr end;
+
+	end = (P_)((StgClosure *)p)->payload + info->layout.payload.ptrs;
+	for (p = (P_)((StgClosure *)p)->payload; p < end; p++) {
+	    *p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	}
+	p += info->layout.payload.nptrs;
+	break;
+    }
+
+    case BCO: {
+	StgBCO *bco = (StgBCO *)p;
+	bco->instrs = (StgArrWords *)evacuate((StgClosure *)bco->instrs);
+	bco->literals = (StgArrWords *)evacuate((StgClosure *)bco->literals);
+	bco->ptrs = (StgMutArrPtrs *)evacuate((StgClosure *)bco->ptrs);
+	bco->itbls = (StgArrWords *)evacuate((StgClosure *)bco->itbls);
+	p += bco_sizeW(bco);
+	break;
+    }
+
+    case IND_PERM:
+      if (stp->gen->no != 0) {
+#ifdef PROFILING
+        // @LDV profiling
+        // No need to call LDV_recordDead_FILL_SLOP_DYNAMIC() because an 
+        // IND_OLDGEN_PERM closure is larger than an IND_PERM closure.
+        LDV_recordDead((StgClosure *)p, sizeofW(StgInd));
+#endif        
+        // 
+        // Todo: maybe use SET_HDR() and remove LDV_RECORD_CREATE()?
+        //
+	SET_INFO(((StgClosure *)p), &stg_IND_OLDGEN_PERM_info);
+
+        // We pretend that p has just been created.
+        LDV_RECORD_CREATE((StgClosure *)p);
+      }
+	// fall through 
+    case IND_OLDGEN_PERM:
+	((StgInd *)p)->indirectee = evacuate(((StgInd *)p)->indirectee);
+	p += sizeofW(StgInd);
+	break;
+
+    case MUT_VAR_CLEAN:
+    case MUT_VAR_DIRTY: {
+	rtsBool saved_eager_promotion = eager_promotion;
+
+	eager_promotion = rtsFalse;
+	((StgMutVar *)p)->var = evacuate(((StgMutVar *)p)->var);
+	eager_promotion = saved_eager_promotion;
+
+	if (failed_to_evac) {
+	    ((StgClosure *)q)->header.info = &stg_MUT_VAR_DIRTY_info;
+	} else {
+	    ((StgClosure *)q)->header.info = &stg_MUT_VAR_CLEAN_info;
+	}
+	p += sizeofW(StgMutVar);
+	break;
+    }
+
+    case CAF_BLACKHOLE:
+    case SE_CAF_BLACKHOLE:
+    case SE_BLACKHOLE:
+    case BLACKHOLE:
+	p += BLACKHOLE_sizeW();
+	break;
+
+    case THUNK_SELECTOR:
+    { 
+	StgSelector *s = (StgSelector *)p;
+	s->selectee = evacuate(s->selectee);
+	p += THUNK_SELECTOR_sizeW();
+	break;
+    }
+
+    // A chunk of stack saved in a heap object
+    case AP_STACK:
+    {
+	StgAP_STACK *ap = (StgAP_STACK *)p;
+
+	ap->fun = evacuate(ap->fun);
+	scavenge_stack((StgPtr)ap->payload, (StgPtr)ap->payload + ap->size);
+	p = (StgPtr)ap->payload + ap->size;
+	break;
+    }
+
+    case PAP:
+	p = scavenge_PAP((StgPAP *)p);
+	break;
+
+    case AP:
+	p = scavenge_AP((StgAP *)p);
+	break;
+
+    case ARR_WORDS:
+	// nothing to follow 
+	p += arr_words_sizeW((StgArrWords *)p);
+	break;
+
+    case MUT_ARR_PTRS_CLEAN:
+    case MUT_ARR_PTRS_DIRTY:
+	// follow everything 
+    {
+	StgPtr next;
+	rtsBool saved_eager;
+
+	// We don't eagerly promote objects pointed to by a mutable
+	// array, but if we find the array only points to objects in
+	// the same or an older generation, we mark it "clean" and
+	// avoid traversing it during minor GCs.
+	saved_eager = eager_promotion;
+	eager_promotion = rtsFalse;
+	next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+	for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+	    *p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	}
+	eager_promotion = saved_eager;
+
+	if (failed_to_evac) {
+	    ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info;
+	} else {
+	    ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_CLEAN_info;
+	}
+
+	failed_to_evac = rtsTrue; // always put it on the mutable list.
+	break;
+    }
+
+    case MUT_ARR_PTRS_FROZEN:
+    case MUT_ARR_PTRS_FROZEN0:
+	// follow everything 
+    {
+	StgPtr next;
+
+	next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+	for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+	    *p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	}
+
+	// If we're going to put this object on the mutable list, then
+	// set its info ptr to MUT_ARR_PTRS_FROZEN0 to indicate that.
+	if (failed_to_evac) {
+	    ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN0_info;
+	} else {
+	    ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_info;
+	}
+	break;
+    }
+
+    case TSO:
+    { 
+	StgTSO *tso = (StgTSO *)p;
+	rtsBool saved_eager = eager_promotion;
+
+	eager_promotion = rtsFalse;
+	scavengeTSO(tso);
+	eager_promotion = saved_eager;
+
+	if (failed_to_evac) {
+	    tso->flags |= TSO_DIRTY;
+	} else {
+	    tso->flags &= ~TSO_DIRTY;
+	}
+
+	failed_to_evac = rtsTrue; // always on the mutable list
+	p += tso_sizeW(tso);
+	break;
+    }
+
+#if defined(PAR)
+    case RBH:
+    { 
+#if 0
+	nat size, ptrs, nonptrs, vhs;
+	char str[80];
+	StgInfoTable *rip = get_closure_info(p, &size, &ptrs, &nonptrs, &vhs, str);
+#endif
+	StgRBH *rbh = (StgRBH *)p;
+	(StgClosure *)rbh->blocking_queue = 
+	    evacuate((StgClosure *)rbh->blocking_queue);
+	failed_to_evac = rtsTrue;  // mutable anyhow.
+	IF_DEBUG(gc,
+		 debugBelch("@@ scavenge: RBH %p (%s) (new blocking_queue link=%p)",
+		       p, info_type(p), (StgClosure *)rbh->blocking_queue));
+	// ToDo: use size of reverted closure here!
+	p += BLACKHOLE_sizeW(); 
+	break;
+    }
+
+    case BLOCKED_FETCH:
+    { 
+	StgBlockedFetch *bf = (StgBlockedFetch *)p;
+	// follow the pointer to the node which is being demanded 
+	(StgClosure *)bf->node = 
+	    evacuate((StgClosure *)bf->node);
+	// follow the link to the rest of the blocking queue 
+	(StgClosure *)bf->link = 
+	    evacuate((StgClosure *)bf->link);
+	IF_DEBUG(gc,
+		 debugBelch("@@ scavenge: %p (%s); node is now %p; exciting, isn't it",
+		       bf, info_type((StgClosure *)bf), 
+		       bf->node, info_type(bf->node)));
+	p += sizeofW(StgBlockedFetch);
+	break;
+    }
+
+#ifdef DIST
+    case REMOTE_REF:
+#endif
+    case FETCH_ME:
+	p += sizeofW(StgFetchMe);
+	break; // nothing to do in this case
+
+    case FETCH_ME_BQ:
+    { 
+	StgFetchMeBlockingQueue *fmbq = (StgFetchMeBlockingQueue *)p;
+	(StgClosure *)fmbq->blocking_queue = 
+	    evacuate((StgClosure *)fmbq->blocking_queue);
+	IF_DEBUG(gc,
+		 debugBelch("@@ scavenge: %p (%s) exciting, isn't it",
+		       p, info_type((StgClosure *)p)));
+	p += sizeofW(StgFetchMeBlockingQueue);
+	break;
+    }
+#endif
+
+    case TVAR_WAIT_QUEUE:
+      {
+	StgTVarWaitQueue *wq = ((StgTVarWaitQueue *) p);
+	evac_gen = 0;
+	wq->waiting_tso = (StgTSO *)evacuate((StgClosure*)wq->waiting_tso);
+	wq->next_queue_entry = (StgTVarWaitQueue *)evacuate((StgClosure*)wq->next_queue_entry);
+	wq->prev_queue_entry = (StgTVarWaitQueue *)evacuate((StgClosure*)wq->prev_queue_entry);
+	evac_gen = saved_evac_gen;
+	failed_to_evac = rtsTrue; // mutable
+	p += sizeofW(StgTVarWaitQueue);
+	break;
+      }
+
+    case TVAR:
+      {
+	StgTVar *tvar = ((StgTVar *) p);
+	evac_gen = 0;
+	tvar->current_value = evacuate((StgClosure*)tvar->current_value);
+	tvar->first_wait_queue_entry = (StgTVarWaitQueue *)evacuate((StgClosure*)tvar->first_wait_queue_entry);
+	evac_gen = saved_evac_gen;
+	failed_to_evac = rtsTrue; // mutable
+	p += sizeofW(StgTVar);
+	break;
+      }
+
+    case TREC_HEADER:
+      {
+        StgTRecHeader *trec = ((StgTRecHeader *) p);
+        evac_gen = 0;
+	trec->enclosing_trec = (StgTRecHeader *)evacuate((StgClosure*)trec->enclosing_trec);
+	trec->current_chunk = (StgTRecChunk *)evacuate((StgClosure*)trec->current_chunk);
+	evac_gen = saved_evac_gen;
+	failed_to_evac = rtsTrue; // mutable
+	p += sizeofW(StgTRecHeader);
+        break;
+      }
+
+    case TREC_CHUNK:
+      {
+	StgWord i;
+	StgTRecChunk *tc = ((StgTRecChunk *) p);
+	TRecEntry *e = &(tc -> entries[0]);
+	evac_gen = 0;
+	tc->prev_chunk = (StgTRecChunk *)evacuate((StgClosure*)tc->prev_chunk);
+	for (i = 0; i < tc -> next_entry_idx; i ++, e++ ) {
+	  e->tvar = (StgTVar *)evacuate((StgClosure*)e->tvar);
+	  e->expected_value = evacuate((StgClosure*)e->expected_value);
+	  e->new_value = evacuate((StgClosure*)e->new_value);
+	}
+	evac_gen = saved_evac_gen;
+	failed_to_evac = rtsTrue; // mutable
+	p += sizeofW(StgTRecChunk);
+	break;
+      }
+
+    default:
+	barf("scavenge: unimplemented/strange closure type %d @ %p", 
+	     info->type, p);
+    }
+
+    /*
+     * We need to record the current object on the mutable list if
+     *  (a) It is actually mutable, or 
+     *  (b) It contains pointers to a younger generation.
+     * Case (b) arises if we didn't manage to promote everything that
+     * the current object points to into the current generation.
+     */
+    if (failed_to_evac) {
+	failed_to_evac = rtsFalse;
+	if (stp->gen_no > 0) {
+	    recordMutableGen((StgClosure *)q, stp->gen);
+	}
+    }
+  }
+
+  stp->scan_bd = bd;
+  stp->scan = p;
+}    
+
+/* -----------------------------------------------------------------------------
+   Scavenge everything on the mark stack.
+
+   This is slightly different from scavenge():
+      - we don't walk linearly through the objects, so the scavenger
+        doesn't need to advance the pointer on to the next object.
+   -------------------------------------------------------------------------- */
+
+static void
+scavenge_mark_stack(void)
+{
+    StgPtr p, q;
+    StgInfoTable *info;
+    nat saved_evac_gen;
+
+    evac_gen = oldest_gen->no;
+    saved_evac_gen = evac_gen;
+
+linear_scan:
+    while (!mark_stack_empty()) {
+	p = pop_mark_stack();
+
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
+	info = get_itbl((StgClosure *)p);
+	
+	q = p;
+	switch (info->type) {
+	    
+	case MVAR:
+	{
+	    StgMVar *mvar = ((StgMVar *)p);
+	    evac_gen = 0;
+	    mvar->head = (StgTSO *)evacuate((StgClosure *)mvar->head);
+	    mvar->tail = (StgTSO *)evacuate((StgClosure *)mvar->tail);
+	    mvar->value = evacuate((StgClosure *)mvar->value);
+	    evac_gen = saved_evac_gen;
+	    failed_to_evac = rtsTrue; // mutable.
+	    break;
+	}
+
+	case FUN_2_0:
+	    scavenge_fun_srt(info);
+	    ((StgClosure *)p)->payload[1] = evacuate(((StgClosure *)p)->payload[1]);
+	    ((StgClosure *)p)->payload[0] = evacuate(((StgClosure *)p)->payload[0]);
+	    break;
+
+	case THUNK_2_0:
+	    scavenge_thunk_srt(info);
+	    ((StgThunk *)p)->payload[1] = evacuate(((StgThunk *)p)->payload[1]);
+	    ((StgThunk *)p)->payload[0] = evacuate(((StgThunk *)p)->payload[0]);
+	    break;
+
+	case CONSTR_2_0:
+	    ((StgClosure *)p)->payload[1] = evacuate(((StgClosure *)p)->payload[1]);
+	    ((StgClosure *)p)->payload[0] = evacuate(((StgClosure *)p)->payload[0]);
+	    break;
+	
+	case FUN_1_0:
+	case FUN_1_1:
+	    scavenge_fun_srt(info);
+	    ((StgClosure *)p)->payload[0] = evacuate(((StgClosure *)p)->payload[0]);
+	    break;
+
+	case THUNK_1_0:
+	case THUNK_1_1:
+	    scavenge_thunk_srt(info);
+	    ((StgThunk *)p)->payload[0] = evacuate(((StgThunk *)p)->payload[0]);
+	    break;
+
+	case CONSTR_1_0:
+	case CONSTR_1_1:
+	    ((StgClosure *)p)->payload[0] = evacuate(((StgClosure *)p)->payload[0]);
+	    break;
+	
+	case FUN_0_1:
+	case FUN_0_2:
+	    scavenge_fun_srt(info);
+	    break;
+
+	case THUNK_0_1:
+	case THUNK_0_2:
+	    scavenge_thunk_srt(info);
+	    break;
+
+	case CONSTR_0_1:
+	case CONSTR_0_2:
+	    break;
+	
+	case FUN:
+	    scavenge_fun_srt(info);
+	    goto gen_obj;
+
+	case THUNK:
+	{
+	    StgPtr end;
+	    
+	    scavenge_thunk_srt(info);
+	    end = (P_)((StgThunk *)p)->payload + info->layout.payload.ptrs;
+	    for (p = (P_)((StgThunk *)p)->payload; p < end; p++) {
+		*p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	    }
+	    break;
+	}
+	
+	gen_obj:
+	case CONSTR:
+	case WEAK:
+	case STABLE_NAME:
+	{
+	    StgPtr end;
+	    
+	    end = (P_)((StgClosure *)p)->payload + info->layout.payload.ptrs;
+	    for (p = (P_)((StgClosure *)p)->payload; p < end; p++) {
+		*p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	    }
+	    break;
+	}
+
+	case BCO: {
+	    StgBCO *bco = (StgBCO *)p;
+	    bco->instrs = (StgArrWords *)evacuate((StgClosure *)bco->instrs);
+	    bco->literals = (StgArrWords *)evacuate((StgClosure *)bco->literals);
+	    bco->ptrs = (StgMutArrPtrs *)evacuate((StgClosure *)bco->ptrs);
+	    bco->itbls = (StgArrWords *)evacuate((StgClosure *)bco->itbls);
+	    break;
+	}
+
+	case IND_PERM:
+	    // don't need to do anything here: the only possible case
+	    // is that we're in a 1-space compacting collector, with
+	    // no "old" generation.
+	    break;
+
+	case IND_OLDGEN:
+	case IND_OLDGEN_PERM:
+	    ((StgInd *)p)->indirectee = 
+		evacuate(((StgInd *)p)->indirectee);
+	    break;
+
+	case MUT_VAR_CLEAN:
+	case MUT_VAR_DIRTY: {
+	    rtsBool saved_eager_promotion = eager_promotion;
+	    
+	    eager_promotion = rtsFalse;
+	    ((StgMutVar *)p)->var = evacuate(((StgMutVar *)p)->var);
+	    eager_promotion = saved_eager_promotion;
+	    
+	    if (failed_to_evac) {
+		((StgClosure *)q)->header.info = &stg_MUT_VAR_DIRTY_info;
+	    } else {
+		((StgClosure *)q)->header.info = &stg_MUT_VAR_CLEAN_info;
+	    }
+	    break;
+	}
+
+	case CAF_BLACKHOLE:
+	case SE_CAF_BLACKHOLE:
+	case SE_BLACKHOLE:
+	case BLACKHOLE:
+	case ARR_WORDS:
+	    break;
+
+	case THUNK_SELECTOR:
+	{ 
+	    StgSelector *s = (StgSelector *)p;
+	    s->selectee = evacuate(s->selectee);
+	    break;
+	}
+
+	// A chunk of stack saved in a heap object
+	case AP_STACK:
+	{
+	    StgAP_STACK *ap = (StgAP_STACK *)p;
+	    
+	    ap->fun = evacuate(ap->fun);
+	    scavenge_stack((StgPtr)ap->payload, (StgPtr)ap->payload + ap->size);
+	    break;
+	}
+
+	case PAP:
+	    scavenge_PAP((StgPAP *)p);
+	    break;
+
+	case AP:
+	    scavenge_AP((StgAP *)p);
+	    break;
+      
+	case MUT_ARR_PTRS_CLEAN:
+	case MUT_ARR_PTRS_DIRTY:
+	    // follow everything 
+	{
+	    StgPtr next;
+	    rtsBool saved_eager;
+
+	    // We don't eagerly promote objects pointed to by a mutable
+	    // array, but if we find the array only points to objects in
+	    // the same or an older generation, we mark it "clean" and
+	    // avoid traversing it during minor GCs.
+	    saved_eager = eager_promotion;
+	    eager_promotion = rtsFalse;
+	    next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+	    for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+		*p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	    }
+	    eager_promotion = saved_eager;
+
+	    if (failed_to_evac) {
+		((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info;
+	    } else {
+		((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_CLEAN_info;
+	    }
+
+	    failed_to_evac = rtsTrue; // mutable anyhow.
+	    break;
+	}
+
+	case MUT_ARR_PTRS_FROZEN:
+	case MUT_ARR_PTRS_FROZEN0:
+	    // follow everything 
+	{
+	    StgPtr next, q = p;
+	    
+	    next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+	    for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+		*p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	    }
+
+	    // If we're going to put this object on the mutable list, then
+	    // set its info ptr to MUT_ARR_PTRS_FROZEN0 to indicate that.
+	    if (failed_to_evac) {
+		((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN0_info;
+	    } else {
+		((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_info;
+	    }
+	    break;
+	}
+
+	case TSO:
+	{ 
+	    StgTSO *tso = (StgTSO *)p;
+	    rtsBool saved_eager = eager_promotion;
+
+	    eager_promotion = rtsFalse;
+	    scavengeTSO(tso);
+	    eager_promotion = saved_eager;
+	    
+	    if (failed_to_evac) {
+		tso->flags |= TSO_DIRTY;
+	    } else {
+		tso->flags &= ~TSO_DIRTY;
+	    }
+	    
+	    failed_to_evac = rtsTrue; // always on the mutable list
+	    break;
+	}
+
+#if defined(PAR)
+	case RBH:
+	{ 
+#if 0
+	    nat size, ptrs, nonptrs, vhs;
+	    char str[80];
+	    StgInfoTable *rip = get_closure_info(p, &size, &ptrs, &nonptrs, &vhs, str);
+#endif
+	    StgRBH *rbh = (StgRBH *)p;
+	    bh->blocking_queue = 
+		(StgTSO *)evacuate((StgClosure *)bh->blocking_queue);
+	    failed_to_evac = rtsTrue;  // mutable anyhow.
+	    IF_DEBUG(gc,
+		     debugBelch("@@ scavenge: RBH %p (%s) (new blocking_queue link=%p)",
+			   p, info_type(p), (StgClosure *)rbh->blocking_queue));
+	    break;
+	}
+	
+	case BLOCKED_FETCH:
+	{ 
+	    StgBlockedFetch *bf = (StgBlockedFetch *)p;
+	    // follow the pointer to the node which is being demanded 
+	    (StgClosure *)bf->node = 
+		evacuate((StgClosure *)bf->node);
+	    // follow the link to the rest of the blocking queue 
+	    (StgClosure *)bf->link = 
+		evacuate((StgClosure *)bf->link);
+	    IF_DEBUG(gc,
+		     debugBelch("@@ scavenge: %p (%s); node is now %p; exciting, isn't it",
+			   bf, info_type((StgClosure *)bf), 
+			   bf->node, info_type(bf->node)));
+	    break;
+	}
+
+#ifdef DIST
+	case REMOTE_REF:
+#endif
+	case FETCH_ME:
+	    break; // nothing to do in this case
+
+	case FETCH_ME_BQ:
+	{ 
+	    StgFetchMeBlockingQueue *fmbq = (StgFetchMeBlockingQueue *)p;
+	    (StgClosure *)fmbq->blocking_queue = 
+		evacuate((StgClosure *)fmbq->blocking_queue);
+	    IF_DEBUG(gc,
+		     debugBelch("@@ scavenge: %p (%s) exciting, isn't it",
+			   p, info_type((StgClosure *)p)));
+	    break;
+	}
+#endif /* PAR */
+
+	case TVAR_WAIT_QUEUE:
+	  {
+	    StgTVarWaitQueue *wq = ((StgTVarWaitQueue *) p);
+	    evac_gen = 0;
+	    wq->waiting_tso = (StgTSO *)evacuate((StgClosure*)wq->waiting_tso);
+	    wq->next_queue_entry = (StgTVarWaitQueue *)evacuate((StgClosure*)wq->next_queue_entry);
+	    wq->prev_queue_entry = (StgTVarWaitQueue *)evacuate((StgClosure*)wq->prev_queue_entry);
+	    evac_gen = saved_evac_gen;
+	    failed_to_evac = rtsTrue; // mutable
+	    break;
+	  }
+	  
+	case TVAR:
+	  {
+	    StgTVar *tvar = ((StgTVar *) p);
+	    evac_gen = 0;
+	    tvar->current_value = evacuate((StgClosure*)tvar->current_value);
+	    tvar->first_wait_queue_entry = (StgTVarWaitQueue *)evacuate((StgClosure*)tvar->first_wait_queue_entry);
+	    evac_gen = saved_evac_gen;
+	    failed_to_evac = rtsTrue; // mutable
+	    break;
+	  }
+	  
+	case TREC_CHUNK:
+	  {
+	    StgWord i;
+	    StgTRecChunk *tc = ((StgTRecChunk *) p);
+	    TRecEntry *e = &(tc -> entries[0]);
+	    evac_gen = 0;
+	    tc->prev_chunk = (StgTRecChunk *)evacuate((StgClosure*)tc->prev_chunk);
+	    for (i = 0; i < tc -> next_entry_idx; i ++, e++ ) {
+	      e->tvar = (StgTVar *)evacuate((StgClosure*)e->tvar);
+	      e->expected_value = evacuate((StgClosure*)e->expected_value);
+	      e->new_value = evacuate((StgClosure*)e->new_value);
+	    }
+	    evac_gen = saved_evac_gen;
+	    failed_to_evac = rtsTrue; // mutable
+	    break;
+	  }
+
+	case TREC_HEADER:
+	  {
+	    StgTRecHeader *trec = ((StgTRecHeader *) p);
+	    evac_gen = 0;
+	    trec->enclosing_trec = (StgTRecHeader *)evacuate((StgClosure*)trec->enclosing_trec);
+	    trec->current_chunk = (StgTRecChunk *)evacuate((StgClosure*)trec->current_chunk);
+	    evac_gen = saved_evac_gen;
+	    failed_to_evac = rtsTrue; // mutable
+	    break;
+	  }
+
+	default:
+	    barf("scavenge_mark_stack: unimplemented/strange closure type %d @ %p", 
+		 info->type, p);
+	}
+
+	if (failed_to_evac) {
+	    failed_to_evac = rtsFalse;
+	    if (evac_gen > 0) {
+		recordMutableGen((StgClosure *)q, &generations[evac_gen]);
+	    }
+	}
+	
+	// mark the next bit to indicate "scavenged"
+	mark(q+1, Bdescr(q));
+
+    } // while (!mark_stack_empty())
+
+    // start a new linear scan if the mark stack overflowed at some point
+    if (mark_stack_overflowed && oldgen_scan_bd == NULL) {
+	IF_DEBUG(gc, debugBelch("scavenge_mark_stack: starting linear scan"));
+	mark_stack_overflowed = rtsFalse;
+	oldgen_scan_bd = oldest_gen->steps[0].old_blocks;
+	oldgen_scan = oldgen_scan_bd->start;
+    }
+
+    if (oldgen_scan_bd) {
+	// push a new thing on the mark stack
+    loop:
+	// find a closure that is marked but not scavenged, and start
+	// from there.
+	while (oldgen_scan < oldgen_scan_bd->free 
+	       && !is_marked(oldgen_scan,oldgen_scan_bd)) {
+	    oldgen_scan++;
+	}
+
+	if (oldgen_scan < oldgen_scan_bd->free) {
+
+	    // already scavenged?
+	    if (is_marked(oldgen_scan+1,oldgen_scan_bd)) {
+		oldgen_scan += sizeofW(StgHeader) + MIN_PAYLOAD_SIZE;
+		goto loop;
+	    }
+	    push_mark_stack(oldgen_scan);
+	    // ToDo: bump the linear scan by the actual size of the object
+	    oldgen_scan += sizeofW(StgHeader) + MIN_PAYLOAD_SIZE;
+	    goto linear_scan;
+	}
+
+	oldgen_scan_bd = oldgen_scan_bd->link;
+	if (oldgen_scan_bd != NULL) {
+	    oldgen_scan = oldgen_scan_bd->start;
+	    goto loop;
+	}
+    }
+}
+
+/* -----------------------------------------------------------------------------
+   Scavenge one object.
+
+   This is used for objects that are temporarily marked as mutable
+   because they contain old-to-new generation pointers.  Only certain
+   objects can have this property.
+   -------------------------------------------------------------------------- */
+
+static rtsBool
+scavenge_one(StgPtr p)
+{
+    const StgInfoTable *info;
+    nat saved_evac_gen = evac_gen;
+    rtsBool no_luck;
+    
+    ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
+    info = get_itbl((StgClosure *)p);
+    
+    switch (info->type) {
+	
+    case MVAR:
+    { 
+	StgMVar *mvar = ((StgMVar *)p);
+	evac_gen = 0;
+	mvar->head = (StgTSO *)evacuate((StgClosure *)mvar->head);
+	mvar->tail = (StgTSO *)evacuate((StgClosure *)mvar->tail);
+	mvar->value = evacuate((StgClosure *)mvar->value);
+	evac_gen = saved_evac_gen;
+	failed_to_evac = rtsTrue; // mutable.
+	break;
+    }
+
+    case THUNK:
+    case THUNK_1_0:
+    case THUNK_0_1:
+    case THUNK_1_1:
+    case THUNK_0_2:
+    case THUNK_2_0:
+    {
+	StgPtr q, end;
+	
+	end = (StgPtr)((StgThunk *)p)->payload + info->layout.payload.ptrs;
+	for (q = (StgPtr)((StgThunk *)p)->payload; q < end; q++) {
+	    *q = (StgWord)(StgPtr)evacuate((StgClosure *)*q);
+	}
+	break;
+    }
+
+    case FUN:
+    case FUN_1_0:			// hardly worth specialising these guys
+    case FUN_0_1:
+    case FUN_1_1:
+    case FUN_0_2:
+    case FUN_2_0:
+    case CONSTR:
+    case CONSTR_1_0:
+    case CONSTR_0_1:
+    case CONSTR_1_1:
+    case CONSTR_0_2:
+    case CONSTR_2_0:
+    case WEAK:
+    case IND_PERM:
+    {
+	StgPtr q, end;
+	
+	end = (StgPtr)((StgClosure *)p)->payload + info->layout.payload.ptrs;
+	for (q = (StgPtr)((StgClosure *)p)->payload; q < end; q++) {
+	    *q = (StgWord)(StgPtr)evacuate((StgClosure *)*q);
+	}
+	break;
+    }
+    
+    case MUT_VAR_CLEAN:
+    case MUT_VAR_DIRTY: {
+	StgPtr q = p;
+	rtsBool saved_eager_promotion = eager_promotion;
+
+	eager_promotion = rtsFalse;
+	((StgMutVar *)p)->var = evacuate(((StgMutVar *)p)->var);
+	eager_promotion = saved_eager_promotion;
+
+	if (failed_to_evac) {
+	    ((StgClosure *)q)->header.info = &stg_MUT_VAR_DIRTY_info;
+	} else {
+	    ((StgClosure *)q)->header.info = &stg_MUT_VAR_CLEAN_info;
+	}
+	break;
+    }
+
+    case CAF_BLACKHOLE:
+    case SE_CAF_BLACKHOLE:
+    case SE_BLACKHOLE:
+    case BLACKHOLE:
+	break;
+	
+    case THUNK_SELECTOR:
+    { 
+	StgSelector *s = (StgSelector *)p;
+	s->selectee = evacuate(s->selectee);
+	break;
+    }
+    
+    case AP_STACK:
+    {
+	StgAP_STACK *ap = (StgAP_STACK *)p;
+
+	ap->fun = evacuate(ap->fun);
+	scavenge_stack((StgPtr)ap->payload, (StgPtr)ap->payload + ap->size);
+	p = (StgPtr)ap->payload + ap->size;
+	break;
+    }
+
+    case PAP:
+	p = scavenge_PAP((StgPAP *)p);
+	break;
+
+    case AP:
+	p = scavenge_AP((StgAP *)p);
+	break;
+
+    case ARR_WORDS:
+	// nothing to follow 
+	break;
+
+    case MUT_ARR_PTRS_CLEAN:
+    case MUT_ARR_PTRS_DIRTY:
+    {
+	StgPtr next, q;
+	rtsBool saved_eager;
+
+	// We don't eagerly promote objects pointed to by a mutable
+	// array, but if we find the array only points to objects in
+	// the same or an older generation, we mark it "clean" and
+	// avoid traversing it during minor GCs.
+	saved_eager = eager_promotion;
+	eager_promotion = rtsFalse;
+	q = p;
+	next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+	for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+	    *p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	}
+	eager_promotion = saved_eager;
+
+	if (failed_to_evac) {
+	    ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info;
+	} else {
+	    ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_CLEAN_info;
+	}
+
+	failed_to_evac = rtsTrue;
+	break;
+    }
+
+    case MUT_ARR_PTRS_FROZEN:
+    case MUT_ARR_PTRS_FROZEN0:
+    {
+	// follow everything 
+	StgPtr next, q=p;
+      
+	next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+	for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+	    *p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	}
+
+	// If we're going to put this object on the mutable list, then
+	// set its info ptr to MUT_ARR_PTRS_FROZEN0 to indicate that.
+	if (failed_to_evac) {
+	    ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN0_info;
+	} else {
+	    ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_FROZEN_info;
+	}
+	break;
+    }
+
+    case TSO:
+    {
+	StgTSO *tso = (StgTSO *)p;
+	rtsBool saved_eager = eager_promotion;
+
+	eager_promotion = rtsFalse;
+	scavengeTSO(tso);
+	eager_promotion = saved_eager;
+
+	if (failed_to_evac) {
+	    tso->flags |= TSO_DIRTY;
+	} else {
+	    tso->flags &= ~TSO_DIRTY;
+	}
+
+	failed_to_evac = rtsTrue; // always on the mutable list
+	break;
+    }
+  
+#if defined(PAR)
+    case RBH:
+    { 
+#if 0
+	nat size, ptrs, nonptrs, vhs;
+	char str[80];
+	StgInfoTable *rip = get_closure_info(p, &size, &ptrs, &nonptrs, &vhs, str);
+#endif
+	StgRBH *rbh = (StgRBH *)p;
+	(StgClosure *)rbh->blocking_queue = 
+	    evacuate((StgClosure *)rbh->blocking_queue);
+	failed_to_evac = rtsTrue;  // mutable anyhow.
+	IF_DEBUG(gc,
+		 debugBelch("@@ scavenge: RBH %p (%s) (new blocking_queue link=%p)",
+		       p, info_type(p), (StgClosure *)rbh->blocking_queue));
+	// ToDo: use size of reverted closure here!
+	break;
+    }
+
+    case BLOCKED_FETCH:
+    { 
+	StgBlockedFetch *bf = (StgBlockedFetch *)p;
+	// follow the pointer to the node which is being demanded 
+	(StgClosure *)bf->node = 
+	    evacuate((StgClosure *)bf->node);
+	// follow the link to the rest of the blocking queue 
+	(StgClosure *)bf->link = 
+	    evacuate((StgClosure *)bf->link);
+	IF_DEBUG(gc,
+		 debugBelch("@@ scavenge: %p (%s); node is now %p; exciting, isn't it",
+		       bf, info_type((StgClosure *)bf), 
+		       bf->node, info_type(bf->node)));
+	break;
+    }
+
+#ifdef DIST
+    case REMOTE_REF:
+#endif
+    case FETCH_ME:
+	break; // nothing to do in this case
+
+    case FETCH_ME_BQ:
+    { 
+	StgFetchMeBlockingQueue *fmbq = (StgFetchMeBlockingQueue *)p;
+	(StgClosure *)fmbq->blocking_queue = 
+	    evacuate((StgClosure *)fmbq->blocking_queue);
+	IF_DEBUG(gc,
+		 debugBelch("@@ scavenge: %p (%s) exciting, isn't it",
+		       p, info_type((StgClosure *)p)));
+	break;
+    }
+#endif
+
+    case TVAR_WAIT_QUEUE:
+      {
+	StgTVarWaitQueue *wq = ((StgTVarWaitQueue *) p);
+	evac_gen = 0;
+	wq->waiting_tso = (StgTSO *)evacuate((StgClosure*)wq->waiting_tso);
+	wq->next_queue_entry = (StgTVarWaitQueue *)evacuate((StgClosure*)wq->next_queue_entry);
+	wq->prev_queue_entry = (StgTVarWaitQueue *)evacuate((StgClosure*)wq->prev_queue_entry);
+	evac_gen = saved_evac_gen;
+	failed_to_evac = rtsTrue; // mutable
+	break;
+      }
+
+    case TVAR:
+      {
+	StgTVar *tvar = ((StgTVar *) p);
+	evac_gen = 0;
+	tvar->current_value = evacuate((StgClosure*)tvar->current_value);
+	tvar->first_wait_queue_entry = (StgTVarWaitQueue *)evacuate((StgClosure*)tvar->first_wait_queue_entry);
+	evac_gen = saved_evac_gen;
+	failed_to_evac = rtsTrue; // mutable
+	break;
+      }
+
+    case TREC_HEADER:
+      {
+        StgTRecHeader *trec = ((StgTRecHeader *) p);
+        evac_gen = 0;
+	trec->enclosing_trec = (StgTRecHeader *)evacuate((StgClosure*)trec->enclosing_trec);
+	trec->current_chunk = (StgTRecChunk *)evacuate((StgClosure*)trec->current_chunk);
+	evac_gen = saved_evac_gen;
+	failed_to_evac = rtsTrue; // mutable
+        break;
+      }
+
+    case TREC_CHUNK:
+      {
+	StgWord i;
+	StgTRecChunk *tc = ((StgTRecChunk *) p);
+	TRecEntry *e = &(tc -> entries[0]);
+	evac_gen = 0;
+	tc->prev_chunk = (StgTRecChunk *)evacuate((StgClosure*)tc->prev_chunk);
+	for (i = 0; i < tc -> next_entry_idx; i ++, e++ ) {
+	  e->tvar = (StgTVar *)evacuate((StgClosure*)e->tvar);
+	  e->expected_value = evacuate((StgClosure*)e->expected_value);
+	  e->new_value = evacuate((StgClosure*)e->new_value);
+	}
+	evac_gen = saved_evac_gen;
+	failed_to_evac = rtsTrue; // mutable
+	break;
+      }
+
+    case IND_OLDGEN:
+    case IND_OLDGEN_PERM:
+    case IND_STATIC:
+    {
+	/* Careful here: a THUNK can be on the mutable list because
+	 * it contains pointers to young gen objects.  If such a thunk
+	 * is updated, the IND_OLDGEN will be added to the mutable
+	 * list again, and we'll scavenge it twice.  evacuate()
+	 * doesn't check whether the object has already been
+	 * evacuated, so we perform that check here.
+	 */
+	StgClosure *q = ((StgInd *)p)->indirectee;
+	if (HEAP_ALLOCED(q) && Bdescr((StgPtr)q)->flags & BF_EVACUATED) {
+	    break;
+	}
+	((StgInd *)p)->indirectee = evacuate(q);
+    }
+
+#if 0 && defined(DEBUG)
+      if (RtsFlags.DebugFlags.gc) 
+      /* Debugging code to print out the size of the thing we just
+       * promoted 
+       */
+      { 
+	StgPtr start = gen->steps[0].scan;
+	bdescr *start_bd = gen->steps[0].scan_bd;
+	nat size = 0;
+	scavenge(&gen->steps[0]);
+	if (start_bd != gen->steps[0].scan_bd) {
+	  size += (P_)BLOCK_ROUND_UP(start) - start;
+	  start_bd = start_bd->link;
+	  while (start_bd != gen->steps[0].scan_bd) {
+	    size += BLOCK_SIZE_W;
+	    start_bd = start_bd->link;
+	  }
+	  size += gen->steps[0].scan -
+	    (P_)BLOCK_ROUND_DOWN(gen->steps[0].scan);
+	} else {
+	  size = gen->steps[0].scan - start;
+	}
+	debugBelch("evac IND_OLDGEN: %ld bytes", size * sizeof(W_));
+      }
+#endif
+      break;
+
+    default:
+	barf("scavenge_one: strange object %d", (int)(info->type));
+    }    
+
+    no_luck = failed_to_evac;
+    failed_to_evac = rtsFalse;
+    return (no_luck);
+}
+
+/* -----------------------------------------------------------------------------
+   Scavenging mutable lists.
+
+   We treat the mutable list of each generation > N (i.e. all the
+   generations older than the one being collected) as roots.  We also
+   remove non-mutable objects from the mutable list at this point.
+   -------------------------------------------------------------------------- */
+
+static void
+scavenge_mutable_list(generation *gen)
+{
+    bdescr *bd;
+    StgPtr p, q;
+
+    bd = gen->saved_mut_list;
+
+    evac_gen = gen->no;
+    for (; bd != NULL; bd = bd->link) {
+	for (q = bd->start; q < bd->free; q++) {
+	    p = (StgPtr)*q;
+	    ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
+
+#ifdef DEBUG	    
+	    switch (get_itbl((StgClosure *)p)->type) {
+	    case MUT_VAR_CLEAN:
+		barf("MUT_VAR_CLEAN on mutable list");
+	    case MUT_VAR_DIRTY:
+		mutlist_MUTVARS++; break;
+	    case MUT_ARR_PTRS_CLEAN:
+	    case MUT_ARR_PTRS_DIRTY:
+	    case MUT_ARR_PTRS_FROZEN:
+	    case MUT_ARR_PTRS_FROZEN0:
+		mutlist_MUTARRS++; break;
+	    default:
+		mutlist_OTHERS++; break;
+	    }
+#endif
+
+	    // Check whether this object is "clean", that is it
+	    // definitely doesn't point into a young generation.
+	    // Clean objects don't need to be scavenged.  Some clean
+	    // objects (MUT_VAR_CLEAN) are not kept on the mutable
+	    // list at all; others, such as MUT_ARR_PTRS_CLEAN and
+	    // TSO, are always on the mutable list.
+	    //
+	    switch (get_itbl((StgClosure *)p)->type) {
+	    case MUT_ARR_PTRS_CLEAN:
+		recordMutableGen((StgClosure *)p,gen);
+		continue;
+	    case TSO: {
+		StgTSO *tso = (StgTSO *)p;
+		if ((tso->flags & TSO_DIRTY) == 0) {
+		    // A clean TSO: we don't have to traverse its
+		    // stack.  However, we *do* follow the link field:
+		    // we don't want to have to mark a TSO dirty just
+		    // because we put it on a different queue.
+		    if (tso->why_blocked != BlockedOnBlackHole) {
+			tso->link = (StgTSO *)evacuate((StgClosure *)tso->link);
+		    }
+		    recordMutableGen((StgClosure *)p,gen);
+		    continue;
+		}
+	    }
+	    default:
+		;
+	    }
+
+	    if (scavenge_one(p)) {
+		// didn't manage to promote everything, so put the
+		// object back on the list.
+		recordMutableGen((StgClosure *)p,gen);
+	    }
+	}
+    }
+
+    // free the old mut_list
+    freeChain(gen->saved_mut_list);
+    gen->saved_mut_list = NULL;
+}
+
+
+static void
+scavenge_static(void)
+{
+  StgClosure* p = static_objects;
+  const StgInfoTable *info;
+
+  /* Always evacuate straight to the oldest generation for static
+   * objects */
+  evac_gen = oldest_gen->no;
+
+  /* keep going until we've scavenged all the objects on the linked
+     list... */
+  while (p != END_OF_STATIC_LIST) {
+
+    ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
+    info = get_itbl(p);
+    /*
+    if (info->type==RBH)
+      info = REVERT_INFOPTR(info); // if it's an RBH, look at the orig closure
+    */
+    // make sure the info pointer is into text space 
+    
+    /* Take this object *off* the static_objects list,
+     * and put it on the scavenged_static_objects list.
+     */
+    static_objects = *STATIC_LINK(info,p);
+    *STATIC_LINK(info,p) = scavenged_static_objects;
+    scavenged_static_objects = p;
+    
+    switch (info -> type) {
+      
+    case IND_STATIC:
+      {
+	StgInd *ind = (StgInd *)p;
+	ind->indirectee = evacuate(ind->indirectee);
+
+	/* might fail to evacuate it, in which case we have to pop it
+	 * back on the mutable list of the oldest generation.  We
+	 * leave it *on* the scavenged_static_objects list, though,
+	 * in case we visit this object again.
+	 */
+	if (failed_to_evac) {
+	  failed_to_evac = rtsFalse;
+	  recordMutableGen((StgClosure *)p,oldest_gen);
+	}
+	break;
+      }
+      
+    case THUNK_STATIC:
+      scavenge_thunk_srt(info);
+      break;
+
+    case FUN_STATIC:
+      scavenge_fun_srt(info);
+      break;
+      
+    case CONSTR_STATIC:
+      {	
+	StgPtr q, next;
+	
+	next = (P_)p->payload + info->layout.payload.ptrs;
+	// evacuate the pointers 
+	for (q = (P_)p->payload; q < next; q++) {
+	    *q = (StgWord)(StgPtr)evacuate((StgClosure *)*q);
+	}
+	break;
+      }
+      
+    default:
+      barf("scavenge_static: strange closure %d", (int)(info->type));
+    }
+
+    ASSERT(failed_to_evac == rtsFalse);
+
+    /* get the next static object from the list.  Remember, there might
+     * be more stuff on this list now that we've done some evacuating!
+     * (static_objects is a global)
+     */
+    p = static_objects;
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   scavenge a chunk of memory described by a bitmap
+   -------------------------------------------------------------------------- */
+
+static void
+scavenge_large_bitmap( StgPtr p, StgLargeBitmap *large_bitmap, nat size )
+{
+    nat i, b;
+    StgWord bitmap;
+    
+    b = 0;
+    bitmap = large_bitmap->bitmap[b];
+    for (i = 0; i < size; ) {
+	if ((bitmap & 1) == 0) {
+	    *p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	}
+	i++;
+	p++;
+	if (i % BITS_IN(W_) == 0) {
+	    b++;
+	    bitmap = large_bitmap->bitmap[b];
+	} else {
+	    bitmap = bitmap >> 1;
+	}
+    }
+}
+
+STATIC_INLINE StgPtr
+scavenge_small_bitmap (StgPtr p, nat size, StgWord bitmap)
+{
+    while (size > 0) {
+	if ((bitmap & 1) == 0) {
+	    *p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	}
+	p++;
+	bitmap = bitmap >> 1;
+	size--;
+    }
+    return p;
+}
+
+/* -----------------------------------------------------------------------------
+   scavenge_stack walks over a section of stack and evacuates all the
+   objects pointed to by it.  We can use the same code for walking
+   AP_STACK_UPDs, since these are just sections of copied stack.
+   -------------------------------------------------------------------------- */
+
+
+static void
+scavenge_stack(StgPtr p, StgPtr stack_end)
+{
+  const StgRetInfoTable* info;
+  StgWord bitmap;
+  nat size;
+
+  //IF_DEBUG(sanity, debugBelch("  scavenging stack between %p and %p", p, stack_end));
+
+  /* 
+   * Each time around this loop, we are looking at a chunk of stack
+   * that starts with an activation record. 
+   */
+
+  while (p < stack_end) {
+    info  = get_ret_itbl((StgClosure *)p);
+      
+    switch (info->i.type) {
+	
+    case UPDATE_FRAME:
+	// In SMP, we can get update frames that point to indirections
+	// when two threads evaluate the same thunk.  We do attempt to
+	// discover this situation in threadPaused(), but it's
+	// possible that the following sequence occurs:
+	//
+	//        A             B
+	//                  enter T
+	//     enter T
+	//     blackhole T
+	//                  update T
+	//     GC
+	//
+	// Now T is an indirection, and the update frame is already
+	// marked on A's stack, so we won't traverse it again in
+	// threadPaused().  We could traverse the whole stack again
+	// before GC, but that seems like overkill.
+	//
+	// Scavenging this update frame as normal would be disastrous;
+	// the updatee would end up pointing to the value.  So we turn
+	// the indirection into an IND_PERM, so that evacuate will
+	// copy the indirection into the old generation instead of
+	// discarding it.
+	if (get_itbl(((StgUpdateFrame *)p)->updatee)->type == IND) {
+	    ((StgUpdateFrame *)p)->updatee->header.info = 
+		(StgInfoTable *)&stg_IND_PERM_info;
+	}
+	((StgUpdateFrame *)p)->updatee 
+	    = evacuate(((StgUpdateFrame *)p)->updatee);
+	p += sizeofW(StgUpdateFrame);
+	continue;
+
+      // small bitmap (< 32 entries, or 64 on a 64-bit machine) 
+    case CATCH_STM_FRAME:
+    case CATCH_RETRY_FRAME:
+    case ATOMICALLY_FRAME:
+    case STOP_FRAME:
+    case CATCH_FRAME:
+    case RET_SMALL:
+    case RET_VEC_SMALL:
+	bitmap = BITMAP_BITS(info->i.layout.bitmap);
+	size   = BITMAP_SIZE(info->i.layout.bitmap);
+	// NOTE: the payload starts immediately after the info-ptr, we
+	// don't have an StgHeader in the same sense as a heap closure.
+	p++;
+	p = scavenge_small_bitmap(p, size, bitmap);
+
+    follow_srt:
+	if (major_gc) 
+	    scavenge_srt((StgClosure **)GET_SRT(info), info->i.srt_bitmap);
+	continue;
+
+    case RET_BCO: {
+	StgBCO *bco;
+	nat size;
+
+	p++;
+	*p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	bco = (StgBCO *)*p;
+	p++;
+	size = BCO_BITMAP_SIZE(bco);
+	scavenge_large_bitmap(p, BCO_BITMAP(bco), size);
+	p += size;
+	continue;
+    }
+
+      // large bitmap (> 32 entries, or > 64 on a 64-bit machine) 
+    case RET_BIG:
+    case RET_VEC_BIG:
+    {
+	nat size;
+
+	size = GET_LARGE_BITMAP(&info->i)->size;
+	p++;
+	scavenge_large_bitmap(p, GET_LARGE_BITMAP(&info->i), size);
+	p += size;
+	// and don't forget to follow the SRT 
+	goto follow_srt;
+    }
+
+      // Dynamic bitmap: the mask is stored on the stack, and
+      // there are a number of non-pointers followed by a number
+      // of pointers above the bitmapped area.  (see StgMacros.h,
+      // HEAP_CHK_GEN).
+    case RET_DYN:
+    {
+	StgWord dyn;
+	dyn = ((StgRetDyn *)p)->liveness;
+
+	// traverse the bitmap first
+	bitmap = RET_DYN_LIVENESS(dyn);
+	p      = (P_)&((StgRetDyn *)p)->payload[0];
+	size   = RET_DYN_BITMAP_SIZE;
+	p = scavenge_small_bitmap(p, size, bitmap);
+
+	// skip over the non-ptr words
+	p += RET_DYN_NONPTRS(dyn) + RET_DYN_NONPTR_REGS_SIZE;
+	
+	// follow the ptr words
+	for (size = RET_DYN_PTRS(dyn); size > 0; size--) {
+	    *p = (StgWord)(StgPtr)evacuate((StgClosure *)*p);
+	    p++;
+	}
+	continue;
+    }
+
+    case RET_FUN:
+    {
+	StgRetFun *ret_fun = (StgRetFun *)p;
+	StgFunInfoTable *fun_info;
+
+	ret_fun->fun = evacuate(ret_fun->fun);
+ 	fun_info = get_fun_itbl(ret_fun->fun);
+	p = scavenge_arg_block(fun_info, ret_fun->payload);
+	goto follow_srt;
+    }
+
+    default:
+	barf("scavenge_stack: weird activation record found on stack: %d", (int)(info->i.type));
+    }
+  }		     
+}
+
+/*-----------------------------------------------------------------------------
+  scavenge the large object list.
+
+  evac_gen set by caller; similar games played with evac_gen as with
+  scavenge() - see comment at the top of scavenge().  Most large
+  objects are (repeatedly) mutable, so most of the time evac_gen will
+  be zero.
+  --------------------------------------------------------------------------- */
+
+static void
+scavenge_large(step *stp)
+{
+  bdescr *bd;
+  StgPtr p;
+
+  bd = stp->new_large_objects;
+
+  for (; bd != NULL; bd = stp->new_large_objects) {
+
+    /* take this object *off* the large objects list and put it on
+     * the scavenged large objects list.  This is so that we can
+     * treat new_large_objects as a stack and push new objects on
+     * the front when evacuating.
+     */
+    stp->new_large_objects = bd->link;
+    dbl_link_onto(bd, &stp->scavenged_large_objects);
+
+    // update the block count in this step.
+    stp->n_scavenged_large_blocks += bd->blocks;
+
+    p = bd->start;
+    if (scavenge_one(p)) {
+	if (stp->gen_no > 0) {
+	    recordMutableGen((StgClosure *)p, stp->gen);
+	}
+    }
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   Initialising the static object & mutable lists
+   -------------------------------------------------------------------------- */
+
+static void
+zero_static_object_list(StgClosure* first_static)
+{
+  StgClosure* p;
+  StgClosure* link;
+  const StgInfoTable *info;
+
+  for (p = first_static; p != END_OF_STATIC_LIST; p = link) {
+    info = get_itbl(p);
+    link = *STATIC_LINK(info, p);
+    *STATIC_LINK(info,p) = NULL;
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   Reverting CAFs
+   -------------------------------------------------------------------------- */
+
+void
+revertCAFs( void )
+{
+    StgIndStatic *c;
+
+    for (c = (StgIndStatic *)revertible_caf_list; c != NULL; 
+	 c = (StgIndStatic *)c->static_link) 
+    {
+	SET_INFO(c, c->saved_info);
+	c->saved_info = NULL;
+	// could, but not necessary: c->static_link = NULL; 
+    }
+    revertible_caf_list = NULL;
+}
+
+void
+markCAFs( evac_fn evac )
+{
+    StgIndStatic *c;
+
+    for (c = (StgIndStatic *)caf_list; c != NULL; 
+	 c = (StgIndStatic *)c->static_link) 
+    {
+	evac(&c->indirectee);
+    }
+    for (c = (StgIndStatic *)revertible_caf_list; c != NULL; 
+	 c = (StgIndStatic *)c->static_link) 
+    {
+	evac(&c->indirectee);
+    }
+}
+
+/* -----------------------------------------------------------------------------
+   Sanity code for CAF garbage collection.
+
+   With DEBUG turned on, we manage a CAF list in addition to the SRT
+   mechanism.  After GC, we run down the CAF list and blackhole any
+   CAFs which have been garbage collected.  This means we get an error
+   whenever the program tries to enter a garbage collected CAF.
+
+   Any garbage collected CAFs are taken off the CAF list at the same
+   time. 
+   -------------------------------------------------------------------------- */
+
+#if 0 && defined(DEBUG)
+
+static void
+gcCAFs(void)
+{
+  StgClosure*  p;
+  StgClosure** pp;
+  const StgInfoTable *info;
+  nat i;
+
+  i = 0;
+  p = caf_list;
+  pp = &caf_list;
+
+  while (p != NULL) {
+    
+    info = get_itbl(p);
+
+    ASSERT(info->type == IND_STATIC);
+
+    if (STATIC_LINK(info,p) == NULL) {
+      IF_DEBUG(gccafs, debugBelch("CAF gc'd at 0x%04lx", (long)p));
+      // black hole it 
+      SET_INFO(p,&stg_BLACKHOLE_info);
+      p = STATIC_LINK2(info,p);
+      *pp = p;
+    }
+    else {
+      pp = &STATIC_LINK2(info,p);
+      p = *pp;
+      i++;
+    }
+
+  }
+
+  //  debugBelch("%d CAFs live", i); 
+}
+#endif
+
+
+/* -----------------------------------------------------------------------------
+ * Stack squeezing
+ *
+ * Code largely pinched from old RTS, then hacked to bits.  We also do
+ * lazy black holing here.
+ *
+ * -------------------------------------------------------------------------- */
+
+struct stack_gap { StgWord gap_size; struct stack_gap *next_gap; };
+
+static void
+stackSqueeze(StgTSO *tso, StgPtr bottom)
+{
+    StgPtr frame;
+    rtsBool prev_was_update_frame;
+    StgClosure *updatee = NULL;
+    StgRetInfoTable *info;
+    StgWord current_gap_size;
+    struct stack_gap *gap;
+
+    // Stage 1: 
+    //    Traverse the stack upwards, replacing adjacent update frames
+    //    with a single update frame and a "stack gap".  A stack gap
+    //    contains two values: the size of the gap, and the distance
+    //    to the next gap (or the stack top).
+
+    frame = tso->sp;
+
+    ASSERT(frame < bottom);
+    
+    prev_was_update_frame = rtsFalse;
+    current_gap_size = 0;
+    gap = (struct stack_gap *) (tso->sp - sizeofW(StgUpdateFrame));
+
+    while (frame < bottom) {
+	
+	info = get_ret_itbl((StgClosure *)frame);
+	switch (info->i.type) {
+
+	case UPDATE_FRAME:
+	{ 
+	    StgUpdateFrame *upd = (StgUpdateFrame *)frame;
+
+	    if (prev_was_update_frame) {
+
+		TICK_UPD_SQUEEZED();
+		/* wasn't there something about update squeezing and ticky to be
+		 * sorted out?  oh yes: we aren't counting each enter properly
+		 * in this case.  See the log somewhere.  KSW 1999-04-21
+		 *
+		 * Check two things: that the two update frames don't point to
+		 * the same object, and that the updatee_bypass isn't already an
+		 * indirection.  Both of these cases only happen when we're in a
+		 * block hole-style loop (and there are multiple update frames
+		 * on the stack pointing to the same closure), but they can both
+		 * screw us up if we don't check.
+		 */
+		if (upd->updatee != updatee && !closure_IND(upd->updatee)) {
+		    UPD_IND_NOLOCK(upd->updatee, updatee);
+		}
+
+		// now mark this update frame as a stack gap.  The gap
+		// marker resides in the bottom-most update frame of
+		// the series of adjacent frames, and covers all the
+		// frames in this series.
+		current_gap_size += sizeofW(StgUpdateFrame);
+		((struct stack_gap *)frame)->gap_size = current_gap_size;
+		((struct stack_gap *)frame)->next_gap = gap;
+
+		frame += sizeofW(StgUpdateFrame);
+		continue;
+	    } 
+
+	    // single update frame, or the topmost update frame in a series
+	    else {
+		prev_was_update_frame = rtsTrue;
+		updatee = upd->updatee;
+		frame += sizeofW(StgUpdateFrame);
+		continue;
+	    }
+	}
+	    
+	default:
+	    prev_was_update_frame = rtsFalse;
+
+	    // we're not in a gap... check whether this is the end of a gap
+	    // (an update frame can't be the end of a gap).
+	    if (current_gap_size != 0) {
+		gap = (struct stack_gap *) (frame - sizeofW(StgUpdateFrame));
+	    }
+	    current_gap_size = 0;
+
+	    frame += stack_frame_sizeW((StgClosure *)frame);
+	    continue;
+	}
+    }
+
+    if (current_gap_size != 0) {
+	gap = (struct stack_gap *) (frame - sizeofW(StgUpdateFrame));
+    }
+
+    // Now we have a stack with gaps in it, and we have to walk down
+    // shoving the stack up to fill in the gaps.  A diagram might
+    // help:
+    //
+    //    +| ********* |
+    //     | ********* | <- sp
+    //     |           |
+    //     |           | <- gap_start
+    //     | ......... |                |
+    //     | stack_gap | <- gap         | chunk_size
+    //     | ......... |                | 
+    //     | ......... | <- gap_end     v
+    //     | ********* | 
+    //     | ********* | 
+    //     | ********* | 
+    //    -| ********* | 
+    //
+    // 'sp'  points the the current top-of-stack
+    // 'gap' points to the stack_gap structure inside the gap
+    // *****   indicates real stack data
+    // .....   indicates gap
+    // <empty> indicates unused
+    //
+    {
+	void *sp;
+	void *gap_start, *next_gap_start, *gap_end;
+	nat chunk_size;
+
+	next_gap_start = (void *)((unsigned char*)gap + sizeof(StgUpdateFrame));
+	sp = next_gap_start;
+
+	while ((StgPtr)gap > tso->sp) {
+
+	    // we're working in *bytes* now...
+	    gap_start = next_gap_start;
+	    gap_end = (void*) ((unsigned char*)gap_start - gap->gap_size * sizeof(W_));
+
+	    gap = gap->next_gap;
+	    next_gap_start = (void *)((unsigned char*)gap + sizeof(StgUpdateFrame));
+
+	    chunk_size = (unsigned char*)gap_end - (unsigned char*)next_gap_start;
+	    sp -= chunk_size;
+	    memmove(sp, next_gap_start, chunk_size);
+	}
+
+	tso->sp = (StgPtr)sp;
+    }
+}    
+
+/* -----------------------------------------------------------------------------
+ * Pausing a thread
+ * 
+ * We have to prepare for GC - this means doing lazy black holing
+ * here.  We also take the opportunity to do stack squeezing if it's
+ * turned on.
+ * -------------------------------------------------------------------------- */
+void
+threadPaused(Capability *cap, StgTSO *tso)
+{
+    StgClosure *frame;
+    StgRetInfoTable *info;
+    StgClosure *bh;
+    StgPtr stack_end;
+    nat words_to_squeeze = 0;
+    nat weight           = 0;
+    nat weight_pending   = 0;
+    rtsBool prev_was_update_frame;
+    
+    stack_end = &tso->stack[tso->stack_size];
+    
+    frame = (StgClosure *)tso->sp;
+
+    while (1) {
+	// If we've already marked this frame, then stop here.
+	if (frame->header.info == (StgInfoTable *)&stg_marked_upd_frame_info) {
+	    goto end;
+	}
+
+	info = get_ret_itbl(frame);
+	
+	switch (info->i.type) {
+	    
+	case UPDATE_FRAME:
+
+	    SET_INFO(frame, (StgInfoTable *)&stg_marked_upd_frame_info);
+
+	    bh = ((StgUpdateFrame *)frame)->updatee;
+
+	    if (closure_IND(bh) || bh->header.info == &stg_BLACKHOLE_info) {
+		IF_DEBUG(squeeze, debugBelch("suspending duplicate work: %ld words of stack\n", (StgPtr)frame - tso->sp));
+
+		// If this closure is already an indirection, then
+		// suspend the computation up to this point:
+		suspendComputation(cap,tso,(StgPtr)frame);
+
+		// Now drop the update frame, and arrange to return
+		// the value to the frame underneath:
+		tso->sp = (StgPtr)frame + sizeofW(StgUpdateFrame) - 2;
+		tso->sp[1] = (StgWord)bh;
+		tso->sp[0] = (W_)&stg_enter_info;
+
+		// And continue with threadPaused; there might be
+		// yet more computation to suspend.
+		threadPaused(cap,tso);
+		return;
+	    }
+
+	    if (bh->header.info != &stg_CAF_BLACKHOLE_info) {
+#if (!defined(LAZY_BLACKHOLING)) && defined(DEBUG)
+		debugBelch("Unexpected lazy BHing required at 0x%04lx\n",(long)bh);
+#endif
+		// zero out the slop so that the sanity checker can tell
+		// where the next closure is.
+		DEBUG_FILL_SLOP(bh);
+#ifdef PROFILING
+		// @LDV profiling
+		// We pretend that bh is now dead.
+		LDV_recordDead_FILL_SLOP_DYNAMIC((StgClosure *)bh);
+#endif
+		SET_INFO(bh,&stg_BLACKHOLE_info);
+
+		// We pretend that bh has just been created.
+		LDV_RECORD_CREATE(bh);
+	    }
+	    
+	    frame = (StgClosure *) ((StgUpdateFrame *)frame + 1);
+	    if (prev_was_update_frame) {
+		words_to_squeeze += sizeofW(StgUpdateFrame);
+		weight += weight_pending;
+		weight_pending = 0;
+	    }
+	    prev_was_update_frame = rtsTrue;
+	    break;
+	    
+	case STOP_FRAME:
+	    goto end;
+	    
+	    // normal stack frames; do nothing except advance the pointer
+	default:
+	{
+	    nat frame_size = stack_frame_sizeW(frame);
+	    weight_pending += frame_size;
+	    frame = (StgClosure *)((StgPtr)frame + frame_size);
+	    prev_was_update_frame = rtsFalse;
+	}
+	}
+    }
+
+end:
+    IF_DEBUG(squeeze, 
+	     debugBelch("words_to_squeeze: %d, weight: %d, squeeze: %s\n", 
+			words_to_squeeze, weight, 
+			weight < words_to_squeeze ? "YES" : "NO"));
+
+    // Should we squeeze or not?  Arbitrary heuristic: we squeeze if
+    // the number of words we have to shift down is less than the
+    // number of stack words we squeeze away by doing so.
+    if (RtsFlags.GcFlags.squeezeUpdFrames == rtsTrue &&
+	weight < words_to_squeeze) {
+	stackSqueeze(tso, (StgPtr)frame);
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ * Debugging
+ * -------------------------------------------------------------------------- */
+
+#if DEBUG
+void
+printMutableList(generation *gen)
+{
+    bdescr *bd;
+    StgPtr p;
+
+    debugBelch("@@ Mutable list %p: ", gen->mut_list);
+
+    for (bd = gen->mut_list; bd != NULL; bd = bd->link) {
+	for (p = bd->start; p < bd->free; p++) {
+	    debugBelch("%p (%s), ", (void *)*p, info_type((StgClosure *)*p));
+	}
+    }
+    debugBelch("\n");
+}
+#endif /* DEBUG */
diff --git a/rts/GCCompact.c b/rts/GCCompact.c
new file mode 100644
index 0000000000..4dfe84bbe0
--- /dev/null
+++ b/rts/GCCompact.c
@@ -0,0 +1,949 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 2001
+ *
+ * Compacting garbage collector
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "RtsFlags.h"
+#include "OSThreads.h"
+#include "Storage.h"
+#include "BlockAlloc.h"
+#include "MBlock.h"
+#include "GCCompact.h"
+#include "Schedule.h"
+#include "Apply.h"
+
+// Turn off inlining when debugging - it obfuscates things
+#ifdef DEBUG
+# undef  STATIC_INLINE
+# define STATIC_INLINE static
+#endif
+
+/* -----------------------------------------------------------------------------
+   Threading / unthreading pointers.
+
+   The basic idea here is to chain together all the fields pointing at
+   a particular object, with the root of the chain in the object's
+   info table field.  The original contents of the info pointer goes
+   at the end of the chain.
+
+   Adding a new field to the chain is a matter of swapping the
+   contents of the field with the contents of the object's info table
+   field.
+
+   To unthread the chain, we walk down it updating all the fields on
+   the chain with the new location of the object.  We stop when we
+   reach the info pointer at the end.
+
+   We use a trick to identify the info pointer: when swapping pointers
+   for threading, we set the low bit of the original pointer, with the
+   result that all the pointers in the chain have their low bits set
+   except for the info pointer.
+   -------------------------------------------------------------------------- */
+
+STATIC_INLINE void
+thread( StgPtr p )
+{
+    StgPtr q = (StgPtr)*p;
+    bdescr *bd;
+
+    // It doesn't look like a closure at the moment, because the info
+    // ptr is possibly threaded:
+    // ASSERT(LOOKS_LIKE_CLOSURE_PTR(q));
+
+    if (HEAP_ALLOCED(q)) {
+	bd = Bdescr(q); 
+	// a handy way to discover whether the ptr is into the
+	// compacted area of the old gen, is that the EVACUATED flag
+	// is zero (it's non-zero for all the other areas of live
+	// memory).
+	if ((bd->flags & BF_EVACUATED) == 0) {
+	    *p = (StgWord)*q;
+	    *q = (StgWord)p + 1;	// set the low bit
+	}
+    }
+}
+
+STATIC_INLINE void
+unthread( StgPtr p, StgPtr free )
+{
+    StgWord q = *p, r;
+    
+    while ((q & 1) != 0) {
+	q -= 1;	// unset the low bit again
+	r = *((StgPtr)q);
+	*((StgPtr)q) = (StgWord)free;
+	q = r;
+    }
+    *p = q;
+}
+
+STATIC_INLINE StgInfoTable *
+get_threaded_info( StgPtr p )
+{
+    StgPtr q = (P_)GET_INFO((StgClosure *)p);
+
+    while (((StgWord)q & 1) != 0) {
+	q = (P_)*((StgPtr)((StgWord)q-1));
+    }
+
+    ASSERT(LOOKS_LIKE_INFO_PTR(q));
+    return INFO_PTR_TO_STRUCT((StgInfoTable *)q);
+}
+
+// A word-aligned memmove will be faster for small objects than libc's or gcc's.
+// Remember, the two regions *might* overlap, but: to <= from.
+STATIC_INLINE void
+move(StgPtr to, StgPtr from, nat size)
+{
+    for(; size > 0; --size) {
+	*to++ = *from++;
+    }
+}
+
+static void
+thread_static( StgClosure* p )
+{
+  const StgInfoTable *info;
+
+  // keep going until we've threaded all the objects on the linked
+  // list... 
+  while (p != END_OF_STATIC_LIST) {
+
+    info = get_itbl(p);
+    switch (info->type) {
+      
+    case IND_STATIC:
+	thread((StgPtr)&((StgInd *)p)->indirectee);
+	p = *IND_STATIC_LINK(p);
+	continue;
+      
+    case THUNK_STATIC:
+	p = *THUNK_STATIC_LINK(p);
+	continue;
+    case FUN_STATIC:
+	p = *FUN_STATIC_LINK(p);
+	continue;
+    case CONSTR_STATIC:
+	p = *STATIC_LINK(info,p);
+	continue;
+      
+    default:
+	barf("thread_static: strange closure %d", (int)(info->type));
+    }
+
+  }
+}
+
+STATIC_INLINE void
+thread_large_bitmap( StgPtr p, StgLargeBitmap *large_bitmap, nat size )
+{
+    nat i, b;
+    StgWord bitmap;
+
+    b = 0;
+    bitmap = large_bitmap->bitmap[b];
+    for (i = 0; i < size; ) {
+	if ((bitmap & 1) == 0) {
+	    thread(p);
+	}
+	i++;
+	p++;
+	if (i % BITS_IN(W_) == 0) {
+	    b++;
+	    bitmap = large_bitmap->bitmap[b];
+	} else {
+	    bitmap = bitmap >> 1;
+	}
+    }
+}
+
+STATIC_INLINE StgPtr
+thread_arg_block (StgFunInfoTable *fun_info, StgClosure **args)
+{
+    StgPtr p;
+    StgWord bitmap;
+    nat size;
+
+    p = (StgPtr)args;
+    switch (fun_info->f.fun_type) {
+    case ARG_GEN:
+	bitmap = BITMAP_BITS(fun_info->f.b.bitmap);
+	size = BITMAP_SIZE(fun_info->f.b.bitmap);
+	goto small_bitmap;
+    case ARG_GEN_BIG:
+	size = GET_FUN_LARGE_BITMAP(fun_info)->size;
+	thread_large_bitmap(p, GET_FUN_LARGE_BITMAP(fun_info), size);
+	p += size;
+	break;
+    default:
+	bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]);
+	size = BITMAP_SIZE(stg_arg_bitmaps[fun_info->f.fun_type]);
+    small_bitmap:
+	while (size > 0) {
+	    if ((bitmap & 1) == 0) {
+		thread(p);
+	    }
+	    p++;
+	    bitmap = bitmap >> 1;
+	    size--;
+	}
+	break;
+    }
+    return p;
+}
+
+static void
+thread_stack(StgPtr p, StgPtr stack_end)
+{
+    const StgRetInfoTable* info;
+    StgWord bitmap;
+    nat size;
+    
+    // highly similar to scavenge_stack, but we do pointer threading here.
+    
+    while (p < stack_end) {
+
+	// *p must be the info pointer of an activation
+	// record.  All activation records have 'bitmap' style layout
+	// info.
+	//
+	info  = get_ret_itbl((StgClosure *)p);
+	
+	switch (info->i.type) {
+	    
+	    // Dynamic bitmap: the mask is stored on the stack 
+	case RET_DYN:
+	{
+	    StgWord dyn;
+	    dyn = ((StgRetDyn *)p)->liveness;
+
+	    // traverse the bitmap first
+	    bitmap = RET_DYN_LIVENESS(dyn);
+	    p      = (P_)&((StgRetDyn *)p)->payload[0];
+	    size   = RET_DYN_BITMAP_SIZE;
+	    while (size > 0) {
+		if ((bitmap & 1) == 0) {
+		    thread(p);
+		}
+		p++;
+		bitmap = bitmap >> 1;
+		size--;
+	    }
+	    
+	    // skip over the non-ptr words
+	    p += RET_DYN_NONPTRS(dyn) + RET_DYN_NONPTR_REGS_SIZE;
+	    
+	    // follow the ptr words
+	    for (size = RET_DYN_PTRS(dyn); size > 0; size--) {
+		thread(p);
+		p++;
+	    }
+	    continue;
+	}
+	    
+	    // small bitmap (<= 32 entries, or 64 on a 64-bit machine) 
+        case CATCH_RETRY_FRAME:
+        case CATCH_STM_FRAME:
+        case ATOMICALLY_FRAME:
+	case UPDATE_FRAME:
+	case STOP_FRAME:
+	case CATCH_FRAME:
+	case RET_SMALL:
+	case RET_VEC_SMALL:
+	    bitmap = BITMAP_BITS(info->i.layout.bitmap);
+	    size   = BITMAP_SIZE(info->i.layout.bitmap);
+	    p++;
+	    // NOTE: the payload starts immediately after the info-ptr, we
+	    // don't have an StgHeader in the same sense as a heap closure.
+	    while (size > 0) {
+		if ((bitmap & 1) == 0) {
+		    thread(p);
+		}
+		p++;
+		bitmap = bitmap >> 1;
+		size--;
+	    }
+	    continue;
+
+	case RET_BCO: {
+	    StgBCO *bco;
+	    nat size;
+	    
+	    p++;
+	    bco = (StgBCO *)*p;
+	    thread(p);
+	    p++;
+	    size = BCO_BITMAP_SIZE(bco);
+	    thread_large_bitmap(p, BCO_BITMAP(bco), size);
+	    p += size;
+	    continue;
+	}
+
+	    // large bitmap (> 32 entries, or 64 on a 64-bit machine) 
+	case RET_BIG:
+	case RET_VEC_BIG:
+	    p++;
+	    size = GET_LARGE_BITMAP(&info->i)->size;
+	    thread_large_bitmap(p, GET_LARGE_BITMAP(&info->i), size);
+	    p += size;
+	    continue;
+
+	case RET_FUN:
+	{
+	    StgRetFun *ret_fun = (StgRetFun *)p;
+	    StgFunInfoTable *fun_info;
+	    
+	    fun_info = itbl_to_fun_itbl(
+		get_threaded_info((StgPtr)ret_fun->fun));
+	         // *before* threading it!
+	    thread((StgPtr)&ret_fun->fun);
+	    p = thread_arg_block(fun_info, ret_fun->payload);
+	    continue;
+	}
+
+	default:
+	    barf("thread_stack: weird activation record found on stack: %d", 
+		 (int)(info->i.type));
+	}
+    }
+}
+
+STATIC_INLINE StgPtr
+thread_PAP_payload (StgClosure *fun, StgClosure **payload, StgWord size)
+{
+    StgPtr p;
+    StgWord bitmap;
+    StgFunInfoTable *fun_info;
+
+    fun_info = itbl_to_fun_itbl(get_threaded_info((StgPtr)fun));
+    ASSERT(fun_info->i.type != PAP);
+
+    p = (StgPtr)payload;
+
+    switch (fun_info->f.fun_type) {
+    case ARG_GEN:
+	bitmap = BITMAP_BITS(fun_info->f.b.bitmap);
+	goto small_bitmap;
+    case ARG_GEN_BIG:
+	thread_large_bitmap(p, GET_FUN_LARGE_BITMAP(fun_info), size);
+	p += size;
+	break;
+    case ARG_BCO:
+	thread_large_bitmap((StgPtr)payload, BCO_BITMAP(fun), size);
+	p += size;
+	break;
+    default:
+	bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]);
+    small_bitmap:
+	while (size > 0) {
+	    if ((bitmap & 1) == 0) {
+		thread(p);
+	    }
+	    p++;
+	    bitmap = bitmap >> 1;
+	    size--;
+	}
+	break;
+    }
+
+    return p;
+}
+
+STATIC_INLINE StgPtr
+thread_PAP (StgPAP *pap)
+{
+    StgPtr p;
+    p = thread_PAP_payload(pap->fun, pap->payload, pap->n_args);
+    thread((StgPtr)&pap->fun);
+    return p;
+}
+    
+STATIC_INLINE StgPtr
+thread_AP (StgAP *ap)
+{
+    StgPtr p;
+    p = thread_PAP_payload(ap->fun, ap->payload, ap->n_args);
+    thread((StgPtr)&ap->fun);
+    return p;
+}    
+
+STATIC_INLINE StgPtr
+thread_AP_STACK (StgAP_STACK *ap)
+{
+    thread((StgPtr)&ap->fun);
+    thread_stack((P_)ap->payload, (P_)ap->payload + ap->size);
+    return (P_)ap + sizeofW(StgAP_STACK) + ap->size;
+}
+
+static StgPtr
+thread_TSO (StgTSO *tso)
+{
+    thread((StgPtr)&tso->link);
+    thread((StgPtr)&tso->global_link);
+
+    if (   tso->why_blocked == BlockedOnMVar
+	|| tso->why_blocked == BlockedOnBlackHole
+	|| tso->why_blocked == BlockedOnException
+#if defined(PAR)
+	|| tso->why_blocked == BlockedOnGA
+	|| tso->why_blocked == BlockedOnGA_NoSend
+#endif
+	) {
+	thread((StgPtr)&tso->block_info.closure);
+    }
+    if ( tso->blocked_exceptions != NULL ) {
+	thread((StgPtr)&tso->blocked_exceptions);
+    }
+    
+    thread((StgPtr)&tso->trec);
+
+    thread_stack(tso->sp, &(tso->stack[tso->stack_size]));
+    return (StgPtr)tso + tso_sizeW(tso);
+}
+
+
+static void
+update_fwd_large( bdescr *bd )
+{
+  StgPtr p;
+  const StgInfoTable* info;
+
+  for (; bd != NULL; bd = bd->link) {
+
+    p = bd->start;
+    info  = get_itbl((StgClosure *)p);
+
+    switch (info->type) {
+
+    case ARR_WORDS:
+      // nothing to follow 
+      continue;
+
+    case MUT_ARR_PTRS_CLEAN:
+    case MUT_ARR_PTRS_DIRTY:
+    case MUT_ARR_PTRS_FROZEN:
+    case MUT_ARR_PTRS_FROZEN0:
+      // follow everything 
+      {
+	StgPtr next;
+
+	next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+	for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+	    thread(p);
+	}
+	continue;
+      }
+
+    case TSO:
+	thread_TSO((StgTSO *)p);
+	continue;
+
+    case AP_STACK:
+	thread_AP_STACK((StgAP_STACK *)p);
+	continue;
+
+    case PAP:
+	thread_PAP((StgPAP *)p);
+	continue;
+
+    case TREC_CHUNK:
+    {
+        StgWord i;
+        StgTRecChunk *tc = (StgTRecChunk *)p;
+	TRecEntry *e = &(tc -> entries[0]);
+	thread((StgPtr)&tc->prev_chunk);
+	for (i = 0; i < tc -> next_entry_idx; i ++, e++ ) {
+	  thread((StgPtr)&e->tvar);
+	  thread((StgPtr)&e->expected_value);
+	  thread((StgPtr)&e->new_value);
+	}
+	continue;
+    }
+
+    default:
+      barf("update_fwd_large: unknown/strange object  %d", (int)(info->type));
+    }
+  }
+}
+
+STATIC_INLINE StgPtr
+thread_obj (StgInfoTable *info, StgPtr p)
+{
+    switch (info->type) {
+    case THUNK_0_1:
+	return p + sizeofW(StgThunk) + 1;
+
+    case FUN_0_1:
+    case CONSTR_0_1:
+	return p + sizeofW(StgHeader) + 1;
+	
+    case FUN_1_0:
+    case CONSTR_1_0:
+	thread((StgPtr)&((StgClosure *)p)->payload[0]);
+	return p + sizeofW(StgHeader) + 1;
+	
+    case THUNK_1_0:
+	thread((StgPtr)&((StgThunk *)p)->payload[0]);
+	return p + sizeofW(StgThunk) + 1;
+	
+    case THUNK_0_2:
+	return p + sizeofW(StgThunk) + 2;
+
+    case FUN_0_2:
+    case CONSTR_0_2:
+	return p + sizeofW(StgHeader) + 2;
+	
+    case THUNK_1_1:
+	thread((StgPtr)&((StgThunk *)p)->payload[0]);
+	return p + sizeofW(StgThunk) + 2;
+
+    case FUN_1_1:
+    case CONSTR_1_1:
+	thread((StgPtr)&((StgClosure *)p)->payload[0]);
+	return p + sizeofW(StgHeader) + 2;
+	
+    case THUNK_2_0:
+	thread((StgPtr)&((StgThunk *)p)->payload[0]);
+	thread((StgPtr)&((StgThunk *)p)->payload[1]);
+	return p + sizeofW(StgThunk) + 2;
+
+    case FUN_2_0:
+    case CONSTR_2_0:
+	thread((StgPtr)&((StgClosure *)p)->payload[0]);
+	thread((StgPtr)&((StgClosure *)p)->payload[1]);
+	return p + sizeofW(StgHeader) + 2;
+	
+    case BCO: {
+	StgBCO *bco = (StgBCO *)p;
+	thread((StgPtr)&bco->instrs);
+	thread((StgPtr)&bco->literals);
+	thread((StgPtr)&bco->ptrs);
+	thread((StgPtr)&bco->itbls);
+	return p + bco_sizeW(bco);
+    }
+
+    case THUNK:
+    {
+	StgPtr end;
+	
+	end = (P_)((StgThunk *)p)->payload + 
+	    info->layout.payload.ptrs;
+	for (p = (P_)((StgThunk *)p)->payload; p < end; p++) {
+	    thread(p);
+	}
+	return p + info->layout.payload.nptrs;
+    }
+
+    case FUN:
+    case CONSTR:
+    case STABLE_NAME:
+    case IND_PERM:
+    case MUT_VAR_CLEAN:
+    case MUT_VAR_DIRTY:
+    case CAF_BLACKHOLE:
+    case SE_CAF_BLACKHOLE:
+    case SE_BLACKHOLE:
+    case BLACKHOLE:
+    {
+	StgPtr end;
+	
+	end = (P_)((StgClosure *)p)->payload + 
+	    info->layout.payload.ptrs;
+	for (p = (P_)((StgClosure *)p)->payload; p < end; p++) {
+	    thread(p);
+	}
+	return p + info->layout.payload.nptrs;
+    }
+    
+    case WEAK:
+    {
+	StgWeak *w = (StgWeak *)p;
+	thread((StgPtr)&w->key);
+	thread((StgPtr)&w->value);
+	thread((StgPtr)&w->finalizer);
+	if (w->link != NULL) {
+	    thread((StgPtr)&w->link);
+	}
+	return p + sizeofW(StgWeak);
+    }
+    
+    case MVAR:
+    { 
+	StgMVar *mvar = (StgMVar *)p;
+	thread((StgPtr)&mvar->head);
+	thread((StgPtr)&mvar->tail);
+	thread((StgPtr)&mvar->value);
+	return p + sizeofW(StgMVar);
+    }
+    
+    case IND_OLDGEN:
+    case IND_OLDGEN_PERM:
+	thread((StgPtr)&((StgInd *)p)->indirectee);
+	return p + sizeofW(StgInd);
+
+    case THUNK_SELECTOR:
+    { 
+	StgSelector *s = (StgSelector *)p;
+	thread((StgPtr)&s->selectee);
+	return p + THUNK_SELECTOR_sizeW();
+    }
+    
+    case AP_STACK:
+	return thread_AP_STACK((StgAP_STACK *)p);
+	
+    case PAP:
+	return thread_PAP((StgPAP *)p);
+
+    case AP:
+	return thread_AP((StgAP *)p);
+	
+    case ARR_WORDS:
+	return p + arr_words_sizeW((StgArrWords *)p);
+	
+    case MUT_ARR_PTRS_CLEAN:
+    case MUT_ARR_PTRS_DIRTY:
+    case MUT_ARR_PTRS_FROZEN:
+    case MUT_ARR_PTRS_FROZEN0:
+	// follow everything 
+    {
+	StgPtr next;
+	
+	next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+	for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+	    thread(p);
+	}
+	return p;
+    }
+    
+    case TSO:
+	return thread_TSO((StgTSO *)p);
+    
+    case TVAR_WAIT_QUEUE:
+    {
+        StgTVarWaitQueue *wq = (StgTVarWaitQueue *)p;
+	thread((StgPtr)&wq->waiting_tso);
+	thread((StgPtr)&wq->next_queue_entry);
+	thread((StgPtr)&wq->prev_queue_entry);
+	return p + sizeofW(StgTVarWaitQueue);
+    }
+    
+    case TVAR:
+    {
+        StgTVar *tvar = (StgTVar *)p;
+	thread((StgPtr)&tvar->current_value);
+	thread((StgPtr)&tvar->first_wait_queue_entry);
+	return p + sizeofW(StgTVar);
+    }
+    
+    case TREC_HEADER:
+    {
+        StgTRecHeader *trec = (StgTRecHeader *)p;
+	thread((StgPtr)&trec->enclosing_trec);
+	thread((StgPtr)&trec->current_chunk);
+	return p + sizeofW(StgTRecHeader);
+    }
+
+    case TREC_CHUNK:
+    {
+        StgWord i;
+        StgTRecChunk *tc = (StgTRecChunk *)p;
+	TRecEntry *e = &(tc -> entries[0]);
+	thread((StgPtr)&tc->prev_chunk);
+	for (i = 0; i < tc -> next_entry_idx; i ++, e++ ) {
+	  thread((StgPtr)&e->tvar);
+	  thread((StgPtr)&e->expected_value);
+	  thread((StgPtr)&e->new_value);
+	}
+	return p + sizeofW(StgTRecChunk);
+    }
+
+    default:
+	barf("update_fwd: unknown/strange object  %d", (int)(info->type));
+	return NULL;
+    }
+}
+
+static void
+update_fwd( bdescr *blocks )
+{
+    StgPtr p;
+    bdescr *bd;
+    StgInfoTable *info;
+
+    bd = blocks;
+
+#if defined(PAR)
+    barf("update_fwd: ToDo");
+#endif
+
+    // cycle through all the blocks in the step
+    for (; bd != NULL; bd = bd->link) {
+	p = bd->start;
+
+	// linearly scan the objects in this block
+	while (p < bd->free) {
+	    ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
+	    info = get_itbl((StgClosure *)p);
+	    p = thread_obj(info, p);
+	}
+    }
+} 
+
+static void
+update_fwd_compact( bdescr *blocks )
+{
+    StgPtr p, q, free;
+#if 0
+    StgWord m;
+#endif
+    bdescr *bd, *free_bd;
+    StgInfoTable *info;
+    nat size;
+
+    bd = blocks;
+    free_bd = blocks;
+    free = free_bd->start;
+
+#if defined(PAR)
+    barf("update_fwd: ToDo");
+#endif
+
+    // cycle through all the blocks in the step
+    for (; bd != NULL; bd = bd->link) {
+	p = bd->start;
+
+	while (p < bd->free ) {
+
+	    while ( p < bd->free && !is_marked(p,bd) ) {
+		p++;
+	    }
+	    if (p >= bd->free) {
+		break;
+	    }
+
+#if 0
+    next:
+	m = * ((StgPtr)bd->u.bitmap + ((p - bd->start) / (BITS_IN(StgWord))));
+	m >>= ((p - bd->start) & (BITS_IN(StgWord) - 1));
+
+	while ( p < bd->free ) {
+
+	    if ((m & 1) == 0) {
+		m >>= 1;
+		p++;
+		if (((StgWord)p & (sizeof(W_) * BITS_IN(StgWord))) == 0) {
+		    goto next;
+		} else {
+		    continue;
+		}
+	    }
+#endif
+
+	    // Problem: we need to know the destination for this cell
+	    // in order to unthread its info pointer.  But we can't
+	    // know the destination without the size, because we may
+	    // spill into the next block.  So we have to run down the 
+	    // threaded list and get the info ptr first.
+	    info = get_threaded_info(p);
+
+	    q = p;
+
+	    p = thread_obj(info, p);
+
+	    size = p - q;
+	    if (free + size > free_bd->start + BLOCK_SIZE_W) {
+		// unset the next bit in the bitmap to indicate that
+		// this object needs to be pushed into the next
+		// block.  This saves us having to run down the
+		// threaded info pointer list twice during the next pass.
+		unmark(q+1,bd);
+		free_bd = free_bd->link;
+		free = free_bd->start;
+	    } else {
+		ASSERT(is_marked(q+1,bd));
+	    }
+
+	    unthread(q,free);
+	    free += size;
+#if 0
+	    goto next;
+#endif
+	}
+    }
+}
+
+static nat
+update_bkwd_compact( step *stp )
+{
+    StgPtr p, free;
+#if 0
+    StgWord m;
+#endif
+    bdescr *bd, *free_bd;
+    StgInfoTable *info;
+    nat size, free_blocks;
+
+    bd = free_bd = stp->old_blocks;
+    free = free_bd->start;
+    free_blocks = 1;
+
+#if defined(PAR)
+    barf("update_bkwd: ToDo");
+#endif
+
+    // cycle through all the blocks in the step
+    for (; bd != NULL; bd = bd->link) {
+	p = bd->start;
+
+	while (p < bd->free ) {
+
+	    while ( p < bd->free && !is_marked(p,bd) ) {
+		p++;
+	    }
+	    if (p >= bd->free) {
+		break;
+	    }
+
+#if 0
+    next:
+	m = * ((StgPtr)bd->u.bitmap + ((p - bd->start) / (BITS_IN(StgWord))));
+	m >>= ((p - bd->start) & (BITS_IN(StgWord) - 1));
+
+	while ( p < bd->free ) {
+
+	    if ((m & 1) == 0) {
+		m >>= 1;
+		p++;
+		if (((StgWord)p & (sizeof(W_) * BITS_IN(StgWord))) == 0) {
+		    goto next;
+		} else {
+		    continue;
+		}
+	    }
+#endif
+
+	    if (!is_marked(p+1,bd)) {
+		// don't forget to update the free ptr in the block desc.
+		free_bd->free = free;
+		free_bd = free_bd->link;
+		free = free_bd->start;
+		free_blocks++;
+	    }
+
+	    unthread(p,free);
+	    ASSERT(LOOKS_LIKE_INFO_PTR(((StgClosure *)p)->header.info));
+	    info = get_itbl((StgClosure *)p);
+	    size = closure_sizeW_((StgClosure *)p,info);
+
+	    if (free != p) {
+		move(free,p,size);
+	    }
+
+	    // relocate TSOs
+	    if (info->type == TSO) {
+		move_TSO((StgTSO *)p, (StgTSO *)free);
+	    }
+
+	    free += size;
+	    p += size;
+#if 0
+	    goto next;
+#endif
+	}
+    }
+
+    // free the remaining blocks and count what's left.
+    free_bd->free = free;
+    if (free_bd->link != NULL) {
+	freeChain(free_bd->link);
+	free_bd->link = NULL;
+    }
+
+    return free_blocks;
+}
+
+void
+compact( void (*get_roots)(evac_fn) )
+{
+    nat g, s, blocks;
+    step *stp;
+
+    // 1. thread the roots
+    get_roots((evac_fn)thread);
+
+    // the weak pointer lists...
+    if (weak_ptr_list != NULL) {
+	thread((StgPtr)(void *)&weak_ptr_list);
+    }
+    if (old_weak_ptr_list != NULL) {
+	thread((StgPtr)(void *)&old_weak_ptr_list); // tmp
+    }
+
+    // mutable lists
+    for (g = 1; g < RtsFlags.GcFlags.generations; g++) {
+	bdescr *bd;
+	StgPtr p;
+	for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) {
+	    for (p = bd->start; p < bd->free; p++) {
+		thread(p);
+	    }
+	}
+    }
+
+    // the global thread list
+    thread((StgPtr)(void *)&all_threads);
+
+    // any threads resurrected during this GC
+    thread((StgPtr)(void *)&resurrected_threads);
+
+    // the task list
+    {
+	Task *task;
+	for (task = all_tasks; task != NULL; task = task->all_link) {
+	    if (task->tso) {
+		thread((StgPtr)&task->tso);
+	    }
+	}
+    }
+
+    // the static objects
+    thread_static(scavenged_static_objects);
+
+    // the stable pointer table
+    threadStablePtrTable((evac_fn)thread);
+
+    // the CAF list (used by GHCi)
+    markCAFs((evac_fn)thread);
+
+    // 2. update forward ptrs
+    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+	for (s = 0; s < generations[g].n_steps; s++) {
+	    if (g==0 && s ==0) continue;
+	    stp = &generations[g].steps[s];
+	    IF_DEBUG(gc, debugBelch("update_fwd:  %d.%d\n", stp->gen->no, stp->no););
+
+	    update_fwd(stp->blocks);
+	    update_fwd_large(stp->scavenged_large_objects);
+	    if (g == RtsFlags.GcFlags.generations-1 && stp->old_blocks != NULL) {
+		IF_DEBUG(gc, debugBelch("update_fwd:  %d.%d (compact)\n", stp->gen->no, stp->no););
+		update_fwd_compact(stp->old_blocks);
+	    }
+	}
+    }
+
+    // 3. update backward ptrs
+    stp = &oldest_gen->steps[0];
+    if (stp->old_blocks != NULL) {
+	blocks = update_bkwd_compact(stp);
+	IF_DEBUG(gc, debugBelch("update_bkwd: %d.%d (compact, old: %d blocks, now %d blocks)\n", 
+			     stp->gen->no, stp->no,
+			     stp->n_old_blocks, blocks););
+	stp->n_old_blocks = blocks;
+    }
+}
diff --git a/rts/GCCompact.h b/rts/GCCompact.h
new file mode 100644
index 0000000000..0fb39b3b12
--- /dev/null
+++ b/rts/GCCompact.h
@@ -0,0 +1,44 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1998-2005
+ *
+ * Compacting garbage collector
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef GCCOMPACT_H
+#define GCCOMPACT_H
+
+STATIC_INLINE void 
+mark(StgPtr p, bdescr *bd)
+{
+    nat offset_within_block = p - bd->start; // in words
+    StgPtr bitmap_word = (StgPtr)bd->u.bitmap + 
+	(offset_within_block / (sizeof(W_)*BITS_PER_BYTE));
+    StgWord bit_mask = (StgWord)1 << (offset_within_block & (sizeof(W_)*BITS_PER_BYTE - 1));
+    *bitmap_word |= bit_mask;
+}
+
+STATIC_INLINE void 
+unmark(StgPtr p, bdescr *bd)
+{
+    nat offset_within_block = p - bd->start; // in words
+    StgPtr bitmap_word = (StgPtr)bd->u.bitmap + 
+	(offset_within_block / (sizeof(W_)*BITS_PER_BYTE));
+    StgWord bit_mask = (StgWord)1 << (offset_within_block & (sizeof(W_)*BITS_PER_BYTE - 1));
+    *bitmap_word &= ~bit_mask;
+}
+
+STATIC_INLINE StgWord
+is_marked(StgPtr p, bdescr *bd)
+{
+    nat offset_within_block = p - bd->start; // in words
+    StgPtr bitmap_word = (StgPtr)bd->u.bitmap + 
+	(offset_within_block / (sizeof(W_)*BITS_PER_BYTE));
+    StgWord bit_mask = (StgWord)1 << (offset_within_block & (sizeof(W_)*BITS_PER_BYTE - 1));
+    return (*bitmap_word & bit_mask);
+}
+
+void compact( void (*get_roots)(evac_fn) );
+
+#endif /* GCCOMPACT_H */
diff --git a/rts/GetTime.h b/rts/GetTime.h
new file mode 100644
index 0000000000..5f02df0625
--- /dev/null
+++ b/rts/GetTime.h
@@ -0,0 +1,26 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 2005
+ *
+ * Machine-independent interface to time measurement
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef GETTIME_H
+#define GETTIME_H
+
+// We'll use a fixed resolution of usec for now.  The machine
+// dependent implementation may have a different resolution, but we'll
+// normalise to this for the machine independent interface.
+#define TICKS_PER_SECOND 1000000
+typedef StgInt64 Ticks;
+
+Ticks getProcessCPUTime     (void);
+Ticks getThreadCPUTime      (void);
+Ticks getProcessElapsedTime (void);
+void  getProcessTimes       (Ticks *user, Ticks *elapsed);
+
+// Not strictly timing, but related
+nat   getPageFaults         (void);
+
+#endif /* GETTIME_H */
diff --git a/rts/HSprel.def b/rts/HSprel.def
new file mode 100644
index 0000000000..0ffe00b48c
--- /dev/null
+++ b/rts/HSprel.def
@@ -0,0 +1,28 @@
+; list of entry points that the RTS imports from
+; the Prelude.
+EXPORTS
+PrelBase_False_closure
+PrelBase_True_closure
+PrelBase_Czh_con_info DATA
+PrelBase_Czh_static_info DATA
+PrelBase_Izh_con_info DATA
+PrelBase_Izh_static_info DATA
+PrelAddr_I64zh_con_info DATA
+PrelAddr_W64zh_con_info DATA
+PrelAddr_Azh_con_info DATA
+PrelAddr_Azh_static_info DATA
+PrelFloat_Fzh_con_info DATA
+PrelFloat_Fzh_static_info DATA
+PrelFloat_Dzh_con_info DATA
+PrelFloat_Dzh_static_info DATA
+PrelAddr_Wzh_con_info DATA
+PrelAddr_Wzh_static_info DATA
+PrelStable_StablePtr_con_info DATA
+PrelStable_StablePtr_static_info DATA
+PrelPack_unpackCString_closure
+PrelIOBase_stackOverflow_closure
+PrelIOBase_BlockedOnDeadMVar_closure
+PrelIOBase_BlockedIndefinitely_closure
+PrelIOBase_NonTermination_closure
+PrelWeak_runFinalizzerBatch_closure
+__stginit_Prelude
diff --git a/rts/Hash.c b/rts/Hash.c
new file mode 100644
index 0000000000..ada11a6a85
--- /dev/null
+++ b/rts/Hash.c
@@ -0,0 +1,376 @@
+/*-----------------------------------------------------------------------------
+ *
+ * (c) The AQUA Project, Glasgow University, 1995-1998
+ * (c) The GHC Team, 1999
+ *
+ * Dynamically expanding linear hash tables, as described in
+ * Per-\AAke Larson, ``Dynamic Hash Tables,'' CACM 31(4), April 1988,
+ * pp. 446 -- 457.
+ * -------------------------------------------------------------------------- */
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "Hash.h"
+#include "RtsUtils.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#define HSEGSIZE    1024    /* Size of a single hash table segment */
+			    /* Also the minimum size of a hash table */
+#define HDIRSIZE    1024    /* Size of the segment directory */
+			    /* Maximum hash table size is HSEGSIZE * HDIRSIZE */
+#define HLOAD	    5	    /* Maximum average load of a single hash bucket */
+
+#define HCHUNK	    (1024 * sizeof(W_) / sizeof(HashList))
+			    /* Number of HashList cells to allocate in one go */
+
+
+/* Linked list of (key, data) pairs for separate chaining */
+struct hashlist {
+    StgWord key;
+    void *data;
+    struct hashlist *next;  /* Next cell in bucket chain (same hash value) */
+};
+
+typedef struct hashlist HashList;
+
+typedef int HashFunction(HashTable *table, StgWord key);
+typedef int CompareFunction(StgWord key1, StgWord key2);
+
+struct hashtable {
+    int split;		    /* Next bucket to split when expanding */
+    int max;		    /* Max bucket of smaller table */
+    int mask1;		    /* Mask for doing the mod of h_1 (smaller table) */
+    int mask2;		    /* Mask for doing the mod of h_2 (larger table) */
+    int kcount;		    /* Number of keys */
+    int bcount;		    /* Number of buckets */
+    HashList **dir[HDIRSIZE];	/* Directory of segments */
+    HashFunction *hash;		/* hash function */
+    CompareFunction *compare;   /* key comparison function */
+};
+
+/* -----------------------------------------------------------------------------
+ * Hash first using the smaller table.  If the bucket is less than the
+ * next bucket to be split, re-hash using the larger table.
+ * -------------------------------------------------------------------------- */
+
+static int
+hashWord(HashTable *table, StgWord key)
+{
+    int bucket;
+
+    /* Strip the boring zero bits */
+    key /= sizeof(StgWord);
+
+    /* Mod the size of the hash table (a power of 2) */
+    bucket = key & table->mask1;
+
+    if (bucket < table->split) {
+	/* Mod the size of the expanded hash table (also a power of 2) */
+	bucket = key & table->mask2;
+    }
+    return bucket;
+}
+
+static int
+hashStr(HashTable *table, char *key)
+{
+    int h, bucket;
+    char *s;
+
+    s = key;
+    for (h=0; *s; s++) {
+	h *= 128;
+	h += *s;
+	h = h % 1048583;	/* some random large prime */
+    }
+
+    /* Mod the size of the hash table (a power of 2) */
+    bucket = h & table->mask1;
+
+    if (bucket < table->split) {
+	/* Mod the size of the expanded hash table (also a power of 2) */
+	bucket = h & table->mask2;
+    }
+
+    return bucket;
+}
+
+static int
+compareWord(StgWord key1, StgWord key2)
+{
+    return (key1 == key2);
+}
+
+static int
+compareStr(StgWord key1, StgWord key2)
+{
+    return (strcmp((char *)key1, (char *)key2) == 0);
+}
+
+
+/* -----------------------------------------------------------------------------
+ * Allocate a new segment of the dynamically growing hash table.
+ * -------------------------------------------------------------------------- */
+
+static void
+allocSegment(HashTable *table, int segment)
+{
+    table->dir[segment] = stgMallocBytes(HSEGSIZE * sizeof(HashList *), 
+					 "allocSegment");
+}
+
+
+/* -----------------------------------------------------------------------------
+ * Expand the larger hash table by one bucket, and split one bucket
+ * from the smaller table into two parts.  Only the bucket referenced
+ * by @table->split@ is affected by the expansion.
+ * -------------------------------------------------------------------------- */
+
+static void
+expand(HashTable *table)
+{
+    int oldsegment;
+    int oldindex;
+    int newbucket;
+    int newsegment;
+    int newindex;
+    HashList *hl;
+    HashList *next;
+    HashList *old, *new;
+
+    if (table->split + table->max >= HDIRSIZE * HSEGSIZE)
+	/* Wow!  That's big.  Too big, so don't expand. */
+	return;
+
+    /* Calculate indices of bucket to split */
+    oldsegment = table->split / HSEGSIZE;
+    oldindex = table->split % HSEGSIZE;
+
+    newbucket = table->max + table->split;
+
+    /* And the indices of the new bucket */
+    newsegment = newbucket / HSEGSIZE;
+    newindex = newbucket % HSEGSIZE;
+
+    if (newindex == 0)
+	allocSegment(table, newsegment);
+
+    if (++table->split == table->max) {
+	table->split = 0;
+	table->max *= 2;
+	table->mask1 = table->mask2;
+	table->mask2 = table->mask2 << 1 | 1;
+    }
+    table->bcount++;
+
+    /* Split the bucket, paying no attention to the original order */
+
+    old = new = NULL;
+    for (hl = table->dir[oldsegment][oldindex]; hl != NULL; hl = next) {
+	next = hl->next;
+	if (table->hash(table, hl->key) == newbucket) {
+	    hl->next = new;
+	    new = hl;
+	} else {
+	    hl->next = old;
+	    old = hl;
+	}
+    }
+    table->dir[oldsegment][oldindex] = old;
+    table->dir[newsegment][newindex] = new;
+
+    return;
+}
+
+void *
+lookupHashTable(HashTable *table, StgWord key)
+{
+    int bucket;
+    int segment;
+    int index;
+    HashList *hl;
+
+    bucket = table->hash(table, key);
+    segment = bucket / HSEGSIZE;
+    index = bucket % HSEGSIZE;
+
+    for (hl = table->dir[segment][index]; hl != NULL; hl = hl->next)
+	if (table->compare(hl->key, key))
+	    return hl->data;
+
+    /* It's not there */
+    return NULL;
+}
+
+/* -----------------------------------------------------------------------------
+ * We allocate the hashlist cells in large chunks to cut down on malloc
+ * overhead.  Although we keep a free list of hashlist cells, we make
+ * no effort to actually return the space to the malloc arena.
+ * -------------------------------------------------------------------------- */
+
+static HashList *freeList = NULL;
+
+static HashList *
+allocHashList(void)
+{
+    HashList *hl, *p;
+
+    if ((hl = freeList) != NULL) {
+	freeList = hl->next;
+    } else {
+        hl = stgMallocBytes(HCHUNK * sizeof(HashList), "allocHashList");
+
+	freeList = hl + 1;
+	for (p = freeList; p < hl + HCHUNK - 1; p++)
+	    p->next = p + 1;
+	p->next = NULL;
+    }
+    return hl;
+}
+
+static void
+freeHashList(HashList *hl)
+{
+    hl->next = freeList;
+    freeList = hl;
+}
+
+void
+insertHashTable(HashTable *table, StgWord key, void *data)
+{
+    int bucket;
+    int segment;
+    int index;
+    HashList *hl;
+
+    // Disable this assert; sometimes it's useful to be able to
+    // overwrite entries in the hash table.
+    // ASSERT(lookupHashTable(table, key) == NULL);
+
+    /* When the average load gets too high, we expand the table */
+    if (++table->kcount >= HLOAD * table->bcount)
+	expand(table);
+
+    bucket = table->hash(table, key);
+    segment = bucket / HSEGSIZE;
+    index = bucket % HSEGSIZE;
+
+    hl = allocHashList();
+
+    hl->key = key;
+    hl->data = data;
+    hl->next = table->dir[segment][index];
+    table->dir[segment][index] = hl;
+
+}
+
+void *
+removeHashTable(HashTable *table, StgWord key, void *data)
+{
+    int bucket;
+    int segment;
+    int index;
+    HashList *hl;
+    HashList *prev = NULL;
+
+    bucket = table->hash(table, key);
+    segment = bucket / HSEGSIZE;
+    index = bucket % HSEGSIZE;
+
+    for (hl = table->dir[segment][index]; hl != NULL; hl = hl->next) {
+	if (table->compare(hl->key,key) && (data == NULL || hl->data == data)) {
+	    if (prev == NULL)
+		table->dir[segment][index] = hl->next;
+	    else
+		prev->next = hl->next;
+	    freeHashList(hl);
+	    table->kcount--;
+	    return hl->data;
+	}
+	prev = hl;
+    }
+
+    /* It's not there */
+    ASSERT(data == NULL);
+    return NULL;
+}
+
+/* -----------------------------------------------------------------------------
+ * When we free a hash table, we are also good enough to free the
+ * data part of each (key, data) pair, as long as our caller can tell
+ * us how to do it.
+ * -------------------------------------------------------------------------- */
+
+void
+freeHashTable(HashTable *table, void (*freeDataFun)(void *) )
+{
+    long segment;
+    long index;
+    HashList *hl;
+    HashList *next;
+
+    /* The last bucket with something in it is table->max + table->split - 1 */
+    segment = (table->max + table->split - 1) / HSEGSIZE;
+    index = (table->max + table->split - 1) % HSEGSIZE;
+
+    while (segment >= 0) {
+	while (index >= 0) {
+	    for (hl = table->dir[segment][index]; hl != NULL; hl = next) {
+		next = hl->next;
+		if (freeDataFun != NULL)
+		    (*freeDataFun)(hl->data);
+		freeHashList(hl);
+	    }
+	    index--;
+	}
+	stgFree(table->dir[segment]);
+	segment--;
+	index = HSEGSIZE - 1;
+    }
+    stgFree(table);
+}
+
+/* -----------------------------------------------------------------------------
+ * When we initialize a hash table, we set up the first segment as well,
+ * initializing all of the first segment's hash buckets to NULL.
+ * -------------------------------------------------------------------------- */
+
+static HashTable *
+allocHashTable_(HashFunction *hash, CompareFunction *compare)
+{
+    HashTable *table;
+    HashList **hb;
+
+    table = stgMallocBytes(sizeof(HashTable),"allocHashTable");
+
+    allocSegment(table, 0);
+
+    for (hb = table->dir[0]; hb < table->dir[0] + HSEGSIZE; hb++)
+	*hb = NULL;
+
+    table->split = 0;
+    table->max = HSEGSIZE;
+    table->mask1 = HSEGSIZE - 1;
+    table->mask2 = 2 * HSEGSIZE - 1;
+    table->kcount = 0;
+    table->bcount = HSEGSIZE;
+    table->hash = hash;
+    table->compare = compare;
+
+    return table;
+}
+
+HashTable *
+allocHashTable(void)
+{
+    return allocHashTable_(hashWord, compareWord);
+}
+
+HashTable *
+allocStrHashTable(void)
+{
+    return allocHashTable_((HashFunction *)hashStr, 
+			   (CompareFunction *)compareStr);
+}
diff --git a/rts/Hash.h b/rts/Hash.h
new file mode 100644
index 0000000000..ad55953da4
--- /dev/null
+++ b/rts/Hash.h
@@ -0,0 +1,40 @@
+/*-----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1999
+ *
+ * Prototypes for Hash.c
+ *
+ * -------------------------------------------------------------------------- */
+
+#ifndef HASH_H
+#define HASH_H
+
+typedef struct hashtable HashTable; /* abstract */
+
+/* Hash table access where the keys are StgWords */
+HashTable * allocHashTable    ( void );
+void *      lookupHashTable ( HashTable *table, StgWord key );
+void        insertHashTable ( HashTable *table, StgWord key, void *data );
+void *      removeHashTable ( HashTable *table, StgWord key, void *data );
+
+/* Hash table access where the keys are C strings (the strings are
+ * assumed to be allocated by the caller, and mustn't be deallocated
+ * until the corresponding hash table entry has been removed).
+ */
+HashTable * allocStrHashTable ( void );
+
+#define lookupStrHashTable(table, key)  \
+   (lookupHashTable(table, (StgWord)key))
+
+#define insertStrHashTable(table, key, data)  \
+   (insertHashTable(table, (StgWord)key, data))
+
+#define removeStrHashTable(table, key, data) \
+   (removeHashTable(table, (StgWord)key, data))
+
+/* Freeing hash tables 
+ */
+void freeHashTable ( HashTable *table, void (*freeDataFun)(void *) );
+
+#endif /* HASH_H */
+
diff --git a/rts/HeapStackCheck.cmm b/rts/HeapStackCheck.cmm
new file mode 100644
index 0000000000..4e5dd24596
--- /dev/null
+++ b/rts/HeapStackCheck.cmm
@@ -0,0 +1,964 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2004
+ *
+ * Canned Heap-Check and Stack-Check sequences.
+ *
+ * This file is written in a subset of C--, extended with various
+ * features specific to GHC.  It is compiled by GHC directly.  For the
+ * syntax of .cmm files, see the parser in ghc/compiler/cmm/CmmParse.y.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Cmm.h"
+
+/* Stack/Heap Check Failure
+ * ------------------------
+ *
+ * On discovering that a stack or heap check has failed, we do the following:
+ *
+ *    - If the context_switch flag is set, indicating that there are more
+ *      threads waiting to run, we yield to the scheduler 
+ *	(return ThreadYielding).
+ *
+ *    - If Hp > HpLim, we've had a heap check failure.  This means we've
+ *	come to the end of the current heap block, so we try to chain
+ *	another block on with ExtendNursery().  
+ *
+ *	     - If this succeeds, we carry on without returning to the 
+ *	       scheduler.  
+ *
+ *	     - If it fails, we return to the scheduler claiming HeapOverflow
+ *	       so that a garbage collection can be performed.
+ *
+ *    - If Hp <= HpLim, it must have been a stack check that failed.  In
+ *	which case, we return to the scheduler claiming StackOverflow, the
+ *	scheduler will either increase the size of our stack, or raise
+ *	an exception if the stack is already too big.
+ *
+ * The effect of checking for context switch only in the heap/stack check
+ * failure code is that we'll switch threads after the current thread has
+ * reached the end of its heap block.  If a thread isn't allocating
+ * at all, it won't yield.  Hopefully this won't be a problem in practice.
+ */
+ 
+#define PRE_RETURN(why,what_next)			\
+  StgTSO_what_next(CurrentTSO) = what_next::I16;	\
+  StgRegTable_rRet(BaseReg) = why;           	        \
+  R1 = BaseReg;
+
+/* Remember that the return address is *removed* when returning to a
+ * ThreadRunGHC thread.
+ */
+
+#define GC_GENERIC						\
+    DEBUG_ONLY(foreign "C" heapCheckFail());			\
+    if (Hp > HpLim) {						\
+        Hp = Hp - HpAlloc/*in bytes*/;				\
+        if (HpAlloc <= BLOCK_SIZE				\
+            && bdescr_link(CurrentNursery) != NULL) {		\
+            CLOSE_NURSERY();					\
+            CurrentNursery = bdescr_link(CurrentNursery);	\
+            OPEN_NURSERY();					\
+            if (CInt[context_switch] != 0 :: CInt) {		\
+                R1 = ThreadYielding;				\
+                goto sched;					\
+            } else {						\
+                jump %ENTRY_CODE(Sp(0));			\
+            }							\
+	} else {						\
+            R1 = HeapOverflow;					\
+            goto sched;						\
+        }							\
+    } else {							\
+        R1 = StackOverflow;					\
+    }								\
+  sched:							\
+    PRE_RETURN(R1,ThreadRunGHC);				\
+    jump stg_returnToSched;
+
+#define HP_GENERIC				\
+   PRE_RETURN(HeapOverflow, ThreadRunGHC)	\
+  jump stg_returnToSched;
+
+#define BLOCK_GENERIC				\
+   PRE_RETURN(ThreadBlocked,  ThreadRunGHC)	\
+  jump stg_returnToSched;
+
+#define YIELD_GENERIC				\
+  PRE_RETURN(ThreadYielding, ThreadRunGHC)	\
+  jump stg_returnToSched;
+
+#define BLOCK_BUT_FIRST(c)			\
+  PRE_RETURN(ThreadBlocked, ThreadRunGHC)	\
+  R2 = c;					\
+  jump stg_returnToSchedButFirst;
+
+#define YIELD_TO_INTERPRETER			\
+  PRE_RETURN(ThreadYielding, ThreadInterpret)	\
+  jump stg_returnToSchedNotPaused;
+
+/* -----------------------------------------------------------------------------
+   Heap checks in thunks/functions.
+
+   In these cases, node always points to the function closure.  This gives
+   us an easy way to return to the function: just leave R1 on the top of
+   the stack, and have the scheduler enter it to return.
+
+   There are canned sequences for 'n' pointer values in registers.
+   -------------------------------------------------------------------------- */
+
+INFO_TABLE_RET( stg_enter, 1/*framesize*/, 0/*bitmap*/, RET_SMALL)
+{
+    R1 = Sp(1);
+    Sp_adj(2);
+    ENTER();
+}
+
+__stg_gc_enter_1
+{
+    Sp_adj(-2);
+    Sp(1) = R1;
+    Sp(0) = stg_enter_info;
+    GC_GENERIC
+}
+
+#if defined(GRAN)
+/*
+  ToDo: merge the block and yield macros, calling something like BLOCK(N)
+        at the end;
+*/
+
+/* 
+   Should we actually ever do a yield in such a case?? -- HWL
+*/
+gran_yield_0
+{
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadYielding;
+    jump StgReturn;
+}
+
+gran_yield_1
+{
+    Sp_adj(-1);
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadYielding;
+    jump StgReturn;
+}
+
+/*- 2 Regs--------------------------------------------------------------------*/
+
+gran_yield_2
+{
+    Sp_adj(-2);
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadYielding;
+    jump StgReturn;
+}
+
+/*- 3 Regs -------------------------------------------------------------------*/
+
+gran_yield_3
+{
+    Sp_adj(-3);
+    Sp(2) = R3;
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadYielding;
+    jump StgReturn;
+}
+
+/*- 4 Regs -------------------------------------------------------------------*/
+
+gran_yield_4
+{
+    Sp_adj(-4);
+    Sp(3) = R4;
+    Sp(2) = R3;
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadYielding;
+    jump StgReturn;
+}
+
+/*- 5 Regs -------------------------------------------------------------------*/
+
+gran_yield_5
+{
+    Sp_adj(-5);
+    Sp(4) = R5;
+    Sp(3) = R4;
+    Sp(2) = R3;
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadYielding;
+    jump StgReturn;
+}
+
+/*- 6 Regs -------------------------------------------------------------------*/
+
+gran_yield_6
+{
+    Sp_adj(-6);
+    Sp(5) = R6;
+    Sp(4) = R5;
+    Sp(3) = R4;
+    Sp(2) = R3;
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadYielding;
+    jump StgReturn;
+}
+
+/*- 7 Regs -------------------------------------------------------------------*/
+
+gran_yield_7
+{
+    Sp_adj(-7);
+    Sp(6) = R7;
+    Sp(5) = R6;
+    Sp(4) = R5;
+    Sp(3) = R4;
+    Sp(2) = R3;
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadYielding;
+    jump StgReturn;
+}
+
+/*- 8 Regs -------------------------------------------------------------------*/
+
+gran_yield_8
+{
+    Sp_adj(-8);
+    Sp(7) = R8;
+    Sp(6) = R7;
+    Sp(5) = R6;
+    Sp(4) = R5;
+    Sp(3) = R4;
+    Sp(2) = R3;
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadYielding;
+    jump StgReturn;
+}
+
+// the same routines but with a block rather than a yield
+
+gran_block_1
+{
+    Sp_adj(-1);
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadBlocked;
+    jump StgReturn;
+}
+
+/*- 2 Regs--------------------------------------------------------------------*/
+
+gran_block_2
+{
+    Sp_adj(-2);
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadBlocked;
+    jump StgReturn;
+}
+
+/*- 3 Regs -------------------------------------------------------------------*/
+
+gran_block_3
+{
+    Sp_adj(-3);
+    Sp(2) = R3;
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadBlocked;
+    jump StgReturn;
+}
+
+/*- 4 Regs -------------------------------------------------------------------*/
+
+gran_block_4
+{
+    Sp_adj(-4);
+    Sp(3) = R4;
+    Sp(2) = R3;
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadBlocked;
+    jump StgReturn;
+}
+
+/*- 5 Regs -------------------------------------------------------------------*/
+
+gran_block_5
+{
+    Sp_adj(-5);
+    Sp(4) = R5;
+    Sp(3) = R4;
+    Sp(2) = R3;
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadBlocked;
+    jump StgReturn;
+}
+
+/*- 6 Regs -------------------------------------------------------------------*/
+
+gran_block_6
+{
+    Sp_adj(-6);
+    Sp(5) = R6;
+    Sp(4) = R5;
+    Sp(3) = R4;
+    Sp(2) = R3;
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadBlocked;
+    jump StgReturn;
+}
+
+/*- 7 Regs -------------------------------------------------------------------*/
+
+gran_block_7
+{
+    Sp_adj(-7);
+    Sp(6) = R7;
+    Sp(5) = R6;
+    Sp(4) = R5;
+    Sp(3) = R4;
+    Sp(2) = R3;
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadBlocked;
+    jump StgReturn;
+}
+
+/*- 8 Regs -------------------------------------------------------------------*/
+
+gran_block_8
+{
+    Sp_adj(-8);
+    Sp(7) = R8;
+    Sp(6) = R7;
+    Sp(5) = R6;
+    Sp(4) = R5;
+    Sp(3) = R4;
+    Sp(2) = R3;
+    Sp(1) = R2;
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadBlocked;
+    jump StgReturn;
+}
+
+#endif
+
+#if 0 && defined(PAR)
+
+/*
+  Similar to stg_block_1 (called via StgMacro BLOCK_NP) but separates the
+  saving of the thread state from the actual jump via an StgReturn.
+  We need this separation because we call RTS routines in blocking entry codes
+  before jumping back into the RTS (see parallel/FetchMe.hc).
+*/
+
+par_block_1_no_jump
+{
+    Sp_adj(-1);
+    Sp(0) = R1;
+    SAVE_THREAD_STATE();					
+}
+
+par_jump
+{
+    TSO_what_next(CurrentTSO) = ThreadRunGHC;		
+    R1 = ThreadBlocked;
+    jump StgReturn;
+}
+
+#endif
+
+/* -----------------------------------------------------------------------------
+   Heap checks in Primitive case alternatives
+
+   A primitive case alternative is entered with a value either in 
+   R1, FloatReg1 or D1 depending on the return convention.  All the
+   cases are covered below.
+   -------------------------------------------------------------------------- */
+
+/*-- No Registers live ------------------------------------------------------ */
+
+stg_gc_noregs
+{
+    GC_GENERIC
+}
+
+/*-- void return ------------------------------------------------------------ */
+
+INFO_TABLE_RET( stg_gc_void, 0/*framesize*/, 0/*bitmap*/, RET_SMALL)
+{
+    Sp_adj(1);
+    jump %ENTRY_CODE(Sp(0));
+}
+
+/*-- R1 is boxed/unpointed -------------------------------------------------- */
+
+INFO_TABLE_RET( stg_gc_unpt_r1, 1/*framesize*/, 0/*bitmap*/, RET_SMALL)
+{
+    R1 = Sp(1);
+    Sp_adj(2);
+    jump %ENTRY_CODE(Sp(0));
+}
+
+stg_gc_unpt_r1
+{
+    Sp_adj(-2);
+    Sp(1) = R1;
+    Sp(0) = stg_gc_unpt_r1_info;
+    GC_GENERIC
+}
+
+/*-- R1 is unboxed -------------------------------------------------- */
+
+/* the 1 is a bitmap - i.e. 1 non-pointer word on the stack. */
+INFO_TABLE_RET(	stg_gc_unbx_r1, 1/*framesize*/, 1/*bitmap*/, RET_SMALL )
+{
+    R1 = Sp(1);
+    Sp_adj(2);
+    jump %ENTRY_CODE(Sp(0));
+}
+
+stg_gc_unbx_r1
+{
+    Sp_adj(-2);
+    Sp(1) = R1;
+    Sp(0) = stg_gc_unbx_r1_info;
+    GC_GENERIC
+}
+
+/*-- F1 contains a float ------------------------------------------------- */
+
+INFO_TABLE_RET(	stg_gc_f1, 1/*framesize*/, 1/*bitmap*/, RET_SMALL )
+{
+    F1 = F_[Sp+WDS(1)];
+    Sp_adj(2);
+    jump %ENTRY_CODE(Sp(0));
+}
+
+stg_gc_f1
+{
+    Sp_adj(-2);
+    F_[Sp + WDS(1)] = F1;
+    Sp(0) = stg_gc_f1_info;
+    GC_GENERIC
+}
+
+/*-- D1 contains a double ------------------------------------------------- */
+
+/* we support doubles of either 1 or 2 words in size */
+
+#if SIZEOF_DOUBLE == SIZEOF_VOID_P
+#  define DBL_BITMAP 1
+#  define DBL_WORDS  1
+#else
+#  define DBL_BITMAP 3
+#  define DBL_WORDS  2
+#endif 
+
+INFO_TABLE_RET(	stg_gc_d1, DBL_WORDS/*framesize*/, DBL_BITMAP/*bitmap*/, RET_SMALL )
+{
+    D1 = D_[Sp + WDS(1)];
+    Sp = Sp + WDS(1) + SIZEOF_StgDouble;
+    jump %ENTRY_CODE(Sp(0));
+}
+
+stg_gc_d1
+{
+    Sp = Sp - WDS(1) - SIZEOF_StgDouble;
+    D_[Sp + WDS(1)] = D1;
+    Sp(0) = stg_gc_d1_info;
+    GC_GENERIC
+}
+
+
+/*-- L1 contains an int64 ------------------------------------------------- */
+
+/* we support int64s of either 1 or 2 words in size */
+
+#if SIZEOF_VOID_P == 8
+#  define LLI_BITMAP 1
+#  define LLI_WORDS  1
+#else
+#  define LLI_BITMAP 3
+#  define LLI_WORDS  2
+#endif 
+
+INFO_TABLE_RET( stg_gc_l1, LLI_WORDS/*framesize*/, LLI_BITMAP/*bitmap*/, RET_SMALL )
+{
+    L1 = L_[Sp + WDS(1)];
+    Sp_adj(1) + SIZEOF_StgWord64;
+    jump %ENTRY_CODE(Sp(0));
+}
+
+stg_gc_l1
+{
+    Sp_adj(-1) - SIZEOF_StgWord64;
+    L_[Sp + WDS(1)] = L1;
+    Sp(0) = stg_gc_l1_info;
+    GC_GENERIC
+}
+
+/*-- Unboxed tuple return, one pointer (unregisterised build only) ---------- */
+
+INFO_TABLE_RET( stg_ut_1_0_unreg, 1/*size*/, 0/*BITMAP*/, RET_SMALL )
+{
+    Sp_adj(1);
+    // one ptr is on the stack (Sp(0))
+    jump %ENTRY_CODE(Sp(1));
+}
+
+/* -----------------------------------------------------------------------------
+   Generic function entry heap check code.
+
+   At a function entry point, the arguments are as per the calling convention,
+   i.e. some in regs and some on the stack.  There may or may not be 
+   a pointer to the function closure in R1 - if there isn't, then the heap
+   check failure code in the function will arrange to load it.
+
+   The function's argument types are described in its info table, so we
+   can just jump to this bit of generic code to save away all the
+   registers and return to the scheduler.
+
+   This code arranges the stack like this:
+	 
+         |        ....         |
+         |        args         |
+	 +---------------------+
+         |      f_closure      |
+	 +---------------------+
+         |        size         |
+	 +---------------------+
+         |   stg_gc_fun_info   |
+	 +---------------------+
+
+   The size is the number of words of arguments on the stack, and is cached
+   in the frame in order to simplify stack walking: otherwise the size of
+   this stack frame would have to be calculated by looking at f's info table.
+
+   -------------------------------------------------------------------------- */
+
+__stg_gc_fun
+{
+    W_ size;
+    W_ info;
+    W_ type;
+
+    info = %GET_FUN_INFO(R1);
+
+    // cache the size
+    type = TO_W_(StgFunInfoExtra_fun_type(info));
+    if (type == ARG_GEN) {
+	size = BITMAP_SIZE(StgFunInfoExtra_bitmap(info));
+    } else { 
+	if (type == ARG_GEN_BIG) {
+#ifdef TABLES_NEXT_TO_CODE
+            // bitmap field holds an offset
+            size = StgLargeBitmap_size( StgFunInfoExtra_bitmap(info)
+                                        + %GET_ENTRY(R1) /* ### */ );
+#else
+	    size = StgLargeBitmap_size( StgFunInfoExtra_bitmap(info) );
+#endif
+	} else {
+	    size = BITMAP_SIZE(W_[stg_arg_bitmaps + WDS(type)]);
+	}
+    }
+    
+#ifdef NO_ARG_REGS
+    // we don't have to save any registers away
+    Sp_adj(-3);
+    Sp(2) = R1;
+    Sp(1) = size;
+    Sp(0) = stg_gc_fun_info;
+    GC_GENERIC
+#else
+    W_ type;
+    type = TO_W_(StgFunInfoExtra_fun_type(info));
+    // cache the size
+    if (type == ARG_GEN || type == ARG_GEN_BIG) {
+        // regs already saved by the heap check code
+        Sp_adj(-3);
+        Sp(2) = R1;
+        Sp(1) = size;
+        Sp(0) = stg_gc_fun_info;
+        // DEBUG_ONLY(foreign "C" debugBelch("stg_fun_gc_gen(ARG_GEN)"););
+        GC_GENERIC
+    } else { 
+	jump W_[stg_stack_save_entries + WDS(type)];
+	    // jumps to stg_gc_noregs after saving stuff
+    }
+#endif /* !NO_ARG_REGS */
+}
+
+/* -----------------------------------------------------------------------------
+   Generic Apply (return point)
+
+   The dual to stg_fun_gc_gen (above): this fragment returns to the
+   function, passing arguments in the stack and in registers
+   appropriately.  The stack layout is given above.
+   -------------------------------------------------------------------------- */
+
+INFO_TABLE_RET( stg_gc_fun, 0/*framesize*/, 0/*bitmap*/, RET_FUN )
+{
+    R1 = Sp(2);
+    Sp_adj(3);
+#ifdef NO_ARG_REGS
+    // Minor optimisation: there are no argument registers to load up,
+    // so we can just jump straight to the function's entry point.
+    jump %GET_ENTRY(R1);
+#else
+    W_ info;
+    W_ type;
+    
+    info = %GET_FUN_INFO(R1);
+    type = TO_W_(StgFunInfoExtra_fun_type(info));
+    if (type == ARG_GEN || type == ARG_GEN_BIG) {
+	jump StgFunInfoExtra_slow_apply(info);
+    } else { 
+	if (type == ARG_BCO) {
+	    // cover this case just to be on the safe side
+	    Sp_adj(-2);
+	    Sp(1) = R1;
+	    Sp(0) = stg_apply_interp_info;
+	    jump stg_yield_to_interpreter;
+	} else {
+	    jump W_[stg_ap_stack_entries + WDS(type)];
+	}
+    }
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+   Generic Heap Check Code.
+
+   Called with Liveness mask in R9,  Return address in R10.
+   Stack must be consistent (containing all necessary info pointers
+   to relevant SRTs).
+
+   See StgMacros.h for a description of the RET_DYN stack frame.
+
+   We also define an stg_gen_yield here, because it's very similar.
+   -------------------------------------------------------------------------- */
+
+// For simplicity, we assume that SIZEOF_DOUBLE == 2*SIZEOF_VOID_P
+// on a 64-bit machine, we'll end up wasting a couple of words, but
+// it's not a big deal.
+
+#define RESTORE_EVERYTHING			\
+    L1   = L_[Sp + WDS(19)];			\
+    D2   = D_[Sp + WDS(17)];			\
+    D1   = D_[Sp + WDS(15)];			\
+    F4   = F_[Sp + WDS(14)];			\
+    F3   = F_[Sp + WDS(13)];			\
+    F2   = F_[Sp + WDS(12)];			\
+    F1   = F_[Sp + WDS(11)];			\
+    R8 = Sp(10);				\
+    R7 = Sp(9);					\
+    R6 = Sp(8);					\
+    R5 = Sp(7);					\
+    R4 = Sp(6);					\
+    R3 = Sp(5);					\
+    R2 = Sp(4);					\
+    R1 = Sp(3);					\
+    Sp_adj(21);
+
+#define RET_OFFSET (-19)
+
+#define SAVE_EVERYTHING				\
+    Sp_adj(-21);				\
+    L_[Sp + WDS(19)] = L1;			\
+    D_[Sp + WDS(17)] = D2;			\
+    D_[Sp + WDS(15)] = D1;			\
+    F_[Sp + WDS(14)] = F4;			\
+    F_[Sp + WDS(13)] = F3;			\
+    F_[Sp + WDS(12)] = F2;			\
+    F_[Sp + WDS(11)] = F1;			\
+    Sp(10) = R8;				\
+    Sp(9) = R7;					\
+    Sp(8) = R6;					\
+    Sp(7) = R5;					\
+    Sp(6) = R4;					\
+    Sp(5) = R3;					\
+    Sp(4) = R2;					\
+    Sp(3) = R1;					\
+    Sp(2) = R10;    /* return address */	\
+    Sp(1) = R9;     /* liveness mask  */	\
+    Sp(0) = stg_gc_gen_info;
+
+INFO_TABLE_RET( stg_gc_gen, 0/*framesize*/, 0/*bitmap*/, RET_DYN )
+/* bitmap in the above info table is unused, the real one is on the stack. */
+{
+    RESTORE_EVERYTHING;
+    jump Sp(RET_OFFSET); /* No %ENTRY_CODE( - this is an actual code ptr */
+}
+
+stg_gc_gen
+{
+    SAVE_EVERYTHING;
+    GC_GENERIC
+}	  
+
+// A heap check at an unboxed tuple return point.  The return address
+// is on the stack, and we can find it by using the offsets given
+// to us in the liveness mask.
+stg_gc_ut
+{
+    R10 = %ENTRY_CODE(Sp(RET_DYN_NONPTRS(R9) + RET_DYN_PTRS(R9)));
+    SAVE_EVERYTHING;
+    GC_GENERIC
+}
+
+/*
+ * stg_gen_hp is used by MAYBE_GC, where we can't use GC_GENERIC
+ * because we've just failed doYouWantToGC(), not a standard heap
+ * check.  GC_GENERIC would end up returning StackOverflow.
+ */
+stg_gc_gen_hp
+{
+    SAVE_EVERYTHING;
+    HP_GENERIC
+}	  
+
+/* -----------------------------------------------------------------------------
+   Yields
+   -------------------------------------------------------------------------- */
+
+stg_gen_yield
+{
+    SAVE_EVERYTHING;
+    YIELD_GENERIC
+}
+
+stg_yield_noregs
+{
+    YIELD_GENERIC;
+}
+
+/* -----------------------------------------------------------------------------
+   Yielding to the interpreter... top of stack says what to do next.
+   -------------------------------------------------------------------------- */
+
+stg_yield_to_interpreter
+{
+    YIELD_TO_INTERPRETER;
+}
+
+/* -----------------------------------------------------------------------------
+   Blocks
+   -------------------------------------------------------------------------- */
+
+stg_gen_block
+{
+    SAVE_EVERYTHING;
+    BLOCK_GENERIC;
+}
+
+stg_block_noregs
+{
+    BLOCK_GENERIC;
+}
+
+stg_block_1
+{
+    Sp_adj(-2);
+    Sp(1) = R1;
+    Sp(0) = stg_enter_info;
+    BLOCK_GENERIC;
+}
+
+/* -----------------------------------------------------------------------------
+ * takeMVar/putMVar-specific blocks
+ *
+ * Stack layout for a thread blocked in takeMVar:
+ *      
+ *       ret. addr
+ *       ptr to MVar   (R1)
+ *       stg_block_takemvar_info
+ *
+ * Stack layout for a thread blocked in putMVar:
+ *      
+ *       ret. addr
+ *       ptr to Value  (R2)
+ *       ptr to MVar   (R1)
+ *       stg_block_putmvar_info
+ *
+ * See PrimOps.hc for a description of the workings of take/putMVar.
+ * 
+ * -------------------------------------------------------------------------- */
+
+INFO_TABLE_RET( stg_block_takemvar, 1/*framesize*/, 0/*bitmap*/, RET_SMALL )
+{
+    R1 = Sp(1);
+    Sp_adj(2);
+    jump takeMVarzh_fast;
+}
+
+// code fragment executed just before we return to the scheduler
+stg_block_takemvar_finally
+{
+#ifdef THREADED_RTS
+    foreign "C" unlockClosure(R3 "ptr", stg_EMPTY_MVAR_info);
+#endif
+    jump StgReturn;
+}
+
+stg_block_takemvar
+{
+    Sp_adj(-2);
+    Sp(1) = R1;
+    Sp(0) = stg_block_takemvar_info;
+    R3 = R1;
+    BLOCK_BUT_FIRST(stg_block_takemvar_finally);
+}
+
+INFO_TABLE_RET( stg_block_putmvar, 2/*framesize*/, 0/*bitmap*/, RET_SMALL )
+{
+    R2 = Sp(2);
+    R1 = Sp(1);
+    Sp_adj(3);
+    jump putMVarzh_fast;
+}
+
+// code fragment executed just before we return to the scheduler
+stg_block_putmvar_finally
+{
+#ifdef THREADED_RTS
+    foreign "C" unlockClosure(R3 "ptr", stg_FULL_MVAR_info);
+#endif
+    jump StgReturn;
+}
+
+stg_block_putmvar
+{
+    Sp_adj(-3);
+    Sp(2) = R2;
+    Sp(1) = R1;
+    Sp(0) = stg_block_putmvar_info;
+    R3 = R1;
+    BLOCK_BUT_FIRST(stg_block_putmvar_finally);
+}
+
+// code fragment executed just before we return to the scheduler
+stg_block_blackhole_finally
+{
+#if defined(THREADED_RTS)
+    // The last thing we do is release sched_lock, which is
+    // preventing other threads from accessing blackhole_queue and
+    // picking up this thread before we are finished with it.
+    foreign "C" RELEASE_LOCK(sched_mutex "ptr");
+#endif
+    jump StgReturn;
+}
+
+stg_block_blackhole
+{
+    Sp_adj(-2);
+    Sp(1) = R1;
+    Sp(0) = stg_enter_info;
+    BLOCK_BUT_FIRST(stg_block_blackhole_finally);
+}
+
+#ifdef mingw32_HOST_OS
+INFO_TABLE_RET( stg_block_async, 0/*framesize*/, 0/*bitmap*/, RET_SMALL )
+{
+    W_ ares;
+    W_ len, errC;
+
+    ares = StgTSO_block_info(CurrentTSO);
+    len = StgAsyncIOResult_len(ares);
+    errC = StgAsyncIOResult_errCode(ares);
+    StgTSO_block_info(CurrentTSO) = NULL;
+    foreign "C" free(ares "ptr");
+    R1 = len;
+    Sp(0) = errC;
+    jump %ENTRY_CODE(Sp(1));
+}
+
+stg_block_async
+{
+    Sp_adj(-1);
+    Sp(0) = stg_block_async_info;
+    BLOCK_GENERIC;
+}
+
+/* Used by threadDelay implementation; it would be desirable to get rid of
+ * this free()'ing void return continuation.
+ */
+INFO_TABLE_RET( stg_block_async_void, 0/*framesize*/, 0/*bitmap*/, RET_SMALL )
+{
+    W_ ares;
+
+    ares = StgTSO_block_info(CurrentTSO);
+    StgTSO_block_info(CurrentTSO) = NULL;
+    foreign "C" free(ares "ptr");
+    Sp_adj(1);
+    jump %ENTRY_CODE(Sp(0));
+}
+
+stg_block_async_void
+{
+    Sp_adj(-1);
+    Sp(0) = stg_block_async_void_info;
+    BLOCK_GENERIC;
+}
+
+#endif
+
+/* -----------------------------------------------------------------------------
+   STM-specific waiting
+   -------------------------------------------------------------------------- */
+
+stg_block_stmwait_finally
+{
+    foreign "C" stmWaitUnlock(MyCapability() "ptr", R3 "ptr");
+    jump StgReturn;
+}
+
+stg_block_stmwait
+{
+    BLOCK_BUT_FIRST(stg_block_stmwait_finally);
+}
diff --git a/rts/HsFFI.c b/rts/HsFFI.c
new file mode 100644
index 0000000000..350bcfbdec
--- /dev/null
+++ b/rts/HsFFI.c
@@ -0,0 +1,40 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2005
+ *
+ * RTS entry points as mandated by the FFI addendum to the Haskell 98 report
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "HsFFI.h"
+#include "Rts.h"
+
+// hs_init and hs_exit are defined in RtsStartup.c
+
+void
+hs_set_argv(int argc, char *argv[])
+{
+    setProgArgv(argc,argv);
+}
+
+void
+hs_perform_gc(void)
+{
+    /* Hmmm, the FFI spec is a bit vague, but it seems to imply a major GC... */
+    performMajorGC();
+}
+
+void
+hs_free_stable_ptr(HsStablePtr sp)
+{
+    /* The cast is for clarity only, both HsStablePtr and StgStablePtr are
+       typedefs for void*. */
+    freeStablePtr((StgStablePtr)sp);
+}
+
+void
+hs_free_fun_ptr(HsFunPtr fp)
+{
+    /* I simply *love* all these similar names... */
+    freeHaskellFunctionPtr(fp);
+}
diff --git a/rts/Interpreter.c b/rts/Interpreter.c
new file mode 100644
index 0000000000..56e9bb67ce
--- /dev/null
+++ b/rts/Interpreter.c
@@ -0,0 +1,1261 @@
+/* -----------------------------------------------------------------------------
+ * Bytecode interpreter
+ *
+ * Copyright (c) The GHC Team, 1994-2002.
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsAPI.h"
+#include "RtsUtils.h"
+#include "Closures.h"
+#include "TSO.h"
+#include "Schedule.h"
+#include "RtsFlags.h"
+#include "Storage.h"
+#include "LdvProfile.h"
+#include "Updates.h"
+#include "Sanity.h"
+#include "Liveness.h"
+
+#include "Bytecodes.h"
+#include "Printer.h"
+#include "Disassembler.h"
+#include "Interpreter.h"
+
+#include <string.h>     /* for memcpy */
+#ifdef HAVE_ERRNO_H
+#include <errno.h>
+#endif
+
+
+/* --------------------------------------------------------------------------
+ * The bytecode interpreter
+ * ------------------------------------------------------------------------*/
+
+/* Gather stats about entry, opcode, opcode-pair frequencies.  For
+   tuning the interpreter. */
+
+/* #define INTERP_STATS */
+
+
+/* Sp points to the lowest live word on the stack. */
+
+#define BCO_NEXT      instrs[bciPtr++]
+#define BCO_PTR(n)    (W_)ptrs[n]
+#define BCO_LIT(n)    literals[n]
+#define BCO_ITBL(n)   itbls[n]
+
+#define LOAD_STACK_POINTERS					\
+    Sp = cap->r.rCurrentTSO->sp;				\
+    /* We don't change this ... */				\
+    SpLim = cap->r.rCurrentTSO->stack + RESERVED_STACK_WORDS;
+
+#define SAVE_STACK_POINTERS			\
+    cap->r.rCurrentTSO->sp = Sp
+
+#define RETURN_TO_SCHEDULER(todo,retcode)	\
+   SAVE_STACK_POINTERS;				\
+   cap->r.rCurrentTSO->what_next = (todo);	\
+   threadPaused(cap,cap->r.rCurrentTSO);		\
+   cap->r.rRet = (retcode);			\
+   return cap;
+
+#define RETURN_TO_SCHEDULER_NO_PAUSE(todo,retcode)	\
+   SAVE_STACK_POINTERS;					\
+   cap->r.rCurrentTSO->what_next = (todo);		\
+   cap->r.rRet = (retcode);				\
+   return cap;
+
+
+STATIC_INLINE StgPtr
+allocate_NONUPD (int n_words)
+{
+    return allocate(stg_max(sizeofW(StgHeader)+MIN_PAYLOAD_SIZE, n_words));
+}
+
+
+#ifdef INTERP_STATS
+
+/* Hacky stats, for tuning the interpreter ... */
+int it_unknown_entries[N_CLOSURE_TYPES];
+int it_total_unknown_entries;
+int it_total_entries;
+
+int it_retto_BCO;
+int it_retto_UPDATE;
+int it_retto_other;
+
+int it_slides;
+int it_insns;
+int it_BCO_entries;
+
+int it_ofreq[27];
+int it_oofreq[27][27];
+int it_lastopc;
+
+#define INTERP_TICK(n) (n)++
+
+void interp_startup ( void )
+{
+   int i, j;
+   it_retto_BCO = it_retto_UPDATE = it_retto_other = 0;
+   it_total_entries = it_total_unknown_entries = 0;
+   for (i = 0; i < N_CLOSURE_TYPES; i++)
+      it_unknown_entries[i] = 0;
+   it_slides = it_insns = it_BCO_entries = 0;
+   for (i = 0; i < 27; i++) it_ofreq[i] = 0;
+   for (i = 0; i < 27; i++) 
+     for (j = 0; j < 27; j++)
+        it_oofreq[i][j] = 0;
+   it_lastopc = 0;
+}
+
+void interp_shutdown ( void )
+{
+   int i, j, k, o_max, i_max, j_max;
+   debugBelch("%d constrs entered -> (%d BCO, %d UPD, %d ??? )\n",
+                   it_retto_BCO + it_retto_UPDATE + it_retto_other,
+                   it_retto_BCO, it_retto_UPDATE, it_retto_other );
+   debugBelch("%d total entries, %d unknown entries \n", 
+                   it_total_entries, it_total_unknown_entries);
+   for (i = 0; i < N_CLOSURE_TYPES; i++) {
+     if (it_unknown_entries[i] == 0) continue;
+     debugBelch("   type %2d: unknown entries (%4.1f%%) == %d\n",
+	     i, 100.0 * ((double)it_unknown_entries[i]) / 
+                        ((double)it_total_unknown_entries),
+             it_unknown_entries[i]);
+   }
+   debugBelch("%d insns, %d slides, %d BCO_entries\n", 
+                   it_insns, it_slides, it_BCO_entries);
+   for (i = 0; i < 27; i++) 
+      debugBelch("opcode %2d got %d\n", i, it_ofreq[i] );
+
+   for (k = 1; k < 20; k++) {
+      o_max = 0;
+      i_max = j_max = 0;
+      for (i = 0; i < 27; i++) {
+         for (j = 0; j < 27; j++) {
+	    if (it_oofreq[i][j] > o_max) {
+               o_max = it_oofreq[i][j];
+	       i_max = i; j_max = j;
+	    }
+	 }
+      }
+      
+      debugBelch("%d:  count (%4.1f%%) %6d   is %d then %d\n",
+                k, ((double)o_max) * 100.0 / ((double)it_insns), o_max,
+                   i_max, j_max );
+      it_oofreq[i_max][j_max] = 0;
+
+   }
+}
+
+#else // !INTERP_STATS
+
+#define INTERP_TICK(n) /* nothing */
+
+#endif
+
+static StgWord app_ptrs_itbl[] = {
+    (W_)&stg_ap_p_info,
+    (W_)&stg_ap_pp_info,
+    (W_)&stg_ap_ppp_info,
+    (W_)&stg_ap_pppp_info,
+    (W_)&stg_ap_ppppp_info,
+    (W_)&stg_ap_pppppp_info,
+};
+
+Capability *
+interpretBCO (Capability* cap)
+{
+    // Use of register here is primarily to make it clear to compilers
+    // that these entities are non-aliasable.
+    register StgPtr       Sp;    // local state -- stack pointer
+    register StgPtr       SpLim; // local state -- stack lim pointer
+    register StgClosure*  obj;
+    nat n, m;
+
+    LOAD_STACK_POINTERS;
+
+    // ------------------------------------------------------------------------
+    // Case 1:
+    // 
+    //       We have a closure to evaluate.  Stack looks like:
+    //       
+    //      	|   XXXX_info   |
+    //      	+---------------+
+    //       Sp |      -------------------> closure
+    //      	+---------------+
+    //       
+    if (Sp[0] == (W_)&stg_enter_info) {
+	Sp++;
+	goto eval;
+    }
+
+    // ------------------------------------------------------------------------
+    // Case 2:
+    // 
+    //       We have a BCO application to perform.  Stack looks like:
+    //
+    //      	|     ....      |
+    //      	+---------------+
+    //      	|     arg1      |
+    //      	+---------------+
+    //      	|     BCO       |
+    //      	+---------------+
+    //       Sp |   RET_BCO     |
+    //      	+---------------+
+    //       
+    else if (Sp[0] == (W_)&stg_apply_interp_info) {
+	obj = (StgClosure *)Sp[1];
+	Sp += 2;
+	goto run_BCO_fun;
+    }
+
+    // ------------------------------------------------------------------------
+    // Case 3:
+    //
+    //       We have an unboxed value to return.  See comment before
+    //       do_return_unboxed, below.
+    //
+    else {
+	goto do_return_unboxed;
+    }
+
+    // Evaluate the object on top of the stack.
+eval:
+    obj = (StgClosure*)Sp[0]; Sp++;
+
+eval_obj:
+    INTERP_TICK(it_total_evals);
+
+    IF_DEBUG(interpreter,
+             debugBelch(
+             "\n---------------------------------------------------------------\n");
+             debugBelch("Evaluating: "); printObj(obj);
+             debugBelch("Sp = %p\n", Sp);
+             debugBelch("\n" );
+
+             printStackChunk(Sp,cap->r.rCurrentTSO->stack+cap->r.rCurrentTSO->stack_size);
+             debugBelch("\n\n");
+            );
+
+    IF_DEBUG(sanity,checkStackChunk(Sp, cap->r.rCurrentTSO->stack+cap->r.rCurrentTSO->stack_size));
+
+    switch ( get_itbl(obj)->type ) {
+
+    case IND:
+    case IND_OLDGEN:
+    case IND_PERM:
+    case IND_OLDGEN_PERM:
+    case IND_STATIC:
+    { 
+	obj = ((StgInd*)obj)->indirectee;
+	goto eval_obj;
+    }
+    
+    case CONSTR:
+    case CONSTR_1_0:
+    case CONSTR_0_1:
+    case CONSTR_2_0:
+    case CONSTR_1_1:
+    case CONSTR_0_2:
+    case CONSTR_INTLIKE:
+    case CONSTR_CHARLIKE:
+    case CONSTR_STATIC:
+    case CONSTR_NOCAF_STATIC:
+    case FUN:
+    case FUN_1_0:
+    case FUN_0_1:
+    case FUN_2_0:
+    case FUN_1_1:
+    case FUN_0_2:
+    case FUN_STATIC:
+    case PAP:
+	// already in WHNF
+	break;
+	
+    case BCO:
+	ASSERT(((StgBCO *)obj)->arity > 0);
+	break;
+
+    case AP:	/* Copied from stg_AP_entry. */
+    {
+	nat i, words;
+	StgAP *ap;
+	
+	ap = (StgAP*)obj;
+	words = ap->n_args;
+	
+	// Stack check
+	if (Sp - (words+sizeofW(StgUpdateFrame)) < SpLim) {
+	    Sp -= 2;
+	    Sp[1] = (W_)obj;
+	    Sp[0] = (W_)&stg_enter_info;
+	    RETURN_TO_SCHEDULER(ThreadInterpret, StackOverflow);
+	}
+	
+	/* Ok; we're safe.  Party on.  Push an update frame. */
+	Sp -= sizeofW(StgUpdateFrame);
+	{
+	    StgUpdateFrame *__frame;
+	    __frame = (StgUpdateFrame *)Sp;
+	    SET_INFO(__frame, (StgInfoTable *)&stg_upd_frame_info);
+	    __frame->updatee = (StgClosure *)(ap);
+	}
+	
+	/* Reload the stack */
+	Sp -= words;
+	for (i=0; i < words; i++) {
+	    Sp[i] = (W_)ap->payload[i];
+	}
+
+	obj = (StgClosure*)ap->fun;
+	ASSERT(get_itbl(obj)->type == BCO);
+	goto run_BCO_fun;
+    }
+
+    default:
+#ifdef INTERP_STATS
+    { 
+	int j;
+	
+	j = get_itbl(obj)->type;
+	ASSERT(j >= 0 && j < N_CLOSURE_TYPES);
+	it_unknown_entries[j]++;
+	it_total_unknown_entries++;
+    }
+#endif
+    {
+	// Can't handle this object; yield to scheduler
+	IF_DEBUG(interpreter,
+		 debugBelch("evaluating unknown closure -- yielding to sched\n"); 
+		 printObj(obj);
+	    );
+	Sp -= 2;
+	Sp[1] = (W_)obj;
+	Sp[0] = (W_)&stg_enter_info;
+	RETURN_TO_SCHEDULER_NO_PAUSE(ThreadRunGHC, ThreadYielding);
+    }
+    }
+
+    // ------------------------------------------------------------------------
+    // We now have an evaluated object (obj).  The next thing to
+    // do is return it to the stack frame on top of the stack.
+do_return:
+    ASSERT(closure_HNF(obj));
+
+    IF_DEBUG(interpreter,
+             debugBelch(
+             "\n---------------------------------------------------------------\n");
+             debugBelch("Returning: "); printObj(obj);
+             debugBelch("Sp = %p\n", Sp);
+             debugBelch("\n" );
+             printStackChunk(Sp,cap->r.rCurrentTSO->stack+cap->r.rCurrentTSO->stack_size);
+             debugBelch("\n\n");
+            );
+
+    IF_DEBUG(sanity,checkStackChunk(Sp, cap->r.rCurrentTSO->stack+cap->r.rCurrentTSO->stack_size));
+
+    switch (get_itbl((StgClosure *)Sp)->type) {
+
+    case RET_SMALL: {
+	const StgInfoTable *info;
+
+	// NOTE: not using get_itbl().
+	info = ((StgClosure *)Sp)->header.info;
+	if (info == (StgInfoTable *)&stg_ap_v_info) {
+	    n = 1; m = 0; goto do_apply;
+	}
+	if (info == (StgInfoTable *)&stg_ap_f_info) {
+	    n = 1; m = 1; goto do_apply;
+	}
+	if (info == (StgInfoTable *)&stg_ap_d_info) {
+	    n = 1; m = sizeofW(StgDouble); goto do_apply;
+	}
+	if (info == (StgInfoTable *)&stg_ap_l_info) {
+	    n = 1; m = sizeofW(StgInt64); goto do_apply;
+	}
+	if (info == (StgInfoTable *)&stg_ap_n_info) {
+	    n = 1; m = 1; goto do_apply;
+	}
+	if (info == (StgInfoTable *)&stg_ap_p_info) {
+	    n = 1; m = 1; goto do_apply;
+	}
+	if (info == (StgInfoTable *)&stg_ap_pp_info) {
+	    n = 2; m = 2; goto do_apply;
+	}
+	if (info == (StgInfoTable *)&stg_ap_ppp_info) {
+	    n = 3; m = 3; goto do_apply;
+	}
+	if (info == (StgInfoTable *)&stg_ap_pppp_info) {
+	    n = 4; m = 4; goto do_apply;
+	}
+	if (info == (StgInfoTable *)&stg_ap_ppppp_info) {
+	    n = 5; m = 5; goto do_apply;
+	}
+	if (info == (StgInfoTable *)&stg_ap_pppppp_info) {
+	    n = 6; m = 6; goto do_apply;
+	}
+	goto do_return_unrecognised;
+    }
+
+    case UPDATE_FRAME:
+	// Returning to an update frame: do the update, pop the update
+	// frame, and continue with the next stack frame.
+	INTERP_TICK(it_retto_UPDATE);
+	UPD_IND(((StgUpdateFrame *)Sp)->updatee, obj); 
+	Sp += sizeofW(StgUpdateFrame);
+	goto do_return;
+
+    case RET_BCO:
+	// Returning to an interpreted continuation: put the object on
+	// the stack, and start executing the BCO.
+	INTERP_TICK(it_retto_BCO);
+	Sp--;
+	Sp[0] = (W_)obj;
+	obj = (StgClosure*)Sp[2];
+	ASSERT(get_itbl(obj)->type == BCO);
+	goto run_BCO_return;
+
+    default:
+    do_return_unrecognised:
+    {
+	// Can't handle this return address; yield to scheduler
+	INTERP_TICK(it_retto_other);
+	IF_DEBUG(interpreter,
+		 debugBelch("returning to unknown frame -- yielding to sched\n"); 
+		 printStackChunk(Sp,cap->r.rCurrentTSO->stack+cap->r.rCurrentTSO->stack_size);
+	    );
+	Sp -= 2;
+	Sp[1] = (W_)obj;
+	Sp[0] = (W_)&stg_enter_info;
+	RETURN_TO_SCHEDULER_NO_PAUSE(ThreadRunGHC, ThreadYielding);
+    }
+    }
+
+    // -------------------------------------------------------------------------
+    // Returning an unboxed value.  The stack looks like this:
+    //
+    // 	  |     ....      |
+    // 	  +---------------+
+    // 	  |     fv2       |
+    // 	  +---------------+
+    // 	  |     fv1       |
+    // 	  +---------------+
+    // 	  |     BCO       |
+    // 	  +---------------+
+    // 	  | stg_ctoi_ret_ |
+    // 	  +---------------+
+    // 	  |    retval     |
+    // 	  +---------------+
+    // 	  |   XXXX_info   |
+    // 	  +---------------+
+    //
+    // where XXXX_info is one of the stg_gc_unbx_r1_info family.
+    //
+    // We're only interested in the case when the real return address
+    // is a BCO; otherwise we'll return to the scheduler.
+
+do_return_unboxed:
+    { 
+	int offset;
+	
+	ASSERT( Sp[0] == (W_)&stg_gc_unbx_r1_info
+		|| Sp[0] == (W_)&stg_gc_unpt_r1_info
+		|| Sp[0] == (W_)&stg_gc_f1_info
+		|| Sp[0] == (W_)&stg_gc_d1_info
+		|| Sp[0] == (W_)&stg_gc_l1_info
+		|| Sp[0] == (W_)&stg_gc_void_info // VoidRep
+	    );
+
+	// get the offset of the stg_ctoi_ret_XXX itbl
+	offset = stack_frame_sizeW((StgClosure *)Sp);
+
+	switch (get_itbl((StgClosure *)Sp+offset)->type) {
+
+	case RET_BCO:
+	    // Returning to an interpreted continuation: put the object on
+	    // the stack, and start executing the BCO.
+	    INTERP_TICK(it_retto_BCO);
+	    obj = (StgClosure*)Sp[offset+1];
+	    ASSERT(get_itbl(obj)->type == BCO);
+	    goto run_BCO_return_unboxed;
+
+	default:
+	{
+	    // Can't handle this return address; yield to scheduler
+	    INTERP_TICK(it_retto_other);
+	    IF_DEBUG(interpreter,
+		     debugBelch("returning to unknown frame -- yielding to sched\n"); 
+		     printStackChunk(Sp,cap->r.rCurrentTSO->stack+cap->r.rCurrentTSO->stack_size);
+		);
+	    RETURN_TO_SCHEDULER_NO_PAUSE(ThreadRunGHC, ThreadYielding);
+	}
+	}
+    }
+    // not reached.
+
+
+    // -------------------------------------------------------------------------
+    // Application...
+
+do_apply:
+    // we have a function to apply (obj), and n arguments taking up m
+    // words on the stack.  The info table (stg_ap_pp_info or whatever)
+    // is on top of the arguments on the stack.
+    {
+	switch (get_itbl(obj)->type) {
+
+	case PAP: {
+	    StgPAP *pap;
+	    nat i, arity;
+
+	    pap = (StgPAP *)obj;
+
+	    // we only cope with PAPs whose function is a BCO
+	    if (get_itbl(pap->fun)->type != BCO) {
+		goto defer_apply_to_sched;
+	    }
+
+	    Sp++;
+	    arity = pap->arity;
+	    ASSERT(arity > 0);
+	    if (arity < n) {
+		// n must be greater than 1, and the only kinds of
+		// application we support with more than one argument
+		// are all pointers...
+		//
+		// Shuffle the args for this function down, and put
+		// the appropriate info table in the gap.
+		for (i = 0; i < arity; i++) {
+		    Sp[(int)i-1] = Sp[i];
+		    // ^^^^^ careful, i-1 might be negative, but i in unsigned
+		}
+		Sp[arity-1] = app_ptrs_itbl[n-arity-1];
+		Sp--;
+		// unpack the PAP's arguments onto the stack
+		Sp -= pap->n_args;
+		for (i = 0; i < pap->n_args; i++) {
+		    Sp[i] = (W_)pap->payload[i];
+		}
+		obj = pap->fun;
+		goto run_BCO_fun;
+	    } 
+	    else if (arity == n) {
+		Sp -= pap->n_args;
+		for (i = 0; i < pap->n_args; i++) {
+		    Sp[i] = (W_)pap->payload[i];
+		}
+		obj = pap->fun;
+		goto run_BCO_fun;
+	    } 
+	    else /* arity > n */ {
+		// build a new PAP and return it.
+		StgPAP *new_pap;
+		new_pap = (StgPAP *)allocate(PAP_sizeW(pap->n_args + m));
+		SET_HDR(new_pap,&stg_PAP_info,CCCS);
+		new_pap->arity = pap->arity - n;
+		new_pap->n_args = pap->n_args + m;
+		new_pap->fun = pap->fun;
+		for (i = 0; i < pap->n_args; i++) {
+		    new_pap->payload[i] = pap->payload[i];
+		}
+		for (i = 0; i < m; i++) {
+		    new_pap->payload[pap->n_args + i] = (StgClosure *)Sp[i];
+		}
+		obj = (StgClosure *)new_pap;
+		Sp += m;
+		goto do_return;
+	    }
+	}	    
+
+	case BCO: {
+	    nat arity, i;
+
+	    Sp++;
+	    arity = ((StgBCO *)obj)->arity;
+	    ASSERT(arity > 0);
+	    if (arity < n) {
+		// n must be greater than 1, and the only kinds of
+		// application we support with more than one argument
+		// are all pointers...
+		//
+		// Shuffle the args for this function down, and put
+		// the appropriate info table in the gap.
+		for (i = 0; i < arity; i++) {
+		    Sp[(int)i-1] = Sp[i];
+		    // ^^^^^ careful, i-1 might be negative, but i in unsigned
+		}
+		Sp[arity-1] = app_ptrs_itbl[n-arity-1];
+		Sp--;
+		goto run_BCO_fun;
+	    } 
+	    else if (arity == n) {
+		goto run_BCO_fun;
+	    }
+	    else /* arity > n */ {
+		// build a PAP and return it.
+		StgPAP *pap;
+		nat i;
+		pap = (StgPAP *)allocate(PAP_sizeW(m));
+		SET_HDR(pap, &stg_PAP_info,CCCS);
+		pap->arity = arity - n;
+		pap->fun = obj;
+		pap->n_args = m;
+		for (i = 0; i < m; i++) {
+		    pap->payload[i] = (StgClosure *)Sp[i];
+		}
+		obj = (StgClosure *)pap;
+		Sp += m;
+		goto do_return;
+	    }
+	}
+
+	// No point in us applying machine-code functions
+	default:
+	defer_apply_to_sched:
+	    Sp -= 2;
+	    Sp[1] = (W_)obj;
+	    Sp[0] = (W_)&stg_enter_info;
+	    RETURN_TO_SCHEDULER_NO_PAUSE(ThreadRunGHC, ThreadYielding);
+    }
+
+    // ------------------------------------------------------------------------
+    // Ok, we now have a bco (obj), and its arguments are all on the
+    // stack.  We can start executing the byte codes.
+    //
+    // The stack is in one of two states.  First, if this BCO is a
+    // function:
+    //
+    // 	  |     ....      |
+    // 	  +---------------+
+    // 	  |     arg2      |
+    // 	  +---------------+
+    // 	  |     arg1      |
+    // 	  +---------------+
+    //
+    // Second, if this BCO is a continuation:
+    //
+    // 	  |     ....      |
+    // 	  +---------------+
+    // 	  |     fv2       |
+    // 	  +---------------+
+    // 	  |     fv1       |
+    // 	  +---------------+
+    // 	  |     BCO       |
+    // 	  +---------------+
+    // 	  | stg_ctoi_ret_ |
+    // 	  +---------------+
+    // 	  |    retval     |
+    // 	  +---------------+
+    // 
+    // where retval is the value being returned to this continuation.
+    // In the event of a stack check, heap check, or context switch,
+    // we need to leave the stack in a sane state so the garbage
+    // collector can find all the pointers.
+    //
+    //  (1) BCO is a function:  the BCO's bitmap describes the
+    //      pointerhood of the arguments.
+    //
+    //  (2) BCO is a continuation: BCO's bitmap describes the
+    //      pointerhood of the free variables.
+    //
+    // Sadly we have three different kinds of stack/heap/cswitch check
+    // to do:
+
+run_BCO_return:
+    // Heap check
+    if (doYouWantToGC()) {
+	Sp--; Sp[0] = (W_)&stg_enter_info;
+	RETURN_TO_SCHEDULER(ThreadInterpret, HeapOverflow);
+    }
+    // Stack checks aren't necessary at return points, the stack use
+    // is aggregated into the enclosing function entry point.
+    goto run_BCO;
+    
+run_BCO_return_unboxed:
+    // Heap check
+    if (doYouWantToGC()) {
+	RETURN_TO_SCHEDULER(ThreadInterpret, HeapOverflow);
+    }
+    // Stack checks aren't necessary at return points, the stack use
+    // is aggregated into the enclosing function entry point.
+    goto run_BCO;
+    
+run_BCO_fun:
+    IF_DEBUG(sanity,
+	     Sp -= 2; 
+	     Sp[1] = (W_)obj; 
+	     Sp[0] = (W_)&stg_apply_interp_info;
+	     checkStackChunk(Sp,SpLim);
+	     Sp += 2;
+	);
+
+    // Heap check
+    if (doYouWantToGC()) {
+	Sp -= 2; 
+	Sp[1] = (W_)obj; 
+	Sp[0] = (W_)&stg_apply_interp_info; // placeholder, really
+	RETURN_TO_SCHEDULER(ThreadInterpret, HeapOverflow);
+    }
+    
+    // Stack check
+    if (Sp - INTERP_STACK_CHECK_THRESH < SpLim) {
+	Sp -= 2; 
+	Sp[1] = (W_)obj; 
+	Sp[0] = (W_)&stg_apply_interp_info; // placeholder, really
+	RETURN_TO_SCHEDULER(ThreadInterpret, StackOverflow);
+    }
+    goto run_BCO;
+    
+    // Now, actually interpret the BCO... (no returning to the
+    // scheduler again until the stack is in an orderly state).
+run_BCO:
+    INTERP_TICK(it_BCO_entries);
+    {
+	register int       bciPtr     = 1; /* instruction pointer */
+	register StgBCO*   bco        = (StgBCO*)obj;
+	register StgWord16* instrs    = (StgWord16*)(bco->instrs->payload);
+	register StgWord*  literals   = (StgWord*)(&bco->literals->payload[0]);
+	register StgPtr*   ptrs       = (StgPtr*)(&bco->ptrs->payload[0]);
+	register StgInfoTable** itbls = (StgInfoTable**)
+	    (&bco->itbls->payload[0]);
+
+#ifdef INTERP_STATS
+	it_lastopc = 0; /* no opcode */
+#endif
+
+    nextInsn:
+	ASSERT(bciPtr <= instrs[0]);
+	IF_DEBUG(interpreter,
+		 //if (do_print_stack) {
+		 //debugBelch("\n-- BEGIN stack\n");
+		 //printStack(Sp,cap->r.rCurrentTSO->stack+cap->r.rCurrentTSO->stack_size,iSu);
+		 //debugBelch("-- END stack\n\n");
+		 //}
+		 debugBelch("Sp = %p   pc = %d      ", Sp, bciPtr);
+		 disInstr(bco,bciPtr);
+		 if (0) { int i;
+		 debugBelch("\n");
+		 for (i = 8; i >= 0; i--) {
+		     debugBelch("%d  %p\n", i, (StgPtr)(*(Sp+i)));
+		 }
+		 debugBelch("\n");
+		 }
+		 //if (do_print_stack) checkStack(Sp,cap->r.rCurrentTSO->stack+cap->r.rCurrentTSO->stack_size,iSu);
+	    );
+
+	INTERP_TICK(it_insns);
+
+#ifdef INTERP_STATS
+	ASSERT( (int)instrs[bciPtr] >= 0 && (int)instrs[bciPtr] < 27 );
+	it_ofreq[ (int)instrs[bciPtr] ] ++;
+	it_oofreq[ it_lastopc ][ (int)instrs[bciPtr] ] ++;
+	it_lastopc = (int)instrs[bciPtr];
+#endif
+
+	switch (BCO_NEXT) {
+
+	case bci_STKCHECK: {
+	    // Explicit stack check at the beginning of a function
+	    // *only* (stack checks in case alternatives are
+	    // propagated to the enclosing function).
+	    int stk_words_reqd = BCO_NEXT + 1;
+	    if (Sp - stk_words_reqd < SpLim) {
+		Sp -= 2; 
+		Sp[1] = (W_)obj; 
+		Sp[0] = (W_)&stg_apply_interp_info;
+		RETURN_TO_SCHEDULER(ThreadInterpret, StackOverflow);
+	    } else {
+		goto nextInsn;
+	    }
+	}
+
+	case bci_PUSH_L: {
+	    int o1 = BCO_NEXT;
+	    Sp[-1] = Sp[o1];
+	    Sp--;
+	    goto nextInsn;
+	}
+
+	case bci_PUSH_LL: {
+	    int o1 = BCO_NEXT;
+	    int o2 = BCO_NEXT;
+	    Sp[-1] = Sp[o1];
+	    Sp[-2] = Sp[o2];
+	    Sp -= 2;
+	    goto nextInsn;
+	}
+
+	case bci_PUSH_LLL: {
+	    int o1 = BCO_NEXT;
+	    int o2 = BCO_NEXT;
+	    int o3 = BCO_NEXT;
+	    Sp[-1] = Sp[o1];
+	    Sp[-2] = Sp[o2];
+	    Sp[-3] = Sp[o3];
+	    Sp -= 3;
+	    goto nextInsn;
+	}
+
+	case bci_PUSH_G: {
+	    int o1 = BCO_NEXT;
+	    Sp[-1] = BCO_PTR(o1);
+	    Sp -= 1;
+	    goto nextInsn;
+	}
+
+	case bci_PUSH_ALTS: {
+	    int o_bco  = BCO_NEXT;
+	    Sp[-2] = (W_)&stg_ctoi_R1p_info;
+	    Sp[-1] = BCO_PTR(o_bco);
+	    Sp -= 2;
+	    goto nextInsn;
+	}
+
+	case bci_PUSH_ALTS_P: {
+	    int o_bco  = BCO_NEXT;
+	    Sp[-2] = (W_)&stg_ctoi_R1unpt_info;
+	    Sp[-1] = BCO_PTR(o_bco);
+	    Sp -= 2;
+	    goto nextInsn;
+	}
+
+	case bci_PUSH_ALTS_N: {
+	    int o_bco  = BCO_NEXT;
+	    Sp[-2] = (W_)&stg_ctoi_R1n_info;
+	    Sp[-1] = BCO_PTR(o_bco);
+	    Sp -= 2;
+	    goto nextInsn;
+	}
+
+	case bci_PUSH_ALTS_F: {
+	    int o_bco  = BCO_NEXT;
+	    Sp[-2] = (W_)&stg_ctoi_F1_info;
+	    Sp[-1] = BCO_PTR(o_bco);
+	    Sp -= 2;
+	    goto nextInsn;
+	}
+
+	case bci_PUSH_ALTS_D: {
+	    int o_bco  = BCO_NEXT;
+	    Sp[-2] = (W_)&stg_ctoi_D1_info;
+	    Sp[-1] = BCO_PTR(o_bco);
+	    Sp -= 2;
+	    goto nextInsn;
+	}
+
+	case bci_PUSH_ALTS_L: {
+	    int o_bco  = BCO_NEXT;
+	    Sp[-2] = (W_)&stg_ctoi_L1_info;
+	    Sp[-1] = BCO_PTR(o_bco);
+	    Sp -= 2;
+	    goto nextInsn;
+	}
+
+	case bci_PUSH_ALTS_V: {
+	    int o_bco  = BCO_NEXT;
+	    Sp[-2] = (W_)&stg_ctoi_V_info;
+	    Sp[-1] = BCO_PTR(o_bco);
+	    Sp -= 2;
+	    goto nextInsn;
+	}
+
+	case bci_PUSH_APPLY_N:
+	    Sp--; Sp[0] = (W_)&stg_ap_n_info;
+	    goto nextInsn;
+	case bci_PUSH_APPLY_V:
+	    Sp--; Sp[0] = (W_)&stg_ap_v_info;
+	    goto nextInsn;
+	case bci_PUSH_APPLY_F:
+	    Sp--; Sp[0] = (W_)&stg_ap_f_info;
+	    goto nextInsn;
+	case bci_PUSH_APPLY_D:
+	    Sp--; Sp[0] = (W_)&stg_ap_d_info;
+	    goto nextInsn;
+	case bci_PUSH_APPLY_L:
+	    Sp--; Sp[0] = (W_)&stg_ap_l_info;
+	    goto nextInsn;
+	case bci_PUSH_APPLY_P:
+	    Sp--; Sp[0] = (W_)&stg_ap_p_info;
+	    goto nextInsn;
+	case bci_PUSH_APPLY_PP:
+	    Sp--; Sp[0] = (W_)&stg_ap_pp_info;
+	    goto nextInsn;
+	case bci_PUSH_APPLY_PPP:
+	    Sp--; Sp[0] = (W_)&stg_ap_ppp_info;
+	    goto nextInsn;
+	case bci_PUSH_APPLY_PPPP:
+	    Sp--; Sp[0] = (W_)&stg_ap_pppp_info;
+	    goto nextInsn;
+	case bci_PUSH_APPLY_PPPPP:
+	    Sp--; Sp[0] = (W_)&stg_ap_ppppp_info;
+	    goto nextInsn;
+	case bci_PUSH_APPLY_PPPPPP:
+	    Sp--; Sp[0] = (W_)&stg_ap_pppppp_info;
+	    goto nextInsn;
+	    
+	case bci_PUSH_UBX: {
+	    int i;
+	    int o_lits = BCO_NEXT;
+	    int n_words = BCO_NEXT;
+	    Sp -= n_words;
+	    for (i = 0; i < n_words; i++) {
+		Sp[i] = (W_)BCO_LIT(o_lits+i);
+	    }
+	    goto nextInsn;
+	}
+
+	case bci_SLIDE: {
+	    int n  = BCO_NEXT;
+	    int by = BCO_NEXT;
+	    /* a_1, .. a_n, b_1, .. b_by, s => a_1, .. a_n, s */
+	    while(--n >= 0) {
+		Sp[n+by] = Sp[n];
+	    }
+	    Sp += by;
+	    INTERP_TICK(it_slides);
+	    goto nextInsn;
+	}
+
+	case bci_ALLOC_AP: {
+	    StgAP* ap; 
+	    int n_payload = BCO_NEXT;
+	    ap = (StgAP*)allocate(AP_sizeW(n_payload));
+	    Sp[-1] = (W_)ap;
+	    ap->n_args = n_payload;
+	    SET_HDR(ap, &stg_AP_info, CCS_SYSTEM/*ToDo*/)
+	    Sp --;
+	    goto nextInsn;
+	}
+
+	case bci_ALLOC_PAP: {
+	    StgPAP* pap; 
+	    int arity = BCO_NEXT;
+	    int n_payload = BCO_NEXT;
+	    pap = (StgPAP*)allocate(PAP_sizeW(n_payload));
+	    Sp[-1] = (W_)pap;
+	    pap->n_args = n_payload;
+	    pap->arity = arity;
+	    SET_HDR(pap, &stg_PAP_info, CCS_SYSTEM/*ToDo*/)
+	    Sp --;
+	    goto nextInsn;
+	}
+
+	case bci_MKAP: {
+	    int i;
+	    int stkoff = BCO_NEXT;
+	    int n_payload = BCO_NEXT;
+	    StgAP* ap = (StgAP*)Sp[stkoff];
+	    ASSERT((int)ap->n_args == n_payload);
+	    ap->fun = (StgClosure*)Sp[0];
+	    
+	    // The function should be a BCO, and its bitmap should
+	    // cover the payload of the AP correctly.
+	    ASSERT(get_itbl(ap->fun)->type == BCO
+		   && BCO_BITMAP_SIZE(ap->fun) == ap->n_args);
+	    
+	    for (i = 0; i < n_payload; i++)
+		ap->payload[i] = (StgClosure*)Sp[i+1];
+	    Sp += n_payload+1;
+	    IF_DEBUG(interpreter,
+		     debugBelch("\tBuilt "); 
+		     printObj((StgClosure*)ap);
+		);
+	    goto nextInsn;
+	}
+
+	case bci_MKPAP: {
+	    int i;
+	    int stkoff = BCO_NEXT;
+	    int n_payload = BCO_NEXT;
+	    StgPAP* pap = (StgPAP*)Sp[stkoff];
+	    ASSERT((int)pap->n_args == n_payload);
+	    pap->fun = (StgClosure*)Sp[0];
+	    
+	    // The function should be a BCO
+	    ASSERT(get_itbl(pap->fun)->type == BCO);
+	    
+	    for (i = 0; i < n_payload; i++)
+		pap->payload[i] = (StgClosure*)Sp[i+1];
+	    Sp += n_payload+1;
+	    IF_DEBUG(interpreter,
+		     debugBelch("\tBuilt "); 
+		     printObj((StgClosure*)pap);
+		);
+	    goto nextInsn;
+	}
+
+	case bci_UNPACK: {
+	    /* Unpack N ptr words from t.o.s constructor */
+	    int i;
+	    int n_words = BCO_NEXT;
+	    StgClosure* con = (StgClosure*)Sp[0];
+	    Sp -= n_words;
+	    for (i = 0; i < n_words; i++) {
+		Sp[i] = (W_)con->payload[i];
+	    }
+	    goto nextInsn;
+	}
+
+	case bci_PACK: {
+	    int i;
+	    int o_itbl         = BCO_NEXT;
+	    int n_words        = BCO_NEXT;
+	    StgInfoTable* itbl = INFO_PTR_TO_STRUCT(BCO_ITBL(o_itbl));
+	    int request        = CONSTR_sizeW( itbl->layout.payload.ptrs, 
+					       itbl->layout.payload.nptrs );
+	    StgClosure* con = (StgClosure*)allocate_NONUPD(request);
+	    ASSERT( itbl->layout.payload.ptrs + itbl->layout.payload.nptrs > 0);
+	    SET_HDR(con, BCO_ITBL(o_itbl), CCS_SYSTEM/*ToDo*/);
+	    for (i = 0; i < n_words; i++) {
+		con->payload[i] = (StgClosure*)Sp[i];
+	    }
+	    Sp += n_words;
+	    Sp --;
+	    Sp[0] = (W_)con;
+	    IF_DEBUG(interpreter,
+		     debugBelch("\tBuilt "); 
+		     printObj((StgClosure*)con);
+		);
+	    goto nextInsn;
+	}
+
+	case bci_TESTLT_P: {
+	    unsigned int discr  = BCO_NEXT;
+	    int failto = BCO_NEXT;
+	    StgClosure* con = (StgClosure*)Sp[0];
+	    if (GET_TAG(con) >= discr) {
+		bciPtr = failto;
+	    }
+	    goto nextInsn;
+	}
+
+	case bci_TESTEQ_P: {
+	    unsigned int discr  = BCO_NEXT;
+	    int failto = BCO_NEXT;
+	    StgClosure* con = (StgClosure*)Sp[0];
+	    if (GET_TAG(con) != discr) {
+		bciPtr = failto;
+	    }
+	    goto nextInsn;
+	}
+
+	case bci_TESTLT_I: {
+	    // There should be an Int at Sp[1], and an info table at Sp[0].
+	    int discr   = BCO_NEXT;
+	    int failto  = BCO_NEXT;
+	    I_ stackInt = (I_)Sp[1];
+	    if (stackInt >= (I_)BCO_LIT(discr))
+		bciPtr = failto;
+	    goto nextInsn;
+	}
+
+	case bci_TESTEQ_I: {
+	    // There should be an Int at Sp[1], and an info table at Sp[0].
+	    int discr   = BCO_NEXT;
+	    int failto  = BCO_NEXT;
+	    I_ stackInt = (I_)Sp[1];
+	    if (stackInt != (I_)BCO_LIT(discr)) {
+		bciPtr = failto;
+	    }
+	    goto nextInsn;
+	}
+
+	case bci_TESTLT_D: {
+	    // There should be a Double at Sp[1], and an info table at Sp[0].
+	    int discr   = BCO_NEXT;
+	    int failto  = BCO_NEXT;
+	    StgDouble stackDbl, discrDbl;
+	    stackDbl = PK_DBL( & Sp[1] );
+	    discrDbl = PK_DBL( & BCO_LIT(discr) );
+	    if (stackDbl >= discrDbl) {
+		bciPtr = failto;
+	    }
+	    goto nextInsn;
+	}
+
+	case bci_TESTEQ_D: {
+	    // There should be a Double at Sp[1], and an info table at Sp[0].
+	    int discr   = BCO_NEXT;
+	    int failto  = BCO_NEXT;
+	    StgDouble stackDbl, discrDbl;
+	    stackDbl = PK_DBL( & Sp[1] );
+	    discrDbl = PK_DBL( & BCO_LIT(discr) );
+	    if (stackDbl != discrDbl) {
+		bciPtr = failto;
+	    }
+	    goto nextInsn;
+	}
+
+	case bci_TESTLT_F: {
+	    // There should be a Float at Sp[1], and an info table at Sp[0].
+	    int discr   = BCO_NEXT;
+	    int failto  = BCO_NEXT;
+	    StgFloat stackFlt, discrFlt;
+	    stackFlt = PK_FLT( & Sp[1] );
+	    discrFlt = PK_FLT( & BCO_LIT(discr) );
+	    if (stackFlt >= discrFlt) {
+		bciPtr = failto;
+	    }
+	    goto nextInsn;
+	}
+
+	case bci_TESTEQ_F: {
+	    // There should be a Float at Sp[1], and an info table at Sp[0].
+	    int discr   = BCO_NEXT;
+	    int failto  = BCO_NEXT;
+	    StgFloat stackFlt, discrFlt;
+	    stackFlt = PK_FLT( & Sp[1] );
+	    discrFlt = PK_FLT( & BCO_LIT(discr) );
+	    if (stackFlt != discrFlt) {
+		bciPtr = failto;
+	    }
+	    goto nextInsn;
+	}
+
+	// Control-flow ish things
+	case bci_ENTER:
+	    // Context-switch check.  We put it here to ensure that
+	    // the interpreter has done at least *some* work before
+	    // context switching: sometimes the scheduler can invoke
+	    // the interpreter with context_switch == 1, particularly
+	    // if the -C0 flag has been given on the cmd line.
+	    if (context_switch) {
+		Sp--; Sp[0] = (W_)&stg_enter_info;
+		RETURN_TO_SCHEDULER(ThreadInterpret, ThreadYielding);
+	    }
+	    goto eval;
+
+	case bci_RETURN:
+	    obj = (StgClosure *)Sp[0];
+	    Sp++;
+	    goto do_return;
+
+	case bci_RETURN_P:
+	    Sp--;
+	    Sp[0] = (W_)&stg_gc_unpt_r1_info;
+	    goto do_return_unboxed;
+	case bci_RETURN_N:
+	    Sp--;
+	    Sp[0] = (W_)&stg_gc_unbx_r1_info;
+	    goto do_return_unboxed;
+	case bci_RETURN_F:
+	    Sp--;
+	    Sp[0] = (W_)&stg_gc_f1_info;
+	    goto do_return_unboxed;
+	case bci_RETURN_D:
+	    Sp--;
+	    Sp[0] = (W_)&stg_gc_d1_info;
+	    goto do_return_unboxed;
+	case bci_RETURN_L:
+	    Sp--;
+	    Sp[0] = (W_)&stg_gc_l1_info;
+	    goto do_return_unboxed;
+	case bci_RETURN_V:
+	    Sp--;
+	    Sp[0] = (W_)&stg_gc_void_info;
+	    goto do_return_unboxed;
+
+	case bci_SWIZZLE: {
+	    int stkoff = BCO_NEXT;
+	    signed short n = (signed short)(BCO_NEXT);
+	    Sp[stkoff] += (W_)n;
+	    goto nextInsn;
+	}
+
+	case bci_CCALL: {
+	    void *tok;
+	    int stk_offset            = BCO_NEXT;
+	    int o_itbl                = BCO_NEXT;
+	    void(*marshall_fn)(void*) = (void (*)(void*))BCO_LIT(o_itbl);
+	    int ret_dyn_size = 
+		RET_DYN_BITMAP_SIZE + RET_DYN_NONPTR_REGS_SIZE
+		+ sizeofW(StgRetDyn);
+
+#ifdef THREADED_RTS
+	    // Threaded RTS:
+	    // Arguments on the TSO stack are not good, because garbage
+	    // collection might move the TSO as soon as we call
+	    // suspendThread below.
+
+	    W_ arguments[stk_offset];
+	    
+	    memcpy(arguments, Sp, sizeof(W_) * stk_offset);
+#endif
+
+	    // Restore the Haskell thread's current value of errno
+	    errno = cap->r.rCurrentTSO->saved_errno;
+
+	    // There are a bunch of non-ptr words on the stack (the
+	    // ccall args, the ccall fun address and space for the
+	    // result), which we need to cover with an info table
+	    // since we might GC during this call.
+	    //
+	    // We know how many (non-ptr) words there are before the
+	    // next valid stack frame: it is the stk_offset arg to the
+	    // CCALL instruction.   So we build a RET_DYN stack frame
+	    // on the stack frame to describe this chunk of stack.
+	    //
+	    Sp -= ret_dyn_size;
+	    ((StgRetDyn *)Sp)->liveness = NO_PTRS | N_NONPTRS(stk_offset);
+	    ((StgRetDyn *)Sp)->info = (StgInfoTable *)&stg_gc_gen_info;
+
+	    SAVE_STACK_POINTERS;
+	    tok = suspendThread(&cap->r);
+
+#ifndef THREADED_RTS
+	    // Careful:
+	    // suspendThread might have shifted the stack
+	    // around (stack squeezing), so we have to grab the real
+	    // Sp out of the TSO to find the ccall args again.
+
+	    marshall_fn ( (void*)(cap->r.rCurrentTSO->sp + ret_dyn_size) );
+#else
+	    // Threaded RTS:
+	    // We already made a copy of the arguments above.
+
+	    marshall_fn ( arguments );
+#endif
+
+	    // And restart the thread again, popping the RET_DYN frame.
+	    cap = (Capability *)((void *)((unsigned char*)resumeThread(tok) - sizeof(StgFunTable)));
+	    LOAD_STACK_POINTERS;
+	    Sp += ret_dyn_size;
+	    
+	    // Save the Haskell thread's current value of errno
+	    cap->r.rCurrentTSO->saved_errno = errno;
+		
+#ifdef THREADED_RTS
+	    // Threaded RTS:
+	    // Copy the "arguments", which might include a return value,
+	    // back to the TSO stack. It would of course be enough to
+	    // just copy the return value, but we don't know the offset.
+	    memcpy(Sp, arguments, sizeof(W_) * stk_offset);
+#endif
+
+	    goto nextInsn;
+	}
+
+	case bci_JMP: {
+	    /* BCO_NEXT modifies bciPtr, so be conservative. */
+	    int nextpc = BCO_NEXT;
+	    bciPtr     = nextpc;
+	    goto nextInsn;
+	}
+
+	case bci_CASEFAIL:
+	    barf("interpretBCO: hit a CASEFAIL");
+	    
+	    // Errors
+	default: 
+	    barf("interpretBCO: unknown or unimplemented opcode");
+
+	} /* switch on opcode */
+    }
+    }
+
+    barf("interpretBCO: fell off end of the interpreter");
+}
diff --git a/rts/Interpreter.h b/rts/Interpreter.h
new file mode 100644
index 0000000000..d66e636084
--- /dev/null
+++ b/rts/Interpreter.h
@@ -0,0 +1,14 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2002.
+ *
+ * Prototypes for functions in Interpreter.c
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef INTERPRETER_H
+#define INTERPRETER_H
+
+extern Capability *interpretBCO (Capability* cap);
+
+#endif /* INTERPRETER_H */
diff --git a/rts/LdvProfile.c b/rts/LdvProfile.c
new file mode 100644
index 0000000000..19ebe426d3
--- /dev/null
+++ b/rts/LdvProfile.c
@@ -0,0 +1,342 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2001
+ * Author: Sungwoo Park
+ *
+ * Lag/Drag/Void profiling.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifdef PROFILING
+
+#include "Rts.h"
+#include "LdvProfile.h"
+#include "RtsFlags.h"
+#include "Profiling.h"
+#include "Stats.h"
+#include "Storage.h"
+#include "RtsUtils.h"
+#include "Schedule.h"
+
+/* --------------------------------------------------------------------------
+ * Fills in the slop when a *dynamic* closure changes its type.
+ * First calls LDV_recordDead() to declare the closure is dead, and then
+ * fills in the slop.
+ * 
+ *  Invoked when:
+ *    1) blackholing, UPD_BH_UPDATABLE() and UPD_BH_SINGLE_ENTRY (in
+ * 	 includes/StgMacros.h), threadLazyBlackHole() and 
+ * 	 threadSqueezeStack() (in GC.c).
+ *    2) updating with indirection closures, updateWithIndirection() 
+ * 	 and updateWithPermIndirection() (in Storage.h).
+ * 
+ *  LDV_recordDead_FILL_SLOP_DYNAMIC() is not called on 'inherently used' 
+ *  closures such as TSO. It is not called on PAP because PAP is not updatable.
+ *  ----------------------------------------------------------------------- */
+void 
+LDV_recordDead_FILL_SLOP_DYNAMIC( StgClosure *p )
+{
+    nat size, i;
+
+#if defined(__GNUC__) && __GNUC__ < 3 && defined(DEBUG)
+#error Please use gcc 3.0+ to compile this file with DEBUG; gcc < 3.0 miscompiles it
+#endif
+
+    if (era > 0) {
+	// very like FILL_SLOP(), except that we call LDV_recordDead().
+	size = closure_sizeW(p);
+
+	LDV_recordDead((StgClosure *)(p), size);
+
+	if (size > sizeofW(StgThunkHeader)) {
+	    for (i = 0; i < size - sizeofW(StgThunkHeader); i++) {
+		((StgThunk *)(p))->payload[i] = 0;
+	    }
+	}
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * This function is called eventually on every object destroyed during
+ * a garbage collection, whether it is a major garbage collection or
+ * not.  If c is an 'inherently used' closure, nothing happens.  If c
+ * is an ordinary closure, LDV_recordDead() is called on c with its
+ * proper size which excludes the profiling header portion in the
+ * closure.  Returns the size of the closure, including the profiling
+ * header portion, so that the caller can find the next closure.
+ * ----------------------------------------------------------------------- */
+STATIC_INLINE nat
+processHeapClosureForDead( StgClosure *c )
+{
+    nat size;
+    StgInfoTable *info;
+
+    info = get_itbl(c);
+
+    if (info->type != EVACUATED) {
+	ASSERT(((LDVW(c) & LDV_CREATE_MASK) >> LDV_SHIFT) <= era &&
+	       ((LDVW(c) & LDV_CREATE_MASK) >> LDV_SHIFT) > 0);
+	ASSERT(((LDVW(c) & LDV_STATE_MASK) == LDV_STATE_CREATE) ||
+	       (
+		   (LDVW(c) & LDV_LAST_MASK) <= era &&
+		   (LDVW(c) & LDV_LAST_MASK) > 0
+		   ));
+    }
+
+    if (info->type == EVACUATED) {
+	// The size of the evacuated closure is currently stored in
+	// the LDV field.  See SET_EVACUAEE_FOR_LDV() in
+	// includes/StgLdvProf.h.
+	return LDVW(c);
+    }
+
+    size = closure_sizeW(c);
+
+    switch (info->type) {
+	/*
+	  'inherently used' cases: do nothing.
+	*/
+    case TSO:
+    case MVAR:
+    case MUT_ARR_PTRS_CLEAN:
+    case MUT_ARR_PTRS_DIRTY:
+    case MUT_ARR_PTRS_FROZEN:
+    case MUT_ARR_PTRS_FROZEN0:
+    case ARR_WORDS:
+    case WEAK:
+    case MUT_VAR_CLEAN:
+    case MUT_VAR_DIRTY:
+    case BCO:
+    case STABLE_NAME:
+    case TVAR_WAIT_QUEUE:
+    case TVAR:
+    case TREC_HEADER:
+    case TREC_CHUNK:
+	return size;
+
+	/*
+	  ordinary cases: call LDV_recordDead().
+	*/
+    case THUNK:
+    case THUNK_1_0:
+    case THUNK_0_1:
+    case THUNK_SELECTOR:
+    case THUNK_2_0:
+    case THUNK_1_1:
+    case THUNK_0_2:
+    case AP:
+    case PAP:
+    case AP_STACK:
+    case CONSTR:
+    case CONSTR_1_0:
+    case CONSTR_0_1:
+    case CONSTR_2_0:
+    case CONSTR_1_1:
+    case CONSTR_0_2:
+    case FUN:
+    case FUN_1_0:
+    case FUN_0_1:
+    case FUN_2_0:
+    case FUN_1_1:
+    case FUN_0_2:
+    case BLACKHOLE:
+    case SE_BLACKHOLE:
+    case CAF_BLACKHOLE:
+    case SE_CAF_BLACKHOLE:
+    case IND_PERM:
+    case IND_OLDGEN_PERM:
+	/*
+	  'Ingore' cases
+	*/
+	// Why can we ignore IND/IND_OLDGEN closures? We assume that
+	// any census is preceded by a major garbage collection, which
+	// IND/IND_OLDGEN closures cannot survive. Therefore, it is no
+	// use considering IND/IND_OLDGEN closures in the meanwhile
+	// because they will perish before the next census at any
+	// rate.
+    case IND:
+    case IND_OLDGEN:
+	// Found a dead closure: record its size
+	LDV_recordDead(c, size);
+	return size;
+
+	/*
+	  Error case
+	*/
+	// static objects
+    case IND_STATIC:
+    case CONSTR_STATIC:
+    case FUN_STATIC:
+    case THUNK_STATIC:
+    case CONSTR_INTLIKE:
+    case CONSTR_CHARLIKE:
+    case CONSTR_NOCAF_STATIC:
+	// stack objects
+    case UPDATE_FRAME:
+    case CATCH_FRAME:
+    case STOP_FRAME:
+    case RET_DYN:
+    case RET_BCO:
+    case RET_SMALL:
+    case RET_VEC_SMALL:
+    case RET_BIG:
+    case RET_VEC_BIG:
+	// others
+    case BLOCKED_FETCH:
+    case FETCH_ME:
+    case FETCH_ME_BQ:
+    case RBH:
+    case REMOTE_REF:
+    case INVALID_OBJECT:
+    default:
+	barf("Invalid object in processHeapClosureForDead(): %d", info->type);
+	return 0;
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Calls processHeapClosureForDead() on every *dead* closures in the
+ * heap blocks starting at bd.
+ * ----------------------------------------------------------------------- */
+static void
+processHeapForDead( bdescr *bd )
+{
+    StgPtr p;
+
+    while (bd != NULL) {
+	p = bd->start;
+	while (p < bd->free) {
+	    p += processHeapClosureForDead((StgClosure *)p);
+	    while (p < bd->free && !*p)   // skip slop
+		p++;
+	}
+	ASSERT(p == bd->free);
+	bd = bd->link;
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Calls processHeapClosureForDead() on every *dead* closures in the nursery.
+ * ----------------------------------------------------------------------- */
+static void
+processNurseryForDead( void )
+{
+    StgPtr p, bdLimit;
+    bdescr *bd;
+
+    bd = MainCapability.r.rNursery->blocks;
+    while (bd->start < bd->free) {
+	p = bd->start;
+	bdLimit = bd->start + BLOCK_SIZE_W;
+	while (p < bd->free && p < bdLimit) {
+	    p += processHeapClosureForDead((StgClosure *)p);
+	    while (p < bd->free && p < bdLimit && !*p)  // skip slop
+		p++;
+	}
+	bd = bd->link;
+	if (bd == NULL)
+	    break;
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Calls processHeapClosureForDead() on every *dead* closures in the
+ * small object pool.
+ * ----------------------------------------------------------------------- */
+static void
+processSmallObjectPoolForDead( void )
+{
+    bdescr *bd;
+    StgPtr p;
+
+    bd = small_alloc_list;
+
+    // first block
+    if (bd == NULL)
+	return;
+
+    p = bd->start;
+    while (p < alloc_Hp) {
+	p += processHeapClosureForDead((StgClosure *)p);
+	while (p < alloc_Hp && !*p)     // skip slop
+	    p++;
+    }
+    ASSERT(p == alloc_Hp);
+
+    bd = bd->link;
+    while (bd != NULL) {
+	p = bd->start;
+	while (p < bd->free) {
+	    p += processHeapClosureForDead((StgClosure *)p);
+	    while (p < bd->free && !*p)    // skip slop
+		p++;
+	}
+	ASSERT(p == bd->free);
+	bd = bd->link;
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Calls processHeapClosureForDead() on every *dead* closures in the closure
+ * chain.
+ * ----------------------------------------------------------------------- */
+static void
+processChainForDead( bdescr *bd )
+{
+    // Any object still in the chain is dead!
+    while (bd != NULL) {
+	processHeapClosureForDead((StgClosure *)bd->start);
+	bd = bd->link;
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Start a census for *dead* closures, and calls
+ * processHeapClosureForDead() on every closure which died in the
+ * current garbage collection.  This function is called from a garbage
+ * collector right before tidying up, when all dead closures are still
+ * stored in the heap and easy to identify.  Generations 0 through N
+ * have just beed garbage collected.
+ * ----------------------------------------------------------------------- */
+void
+LdvCensusForDead( nat N )
+{
+    nat g, s;
+
+    // ldvTime == 0 means that LDV profiling is currently turned off.
+    if (era == 0)
+	return;
+
+    if (RtsFlags.GcFlags.generations == 1) {
+	//
+	// Todo: support LDV for two-space garbage collection.
+	//
+	barf("Lag/Drag/Void profiling not supported with -G1");
+    } else {
+	for (g = 0; g <= N; g++)
+	    for (s = 0; s < generations[g].n_steps; s++) {
+		if (g == 0 && s == 0) {
+		    processSmallObjectPoolForDead();
+		    processNurseryForDead();
+		    processChainForDead(generations[g].steps[s].large_objects);
+		} else{
+		    processHeapForDead(generations[g].steps[s].old_blocks);
+		    processChainForDead(generations[g].steps[s].large_objects);
+		}
+	    }
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Regard any closure in the current heap as dead or moribund and update
+ * LDV statistics accordingly.
+ * Called from shutdownHaskell() in RtsStartup.c.
+ * Also, stops LDV profiling by resetting ldvTime to 0.
+ * ----------------------------------------------------------------------- */
+void
+LdvCensusKillAll( void )
+{
+    LdvCensusForDead(RtsFlags.GcFlags.generations - 1);
+}
+
+#endif /* PROFILING */
diff --git a/rts/LdvProfile.h b/rts/LdvProfile.h
new file mode 100644
index 0000000000..d85b95cd6a
--- /dev/null
+++ b/rts/LdvProfile.h
@@ -0,0 +1,42 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2001
+ * Author: Sungwoo Park
+ *
+ * Lag/Drag/Void profiling.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef LDVPROFILE_H
+#define LDVPROFILE_H
+
+#ifdef PROFILING
+
+#include "ProfHeap.h"
+
+extern void LDV_recordDead_FILL_SLOP_DYNAMIC( StgClosure *p );
+extern void LdvCensusForDead ( nat );
+extern void LdvCensusKillAll ( void );
+
+// Creates a 0-filled slop of size 'howManyBackwards' backwards from the
+// address 'from'. 
+//
+// Invoked when: 
+//   1) Hp is incremented and exceeds HpLim (in Updates.hc).
+//   2) copypart() is called (in GC.c).
+#define LDV_FILL_SLOP(from, howManyBackwards)	\
+  if (era > 0) {				\
+    int i;					\
+    for (i = 0;i < (howManyBackwards); i++)	\
+      ((StgWord *)(from))[-i] = 0;		\
+  }
+
+// Informs the LDV profiler that closure c has just been evacuated.
+// Evacuated objects are no longer needed, so we just store its original size in
+// the LDV field.
+#define SET_EVACUAEE_FOR_LDV(c, size)   \
+    LDVW((c)) = (size)
+
+#endif /* PROFILING */
+
+#endif /* LDVPROFILE_H */
diff --git a/rts/Linker.c b/rts/Linker.c
new file mode 100644
index 0000000000..92d0106def
--- /dev/null
+++ b/rts/Linker.c
@@ -0,0 +1,4315 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2000-2004
+ *
+ * RTS Object Linker
+ *
+ * ---------------------------------------------------------------------------*/
+
+#if 0
+#include "PosixSource.h"
+#endif
+
+/* Linux needs _GNU_SOURCE to get RTLD_DEFAULT from <dlfcn.h> and 
+   MREMAP_MAYMOVE from <sys/mman.h>.
+ */
+#ifdef __linux__
+#define _GNU_SOURCE
+#endif
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "HsFFI.h"
+#include "Hash.h"
+#include "Linker.h"
+#include "LinkerInternals.h"
+#include "RtsUtils.h"
+#include "Schedule.h"
+#include "Storage.h"
+#include "Sparks.h"
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef HAVE_SYS_STAT_H
+#include <sys/stat.h>
+#endif
+
+#if defined(HAVE_DLFCN_H)
+#include <dlfcn.h>
+#endif
+
+#if defined(cygwin32_HOST_OS)
+#ifdef HAVE_DIRENT_H
+#include <dirent.h>
+#endif
+
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+#include <regex.h>
+#include <sys/fcntl.h>
+#include <sys/termios.h>
+#include <sys/utime.h>
+#include <sys/utsname.h>
+#include <sys/wait.h>
+#endif
+
+#if defined(ia64_HOST_ARCH) || defined(openbsd_HOST_OS) || defined(linux_HOST_OS) || defined(freebsd_HOST_OS)
+#define USE_MMAP
+#include <fcntl.h>
+#include <sys/mman.h>
+
+#if defined(openbsd_HOST_OS) || defined(linux_HOST_OS) || defined(freebsd_HOST_OS)
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#endif
+
+#endif
+
+#if defined(linux_HOST_OS) || defined(solaris2_HOST_OS) || defined(freebsd_HOST_OS) || defined(netbsd_HOST_OS) || defined(openbsd_HOST_OS)
+#  define OBJFORMAT_ELF
+#elif defined(cygwin32_HOST_OS) || defined (mingw32_HOST_OS)
+#  define OBJFORMAT_PEi386
+#  include <windows.h>
+#  include <math.h>
+#elif defined(darwin_HOST_OS)
+#  define OBJFORMAT_MACHO
+#  include <mach-o/loader.h>
+#  include <mach-o/nlist.h>
+#  include <mach-o/reloc.h>
+#  include <mach-o/dyld.h>
+#if defined(powerpc_HOST_ARCH)
+#  include <mach-o/ppc/reloc.h>
+#endif
+#endif
+
+/* Hash table mapping symbol names to Symbol */
+static /*Str*/HashTable *symhash;
+
+/* List of currently loaded objects */
+ObjectCode *objects = NULL;	/* initially empty */
+
+#if defined(OBJFORMAT_ELF)
+static int ocVerifyImage_ELF    ( ObjectCode* oc );
+static int ocGetNames_ELF       ( ObjectCode* oc );
+static int ocResolve_ELF        ( ObjectCode* oc );
+#if defined(powerpc_HOST_ARCH)
+static int ocAllocateJumpIslands_ELF ( ObjectCode* oc );
+#endif
+#elif defined(OBJFORMAT_PEi386)
+static int ocVerifyImage_PEi386 ( ObjectCode* oc );
+static int ocGetNames_PEi386    ( ObjectCode* oc );
+static int ocResolve_PEi386     ( ObjectCode* oc );
+#elif defined(OBJFORMAT_MACHO)
+static int ocVerifyImage_MachO    ( ObjectCode* oc );
+static int ocGetNames_MachO       ( ObjectCode* oc );
+static int ocResolve_MachO        ( ObjectCode* oc );
+
+static int machoGetMisalignment( FILE * );
+#ifdef powerpc_HOST_ARCH
+static int ocAllocateJumpIslands_MachO ( ObjectCode* oc );
+static void machoInitSymbolsWithoutUnderscore( void );
+#endif
+#endif
+
+#if defined(x86_64_HOST_ARCH)
+static void*x86_64_high_symbol( char *lbl, void *addr );
+#endif
+
+/* -----------------------------------------------------------------------------
+ * Built-in symbols from the RTS
+ */
+
+typedef struct _RtsSymbolVal {
+    char   *lbl;
+    void   *addr;
+} RtsSymbolVal;
+
+
+#if !defined(PAR)
+#define Maybe_Stable_Names      SymX(mkWeakzh_fast)			\
+      				SymX(makeStableNamezh_fast)		\
+      				SymX(finalizzeWeakzh_fast)
+#else
+/* These are not available in GUM!!! -- HWL */
+#define Maybe_Stable_Names
+#endif
+
+#if !defined (mingw32_HOST_OS)
+#define RTS_POSIX_ONLY_SYMBOLS                  \
+      SymX(signal_handlers)			\
+      SymX(stg_sig_install)			\
+      Sym(nocldstop)
+#endif
+
+#if defined (cygwin32_HOST_OS)
+#define RTS_MINGW_ONLY_SYMBOLS /**/
+/* Don't have the ability to read import libs / archives, so
+ * we have to stupidly list a lot of what libcygwin.a
+ * exports; sigh.
+ */
+#define RTS_CYGWIN_ONLY_SYMBOLS                 \
+      SymX(regfree)                             \
+      SymX(regexec)                             \
+      SymX(regerror)                            \
+      SymX(regcomp)                             \
+      SymX(__errno)                             \
+      SymX(access)                              \
+      SymX(chmod)                               \
+      SymX(chdir)                               \
+      SymX(close)                               \
+      SymX(creat)                               \
+      SymX(dup)                                 \
+      SymX(dup2)                                \
+      SymX(fstat)                               \
+      SymX(fcntl)                               \
+      SymX(getcwd)                              \
+      SymX(getenv)                              \
+      SymX(lseek)                               \
+      SymX(open)                                \
+      SymX(fpathconf)                           \
+      SymX(pathconf)                            \
+      SymX(stat)                                \
+      SymX(pow)                                 \
+      SymX(tanh)                                \
+      SymX(cosh)                                \
+      SymX(sinh)                                \
+      SymX(atan)                                \
+      SymX(acos)                                \
+      SymX(asin)                                \
+      SymX(tan)                                 \
+      SymX(cos)                                 \
+      SymX(sin)                                 \
+      SymX(exp)                                 \
+      SymX(log)                                 \
+      SymX(sqrt)                                \
+      SymX(localtime_r)                         \
+      SymX(gmtime_r)                            \
+      SymX(mktime)                              \
+      Sym(_imp___tzname)                        \
+      SymX(gettimeofday)                        \
+      SymX(timezone)                            \
+      SymX(tcgetattr)                           \
+      SymX(tcsetattr)                           \
+      SymX(memcpy)                              \
+      SymX(memmove)                             \
+      SymX(realloc)                             \
+      SymX(malloc)                              \
+      SymX(free)                                \
+      SymX(fork)                                \
+      SymX(lstat)                               \
+      SymX(isatty)                              \
+      SymX(mkdir)                               \
+      SymX(opendir)                             \
+      SymX(readdir)                             \
+      SymX(rewinddir)                           \
+      SymX(closedir)                            \
+      SymX(link)                                \
+      SymX(mkfifo)                              \
+      SymX(pipe)                                \
+      SymX(read)                                \
+      SymX(rename)                              \
+      SymX(rmdir)                               \
+      SymX(select)                              \
+      SymX(system)                              \
+      SymX(write)                               \
+      SymX(strcmp)                              \
+      SymX(strcpy)                              \
+      SymX(strncpy)                             \
+      SymX(strerror)                            \
+      SymX(sigaddset)                           \
+      SymX(sigemptyset)                         \
+      SymX(sigprocmask)                         \
+      SymX(umask)                               \
+      SymX(uname)                               \
+      SymX(unlink)                              \
+      SymX(utime)                               \
+      SymX(waitpid)
+
+#elif !defined(mingw32_HOST_OS)
+#define RTS_MINGW_ONLY_SYMBOLS /**/
+#define RTS_CYGWIN_ONLY_SYMBOLS /**/
+#else /* defined(mingw32_HOST_OS) */
+#define RTS_POSIX_ONLY_SYMBOLS  /**/
+#define RTS_CYGWIN_ONLY_SYMBOLS /**/
+
+/* Extra syms gen'ed by mingw-2's gcc-3.2: */
+#if __GNUC__>=3
+#define RTS_MINGW_EXTRA_SYMS                    \
+      Sym(_imp____mb_cur_max)                   \
+      Sym(_imp___pctype)
+#else
+#define RTS_MINGW_EXTRA_SYMS
+#endif
+
+/* These are statically linked from the mingw libraries into the ghc
+   executable, so we have to employ this hack. */
+#define RTS_MINGW_ONLY_SYMBOLS                  \
+      SymX(asyncReadzh_fast)			\
+      SymX(asyncWritezh_fast)			\
+      SymX(asyncDoProczh_fast)			\
+      SymX(memset)                              \
+      SymX(inet_ntoa)                           \
+      SymX(inet_addr)                           \
+      SymX(htonl)                               \
+      SymX(recvfrom)                            \
+      SymX(listen)                              \
+      SymX(bind)                                \
+      SymX(shutdown)                            \
+      SymX(connect)                             \
+      SymX(htons)                               \
+      SymX(ntohs)                               \
+      SymX(getservbyname)                       \
+      SymX(getservbyport)                       \
+      SymX(getprotobynumber)                    \
+      SymX(getprotobyname)                      \
+      SymX(gethostbyname)                       \
+      SymX(gethostbyaddr)                       \
+      SymX(gethostname)                         \
+      SymX(strcpy)                              \
+      SymX(strncpy)                             \
+      SymX(abort)                               \
+      Sym(_alloca)                              \
+      Sym(isxdigit)                             \
+      Sym(isupper)                              \
+      Sym(ispunct)                              \
+      Sym(islower)                              \
+      Sym(isspace)                              \
+      Sym(isprint)                              \
+      Sym(isdigit)                              \
+      Sym(iscntrl)                              \
+      Sym(isalpha)                              \
+      Sym(isalnum)                              \
+      SymX(strcmp)                              \
+      SymX(memmove)                             \
+      SymX(realloc)                             \
+      SymX(malloc)                              \
+      SymX(pow)                                 \
+      SymX(tanh)                                \
+      SymX(cosh)                                \
+      SymX(sinh)                                \
+      SymX(atan)                                \
+      SymX(acos)                                \
+      SymX(asin)                                \
+      SymX(tan)                                 \
+      SymX(cos)                                 \
+      SymX(sin)                                 \
+      SymX(exp)                                 \
+      SymX(log)                                 \
+      SymX(sqrt)                                \
+      SymX(powf)                                 \
+      SymX(tanhf)                                \
+      SymX(coshf)                                \
+      SymX(sinhf)                                \
+      SymX(atanf)                                \
+      SymX(acosf)                                \
+      SymX(asinf)                                \
+      SymX(tanf)                                 \
+      SymX(cosf)                                 \
+      SymX(sinf)                                 \
+      SymX(expf)                                 \
+      SymX(logf)                                 \
+      SymX(sqrtf)                                \
+      SymX(memcpy)                              \
+      SymX(rts_InstallConsoleEvent)             \
+      SymX(rts_ConsoleHandlerDone)              \
+      Sym(mktime)                               \
+      Sym(_imp___timezone)                      \
+      Sym(_imp___tzname)                        \
+      Sym(_imp___iob)                           \
+      Sym(_imp___osver)                         \
+      Sym(localtime)                            \
+      Sym(gmtime)                               \
+      Sym(opendir)                              \
+      Sym(readdir)                              \
+      Sym(rewinddir)                            \
+      RTS_MINGW_EXTRA_SYMS                      \
+      Sym(closedir)
+#endif
+
+#if defined(darwin_TARGET_OS) && HAVE_PRINTF_LDBLSTUB
+#define RTS_DARWIN_ONLY_SYMBOLS			\
+     Sym(asprintf$LDBLStub)                     \
+     Sym(err$LDBLStub)                          \
+     Sym(errc$LDBLStub)                         \
+     Sym(errx$LDBLStub)                         \
+     Sym(fprintf$LDBLStub)                      \
+     Sym(fscanf$LDBLStub)                       \
+     Sym(fwprintf$LDBLStub)                     \
+     Sym(fwscanf$LDBLStub)                      \
+     Sym(printf$LDBLStub)                       \
+     Sym(scanf$LDBLStub)                        \
+     Sym(snprintf$LDBLStub)                     \
+     Sym(sprintf$LDBLStub)                      \
+     Sym(sscanf$LDBLStub)                       \
+     Sym(strtold$LDBLStub)                      \
+     Sym(swprintf$LDBLStub)                     \
+     Sym(swscanf$LDBLStub)                      \
+     Sym(syslog$LDBLStub)                       \
+     Sym(vasprintf$LDBLStub)                    \
+     Sym(verr$LDBLStub)                         \
+     Sym(verrc$LDBLStub)                        \
+     Sym(verrx$LDBLStub)                        \
+     Sym(vfprintf$LDBLStub)                     \
+     Sym(vfscanf$LDBLStub)                      \
+     Sym(vfwprintf$LDBLStub)                    \
+     Sym(vfwscanf$LDBLStub)                     \
+     Sym(vprintf$LDBLStub)                      \
+     Sym(vscanf$LDBLStub)                       \
+     Sym(vsnprintf$LDBLStub)                    \
+     Sym(vsprintf$LDBLStub)                     \
+     Sym(vsscanf$LDBLStub)                      \
+     Sym(vswprintf$LDBLStub)                    \
+     Sym(vswscanf$LDBLStub)                     \
+     Sym(vsyslog$LDBLStub)                      \
+     Sym(vwarn$LDBLStub)                        \
+     Sym(vwarnc$LDBLStub)                       \
+     Sym(vwarnx$LDBLStub)                       \
+     Sym(vwprintf$LDBLStub)                     \
+     Sym(vwscanf$LDBLStub)                      \
+     Sym(warn$LDBLStub)                         \
+     Sym(warnc$LDBLStub)                        \
+     Sym(warnx$LDBLStub)                        \
+     Sym(wcstold$LDBLStub)                      \
+     Sym(wprintf$LDBLStub)                      \
+     Sym(wscanf$LDBLStub)
+#else
+#define RTS_DARWIN_ONLY_SYMBOLS
+#endif
+
+#ifndef SMP
+# define MAIN_CAP_SYM SymX(MainCapability)
+#else
+# define MAIN_CAP_SYM
+#endif
+
+#if !defined(mingw32_HOST_OS)
+#define RTS_USER_SIGNALS_SYMBOLS \
+   SymX(setIOManagerPipe)
+#else
+#define RTS_USER_SIGNALS_SYMBOLS /* nothing */
+#endif
+
+#ifdef TABLES_NEXT_TO_CODE
+#define RTS_RET_SYMBOLS /* nothing */
+#else
+#define RTS_RET_SYMBOLS 			\
+      SymX(stg_enter_ret)			\
+      SymX(stg_gc_fun_ret)			\
+      SymX(stg_ap_v_ret)			\
+      SymX(stg_ap_f_ret)			\
+      SymX(stg_ap_d_ret)			\
+      SymX(stg_ap_l_ret)			\
+      SymX(stg_ap_n_ret)			\
+      SymX(stg_ap_p_ret)			\
+      SymX(stg_ap_pv_ret)			\
+      SymX(stg_ap_pp_ret)			\
+      SymX(stg_ap_ppv_ret)			\
+      SymX(stg_ap_ppp_ret)			\
+      SymX(stg_ap_pppv_ret)			\
+      SymX(stg_ap_pppp_ret)			\
+      SymX(stg_ap_ppppp_ret)			\
+      SymX(stg_ap_pppppp_ret)
+#endif
+
+#define RTS_SYMBOLS				\
+      Maybe_Stable_Names			\
+      Sym(StgReturn)				\
+      SymX(stg_enter_info)			\
+      SymX(stg_gc_void_info)			\
+      SymX(__stg_gc_enter_1)			\
+      SymX(stg_gc_noregs)			\
+      SymX(stg_gc_unpt_r1_info)			\
+      SymX(stg_gc_unpt_r1)			\
+      SymX(stg_gc_unbx_r1_info)			\
+      SymX(stg_gc_unbx_r1)			\
+      SymX(stg_gc_f1_info)			\
+      SymX(stg_gc_f1)				\
+      SymX(stg_gc_d1_info)			\
+      SymX(stg_gc_d1)				\
+      SymX(stg_gc_l1_info)			\
+      SymX(stg_gc_l1)				\
+      SymX(__stg_gc_fun)			\
+      SymX(stg_gc_fun_info)			\
+      SymX(stg_gc_gen)				\
+      SymX(stg_gc_gen_info)			\
+      SymX(stg_gc_gen_hp)			\
+      SymX(stg_gc_ut)				\
+      SymX(stg_gen_yield)			\
+      SymX(stg_yield_noregs)			\
+      SymX(stg_yield_to_interpreter)		\
+      SymX(stg_gen_block)			\
+      SymX(stg_block_noregs)			\
+      SymX(stg_block_1)				\
+      SymX(stg_block_takemvar)			\
+      SymX(stg_block_putmvar)			\
+      SymX(stg_seq_frame_info)			\
+      MAIN_CAP_SYM                              \
+      SymX(MallocFailHook)			\
+      SymX(OnExitHook)				\
+      SymX(OutOfHeapHook)			\
+      SymX(StackOverflowHook)			\
+      SymX(__encodeDouble)			\
+      SymX(__encodeFloat)			\
+      SymX(addDLL)               		\
+      SymX(__gmpn_gcd_1)			\
+      SymX(__gmpz_cmp)				\
+      SymX(__gmpz_cmp_si)			\
+      SymX(__gmpz_cmp_ui)			\
+      SymX(__gmpz_get_si)			\
+      SymX(__gmpz_get_ui)			\
+      SymX(__int_encodeDouble)			\
+      SymX(__int_encodeFloat)			\
+      SymX(andIntegerzh_fast)			\
+      SymX(atomicallyzh_fast)			\
+      SymX(barf)				\
+      SymX(debugBelch)				\
+      SymX(errorBelch)				\
+      SymX(blockAsyncExceptionszh_fast)		\
+      SymX(catchzh_fast)			\
+      SymX(catchRetryzh_fast)			\
+      SymX(catchSTMzh_fast)			\
+      SymX(closure_flags)                       \
+      SymX(cmp_thread)				\
+      SymX(cmpIntegerzh_fast)	        	\
+      SymX(cmpIntegerIntzh_fast)	      	\
+      SymX(complementIntegerzh_fast)		\
+      SymX(createAdjustor)			\
+      SymX(decodeDoublezh_fast)			\
+      SymX(decodeFloatzh_fast)			\
+      SymX(defaultsHook)			\
+      SymX(delayzh_fast)			\
+      SymX(deRefWeakzh_fast)			\
+      SymX(deRefStablePtrzh_fast)		\
+      SymX(dirty_MUT_VAR)			\
+      SymX(divExactIntegerzh_fast)		\
+      SymX(divModIntegerzh_fast)		\
+      SymX(forkzh_fast)				\
+      SymX(forkOnzh_fast)			\
+      SymX(forkProcess)				\
+      SymX(forkOS_createThread)			\
+      SymX(freeHaskellFunctionPtr)		\
+      SymX(freeStablePtr)		        \
+      SymX(gcdIntegerzh_fast)			\
+      SymX(gcdIntegerIntzh_fast)		\
+      SymX(gcdIntzh_fast)			\
+      SymX(genSymZh)				\
+      SymX(genericRaise)			\
+      SymX(getProgArgv)				\
+      SymX(getStablePtr)			\
+      SymX(hs_init)				\
+      SymX(hs_exit)				\
+      SymX(hs_set_argv)				\
+      SymX(hs_add_root)				\
+      SymX(hs_perform_gc)			\
+      SymX(hs_free_stable_ptr)			\
+      SymX(hs_free_fun_ptr)			\
+      SymX(initLinker)				\
+      SymX(int2Integerzh_fast)			\
+      SymX(integer2Intzh_fast)			\
+      SymX(integer2Wordzh_fast)			\
+      SymX(isCurrentThreadBoundzh_fast)		\
+      SymX(isDoubleDenormalized)		\
+      SymX(isDoubleInfinite)			\
+      SymX(isDoubleNaN)				\
+      SymX(isDoubleNegativeZero)		\
+      SymX(isEmptyMVarzh_fast)			\
+      SymX(isFloatDenormalized)			\
+      SymX(isFloatInfinite)			\
+      SymX(isFloatNaN)				\
+      SymX(isFloatNegativeZero)			\
+      SymX(killThreadzh_fast)			\
+      SymX(loadObj)          			\
+      SymX(lookupSymbol)     			\
+      SymX(makeStablePtrzh_fast)		\
+      SymX(minusIntegerzh_fast)			\
+      SymX(mkApUpd0zh_fast)			\
+      SymX(myThreadIdzh_fast)			\
+      SymX(labelThreadzh_fast)                  \
+      SymX(newArrayzh_fast)			\
+      SymX(newBCOzh_fast)			\
+      SymX(newByteArrayzh_fast)			\
+      SymX_redirect(newCAF, newDynCAF)		\
+      SymX(newMVarzh_fast)			\
+      SymX(newMutVarzh_fast)			\
+      SymX(newTVarzh_fast)			\
+      SymX(atomicModifyMutVarzh_fast)		\
+      SymX(newPinnedByteArrayzh_fast)		\
+      SymX(newSpark)				\
+      SymX(orIntegerzh_fast)			\
+      SymX(performGC)				\
+      SymX(performMajorGC)			\
+      SymX(plusIntegerzh_fast)			\
+      SymX(prog_argc)				\
+      SymX(prog_argv)				\
+      SymX(putMVarzh_fast)			\
+      SymX(quotIntegerzh_fast)			\
+      SymX(quotRemIntegerzh_fast)		\
+      SymX(raisezh_fast)			\
+      SymX(raiseIOzh_fast)			\
+      SymX(readTVarzh_fast)			\
+      SymX(remIntegerzh_fast)			\
+      SymX(resetNonBlockingFd)			\
+      SymX(resumeThread)			\
+      SymX(resolveObjs)                         \
+      SymX(retryzh_fast)                        \
+      SymX(rts_apply)				\
+      SymX(rts_checkSchedStatus)		\
+      SymX(rts_eval)				\
+      SymX(rts_evalIO)				\
+      SymX(rts_evalLazyIO)			\
+      SymX(rts_evalStableIO)			\
+      SymX(rts_eval_)				\
+      SymX(rts_getBool)				\
+      SymX(rts_getChar)				\
+      SymX(rts_getDouble)			\
+      SymX(rts_getFloat)			\
+      SymX(rts_getInt)				\
+      SymX(rts_getInt32)			\
+      SymX(rts_getPtr)				\
+      SymX(rts_getFunPtr)			\
+      SymX(rts_getStablePtr)			\
+      SymX(rts_getThreadId)			\
+      SymX(rts_getWord)				\
+      SymX(rts_getWord32)			\
+      SymX(rts_lock)				\
+      SymX(rts_mkBool)				\
+      SymX(rts_mkChar)				\
+      SymX(rts_mkDouble)			\
+      SymX(rts_mkFloat)				\
+      SymX(rts_mkInt)				\
+      SymX(rts_mkInt16)				\
+      SymX(rts_mkInt32)				\
+      SymX(rts_mkInt64)				\
+      SymX(rts_mkInt8)				\
+      SymX(rts_mkPtr)				\
+      SymX(rts_mkFunPtr)			\
+      SymX(rts_mkStablePtr)			\
+      SymX(rts_mkString)			\
+      SymX(rts_mkWord)				\
+      SymX(rts_mkWord16)			\
+      SymX(rts_mkWord32)			\
+      SymX(rts_mkWord64)			\
+      SymX(rts_mkWord8)				\
+      SymX(rts_unlock)				\
+      SymX(rtsSupportsBoundThreads)		\
+      SymX(__hscore_get_saved_termios)		\
+      SymX(__hscore_set_saved_termios)		\
+      SymX(setProgArgv)				\
+      SymX(startupHaskell)			\
+      SymX(shutdownHaskell)			\
+      SymX(shutdownHaskellAndExit)		\
+      SymX(stable_ptr_table)			\
+      SymX(stackOverflow)			\
+      SymX(stg_CAF_BLACKHOLE_info)		\
+      SymX(awakenBlockedQueue)			\
+      SymX(stg_CHARLIKE_closure)		\
+      SymX(stg_EMPTY_MVAR_info)			\
+      SymX(stg_IND_STATIC_info)			\
+      SymX(stg_INTLIKE_closure)			\
+      SymX(stg_MUT_ARR_PTRS_DIRTY_info)		\
+      SymX(stg_MUT_ARR_PTRS_FROZEN_info)	\
+      SymX(stg_MUT_ARR_PTRS_FROZEN0_info)	\
+      SymX(stg_WEAK_info)                       \
+      SymX(stg_ap_v_info)			\
+      SymX(stg_ap_f_info)			\
+      SymX(stg_ap_d_info)			\
+      SymX(stg_ap_l_info)			\
+      SymX(stg_ap_n_info)			\
+      SymX(stg_ap_p_info)			\
+      SymX(stg_ap_pv_info)			\
+      SymX(stg_ap_pp_info)			\
+      SymX(stg_ap_ppv_info)			\
+      SymX(stg_ap_ppp_info)			\
+      SymX(stg_ap_pppv_info)			\
+      SymX(stg_ap_pppp_info)			\
+      SymX(stg_ap_ppppp_info)			\
+      SymX(stg_ap_pppppp_info)			\
+      SymX(stg_ap_0_fast)			\
+      SymX(stg_ap_v_fast)			\
+      SymX(stg_ap_f_fast)			\
+      SymX(stg_ap_d_fast)			\
+      SymX(stg_ap_l_fast)			\
+      SymX(stg_ap_n_fast)			\
+      SymX(stg_ap_p_fast)			\
+      SymX(stg_ap_pv_fast)			\
+      SymX(stg_ap_pp_fast)			\
+      SymX(stg_ap_ppv_fast)			\
+      SymX(stg_ap_ppp_fast)			\
+      SymX(stg_ap_pppv_fast)			\
+      SymX(stg_ap_pppp_fast)			\
+      SymX(stg_ap_ppppp_fast)			\
+      SymX(stg_ap_pppppp_fast)			\
+      SymX(stg_ap_1_upd_info)			\
+      SymX(stg_ap_2_upd_info)			\
+      SymX(stg_ap_3_upd_info)			\
+      SymX(stg_ap_4_upd_info)			\
+      SymX(stg_ap_5_upd_info)			\
+      SymX(stg_ap_6_upd_info)			\
+      SymX(stg_ap_7_upd_info)			\
+      SymX(stg_exit)				\
+      SymX(stg_sel_0_upd_info)			\
+      SymX(stg_sel_10_upd_info)			\
+      SymX(stg_sel_11_upd_info)			\
+      SymX(stg_sel_12_upd_info)			\
+      SymX(stg_sel_13_upd_info)			\
+      SymX(stg_sel_14_upd_info)			\
+      SymX(stg_sel_15_upd_info)			\
+      SymX(stg_sel_1_upd_info)			\
+      SymX(stg_sel_2_upd_info)			\
+      SymX(stg_sel_3_upd_info)			\
+      SymX(stg_sel_4_upd_info)			\
+      SymX(stg_sel_5_upd_info)			\
+      SymX(stg_sel_6_upd_info)			\
+      SymX(stg_sel_7_upd_info)			\
+      SymX(stg_sel_8_upd_info)			\
+      SymX(stg_sel_9_upd_info)			\
+      SymX(stg_upd_frame_info)			\
+      SymX(suspendThread)			\
+      SymX(takeMVarzh_fast)			\
+      SymX(timesIntegerzh_fast)			\
+      SymX(tryPutMVarzh_fast)			\
+      SymX(tryTakeMVarzh_fast)			\
+      SymX(unblockAsyncExceptionszh_fast)	\
+      SymX(unloadObj)                           \
+      SymX(unsafeThawArrayzh_fast)		\
+      SymX(waitReadzh_fast)			\
+      SymX(waitWritezh_fast)			\
+      SymX(word2Integerzh_fast)			\
+      SymX(writeTVarzh_fast)			\
+      SymX(xorIntegerzh_fast)			\
+      SymX(yieldzh_fast)                        \
+      SymX(stg_interp_constr_entry)             \
+      SymX(stg_interp_constr1_entry)            \
+      SymX(stg_interp_constr2_entry)            \
+      SymX(stg_interp_constr3_entry)            \
+      SymX(stg_interp_constr4_entry)            \
+      SymX(stg_interp_constr5_entry)            \
+      SymX(stg_interp_constr6_entry)            \
+      SymX(stg_interp_constr7_entry)            \
+      SymX(stg_interp_constr8_entry)            \
+      SymX(stgMallocBytesRWX)                   \
+      SymX(getAllocations)                      \
+      SymX(revertCAFs)                          \
+      SymX(RtsFlags)                            \
+      RTS_USER_SIGNALS_SYMBOLS
+
+#ifdef SUPPORT_LONG_LONGS
+#define RTS_LONG_LONG_SYMS			\
+      SymX(int64ToIntegerzh_fast)		\
+      SymX(word64ToIntegerzh_fast)
+#else
+#define RTS_LONG_LONG_SYMS /* nothing */
+#endif
+
+// 64-bit support functions in libgcc.a
+#if defined(__GNUC__) && SIZEOF_VOID_P <= 4
+#define RTS_LIBGCC_SYMBOLS			\
+      Sym(__divdi3)                             \
+      Sym(__udivdi3)                            \
+      Sym(__moddi3)                             \
+      Sym(__umoddi3)				\
+      Sym(__muldi3)				\
+      Sym(__ashldi3)				\
+      Sym(__ashrdi3)				\
+      Sym(__lshrdi3)				\
+      Sym(__eprintf)
+#elif defined(ia64_HOST_ARCH)
+#define RTS_LIBGCC_SYMBOLS			\
+      Sym(__divdi3)				\
+      Sym(__udivdi3)                            \
+      Sym(__moddi3)				\
+      Sym(__umoddi3)				\
+      Sym(__divsf3)				\
+      Sym(__divdf3)
+#else
+#define RTS_LIBGCC_SYMBOLS
+#endif
+
+#if defined(darwin_HOST_OS) && defined(powerpc_HOST_ARCH)
+      // Symbols that don't have a leading underscore
+      // on Mac OS X. They have to receive special treatment,
+      // see machoInitSymbolsWithoutUnderscore()
+#define RTS_MACHO_NOUNDERLINE_SYMBOLS		\
+      Sym(saveFP)				\
+      Sym(restFP)
+#endif
+
+/* entirely bogus claims about types of these symbols */
+#define Sym(vvv)  extern void vvv(void);
+#define SymX(vvv) /**/
+#define SymX_redirect(vvv,xxx) /**/
+RTS_SYMBOLS
+RTS_RET_SYMBOLS
+RTS_LONG_LONG_SYMS
+RTS_POSIX_ONLY_SYMBOLS
+RTS_MINGW_ONLY_SYMBOLS
+RTS_CYGWIN_ONLY_SYMBOLS
+RTS_DARWIN_ONLY_SYMBOLS
+RTS_LIBGCC_SYMBOLS
+#undef Sym
+#undef SymX
+#undef SymX_redirect
+
+#ifdef LEADING_UNDERSCORE
+#define MAYBE_LEADING_UNDERSCORE_STR(s) ("_" s)
+#else
+#define MAYBE_LEADING_UNDERSCORE_STR(s) (s)
+#endif
+
+#define Sym(vvv) { MAYBE_LEADING_UNDERSCORE_STR(#vvv), \
+                    (void*)(&(vvv)) },
+#define SymX(vvv) Sym(vvv)
+
+// SymX_redirect allows us to redirect references to one symbol to
+// another symbol.  See newCAF/newDynCAF for an example.
+#define SymX_redirect(vvv,xxx) \
+    { MAYBE_LEADING_UNDERSCORE_STR(#vvv), \
+      (void*)(&(xxx)) },
+
+static RtsSymbolVal rtsSyms[] = {
+      RTS_SYMBOLS
+      RTS_RET_SYMBOLS
+      RTS_LONG_LONG_SYMS
+      RTS_POSIX_ONLY_SYMBOLS
+      RTS_MINGW_ONLY_SYMBOLS
+      RTS_CYGWIN_ONLY_SYMBOLS
+      RTS_LIBGCC_SYMBOLS
+#if defined(darwin_HOST_OS) && defined(i386_HOST_ARCH)
+      // dyld stub code contains references to this,
+      // but it should never be called because we treat
+      // lazy pointers as nonlazy.
+      { "dyld_stub_binding_helper", (void*)0xDEADBEEF },
+#endif
+      { 0, 0 } /* sentinel */
+};
+
+/* -----------------------------------------------------------------------------
+ * Insert symbols into hash tables, checking for duplicates.
+ */
+static void ghciInsertStrHashTable ( char* obj_name,
+                                     HashTable *table,
+                                     char* key,
+                                     void *data
+				   )
+{
+   if (lookupHashTable(table, (StgWord)key) == NULL)
+   {
+      insertStrHashTable(table, (StgWord)key, data);
+      return;
+   }
+   debugBelch(
+      "\n\n"
+      "GHCi runtime linker: fatal error: I found a duplicate definition for symbol\n"
+      "   %s\n"
+      "whilst processing object file\n"
+      "   %s\n"
+      "This could be caused by:\n"
+      "   * Loading two different object files which export the same symbol\n"
+      "   * Specifying the same object file twice on the GHCi command line\n"
+      "   * An incorrect `package.conf' entry, causing some object to be\n"
+      "     loaded twice.\n"
+      "GHCi cannot safely continue in this situation.  Exiting now.  Sorry.\n"
+      "\n",
+      (char*)key,
+      obj_name
+   );
+   exit(1);
+}
+
+
+/* -----------------------------------------------------------------------------
+ * initialize the object linker
+ */
+
+
+static int linker_init_done = 0 ;
+
+#if defined(OBJFORMAT_ELF) || defined(OBJFORMAT_MACHO)
+static void *dl_prog_handle;
+#endif
+
+/* dlopen(NULL,..) doesn't work so we grab libc explicitly */
+#if defined(openbsd_HOST_OS)
+static void *dl_libc_handle;
+#endif
+
+void
+initLinker( void )
+{
+    RtsSymbolVal *sym;
+
+    /* Make initLinker idempotent, so we can call it
+       before evey relevant operation; that means we
+       don't need to initialise the linker separately */
+    if (linker_init_done == 1) { return; } else {
+      linker_init_done = 1;
+    }
+
+    symhash = allocStrHashTable();
+
+    /* populate the symbol table with stuff from the RTS */
+    for (sym = rtsSyms; sym->lbl != NULL; sym++) {
+	ghciInsertStrHashTable("(GHCi built-in symbols)",
+                               symhash, sym->lbl, sym->addr);
+    }
+#   if defined(OBJFORMAT_MACHO) && defined(powerpc_HOST_ARCH)
+    machoInitSymbolsWithoutUnderscore();
+#   endif
+
+#   if defined(OBJFORMAT_ELF) || defined(OBJFORMAT_MACHO)
+#   if defined(RTLD_DEFAULT)
+    dl_prog_handle = RTLD_DEFAULT;
+#   else
+    dl_prog_handle = dlopen(NULL, RTLD_LAZY);
+#   if defined(openbsd_HOST_OS)
+    dl_libc_handle = dlopen("libc.so", RTLD_LAZY);
+#   endif
+#   endif /* RTLD_DEFAULT */
+#   endif
+}
+
+/* -----------------------------------------------------------------------------
+ *                  Loading DLL or .so dynamic libraries
+ * -----------------------------------------------------------------------------
+ *
+ * Add a DLL from which symbols may be found.  In the ELF case, just
+ * do RTLD_GLOBAL-style add, so no further messing around needs to
+ * happen in order that symbols in the loaded .so are findable --
+ * lookupSymbol() will subsequently see them by dlsym on the program's
+ * dl-handle.  Returns NULL if success, otherwise ptr to an err msg.
+ *
+ * In the PEi386 case, open the DLLs and put handles to them in a
+ * linked list.  When looking for a symbol, try all handles in the
+ * list.  This means that we need to load even DLLs that are guaranteed
+ * to be in the ghc.exe image already, just so we can get a handle
+ * to give to loadSymbol, so that we can find the symbols.  For such
+ * libraries, the LoadLibrary call should be a no-op except for returning
+ * the handle.
+ *
+ */
+
+#if defined(OBJFORMAT_PEi386)
+/* A record for storing handles into DLLs. */
+
+typedef
+   struct _OpenedDLL {
+      char*              name;
+      struct _OpenedDLL* next;
+      HINSTANCE instance;
+   }
+   OpenedDLL;
+
+/* A list thereof. */
+static OpenedDLL* opened_dlls = NULL;
+#endif
+
+char *
+addDLL( char *dll_name )
+{
+#  if defined(OBJFORMAT_ELF) || defined(OBJFORMAT_MACHO)
+   /* ------------------- ELF DLL loader ------------------- */
+   void *hdl;
+   char *errmsg;
+
+   initLinker();
+
+   hdl= dlopen(dll_name, RTLD_NOW | RTLD_GLOBAL);
+
+   if (hdl == NULL) {
+      /* dlopen failed; return a ptr to the error msg. */
+      errmsg = dlerror();
+      if (errmsg == NULL) errmsg = "addDLL: unknown error";
+      return errmsg;
+   } else {
+      return NULL;
+   }
+   /*NOTREACHED*/
+
+#  elif defined(OBJFORMAT_PEi386)
+   /* ------------------- Win32 DLL loader ------------------- */
+
+   char*      buf;
+   OpenedDLL* o_dll;
+   HINSTANCE  instance;
+
+   initLinker();
+
+   /* debugBelch("\naddDLL; dll_name = `%s'\n", dll_name); */
+
+   /* See if we've already got it, and ignore if so. */
+   for (o_dll = opened_dlls; o_dll != NULL; o_dll = o_dll->next) {
+      if (0 == strcmp(o_dll->name, dll_name))
+         return NULL;
+   }
+
+   /* The file name has no suffix (yet) so that we can try
+      both foo.dll and foo.drv
+
+      The documentation for LoadLibrary says:
+      	If no file name extension is specified in the lpFileName
+      	parameter, the default library extension .dll is
+      	appended. However, the file name string can include a trailing
+      	point character (.) to indicate that the module name has no
+      	extension. */
+
+   buf = stgMallocBytes(strlen(dll_name) + 10, "addDLL");
+   sprintf(buf, "%s.DLL", dll_name);
+   instance = LoadLibrary(buf);
+   if (instance == NULL) {
+	 sprintf(buf, "%s.DRV", dll_name);	// KAA: allow loading of drivers (like winspool.drv)
+	 instance = LoadLibrary(buf);
+	 if (instance == NULL) {
+		stgFree(buf);
+
+	    /* LoadLibrary failed; return a ptr to the error msg. */
+	    return "addDLL: unknown error";
+   	 }
+   }
+   stgFree(buf);
+
+   /* Add this DLL to the list of DLLs in which to search for symbols. */
+   o_dll = stgMallocBytes( sizeof(OpenedDLL), "addDLL" );
+   o_dll->name     = stgMallocBytes(1+strlen(dll_name), "addDLL");
+   strcpy(o_dll->name, dll_name);
+   o_dll->instance = instance;
+   o_dll->next     = opened_dlls;
+   opened_dlls     = o_dll;
+
+   return NULL;
+#  else
+   barf("addDLL: not implemented on this platform");
+#  endif
+}
+
+/* -----------------------------------------------------------------------------
+ * lookup a symbol in the hash table
+ */
+void *
+lookupSymbol( char *lbl )
+{
+    void *val;
+    initLinker() ;
+    ASSERT(symhash != NULL);
+    val = lookupStrHashTable(symhash, lbl);
+
+    if (val == NULL) {
+#       if defined(OBJFORMAT_ELF)
+#	if defined(openbsd_HOST_OS)
+	val = dlsym(dl_prog_handle, lbl);
+	return (val != NULL) ? val : dlsym(dl_libc_handle,lbl);
+#	elif defined(x86_64_HOST_ARCH)
+	val = dlsym(dl_prog_handle, lbl);
+	if (val >= (void *)0x80000000) {
+	    void *new_val;
+	    new_val = x86_64_high_symbol(lbl, val);
+	    IF_DEBUG(linker,debugBelch("lookupSymbol: relocating out of range symbol: %s = %p, now %p\n", lbl, val, new_val));
+	    return new_val;
+	} else {
+	    return val;
+	}
+#	else /* not openbsd */
+	return dlsym(dl_prog_handle, lbl);
+#	endif
+#       elif defined(OBJFORMAT_MACHO)
+	if(NSIsSymbolNameDefined(lbl)) {
+	    NSSymbol symbol = NSLookupAndBindSymbol(lbl);
+	    return NSAddressOfSymbol(symbol);
+	} else {
+	    return NULL;
+	}
+#       elif defined(OBJFORMAT_PEi386)
+        OpenedDLL* o_dll;
+        void* sym;
+        for (o_dll = opened_dlls; o_dll != NULL; o_dll = o_dll->next) {
+	  /* debugBelch("look in %s for %s\n", o_dll->name, lbl); */
+           if (lbl[0] == '_') {
+              /* HACK: if the name has an initial underscore, try stripping
+                 it off & look that up first. I've yet to verify whether there's
+                 a Rule that governs whether an initial '_' *should always* be
+                 stripped off when mapping from import lib name to the DLL name.
+              */
+              sym = GetProcAddress(o_dll->instance, (lbl+1));
+              if (sym != NULL) {
+		/*debugBelch("found %s in %s\n", lbl+1,o_dll->name);*/
+		return sym;
+	      }
+           }
+           sym = GetProcAddress(o_dll->instance, lbl);
+           if (sym != NULL) {
+	     /*debugBelch("found %s in %s\n", lbl,o_dll->name);*/
+	     return sym;
+	   }
+        }
+        return NULL;
+#       else
+        ASSERT(2+2 == 5);
+        return NULL;
+#       endif
+    } else {
+	return val;
+    }
+}
+
+static
+__attribute((unused))
+void *
+lookupLocalSymbol( ObjectCode* oc, char *lbl )
+{
+    void *val;
+    initLinker() ;
+    val = lookupStrHashTable(oc->lochash, lbl);
+
+    if (val == NULL) {
+        return NULL;
+    } else {
+	return val;
+    }
+}
+
+
+/* -----------------------------------------------------------------------------
+ * Debugging aid: look in GHCi's object symbol tables for symbols
+ * within DELTA bytes of the specified address, and show their names.
+ */
+#ifdef DEBUG
+void ghci_enquire ( char* addr );
+
+void ghci_enquire ( char* addr )
+{
+   int   i;
+   char* sym;
+   char* a;
+   const int DELTA = 64;
+   ObjectCode* oc;
+
+   initLinker();
+
+   for (oc = objects; oc; oc = oc->next) {
+      for (i = 0; i < oc->n_symbols; i++) {
+         sym = oc->symbols[i];
+         if (sym == NULL) continue;
+         // debugBelch("enquire %p %p\n", sym, oc->lochash);
+         a = NULL;
+         if (oc->lochash != NULL) {
+            a = lookupStrHashTable(oc->lochash, sym);
+	 }
+         if (a == NULL) {
+            a = lookupStrHashTable(symhash, sym);
+	 }
+         if (a == NULL) {
+	     // debugBelch("ghci_enquire: can't find %s\n", sym);
+         }
+         else if (addr-DELTA <= a && a <= addr+DELTA) {
+            debugBelch("%p + %3d  ==  `%s'\n", addr, (int)(a - addr), sym);
+         }
+      }
+   }
+}
+#endif
+
+#ifdef ia64_HOST_ARCH
+static unsigned int PLTSize(void);
+#endif
+
+/* -----------------------------------------------------------------------------
+ * Load an obj (populate the global symbol table, but don't resolve yet)
+ *
+ * Returns: 1 if ok, 0 on error.
+ */
+HsInt
+loadObj( char *path )
+{
+   ObjectCode* oc;
+   struct stat st;
+   int r, n;
+#ifdef USE_MMAP
+   int fd, pagesize;
+   void *map_addr = NULL;
+#else
+   FILE *f;
+   int misalignment;
+#endif
+   initLinker();
+
+   /* debugBelch("loadObj %s\n", path ); */
+
+   /* Check that we haven't already loaded this object. 
+      Ignore requests to load multiple times */
+   {
+       ObjectCode *o;
+       int is_dup = 0;
+       for (o = objects; o; o = o->next) {
+          if (0 == strcmp(o->fileName, path)) {
+             is_dup = 1;
+             break; /* don't need to search further */
+          }
+       }
+       if (is_dup) {
+          IF_DEBUG(linker, debugBelch(
+            "GHCi runtime linker: warning: looks like you're trying to load the\n"
+            "same object file twice:\n"
+            "   %s\n"
+            "GHCi will ignore this, but be warned.\n"
+            , path));
+          return 1; /* success */
+       }
+   }
+
+   oc = stgMallocBytes(sizeof(ObjectCode), "loadObj(oc)");
+
+#  if defined(OBJFORMAT_ELF)
+   oc->formatName = "ELF";
+#  elif defined(OBJFORMAT_PEi386)
+   oc->formatName = "PEi386";
+#  elif defined(OBJFORMAT_MACHO)
+   oc->formatName = "Mach-O";
+#  else
+   stgFree(oc);
+   barf("loadObj: not implemented on this platform");
+#  endif
+
+   r = stat(path, &st);
+   if (r == -1) { return 0; }
+
+   /* sigh, strdup() isn't a POSIX function, so do it the long way */
+   oc->fileName = stgMallocBytes( strlen(path)+1, "loadObj" );
+   strcpy(oc->fileName, path);
+
+   oc->fileSize          = st.st_size;
+   oc->symbols           = NULL;
+   oc->sections          = NULL;
+   oc->lochash           = allocStrHashTable();
+   oc->proddables        = NULL;
+
+   /* chain it onto the list of objects */
+   oc->next              = objects;
+   objects               = oc;
+
+#ifdef USE_MMAP
+#define ROUND_UP(x,size) ((x + size - 1) & ~(size - 1))
+
+   /* On many architectures malloc'd memory isn't executable, so we need to use mmap. */
+
+#if defined(openbsd_HOST_OS)
+   fd = open(path, O_RDONLY, S_IRUSR);
+#else
+   fd = open(path, O_RDONLY);
+#endif
+   if (fd == -1)
+      barf("loadObj: can't open `%s'", path);
+
+   pagesize = getpagesize();
+
+#ifdef ia64_HOST_ARCH
+   /* The PLT needs to be right before the object */
+   n = ROUND_UP(PLTSize(), pagesize);
+   oc->plt = mmap(NULL, n, PROT_EXEC|PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+   if (oc->plt == MAP_FAILED)
+      barf("loadObj: can't allocate PLT");
+
+   oc->pltIndex = 0;
+   map_addr = oc->plt + n;
+#endif
+
+   n = ROUND_UP(oc->fileSize, pagesize);
+
+   /* Link objects into the lower 2Gb on x86_64.  GHC assumes the
+    * small memory model on this architecture (see gcc docs,
+    * -mcmodel=small).
+    */
+#ifdef x86_64_HOST_ARCH
+#define EXTRA_MAP_FLAGS MAP_32BIT
+#else
+#define EXTRA_MAP_FLAGS 0
+#endif
+
+   oc->image = mmap(map_addr, n, PROT_EXEC|PROT_READ|PROT_WRITE, 
+		    MAP_PRIVATE|EXTRA_MAP_FLAGS, fd, 0);
+   if (oc->image == MAP_FAILED)
+      barf("loadObj: can't map `%s'", path);
+
+   close(fd);
+
+#else /* !USE_MMAP */
+
+   /* load the image into memory */
+   f = fopen(path, "rb");
+   if (!f)
+       barf("loadObj: can't read `%s'", path);
+
+#ifdef darwin_HOST_OS
+    // In a Mach-O .o file, all sections can and will be misaligned
+    // if the total size of the headers is not a multiple of the
+    // desired alignment. This is fine for .o files that only serve
+    // as input for the static linker, but it's not fine for us,
+    // as SSE (used by gcc for floating point) and Altivec require
+    // 16-byte alignment.
+    // We calculate the correct alignment from the header before
+    // reading the file, and then we misalign oc->image on purpose so
+    // that the actual sections end up aligned again.
+   misalignment = machoGetMisalignment(f);
+   oc->misalignment = misalignment;
+#else
+   misalignment = 0;
+#endif
+
+   oc->image = stgMallocBytes(oc->fileSize + misalignment, "loadObj(image)");
+   oc->image += misalignment;
+   
+   n = fread ( oc->image, 1, oc->fileSize, f );
+   if (n != oc->fileSize)
+      barf("loadObj: error whilst reading `%s'", path);
+
+   fclose(f);
+
+#endif /* USE_MMAP */
+
+#  if defined(OBJFORMAT_MACHO) && defined(powerpc_HOST_ARCH)
+   r = ocAllocateJumpIslands_MachO ( oc );
+   if (!r) { return r; }
+#  elif defined(OBJFORMAT_ELF) && defined(powerpc_HOST_ARCH)
+   r = ocAllocateJumpIslands_ELF ( oc );
+   if (!r) { return r; }
+#endif
+
+   /* verify the in-memory image */
+#  if defined(OBJFORMAT_ELF)
+   r = ocVerifyImage_ELF ( oc );
+#  elif defined(OBJFORMAT_PEi386)
+   r = ocVerifyImage_PEi386 ( oc );
+#  elif defined(OBJFORMAT_MACHO)
+   r = ocVerifyImage_MachO ( oc );
+#  else
+   barf("loadObj: no verify method");
+#  endif
+   if (!r) { return r; }
+
+   /* build the symbol list for this image */
+#  if defined(OBJFORMAT_ELF)
+   r = ocGetNames_ELF ( oc );
+#  elif defined(OBJFORMAT_PEi386)
+   r = ocGetNames_PEi386 ( oc );
+#  elif defined(OBJFORMAT_MACHO)
+   r = ocGetNames_MachO ( oc );
+#  else
+   barf("loadObj: no getNames method");
+#  endif
+   if (!r) { return r; }
+
+   /* loaded, but not resolved yet */
+   oc->status = OBJECT_LOADED;
+
+   return 1;
+}
+
+/* -----------------------------------------------------------------------------
+ * resolve all the currently unlinked objects in memory
+ *
+ * Returns: 1 if ok, 0 on error.
+ */
+HsInt
+resolveObjs( void )
+{
+    ObjectCode *oc;
+    int r;
+
+    initLinker();
+
+    for (oc = objects; oc; oc = oc->next) {
+	if (oc->status != OBJECT_RESOLVED) {
+#           if defined(OBJFORMAT_ELF)
+	    r = ocResolve_ELF ( oc );
+#           elif defined(OBJFORMAT_PEi386)
+	    r = ocResolve_PEi386 ( oc );
+#           elif defined(OBJFORMAT_MACHO)
+	    r = ocResolve_MachO ( oc );
+#           else
+	    barf("resolveObjs: not implemented on this platform");
+#           endif
+	    if (!r) { return r; }
+	    oc->status = OBJECT_RESOLVED;
+	}
+    }
+    return 1;
+}
+
+/* -----------------------------------------------------------------------------
+ * delete an object from the pool
+ */
+HsInt
+unloadObj( char *path )
+{
+    ObjectCode *oc, *prev;
+
+    ASSERT(symhash != NULL);
+    ASSERT(objects != NULL);
+
+    initLinker();
+
+    prev = NULL;
+    for (oc = objects; oc; prev = oc, oc = oc->next) {
+	if (!strcmp(oc->fileName,path)) {
+
+	    /* Remove all the mappings for the symbols within this
+	     * object..
+	     */
+	    {
+                int i;
+                for (i = 0; i < oc->n_symbols; i++) {
+                   if (oc->symbols[i] != NULL) {
+                       removeStrHashTable(symhash, oc->symbols[i], NULL);
+                   }
+                }
+            }
+
+	    if (prev == NULL) {
+		objects = oc->next;
+	    } else {
+		prev->next = oc->next;
+	    }
+
+	    /* We're going to leave this in place, in case there are
+	       any pointers from the heap into it: */
+	    /* stgFree(oc->image); */
+	    stgFree(oc->fileName);
+	    stgFree(oc->symbols);
+	    stgFree(oc->sections);
+	    /* The local hash table should have been freed at the end
+               of the ocResolve_ call on it. */
+            ASSERT(oc->lochash == NULL);
+	    stgFree(oc);
+	    return 1;
+	}
+    }
+
+    errorBelch("unloadObj: can't find `%s' to unload", path);
+    return 0;
+}
+
+/* -----------------------------------------------------------------------------
+ * Sanity checking.  For each ObjectCode, maintain a list of address ranges
+ * which may be prodded during relocation, and abort if we try and write
+ * outside any of these.
+ */
+static void addProddableBlock ( ObjectCode* oc, void* start, int size )
+{
+   ProddableBlock* pb
+      = stgMallocBytes(sizeof(ProddableBlock), "addProddableBlock");
+   /* debugBelch("aPB %p %p %d\n", oc, start, size); */
+   ASSERT(size > 0);
+   pb->start      = start;
+   pb->size       = size;
+   pb->next       = oc->proddables;
+   oc->proddables = pb;
+}
+
+static void checkProddableBlock ( ObjectCode* oc, void* addr )
+{
+   ProddableBlock* pb;
+   for (pb = oc->proddables; pb != NULL; pb = pb->next) {
+      char* s = (char*)(pb->start);
+      char* e = s + pb->size - 1;
+      char* a = (char*)addr;
+      /* Assumes that the biggest fixup involves a 4-byte write.  This
+         probably needs to be changed to 8 (ie, +7) on 64-bit
+         plats. */
+      if (a >= s && (a+3) <= e) return;
+   }
+   barf("checkProddableBlock: invalid fixup in runtime linker");
+}
+
+/* -----------------------------------------------------------------------------
+ * Section management.
+ */
+static void addSection ( ObjectCode* oc, SectionKind kind,
+                         void* start, void* end )
+{
+   Section* s   = stgMallocBytes(sizeof(Section), "addSection");
+   s->start     = start;
+   s->end       = end;
+   s->kind      = kind;
+   s->next      = oc->sections;
+   oc->sections = s;
+   /*
+   debugBelch("addSection: %p-%p (size %d), kind %d\n",
+                   start, ((char*)end)-1, end - start + 1, kind );
+   */
+}
+
+
+/* --------------------------------------------------------------------------
+ * PowerPC specifics (jump islands)
+ * ------------------------------------------------------------------------*/
+
+#if defined(powerpc_HOST_ARCH)
+
+/*
+  ocAllocateJumpIslands
+  
+  Allocate additional space at the end of the object file image to make room
+  for jump islands.
+  
+  PowerPC relative branch instructions have a 24 bit displacement field.
+  As PPC code is always 4-byte-aligned, this yields a +-32MB range.
+  If a particular imported symbol is outside this range, we have to redirect
+  the jump to a short piece of new code that just loads the 32bit absolute
+  address and jumps there.
+  This function just allocates space for one 16 byte ppcJumpIsland for every
+  undefined symbol in the object file. The code for the islands is filled in by
+  makeJumpIsland below.
+*/
+
+static int ocAllocateJumpIslands( ObjectCode* oc, int count, int first )
+{
+#ifdef USE_MMAP
+  int pagesize, n, m;
+#endif
+  int aligned;
+  int misalignment = 0;
+#if darwin_HOST_OS
+  misalignment = oc->misalignment;
+#endif
+
+  if( count > 0 )
+  {
+    // round up to the nearest 4
+    aligned = (oc->fileSize + 3) & ~3;
+
+#ifdef USE_MMAP
+    #ifndef linux_HOST_OS /* mremap is a linux extension */
+        #error ocAllocateJumpIslands doesnt want USE_MMAP to be defined
+    #endif
+
+    pagesize = getpagesize();
+    n = ROUND_UP( oc->fileSize, pagesize );
+    m = ROUND_UP( aligned + sizeof (ppcJumpIsland) * count, pagesize );
+
+    /* If we have a half-page-size file and map one page of it then
+     * the part of the page after the size of the file remains accessible.
+     * If, however, we map in 2 pages, the 2nd page is not accessible
+     * and will give a "Bus Error" on access.  To get around this, we check
+     * if we need any extra pages for the jump islands and map them in
+     * anonymously.  We must check that we actually require extra pages
+     * otherwise the attempt to mmap 0 pages of anonymous memory will
+     * fail -EINVAL.
+     */
+
+    if( m > n )
+    {
+      /* The effect of this mremap() call is only the ensure that we have
+       * a sufficient number of virtually contiguous pages.  As returned from
+       * mremap, the pages past the end of the file are not backed.  We give
+       * them a backing by using MAP_FIXED to map in anonymous pages.
+       */
+      oc->image = mremap( oc->image, n, m, MREMAP_MAYMOVE );
+
+      if( oc->image == MAP_FAILED )
+      {
+        errorBelch( "Unable to mremap for Jump Islands\n" );
+        return 0;
+      }
+
+      if( mmap( oc->image + n, m - n, PROT_READ | PROT_WRITE | PROT_EXEC,
+                MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, 0, 0 ) == MAP_FAILED )
+      {
+        errorBelch( "Unable to mmap( MAP_FIXED ) for Jump Islands\n" );
+        return 0;
+      }
+    }
+
+#else
+    oc->image -= misalignment;
+    oc->image = stgReallocBytes( oc->image,
+                                 misalignment + 
+                                 aligned + sizeof (ppcJumpIsland) * count,
+                                 "ocAllocateJumpIslands" );
+    oc->image += misalignment;
+#endif /* USE_MMAP */
+
+    oc->jump_islands = (ppcJumpIsland *) (oc->image + aligned);
+    memset( oc->jump_islands, 0, sizeof (ppcJumpIsland) * count );
+  }
+  else
+    oc->jump_islands = NULL;
+
+  oc->island_start_symbol = first;
+  oc->n_islands = count;
+
+  return 1;
+}
+
+static unsigned long makeJumpIsland( ObjectCode* oc,
+                                     unsigned long symbolNumber,
+                                     unsigned long target )
+{
+  ppcJumpIsland *island;
+
+  if( symbolNumber < oc->island_start_symbol ||
+      symbolNumber - oc->island_start_symbol > oc->n_islands)
+    return 0;
+
+  island = &oc->jump_islands[symbolNumber - oc->island_start_symbol];
+
+  // lis r12, hi16(target)
+  island->lis_r12     = 0x3d80;
+  island->hi_addr     = target >> 16;
+
+  // ori r12, r12, lo16(target)
+  island->ori_r12_r12 = 0x618c;
+  island->lo_addr     = target & 0xffff;
+
+  // mtctr r12
+  island->mtctr_r12   = 0x7d8903a6;
+
+  // bctr
+  island->bctr        = 0x4e800420;
+    
+  return (unsigned long) island;
+}
+
+/*
+   ocFlushInstructionCache
+
+   Flush the data & instruction caches.
+   Because the PPC has split data/instruction caches, we have to
+   do that whenever we modify code at runtime.
+ */
+
+static void ocFlushInstructionCache( ObjectCode *oc )
+{
+    int n = (oc->fileSize + sizeof( ppcJumpIsland ) * oc->n_islands + 3) / 4;
+    unsigned long *p = (unsigned long *) oc->image;
+
+    while( n-- )
+    {
+        __asm__ volatile ( "dcbf 0,%0\n\t"
+                           "sync\n\t"
+                           "icbi 0,%0"
+                           :
+                           : "r" (p)
+                         );
+        p++;
+    }
+    __asm__ volatile ( "sync\n\t"
+                       "isync"
+                     );
+}
+#endif
+
+/* --------------------------------------------------------------------------
+ * PEi386 specifics (Win32 targets)
+ * ------------------------------------------------------------------------*/
+
+/* The information for this linker comes from
+      Microsoft Portable Executable
+      and Common Object File Format Specification
+      revision 5.1 January 1998
+   which SimonM says comes from the MS Developer Network CDs.
+
+   It can be found there (on older CDs), but can also be found
+   online at:
+
+      http://www.microsoft.com/hwdev/hardware/PECOFF.asp
+
+   (this is Rev 6.0 from February 1999).
+
+   Things move, so if that fails, try searching for it via
+
+      http://www.google.com/search?q=PE+COFF+specification
+
+   The ultimate reference for the PE format is the Winnt.h
+   header file that comes with the Platform SDKs; as always,
+   implementations will drift wrt their documentation.
+
+   A good background article on the PE format is Matt Pietrek's
+   March 1994 article in Microsoft System Journal (MSJ)
+   (Vol.9, No. 3): "Peering Inside the PE: A Tour of the
+   Win32 Portable Executable File Format." The info in there
+   has recently been updated in a two part article in
+   MSDN magazine, issues Feb and March 2002,
+   "Inside Windows: An In-Depth Look into the Win32 Portable
+   Executable File Format"
+
+   John Levine's book "Linkers and Loaders" contains useful
+   info on PE too.
+*/
+
+
+#if defined(OBJFORMAT_PEi386)
+
+
+
+typedef unsigned char  UChar;
+typedef unsigned short UInt16;
+typedef unsigned int   UInt32;
+typedef          int   Int32;
+
+
+typedef
+   struct {
+      UInt16 Machine;
+      UInt16 NumberOfSections;
+      UInt32 TimeDateStamp;
+      UInt32 PointerToSymbolTable;
+      UInt32 NumberOfSymbols;
+      UInt16 SizeOfOptionalHeader;
+      UInt16 Characteristics;
+   }
+   COFF_header;
+
+#define sizeof_COFF_header 20
+
+
+typedef
+   struct {
+      UChar  Name[8];
+      UInt32 VirtualSize;
+      UInt32 VirtualAddress;
+      UInt32 SizeOfRawData;
+      UInt32 PointerToRawData;
+      UInt32 PointerToRelocations;
+      UInt32 PointerToLinenumbers;
+      UInt16 NumberOfRelocations;
+      UInt16 NumberOfLineNumbers;
+      UInt32 Characteristics;
+   }
+   COFF_section;
+
+#define sizeof_COFF_section 40
+
+
+typedef
+   struct {
+      UChar  Name[8];
+      UInt32 Value;
+      UInt16 SectionNumber;
+      UInt16 Type;
+      UChar  StorageClass;
+      UChar  NumberOfAuxSymbols;
+   }
+   COFF_symbol;
+
+#define sizeof_COFF_symbol 18
+
+
+typedef
+   struct {
+      UInt32 VirtualAddress;
+      UInt32 SymbolTableIndex;
+      UInt16 Type;
+   }
+   COFF_reloc;
+
+#define sizeof_COFF_reloc 10
+
+
+/* From PE spec doc, section 3.3.2 */
+/* Note use of MYIMAGE_* since IMAGE_* are already defined in
+   windows.h -- for the same purpose, but I want to know what I'm
+   getting, here. */
+#define MYIMAGE_FILE_RELOCS_STRIPPED     0x0001
+#define MYIMAGE_FILE_EXECUTABLE_IMAGE    0x0002
+#define MYIMAGE_FILE_DLL                 0x2000
+#define MYIMAGE_FILE_SYSTEM              0x1000
+#define MYIMAGE_FILE_BYTES_REVERSED_HI   0x8000
+#define MYIMAGE_FILE_BYTES_REVERSED_LO   0x0080
+#define MYIMAGE_FILE_32BIT_MACHINE       0x0100
+
+/* From PE spec doc, section 5.4.2 and 5.4.4 */
+#define MYIMAGE_SYM_CLASS_EXTERNAL       2
+#define MYIMAGE_SYM_CLASS_STATIC         3
+#define MYIMAGE_SYM_UNDEFINED            0
+
+/* From PE spec doc, section 4.1 */
+#define MYIMAGE_SCN_CNT_CODE             0x00000020
+#define MYIMAGE_SCN_CNT_INITIALIZED_DATA 0x00000040
+#define MYIMAGE_SCN_LNK_NRELOC_OVFL      0x01000000
+
+/* From PE spec doc, section 5.2.1 */
+#define MYIMAGE_REL_I386_DIR32           0x0006
+#define MYIMAGE_REL_I386_REL32           0x0014
+
+
+/* We use myindex to calculate array addresses, rather than
+   simply doing the normal subscript thing.  That's because
+   some of the above structs have sizes which are not
+   a whole number of words.  GCC rounds their sizes up to a
+   whole number of words, which means that the address calcs
+   arising from using normal C indexing or pointer arithmetic
+   are just plain wrong.  Sigh.
+*/
+static UChar *
+myindex ( int scale, void* base, int index )
+{
+   return
+      ((UChar*)base) + scale * index;
+}
+
+
+static void
+printName ( UChar* name, UChar* strtab )
+{
+   if (name[0]==0 && name[1]==0 && name[2]==0 && name[3]==0) {
+      UInt32 strtab_offset = * (UInt32*)(name+4);
+      debugBelch("%s", strtab + strtab_offset );
+   } else {
+      int i;
+      for (i = 0; i < 8; i++) {
+         if (name[i] == 0) break;
+         debugBelch("%c", name[i] );
+      }
+   }
+}
+
+
+static void
+copyName ( UChar* name, UChar* strtab, UChar* dst, int dstSize )
+{
+   if (name[0]==0 && name[1]==0 && name[2]==0 && name[3]==0) {
+      UInt32 strtab_offset = * (UInt32*)(name+4);
+      strncpy ( dst, strtab+strtab_offset, dstSize );
+      dst[dstSize-1] = 0;
+   } else {
+      int i = 0;
+      while (1) {
+         if (i >= 8) break;
+         if (name[i] == 0) break;
+         dst[i] = name[i];
+         i++;
+      }
+      dst[i] = 0;
+   }
+}
+
+
+static UChar *
+cstring_from_COFF_symbol_name ( UChar* name, UChar* strtab )
+{
+   UChar* newstr;
+   /* If the string is longer than 8 bytes, look in the
+      string table for it -- this will be correctly zero terminated.
+   */
+   if (name[0]==0 && name[1]==0 && name[2]==0 && name[3]==0) {
+      UInt32 strtab_offset = * (UInt32*)(name+4);
+      return ((UChar*)strtab) + strtab_offset;
+   }
+   /* Otherwise, if shorter than 8 bytes, return the original,
+      which by defn is correctly terminated.
+   */
+   if (name[7]==0) return name;
+   /* The annoying case: 8 bytes.  Copy into a temporary
+      (which is never freed ...)
+   */
+   newstr = stgMallocBytes(9, "cstring_from_COFF_symbol_name");
+   ASSERT(newstr);
+   strncpy(newstr,name,8);
+   newstr[8] = 0;
+   return newstr;
+}
+
+
+/* Just compares the short names (first 8 chars) */
+static COFF_section *
+findPEi386SectionCalled ( ObjectCode* oc,  char* name )
+{
+   int i;
+   COFF_header* hdr
+      = (COFF_header*)(oc->image);
+   COFF_section* sectab
+      = (COFF_section*) (
+           ((UChar*)(oc->image))
+           + sizeof_COFF_header + hdr->SizeOfOptionalHeader
+        );
+   for (i = 0; i < hdr->NumberOfSections; i++) {
+      UChar* n1;
+      UChar* n2;
+      COFF_section* section_i
+         = (COFF_section*)
+           myindex ( sizeof_COFF_section, sectab, i );
+      n1 = (UChar*) &(section_i->Name);
+      n2 = name;
+      if (n1[0]==n2[0] && n1[1]==n2[1] && n1[2]==n2[2] &&
+          n1[3]==n2[3] && n1[4]==n2[4] && n1[5]==n2[5] &&
+          n1[6]==n2[6] && n1[7]==n2[7])
+         return section_i;
+   }
+
+   return NULL;
+}
+
+
+static void
+zapTrailingAtSign ( UChar* sym )
+{
+#  define my_isdigit(c) ((c) >= '0' && (c) <= '9')
+   int i, j;
+   if (sym[0] == 0) return;
+   i = 0;
+   while (sym[i] != 0) i++;
+   i--;
+   j = i;
+   while (j > 0 && my_isdigit(sym[j])) j--;
+   if (j > 0 && sym[j] == '@' && j != i) sym[j] = 0;
+#  undef my_isdigit
+}
+
+
+static int
+ocVerifyImage_PEi386 ( ObjectCode* oc )
+{
+   int i;
+   UInt32 j, noRelocs;
+   COFF_header*  hdr;
+   COFF_section* sectab;
+   COFF_symbol*  symtab;
+   UChar*        strtab;
+   /* debugBelch("\nLOADING %s\n", oc->fileName); */
+   hdr = (COFF_header*)(oc->image);
+   sectab = (COFF_section*) (
+               ((UChar*)(oc->image))
+               + sizeof_COFF_header + hdr->SizeOfOptionalHeader
+            );
+   symtab = (COFF_symbol*) (
+               ((UChar*)(oc->image))
+               + hdr->PointerToSymbolTable
+            );
+   strtab = ((UChar*)symtab)
+            + hdr->NumberOfSymbols * sizeof_COFF_symbol;
+
+   if (hdr->Machine != 0x14c) {
+      errorBelch("%s: Not x86 PEi386", oc->fileName);
+      return 0;
+   }
+   if (hdr->SizeOfOptionalHeader != 0) {
+      errorBelch("%s: PEi386 with nonempty optional header", oc->fileName);
+      return 0;
+   }
+   if ( /* (hdr->Characteristics & MYIMAGE_FILE_RELOCS_STRIPPED) || */
+        (hdr->Characteristics & MYIMAGE_FILE_EXECUTABLE_IMAGE) ||
+        (hdr->Characteristics & MYIMAGE_FILE_DLL) ||
+        (hdr->Characteristics & MYIMAGE_FILE_SYSTEM) ) {
+      errorBelch("%s: Not a PEi386 object file", oc->fileName);
+      return 0;
+   }
+   if ( (hdr->Characteristics & MYIMAGE_FILE_BYTES_REVERSED_HI)
+        /* || !(hdr->Characteristics & MYIMAGE_FILE_32BIT_MACHINE) */ ) {
+      errorBelch("%s: Invalid PEi386 word size or endiannness: %d",
+		 oc->fileName,
+		 (int)(hdr->Characteristics));
+      return 0;
+   }
+   /* If the string table size is way crazy, this might indicate that
+      there are more than 64k relocations, despite claims to the
+      contrary.  Hence this test. */
+   /* debugBelch("strtab size %d\n", * (UInt32*)strtab); */
+#if 0
+   if ( (*(UInt32*)strtab) > 600000 ) {
+      /* Note that 600k has no special significance other than being
+         big enough to handle the almost-2MB-sized lumps that
+         constitute HSwin32*.o. */
+      debugBelch("PEi386 object has suspiciously large string table; > 64k relocs?");
+      return 0;
+   }
+#endif
+
+   /* No further verification after this point; only debug printing. */
+   i = 0;
+   IF_DEBUG(linker, i=1);
+   if (i == 0) return 1;
+
+   debugBelch( "sectab offset = %d\n", ((UChar*)sectab) - ((UChar*)hdr) );
+   debugBelch( "symtab offset = %d\n", ((UChar*)symtab) - ((UChar*)hdr) );
+   debugBelch( "strtab offset = %d\n", ((UChar*)strtab) - ((UChar*)hdr) );
+
+   debugBelch("\n" );
+   debugBelch( "Machine:           0x%x\n", (UInt32)(hdr->Machine) );
+   debugBelch( "# sections:        %d\n",   (UInt32)(hdr->NumberOfSections) );
+   debugBelch( "time/date:         0x%x\n", (UInt32)(hdr->TimeDateStamp) );
+   debugBelch( "symtab offset:     %d\n",   (UInt32)(hdr->PointerToSymbolTable) );
+   debugBelch( "# symbols:         %d\n",   (UInt32)(hdr->NumberOfSymbols) );
+   debugBelch( "sz of opt hdr:     %d\n",   (UInt32)(hdr->SizeOfOptionalHeader) );
+   debugBelch( "characteristics:   0x%x\n", (UInt32)(hdr->Characteristics) );
+
+   /* Print the section table. */
+   debugBelch("\n" );
+   for (i = 0; i < hdr->NumberOfSections; i++) {
+      COFF_reloc* reltab;
+      COFF_section* sectab_i
+         = (COFF_section*)
+           myindex ( sizeof_COFF_section, sectab, i );
+      debugBelch(
+                "\n"
+                "section %d\n"
+                "     name `",
+                i
+              );
+      printName ( sectab_i->Name, strtab );
+      debugBelch(
+                "'\n"
+                "    vsize %d\n"
+                "    vaddr %d\n"
+                "  data sz %d\n"
+                " data off %d\n"
+                "  num rel %d\n"
+                "  off rel %d\n"
+                "  ptr raw 0x%x\n",
+                sectab_i->VirtualSize,
+                sectab_i->VirtualAddress,
+                sectab_i->SizeOfRawData,
+                sectab_i->PointerToRawData,
+                sectab_i->NumberOfRelocations,
+                sectab_i->PointerToRelocations,
+                sectab_i->PointerToRawData
+              );
+      reltab = (COFF_reloc*) (
+                  ((UChar*)(oc->image)) + sectab_i->PointerToRelocations
+               );
+
+      if ( sectab_i->Characteristics & MYIMAGE_SCN_LNK_NRELOC_OVFL ) {
+	/* If the relocation field (a short) has overflowed, the
+	 * real count can be found in the first reloc entry.
+	 *
+	 * See Section 4.1 (last para) of the PE spec (rev6.0).
+	 */
+        COFF_reloc* rel = (COFF_reloc*)
+                           myindex ( sizeof_COFF_reloc, reltab, 0 );
+	noRelocs = rel->VirtualAddress;
+	j = 1;
+      } else {
+	noRelocs = sectab_i->NumberOfRelocations;
+        j = 0;
+      }
+
+      for (; j < noRelocs; j++) {
+         COFF_symbol* sym;
+         COFF_reloc* rel = (COFF_reloc*)
+                           myindex ( sizeof_COFF_reloc, reltab, j );
+         debugBelch(
+                   "        type 0x%-4x   vaddr 0x%-8x   name `",
+                   (UInt32)rel->Type,
+                   rel->VirtualAddress );
+         sym = (COFF_symbol*)
+               myindex ( sizeof_COFF_symbol, symtab, rel->SymbolTableIndex );
+	 /* Hmm..mysterious looking offset - what's it for? SOF */
+         printName ( sym->Name, strtab -10 );
+         debugBelch("'\n" );
+      }
+
+      debugBelch("\n" );
+   }
+   debugBelch("\n" );
+   debugBelch("string table has size 0x%x\n", * (UInt32*)strtab );
+   debugBelch("---START of string table---\n");
+   for (i = 4; i < *(Int32*)strtab; i++) {
+      if (strtab[i] == 0)
+         debugBelch("\n"); else
+         debugBelch("%c", strtab[i] );
+   }
+   debugBelch("--- END  of string table---\n");
+
+   debugBelch("\n" );
+   i = 0;
+   while (1) {
+      COFF_symbol* symtab_i;
+      if (i >= (Int32)(hdr->NumberOfSymbols)) break;
+      symtab_i = (COFF_symbol*)
+                 myindex ( sizeof_COFF_symbol, symtab, i );
+      debugBelch(
+                "symbol %d\n"
+                "     name `",
+                i
+              );
+      printName ( symtab_i->Name, strtab );
+      debugBelch(
+                "'\n"
+                "    value 0x%x\n"
+                "   1+sec# %d\n"
+                "     type 0x%x\n"
+                "   sclass 0x%x\n"
+                "     nAux %d\n",
+                symtab_i->Value,
+                (Int32)(symtab_i->SectionNumber),
+                (UInt32)symtab_i->Type,
+                (UInt32)symtab_i->StorageClass,
+                (UInt32)symtab_i->NumberOfAuxSymbols
+              );
+      i += symtab_i->NumberOfAuxSymbols;
+      i++;
+   }
+
+   debugBelch("\n" );
+   return 1;
+}
+
+
+static int
+ocGetNames_PEi386 ( ObjectCode* oc )
+{
+   COFF_header*  hdr;
+   COFF_section* sectab;
+   COFF_symbol*  symtab;
+   UChar*        strtab;
+
+   UChar* sname;
+   void*  addr;
+   int    i;
+
+   hdr = (COFF_header*)(oc->image);
+   sectab = (COFF_section*) (
+               ((UChar*)(oc->image))
+               + sizeof_COFF_header + hdr->SizeOfOptionalHeader
+            );
+   symtab = (COFF_symbol*) (
+               ((UChar*)(oc->image))
+               + hdr->PointerToSymbolTable
+            );
+   strtab = ((UChar*)(oc->image))
+            + hdr->PointerToSymbolTable
+            + hdr->NumberOfSymbols * sizeof_COFF_symbol;
+
+   /* Allocate space for any (local, anonymous) .bss sections. */
+
+   for (i = 0; i < hdr->NumberOfSections; i++) {
+      UInt32 bss_sz;
+      UChar* zspace;
+      COFF_section* sectab_i
+         = (COFF_section*)
+           myindex ( sizeof_COFF_section, sectab, i );
+      if (0 != strcmp(sectab_i->Name, ".bss")) continue;
+      /* sof 10/05: the PE spec text isn't too clear regarding what
+       * the SizeOfRawData field is supposed to hold for object
+       * file sections containing just uninitialized data -- for executables,
+       * it is supposed to be zero; unclear what it's supposed to be
+       * for object files. However, VirtualSize is guaranteed to be
+       * zero for object files, which definitely suggests that SizeOfRawData
+       * will be non-zero (where else would the size of this .bss section be
+       * stored?) Looking at the COFF_section info for incoming object files,
+       * this certainly appears to be the case.
+       *
+       * => I suspect we've been incorrectly handling .bss sections in (relocatable)
+       * object files up until now. This turned out to bite us with ghc-6.4.1's use
+       * of gcc-3.4.x, which has started to emit initially-zeroed-out local 'static'
+       * variable decls into to the .bss section. (The specific function in Q which
+       * triggered this is libraries/base/cbits/dirUtils.c:__hscore_getFolderPath())
+       */
+      if (sectab_i->VirtualSize == 0 && sectab_i->SizeOfRawData == 0) continue;
+      /* This is a non-empty .bss section.  Allocate zeroed space for
+         it, and set its PointerToRawData field such that oc->image +
+         PointerToRawData == addr_of_zeroed_space.  */
+      bss_sz = sectab_i->VirtualSize;
+      if ( bss_sz < sectab_i->SizeOfRawData) { bss_sz = sectab_i->SizeOfRawData; }
+      zspace = stgCallocBytes(1, bss_sz, "ocGetNames_PEi386(anonymous bss)");
+      sectab_i->PointerToRawData = ((UChar*)zspace) - ((UChar*)(oc->image));
+      addProddableBlock(oc, zspace, bss_sz);
+      /* debugBelch("BSS anon section at 0x%x\n", zspace); */
+   }
+
+   /* Copy section information into the ObjectCode. */
+
+   for (i = 0; i < hdr->NumberOfSections; i++) {
+      UChar* start;
+      UChar* end;
+      UInt32 sz;
+
+      SectionKind kind
+         = SECTIONKIND_OTHER;
+      COFF_section* sectab_i
+         = (COFF_section*)
+           myindex ( sizeof_COFF_section, sectab, i );
+      IF_DEBUG(linker, debugBelch("section name = %s\n", sectab_i->Name ));
+
+#     if 0
+      /* I'm sure this is the Right Way to do it.  However, the
+         alternative of testing the sectab_i->Name field seems to
+         work ok with Cygwin.
+      */
+      if (sectab_i->Characteristics & MYIMAGE_SCN_CNT_CODE ||
+          sectab_i->Characteristics & MYIMAGE_SCN_CNT_INITIALIZED_DATA)
+         kind = SECTIONKIND_CODE_OR_RODATA;
+#     endif
+
+      if (0==strcmp(".text",sectab_i->Name) ||
+          0==strcmp(".rdata",sectab_i->Name)||
+          0==strcmp(".rodata",sectab_i->Name))
+         kind = SECTIONKIND_CODE_OR_RODATA;
+      if (0==strcmp(".data",sectab_i->Name) ||
+          0==strcmp(".bss",sectab_i->Name))
+         kind = SECTIONKIND_RWDATA;
+
+      ASSERT(sectab_i->SizeOfRawData == 0 || sectab_i->VirtualSize == 0);
+      sz = sectab_i->SizeOfRawData;
+      if (sz < sectab_i->VirtualSize) sz = sectab_i->VirtualSize;
+
+      start = ((UChar*)(oc->image)) + sectab_i->PointerToRawData;
+      end   = start + sz - 1;
+
+      if (kind == SECTIONKIND_OTHER
+          /* Ignore sections called which contain stabs debugging
+             information. */
+          && 0 != strcmp(".stab", sectab_i->Name)
+          && 0 != strcmp(".stabstr", sectab_i->Name)
+          /* ignore constructor section for now */
+          && 0 != strcmp(".ctors", sectab_i->Name)
+         ) {
+         errorBelch("Unknown PEi386 section name `%s' (while processing: %s)", sectab_i->Name, oc->fileName);
+         return 0;
+      }
+
+      if (kind != SECTIONKIND_OTHER && end >= start) {
+         addSection(oc, kind, start, end);
+         addProddableBlock(oc, start, end - start + 1);
+      }
+   }
+
+   /* Copy exported symbols into the ObjectCode. */
+
+   oc->n_symbols = hdr->NumberOfSymbols;
+   oc->symbols   = stgMallocBytes(oc->n_symbols * sizeof(char*),
+                                  "ocGetNames_PEi386(oc->symbols)");
+   /* Call me paranoid; I don't care. */
+   for (i = 0; i < oc->n_symbols; i++)
+      oc->symbols[i] = NULL;
+
+   i = 0;
+   while (1) {
+      COFF_symbol* symtab_i;
+      if (i >= (Int32)(hdr->NumberOfSymbols)) break;
+      symtab_i = (COFF_symbol*)
+                 myindex ( sizeof_COFF_symbol, symtab, i );
+
+      addr  = NULL;
+
+      if (symtab_i->StorageClass == MYIMAGE_SYM_CLASS_EXTERNAL
+          && symtab_i->SectionNumber != MYIMAGE_SYM_UNDEFINED) {
+         /* This symbol is global and defined, viz, exported */
+         /* for MYIMAGE_SYMCLASS_EXTERNAL
+                && !MYIMAGE_SYM_UNDEFINED,
+            the address of the symbol is:
+                address of relevant section + offset in section
+         */
+         COFF_section* sectabent
+            = (COFF_section*) myindex ( sizeof_COFF_section,
+                                        sectab,
+                                        symtab_i->SectionNumber-1 );
+         addr = ((UChar*)(oc->image))
+                + (sectabent->PointerToRawData
+                   + symtab_i->Value);
+      }
+      else
+      if (symtab_i->SectionNumber == MYIMAGE_SYM_UNDEFINED
+	  && symtab_i->Value > 0) {
+         /* This symbol isn't in any section at all, ie, global bss.
+            Allocate zeroed space for it. */
+         addr = stgCallocBytes(1, symtab_i->Value,
+                               "ocGetNames_PEi386(non-anonymous bss)");
+         addSection(oc, SECTIONKIND_RWDATA, addr,
+                        ((UChar*)addr) + symtab_i->Value - 1);
+         addProddableBlock(oc, addr, symtab_i->Value);
+         /* debugBelch("BSS      section at 0x%x\n", addr); */
+      }
+
+      if (addr != NULL ) {
+         sname = cstring_from_COFF_symbol_name ( symtab_i->Name, strtab );
+         /* debugBelch("addSymbol %p `%s \n", addr,sname);  */
+         IF_DEBUG(linker, debugBelch("addSymbol %p `%s'\n", addr,sname);)
+         ASSERT(i >= 0 && i < oc->n_symbols);
+         /* cstring_from_COFF_symbol_name always succeeds. */
+         oc->symbols[i] = sname;
+         ghciInsertStrHashTable(oc->fileName, symhash, sname, addr);
+      } else {
+#        if 0
+         debugBelch(
+                   "IGNORING symbol %d\n"
+                   "     name `",
+                   i
+                 );
+         printName ( symtab_i->Name, strtab );
+         debugBelch(
+                   "'\n"
+                   "    value 0x%x\n"
+                   "   1+sec# %d\n"
+                   "     type 0x%x\n"
+                   "   sclass 0x%x\n"
+                   "     nAux %d\n",
+                   symtab_i->Value,
+                   (Int32)(symtab_i->SectionNumber),
+                   (UInt32)symtab_i->Type,
+                   (UInt32)symtab_i->StorageClass,
+                   (UInt32)symtab_i->NumberOfAuxSymbols
+                 );
+#        endif
+      }
+
+      i += symtab_i->NumberOfAuxSymbols;
+      i++;
+   }
+
+   return 1;
+}
+
+
+static int
+ocResolve_PEi386 ( ObjectCode* oc )
+{
+   COFF_header*  hdr;
+   COFF_section* sectab;
+   COFF_symbol*  symtab;
+   UChar*        strtab;
+
+   UInt32        A;
+   UInt32        S;
+   UInt32*       pP;
+
+   int i;
+   UInt32 j, noRelocs;
+
+   /* ToDo: should be variable-sized?  But is at least safe in the
+      sense of buffer-overrun-proof. */
+   char symbol[1000];
+   /* debugBelch("resolving for %s\n", oc->fileName); */
+
+   hdr = (COFF_header*)(oc->image);
+   sectab = (COFF_section*) (
+               ((UChar*)(oc->image))
+               + sizeof_COFF_header + hdr->SizeOfOptionalHeader
+            );
+   symtab = (COFF_symbol*) (
+               ((UChar*)(oc->image))
+               + hdr->PointerToSymbolTable
+            );
+   strtab = ((UChar*)(oc->image))
+            + hdr->PointerToSymbolTable
+            + hdr->NumberOfSymbols * sizeof_COFF_symbol;
+
+   for (i = 0; i < hdr->NumberOfSections; i++) {
+      COFF_section* sectab_i
+         = (COFF_section*)
+           myindex ( sizeof_COFF_section, sectab, i );
+      COFF_reloc* reltab
+         = (COFF_reloc*) (
+              ((UChar*)(oc->image)) + sectab_i->PointerToRelocations
+           );
+
+      /* Ignore sections called which contain stabs debugging
+         information. */
+      if (0 == strcmp(".stab", sectab_i->Name)
+          || 0 == strcmp(".stabstr", sectab_i->Name)
+          || 0 == strcmp(".ctors", sectab_i->Name))
+         continue;
+
+      if ( sectab_i->Characteristics & MYIMAGE_SCN_LNK_NRELOC_OVFL ) {
+	/* If the relocation field (a short) has overflowed, the
+	 * real count can be found in the first reloc entry.
+         *
+	 * See Section 4.1 (last para) of the PE spec (rev6.0).
+	 *
+	 * Nov2003 update: the GNU linker still doesn't correctly
+	 * handle the generation of relocatable object files with
+	 * overflown relocations. Hence the output to warn of potential
+	 * troubles.
+	 */
+        COFF_reloc* rel = (COFF_reloc*)
+                           myindex ( sizeof_COFF_reloc, reltab, 0 );
+	noRelocs = rel->VirtualAddress;
+
+	/* 10/05: we now assume (and check for) a GNU ld that is capable
+	 * of handling object files with (>2^16) of relocs.
+	 */
+#if 0
+	debugBelch("WARNING: Overflown relocation field (# relocs found: %u)\n",
+		   noRelocs);
+#endif
+	j = 1;
+      } else {
+	noRelocs = sectab_i->NumberOfRelocations;
+        j = 0;
+      }
+
+
+      for (; j < noRelocs; j++) {
+         COFF_symbol* sym;
+         COFF_reloc* reltab_j
+            = (COFF_reloc*)
+              myindex ( sizeof_COFF_reloc, reltab, j );
+
+         /* the location to patch */
+         pP = (UInt32*)(
+                 ((UChar*)(oc->image))
+                 + (sectab_i->PointerToRawData
+                    + reltab_j->VirtualAddress
+                    - sectab_i->VirtualAddress )
+              );
+         /* the existing contents of pP */
+         A = *pP;
+         /* the symbol to connect to */
+         sym = (COFF_symbol*)
+               myindex ( sizeof_COFF_symbol,
+                         symtab, reltab_j->SymbolTableIndex );
+         IF_DEBUG(linker,
+                  debugBelch(
+                            "reloc sec %2d num %3d:  type 0x%-4x   "
+                            "vaddr 0x%-8x   name `",
+                            i, j,
+                            (UInt32)reltab_j->Type,
+                            reltab_j->VirtualAddress );
+                            printName ( sym->Name, strtab );
+                            debugBelch("'\n" ));
+
+         if (sym->StorageClass == MYIMAGE_SYM_CLASS_STATIC) {
+            COFF_section* section_sym
+               = findPEi386SectionCalled ( oc, sym->Name );
+            if (!section_sym) {
+               errorBelch("%s: can't find section `%s'", oc->fileName, sym->Name);
+               return 0;
+            }
+            S = ((UInt32)(oc->image))
+                + (section_sym->PointerToRawData
+                   + sym->Value);
+         } else {
+            copyName ( sym->Name, strtab, symbol, 1000-1 );
+            (void*)S = lookupLocalSymbol( oc, symbol );
+            if ((void*)S != NULL) goto foundit;
+            (void*)S = lookupSymbol( symbol );
+            if ((void*)S != NULL) goto foundit;
+            zapTrailingAtSign ( symbol );
+            (void*)S = lookupLocalSymbol( oc, symbol );
+            if ((void*)S != NULL) goto foundit;
+            (void*)S = lookupSymbol( symbol );
+            if ((void*)S != NULL) goto foundit;
+	    /* Newline first because the interactive linker has printed "linking..." */
+            errorBelch("\n%s: unknown symbol `%s'", oc->fileName, symbol);
+            return 0;
+           foundit:;
+         }
+         checkProddableBlock(oc, pP);
+         switch (reltab_j->Type) {
+            case MYIMAGE_REL_I386_DIR32:
+               *pP = A + S;
+               break;
+            case MYIMAGE_REL_I386_REL32:
+               /* Tricky.  We have to insert a displacement at
+                  pP which, when added to the PC for the _next_
+                  insn, gives the address of the target (S).
+                  Problem is to know the address of the next insn
+                  when we only know pP.  We assume that this
+                  literal field is always the last in the insn,
+                  so that the address of the next insn is pP+4
+                  -- hence the constant 4.
+                  Also I don't know if A should be added, but so
+                  far it has always been zero.
+
+		  SOF 05/2005: 'A' (old contents of *pP) have been observed
+		  to contain values other than zero (the 'wx' object file
+		  that came with wxhaskell-0.9.4; dunno how it was compiled..).
+		  So, add displacement to old value instead of asserting
+		  A to be zero. Fixes wxhaskell-related crashes, and no other
+		  ill effects have been observed.
+		  
+		  Update: the reason why we're seeing these more elaborate
+		  relocations is due to a switch in how the NCG compiles SRTs 
+		  and offsets to them from info tables. SRTs live in .(ro)data, 
+		  while info tables live in .text, causing GAS to emit REL32/DISP32 
+		  relocations with non-zero values. Adding the displacement is
+		  the right thing to do.
+	       */
+               *pP = S - ((UInt32)pP) - 4 + A;
+               break;
+            default:
+               debugBelch("%s: unhandled PEi386 relocation type %d",
+		     oc->fileName, reltab_j->Type);
+               return 0;
+         }
+
+      }
+   }
+
+   IF_DEBUG(linker, debugBelch("completed %s", oc->fileName));
+   return 1;
+}
+
+#endif /* defined(OBJFORMAT_PEi386) */
+
+
+/* --------------------------------------------------------------------------
+ * ELF specifics
+ * ------------------------------------------------------------------------*/
+
+#if defined(OBJFORMAT_ELF)
+
+#define FALSE 0
+#define TRUE  1
+
+#if defined(sparc_HOST_ARCH)
+#  define ELF_TARGET_SPARC  /* Used inside <elf.h> */
+#elif defined(i386_HOST_ARCH)
+#  define ELF_TARGET_386    /* Used inside <elf.h> */
+#elif defined(x86_64_HOST_ARCH)
+#  define ELF_TARGET_X64_64
+#  define ELF_64BIT
+#elif defined (ia64_HOST_ARCH)
+#  define ELF_TARGET_IA64   /* Used inside <elf.h> */
+#  define ELF_64BIT
+#  define ELF_FUNCTION_DESC /* calling convention uses function descriptors */
+#  define ELF_NEED_GOT      /* needs Global Offset Table */
+#  define ELF_NEED_PLT      /* needs Procedure Linkage Tables */
+#endif
+
+#if !defined(openbsd_HOST_OS)
+#include <elf.h>
+#else
+/* openbsd elf has things in different places, with diff names */
+#include <elf_abi.h>
+#include <machine/reloc.h>
+#define R_386_32    RELOC_32
+#define R_386_PC32  RELOC_PC32
+#endif
+
+/*
+ * Define a set of types which can be used for both ELF32 and ELF64
+ */
+
+#ifdef ELF_64BIT
+#define ELFCLASS    ELFCLASS64
+#define Elf_Addr    Elf64_Addr
+#define Elf_Word    Elf64_Word
+#define Elf_Sword   Elf64_Sword
+#define Elf_Ehdr    Elf64_Ehdr
+#define Elf_Phdr    Elf64_Phdr
+#define Elf_Shdr    Elf64_Shdr
+#define Elf_Sym     Elf64_Sym
+#define Elf_Rel     Elf64_Rel
+#define Elf_Rela    Elf64_Rela
+#define ELF_ST_TYPE ELF64_ST_TYPE
+#define ELF_ST_BIND ELF64_ST_BIND
+#define ELF_R_TYPE  ELF64_R_TYPE
+#define ELF_R_SYM   ELF64_R_SYM
+#else
+#define ELFCLASS    ELFCLASS32
+#define Elf_Addr    Elf32_Addr
+#define Elf_Word    Elf32_Word
+#define Elf_Sword   Elf32_Sword
+#define Elf_Ehdr    Elf32_Ehdr
+#define Elf_Phdr    Elf32_Phdr
+#define Elf_Shdr    Elf32_Shdr
+#define Elf_Sym     Elf32_Sym
+#define Elf_Rel     Elf32_Rel
+#define Elf_Rela    Elf32_Rela
+#ifndef ELF_ST_TYPE
+#define ELF_ST_TYPE ELF32_ST_TYPE
+#endif
+#ifndef ELF_ST_BIND
+#define ELF_ST_BIND ELF32_ST_BIND
+#endif
+#ifndef ELF_R_TYPE
+#define ELF_R_TYPE  ELF32_R_TYPE
+#endif
+#ifndef ELF_R_SYM
+#define ELF_R_SYM   ELF32_R_SYM
+#endif
+#endif
+
+
+/*
+ * Functions to allocate entries in dynamic sections.  Currently we simply
+ * preallocate a large number, and we don't check if a entry for the given
+ * target already exists (a linear search is too slow).  Ideally these
+ * entries would be associated with symbols.
+ */
+
+/* These sizes sufficient to load HSbase + HShaskell98 + a few modules */
+#define GOT_SIZE            0x20000
+#define FUNCTION_TABLE_SIZE 0x10000
+#define PLT_SIZE            0x08000
+
+#ifdef ELF_NEED_GOT
+static Elf_Addr got[GOT_SIZE];
+static unsigned int gotIndex;
+static Elf_Addr gp_val = (Elf_Addr)got;
+
+static Elf_Addr
+allocateGOTEntry(Elf_Addr target)
+{
+   Elf_Addr *entry;
+
+   if (gotIndex >= GOT_SIZE)
+      barf("Global offset table overflow");
+
+   entry = &got[gotIndex++];
+   *entry = target;
+   return (Elf_Addr)entry;
+}
+#endif
+
+#ifdef ELF_FUNCTION_DESC
+typedef struct {
+   Elf_Addr ip;
+   Elf_Addr gp;
+} FunctionDesc;
+
+static FunctionDesc functionTable[FUNCTION_TABLE_SIZE];
+static unsigned int functionTableIndex;
+
+static Elf_Addr
+allocateFunctionDesc(Elf_Addr target)
+{
+   FunctionDesc *entry;
+
+   if (functionTableIndex >= FUNCTION_TABLE_SIZE)
+      barf("Function table overflow");
+
+   entry = &functionTable[functionTableIndex++];
+   entry->ip = target;
+   entry->gp = (Elf_Addr)gp_val;
+   return (Elf_Addr)entry;
+}
+
+static Elf_Addr
+copyFunctionDesc(Elf_Addr target)
+{
+   FunctionDesc *olddesc = (FunctionDesc *)target;
+   FunctionDesc *newdesc;
+
+   newdesc = (FunctionDesc *)allocateFunctionDesc(olddesc->ip);
+   newdesc->gp = olddesc->gp;
+   return (Elf_Addr)newdesc;
+}
+#endif
+
+#ifdef ELF_NEED_PLT
+#ifdef ia64_HOST_ARCH
+static void ia64_reloc_gprel22(Elf_Addr target, Elf_Addr value);
+static void ia64_reloc_pcrel21(Elf_Addr target, Elf_Addr value, ObjectCode *oc);
+
+static unsigned char plt_code[] =
+{
+   /* taken from binutils bfd/elfxx-ia64.c */
+   0x0b, 0x78, 0x00, 0x02, 0x00, 0x24,  /*   [MMI]       addl r15=0,r1;;    */
+   0x00, 0x41, 0x3c, 0x30, 0x28, 0xc0,  /*               ld8 r16=[r15],8    */
+   0x01, 0x08, 0x00, 0x84,              /*               mov r14=r1;;       */
+   0x11, 0x08, 0x00, 0x1e, 0x18, 0x10,  /*   [MIB]       ld8 r1=[r15]       */
+   0x60, 0x80, 0x04, 0x80, 0x03, 0x00,  /*               mov b6=r16         */
+   0x60, 0x00, 0x80, 0x00               /*               br.few b6;;        */
+};
+
+/* If we can't get to the function descriptor via gp, take a local copy of it */
+#define PLT_RELOC(code, target) { \
+   Elf64_Sxword rel_value = target - gp_val; \
+   if ((rel_value > 0x1fffff) || (rel_value < -0x1fffff)) \
+      ia64_reloc_gprel22((Elf_Addr)code, copyFunctionDesc(target)); \
+   else \
+      ia64_reloc_gprel22((Elf_Addr)code, target); \
+   }
+#endif
+
+typedef struct {
+   unsigned char code[sizeof(plt_code)];
+} PLTEntry;
+
+static Elf_Addr
+allocatePLTEntry(Elf_Addr target, ObjectCode *oc)
+{
+   PLTEntry *plt = (PLTEntry *)oc->plt;
+   PLTEntry *entry;
+
+   if (oc->pltIndex >= PLT_SIZE)
+      barf("Procedure table overflow");
+
+   entry = &plt[oc->pltIndex++];
+   memcpy(entry->code, plt_code, sizeof(entry->code));
+   PLT_RELOC(entry->code, target);
+   return (Elf_Addr)entry;
+}
+
+static unsigned int
+PLTSize(void)
+{
+   return (PLT_SIZE * sizeof(PLTEntry));
+}
+#endif
+
+
+#if x86_64_HOST_ARCH
+// On x86_64, 32-bit relocations are often used, which requires that
+// we can resolve a symbol to a 32-bit offset.  However, shared
+// libraries are placed outside the 2Gb area, which leaves us with a
+// problem when we need to give a 32-bit offset to a symbol in a
+// shared library.
+// 
+// For a function symbol, we can allocate a bounce sequence inside the
+// 2Gb area and resolve the symbol to this.  The bounce sequence is
+// simply a long jump instruction to the real location of the symbol.
+//
+// For data references, we're screwed.
+//
+typedef struct {
+    unsigned char jmp[8];  /* 6 byte instruction: jmpq *0x00000002(%rip) */
+    void *addr;
+} x86_64_bounce;
+
+#define X86_64_BB_SIZE 1024
+
+static x86_64_bounce *x86_64_bounce_buffer = NULL;
+static nat x86_64_bb_next_off;
+
+static void*
+x86_64_high_symbol( char *lbl, void *addr )
+{
+    x86_64_bounce *bounce;
+
+    if ( x86_64_bounce_buffer == NULL || 
+	 x86_64_bb_next_off >= X86_64_BB_SIZE ) {
+	x86_64_bounce_buffer = 
+	    mmap(NULL, X86_64_BB_SIZE * sizeof(x86_64_bounce), 
+		 PROT_EXEC|PROT_READ|PROT_WRITE, 
+		 MAP_PRIVATE|MAP_32BIT|MAP_ANONYMOUS, -1, 0);
+	if (x86_64_bounce_buffer == MAP_FAILED) {
+	    barf("x86_64_high_symbol: mmap failed");
+	}
+	x86_64_bb_next_off = 0;
+    }
+    bounce = &x86_64_bounce_buffer[x86_64_bb_next_off];
+    bounce->jmp[0] = 0xff;
+    bounce->jmp[1] = 0x25;
+    bounce->jmp[2] = 0x02;
+    bounce->jmp[3] = 0x00;
+    bounce->jmp[4] = 0x00;
+    bounce->jmp[5] = 0x00;
+    bounce->addr = addr;
+    x86_64_bb_next_off++;
+
+    IF_DEBUG(linker, debugBelch("x86_64: allocated bounce entry for %s->%p at %p\n",
+				lbl, addr, bounce));
+
+    insertStrHashTable(symhash, lbl, bounce);
+    return bounce;
+}
+#endif
+
+
+/*
+ * Generic ELF functions
+ */
+
+static char *
+findElfSection ( void* objImage, Elf_Word sh_type )
+{
+   char* ehdrC = (char*)objImage;
+   Elf_Ehdr* ehdr = (Elf_Ehdr*)ehdrC;
+   Elf_Shdr* shdr = (Elf_Shdr*)(ehdrC + ehdr->e_shoff);
+   char* sh_strtab = ehdrC + shdr[ehdr->e_shstrndx].sh_offset;
+   char* ptr = NULL;
+   int i;
+
+   for (i = 0; i < ehdr->e_shnum; i++) {
+      if (shdr[i].sh_type == sh_type
+          /* Ignore the section header's string table. */
+          && i != ehdr->e_shstrndx
+	  /* Ignore string tables named .stabstr, as they contain
+             debugging info. */
+          && 0 != memcmp(".stabstr", sh_strtab + shdr[i].sh_name, 8)
+         ) {
+         ptr = ehdrC + shdr[i].sh_offset;
+         break;
+      }
+   }
+   return ptr;
+}
+
+#if defined(ia64_HOST_ARCH)
+static Elf_Addr
+findElfSegment ( void* objImage, Elf_Addr vaddr )
+{
+   char* ehdrC = (char*)objImage;
+   Elf_Ehdr* ehdr = (Elf_Ehdr*)ehdrC;
+   Elf_Phdr* phdr = (Elf_Phdr*)(ehdrC + ehdr->e_phoff);
+   Elf_Addr segaddr = 0;
+   int i;
+
+   for (i = 0; i < ehdr->e_phnum; i++) {
+      segaddr = phdr[i].p_vaddr;
+      if ((vaddr >= segaddr) && (vaddr < segaddr + phdr[i].p_memsz))
+	      break;
+   }
+   return segaddr;
+}
+#endif
+
+static int
+ocVerifyImage_ELF ( ObjectCode* oc )
+{
+   Elf_Shdr* shdr;
+   Elf_Sym*  stab;
+   int i, j, nent, nstrtab, nsymtabs;
+   char* sh_strtab;
+   char* strtab;
+
+   char*     ehdrC = (char*)(oc->image);
+   Elf_Ehdr* ehdr  = (Elf_Ehdr*)ehdrC;
+
+   if (ehdr->e_ident[EI_MAG0] != ELFMAG0 ||
+       ehdr->e_ident[EI_MAG1] != ELFMAG1 ||
+       ehdr->e_ident[EI_MAG2] != ELFMAG2 ||
+       ehdr->e_ident[EI_MAG3] != ELFMAG3) {
+      errorBelch("%s: not an ELF object", oc->fileName);
+      return 0;
+   }
+
+   if (ehdr->e_ident[EI_CLASS] != ELFCLASS) {
+      errorBelch("%s: unsupported ELF format", oc->fileName);
+      return 0;
+   }
+
+   if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB) {
+       IF_DEBUG(linker,debugBelch( "Is little-endian\n" ));
+   } else
+   if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB) {
+       IF_DEBUG(linker,debugBelch( "Is big-endian\n" ));
+   } else {
+       errorBelch("%s: unknown endiannness", oc->fileName);
+       return 0;
+   }
+
+   if (ehdr->e_type != ET_REL) {
+      errorBelch("%s: not a relocatable object (.o) file", oc->fileName);
+      return 0;
+   }
+   IF_DEBUG(linker, debugBelch( "Is a relocatable object (.o) file\n" ));
+
+   IF_DEBUG(linker,debugBelch( "Architecture is " ));
+   switch (ehdr->e_machine) {
+      case EM_386:   IF_DEBUG(linker,debugBelch( "x86" )); break;
+      case EM_SPARC: IF_DEBUG(linker,debugBelch( "sparc" )); break;
+#ifdef EM_IA_64
+      case EM_IA_64: IF_DEBUG(linker,debugBelch( "ia64" )); break;
+#endif
+      case EM_PPC:   IF_DEBUG(linker,debugBelch( "powerpc32" )); break;
+#ifdef EM_X86_64
+      case EM_X86_64: IF_DEBUG(linker,debugBelch( "x86_64" )); break;
+#endif
+      default:       IF_DEBUG(linker,debugBelch( "unknown" ));
+                     errorBelch("%s: unknown architecture", oc->fileName);
+                     return 0;
+   }
+
+   IF_DEBUG(linker,debugBelch(
+             "\nSection header table: start %ld, n_entries %d, ent_size %d\n",
+             (long)ehdr->e_shoff, ehdr->e_shnum, ehdr->e_shentsize  ));
+
+   ASSERT (ehdr->e_shentsize == sizeof(Elf_Shdr));
+
+   shdr = (Elf_Shdr*) (ehdrC + ehdr->e_shoff);
+
+   if (ehdr->e_shstrndx == SHN_UNDEF) {
+      errorBelch("%s: no section header string table", oc->fileName);
+      return 0;
+   } else {
+      IF_DEBUG(linker,debugBelch( "Section header string table is section %d\n",
+                          ehdr->e_shstrndx));
+      sh_strtab = ehdrC + shdr[ehdr->e_shstrndx].sh_offset;
+   }
+
+   for (i = 0; i < ehdr->e_shnum; i++) {
+      IF_DEBUG(linker,debugBelch("%2d:  ", i ));
+      IF_DEBUG(linker,debugBelch("type=%2d  ", (int)shdr[i].sh_type ));
+      IF_DEBUG(linker,debugBelch("size=%4d  ", (int)shdr[i].sh_size ));
+      IF_DEBUG(linker,debugBelch("offs=%4d  ", (int)shdr[i].sh_offset ));
+      IF_DEBUG(linker,debugBelch("  (%p .. %p)  ",
+               ehdrC + shdr[i].sh_offset,
+		      ehdrC + shdr[i].sh_offset + shdr[i].sh_size - 1));
+
+      if (shdr[i].sh_type == SHT_REL) {
+	  IF_DEBUG(linker,debugBelch("Rel  " ));
+      } else if (shdr[i].sh_type == SHT_RELA) {
+	  IF_DEBUG(linker,debugBelch("RelA " ));
+      } else {
+	  IF_DEBUG(linker,debugBelch("     "));
+      }
+      if (sh_strtab) {
+	  IF_DEBUG(linker,debugBelch("sname=%s\n", sh_strtab + shdr[i].sh_name ));
+      }
+   }
+
+   IF_DEBUG(linker,debugBelch( "\nString tables" ));
+   strtab = NULL;
+   nstrtab = 0;
+   for (i = 0; i < ehdr->e_shnum; i++) {
+      if (shdr[i].sh_type == SHT_STRTAB
+          /* Ignore the section header's string table. */
+          && i != ehdr->e_shstrndx
+	  /* Ignore string tables named .stabstr, as they contain
+             debugging info. */
+          && 0 != memcmp(".stabstr", sh_strtab + shdr[i].sh_name, 8)
+         ) {
+         IF_DEBUG(linker,debugBelch("   section %d is a normal string table", i ));
+         strtab = ehdrC + shdr[i].sh_offset;
+         nstrtab++;
+      }
+   }
+   if (nstrtab != 1) {
+      errorBelch("%s: no string tables, or too many", oc->fileName);
+      return 0;
+   }
+
+   nsymtabs = 0;
+   IF_DEBUG(linker,debugBelch( "\nSymbol tables" ));
+   for (i = 0; i < ehdr->e_shnum; i++) {
+      if (shdr[i].sh_type != SHT_SYMTAB) continue;
+      IF_DEBUG(linker,debugBelch( "section %d is a symbol table\n", i ));
+      nsymtabs++;
+      stab = (Elf_Sym*) (ehdrC + shdr[i].sh_offset);
+      nent = shdr[i].sh_size / sizeof(Elf_Sym);
+      IF_DEBUG(linker,debugBelch( "   number of entries is apparently %d (%ld rem)\n",
+               nent,
+               (long)shdr[i].sh_size % sizeof(Elf_Sym)
+             ));
+      if (0 != shdr[i].sh_size % sizeof(Elf_Sym)) {
+         errorBelch("%s: non-integral number of symbol table entries", oc->fileName);
+         return 0;
+      }
+      for (j = 0; j < nent; j++) {
+         IF_DEBUG(linker,debugBelch("   %2d  ", j ));
+         IF_DEBUG(linker,debugBelch("  sec=%-5d  size=%-3d  val=%5p  ",
+                             (int)stab[j].st_shndx,
+                             (int)stab[j].st_size,
+                             (char*)stab[j].st_value ));
+
+         IF_DEBUG(linker,debugBelch("type=" ));
+         switch (ELF_ST_TYPE(stab[j].st_info)) {
+            case STT_NOTYPE:  IF_DEBUG(linker,debugBelch("notype " )); break;
+            case STT_OBJECT:  IF_DEBUG(linker,debugBelch("object " )); break;
+            case STT_FUNC  :  IF_DEBUG(linker,debugBelch("func   " )); break;
+            case STT_SECTION: IF_DEBUG(linker,debugBelch("section" )); break;
+            case STT_FILE:    IF_DEBUG(linker,debugBelch("file   " )); break;
+            default:          IF_DEBUG(linker,debugBelch("?      " )); break;
+         }
+         IF_DEBUG(linker,debugBelch("  " ));
+
+         IF_DEBUG(linker,debugBelch("bind=" ));
+         switch (ELF_ST_BIND(stab[j].st_info)) {
+            case STB_LOCAL :  IF_DEBUG(linker,debugBelch("local " )); break;
+            case STB_GLOBAL:  IF_DEBUG(linker,debugBelch("global" )); break;
+            case STB_WEAK  :  IF_DEBUG(linker,debugBelch("weak  " )); break;
+            default:          IF_DEBUG(linker,debugBelch("?     " )); break;
+         }
+         IF_DEBUG(linker,debugBelch("  " ));
+
+         IF_DEBUG(linker,debugBelch("name=%s\n", strtab + stab[j].st_name ));
+      }
+   }
+
+   if (nsymtabs == 0) {
+      errorBelch("%s: didn't find any symbol tables", oc->fileName);
+      return 0;
+   }
+
+   return 1;
+}
+
+static int getSectionKind_ELF( Elf_Shdr *hdr, int *is_bss )
+{
+    *is_bss = FALSE;
+
+    if (hdr->sh_type == SHT_PROGBITS
+	&& (hdr->sh_flags & SHF_ALLOC) && (hdr->sh_flags & SHF_EXECINSTR)) {
+	/* .text-style section */
+	return SECTIONKIND_CODE_OR_RODATA;
+    }
+
+    if (hdr->sh_type == SHT_PROGBITS
+	    && (hdr->sh_flags & SHF_ALLOC) && (hdr->sh_flags & SHF_WRITE)) {
+	    /* .data-style section */
+	    return SECTIONKIND_RWDATA;
+    }
+
+    if (hdr->sh_type == SHT_PROGBITS
+	&& (hdr->sh_flags & SHF_ALLOC) && !(hdr->sh_flags & SHF_WRITE)) {
+	/* .rodata-style section */
+	return SECTIONKIND_CODE_OR_RODATA;
+    }
+
+    if (hdr->sh_type == SHT_NOBITS
+	&& (hdr->sh_flags & SHF_ALLOC) && (hdr->sh_flags & SHF_WRITE)) {
+	/* .bss-style section */
+	*is_bss = TRUE;
+	return SECTIONKIND_RWDATA;
+    }
+
+    return SECTIONKIND_OTHER;
+}
+
+
+static int
+ocGetNames_ELF ( ObjectCode* oc )
+{
+   int i, j, k, nent;
+   Elf_Sym* stab;
+
+   char*     ehdrC    = (char*)(oc->image);
+   Elf_Ehdr* ehdr     = (Elf_Ehdr*)ehdrC;
+   char*     strtab   = findElfSection ( ehdrC, SHT_STRTAB );
+   Elf_Shdr* shdr     = (Elf_Shdr*) (ehdrC + ehdr->e_shoff);
+
+   ASSERT(symhash != NULL);
+
+   if (!strtab) {
+      errorBelch("%s: no strtab", oc->fileName);
+      return 0;
+   }
+
+   k = 0;
+   for (i = 0; i < ehdr->e_shnum; i++) {
+      /* Figure out what kind of section it is.  Logic derived from
+         Figure 1.14 ("Special Sections") of the ELF document
+         ("Portable Formats Specification, Version 1.1"). */
+      int         is_bss = FALSE;
+      SectionKind kind   = getSectionKind_ELF(&shdr[i], &is_bss);
+
+      if (is_bss && shdr[i].sh_size > 0) {
+         /* This is a non-empty .bss section.  Allocate zeroed space for
+            it, and set its .sh_offset field such that
+            ehdrC + .sh_offset == addr_of_zeroed_space.  */
+         char* zspace = stgCallocBytes(1, shdr[i].sh_size,
+                                       "ocGetNames_ELF(BSS)");
+         shdr[i].sh_offset = ((char*)zspace) - ((char*)ehdrC);
+	 /*
+         debugBelch("BSS section at 0x%x, size %d\n",
+                         zspace, shdr[i].sh_size);
+	 */
+      }
+
+      /* fill in the section info */
+      if (kind != SECTIONKIND_OTHER && shdr[i].sh_size > 0) {
+         addProddableBlock(oc, ehdrC + shdr[i].sh_offset, shdr[i].sh_size);
+         addSection(oc, kind, ehdrC + shdr[i].sh_offset,
+                        ehdrC + shdr[i].sh_offset + shdr[i].sh_size - 1);
+      }
+
+      if (shdr[i].sh_type != SHT_SYMTAB) continue;
+
+      /* copy stuff into this module's object symbol table */
+      stab = (Elf_Sym*) (ehdrC + shdr[i].sh_offset);
+      nent = shdr[i].sh_size / sizeof(Elf_Sym);
+
+      oc->n_symbols = nent;
+      oc->symbols = stgMallocBytes(oc->n_symbols * sizeof(char*),
+                                   "ocGetNames_ELF(oc->symbols)");
+
+      for (j = 0; j < nent; j++) {
+
+         char  isLocal = FALSE; /* avoids uninit-var warning */
+         char* ad      = NULL;
+         char* nm      = strtab + stab[j].st_name;
+         int   secno   = stab[j].st_shndx;
+
+	 /* Figure out if we want to add it; if so, set ad to its
+            address.  Otherwise leave ad == NULL. */
+
+         if (secno == SHN_COMMON) {
+            isLocal = FALSE;
+            ad = stgCallocBytes(1, stab[j].st_size, "ocGetNames_ELF(COMMON)");
+	    /*
+            debugBelch("COMMON symbol, size %d name %s\n",
+                            stab[j].st_size, nm);
+	    */
+	    /* Pointless to do addProddableBlock() for this area,
+               since the linker should never poke around in it. */
+	 }
+         else
+         if ( ( ELF_ST_BIND(stab[j].st_info)==STB_GLOBAL
+                || ELF_ST_BIND(stab[j].st_info)==STB_LOCAL
+              )
+              /* and not an undefined symbol */
+              && stab[j].st_shndx != SHN_UNDEF
+	      /* and not in a "special section" */
+              && stab[j].st_shndx < SHN_LORESERVE
+              &&
+	      /* and it's a not a section or string table or anything silly */
+              ( ELF_ST_TYPE(stab[j].st_info)==STT_FUNC ||
+                ELF_ST_TYPE(stab[j].st_info)==STT_OBJECT ||
+                ELF_ST_TYPE(stab[j].st_info)==STT_NOTYPE
+              )
+            ) {
+	    /* Section 0 is the undefined section, hence > and not >=. */
+            ASSERT(secno > 0 && secno < ehdr->e_shnum);
+	    /*
+            if (shdr[secno].sh_type == SHT_NOBITS) {
+               debugBelch("   BSS symbol, size %d off %d name %s\n",
+                               stab[j].st_size, stab[j].st_value, nm);
+            }
+            */
+            ad = ehdrC + shdr[ secno ].sh_offset + stab[j].st_value;
+            if (ELF_ST_BIND(stab[j].st_info)==STB_LOCAL) {
+               isLocal = TRUE;
+            } else {
+#ifdef ELF_FUNCTION_DESC
+               /* dlsym() and the initialisation table both give us function
+		* descriptors, so to be consistent we store function descriptors
+		* in the symbol table */
+               if (ELF_ST_TYPE(stab[j].st_info) == STT_FUNC)
+                   ad = (char *)allocateFunctionDesc((Elf_Addr)ad);
+#endif
+               IF_DEBUG(linker,debugBelch( "addOTabName(GLOB): %10p  %s %s",
+                                      ad, oc->fileName, nm ));
+               isLocal = FALSE;
+            }
+         }
+
+         /* And the decision is ... */
+
+         if (ad != NULL) {
+            ASSERT(nm != NULL);
+	    oc->symbols[j] = nm;
+            /* Acquire! */
+            if (isLocal) {
+               /* Ignore entirely. */
+            } else {
+               ghciInsertStrHashTable(oc->fileName, symhash, nm, ad);
+            }
+         } else {
+            /* Skip. */
+            IF_DEBUG(linker,debugBelch( "skipping `%s'\n",
+                                   strtab + stab[j].st_name ));
+            /*
+            debugBelch(
+                    "skipping   bind = %d,  type = %d,  shndx = %d   `%s'\n",
+                    (int)ELF_ST_BIND(stab[j].st_info),
+                    (int)ELF_ST_TYPE(stab[j].st_info),
+                    (int)stab[j].st_shndx,
+                    strtab + stab[j].st_name
+                   );
+            */
+            oc->symbols[j] = NULL;
+         }
+
+      }
+   }
+
+   return 1;
+}
+
+/* Do ELF relocations which lack an explicit addend.  All x86-linux
+   relocations appear to be of this form. */
+static int
+do_Elf_Rel_relocations ( ObjectCode* oc, char* ehdrC,
+                         Elf_Shdr* shdr, int shnum,
+                         Elf_Sym*  stab, char* strtab )
+{
+   int j;
+   char *symbol;
+   Elf_Word* targ;
+   Elf_Rel*  rtab = (Elf_Rel*) (ehdrC + shdr[shnum].sh_offset);
+   int         nent = shdr[shnum].sh_size / sizeof(Elf_Rel);
+   int target_shndx = shdr[shnum].sh_info;
+   int symtab_shndx = shdr[shnum].sh_link;
+
+   stab  = (Elf_Sym*) (ehdrC + shdr[ symtab_shndx ].sh_offset);
+   targ  = (Elf_Word*)(ehdrC + shdr[ target_shndx ].sh_offset);
+   IF_DEBUG(linker,debugBelch( "relocations for section %d using symtab %d\n",
+                          target_shndx, symtab_shndx ));
+
+   /* Skip sections that we're not interested in. */
+   {
+       int is_bss;
+       SectionKind kind = getSectionKind_ELF(&shdr[target_shndx], &is_bss);
+       if (kind == SECTIONKIND_OTHER) {
+	   IF_DEBUG(linker,debugBelch( "skipping (target section not loaded)"));
+	   return 1;
+       }
+   }
+
+   for (j = 0; j < nent; j++) {
+      Elf_Addr offset = rtab[j].r_offset;
+      Elf_Addr info   = rtab[j].r_info;
+
+      Elf_Addr  P  = ((Elf_Addr)targ) + offset;
+      Elf_Word* pP = (Elf_Word*)P;
+      Elf_Addr  A  = *pP;
+      Elf_Addr  S;
+      void*     S_tmp;
+      Elf_Addr  value;
+
+      IF_DEBUG(linker,debugBelch( "Rel entry %3d is raw(%6p %6p)",
+                             j, (void*)offset, (void*)info ));
+      if (!info) {
+         IF_DEBUG(linker,debugBelch( " ZERO" ));
+         S = 0;
+      } else {
+         Elf_Sym sym = stab[ELF_R_SYM(info)];
+	 /* First see if it is a local symbol. */
+         if (ELF_ST_BIND(sym.st_info) == STB_LOCAL) {
+            /* Yes, so we can get the address directly from the ELF symbol
+               table. */
+            symbol = sym.st_name==0 ? "(noname)" : strtab+sym.st_name;
+            S = (Elf_Addr)
+                (ehdrC + shdr[ sym.st_shndx ].sh_offset
+                       + stab[ELF_R_SYM(info)].st_value);
+
+	 } else {
+            /* No, so look up the name in our global table. */
+            symbol = strtab + sym.st_name;
+            S_tmp = lookupSymbol( symbol );
+            S = (Elf_Addr)S_tmp;
+	 }
+         if (!S) {
+            errorBelch("%s: unknown symbol `%s'", oc->fileName, symbol);
+	    return 0;
+         }
+         IF_DEBUG(linker,debugBelch( "`%s' resolves to %p\n", symbol, (void*)S ));
+      }
+
+      IF_DEBUG(linker,debugBelch( "Reloc: P = %p   S = %p   A = %p\n",
+			     (void*)P, (void*)S, (void*)A ));
+      checkProddableBlock ( oc, pP );
+
+      value = S + A;
+
+      switch (ELF_R_TYPE(info)) {
+#        ifdef i386_HOST_ARCH
+         case R_386_32:   *pP = value;     break;
+         case R_386_PC32: *pP = value - P; break;
+#        endif
+         default:
+            errorBelch("%s: unhandled ELF relocation(Rel) type %lu\n",
+		  oc->fileName, (lnat)ELF_R_TYPE(info));
+            return 0;
+      }
+
+   }
+   return 1;
+}
+
+/* Do ELF relocations for which explicit addends are supplied.
+   sparc-solaris relocations appear to be of this form. */
+static int
+do_Elf_Rela_relocations ( ObjectCode* oc, char* ehdrC,
+                          Elf_Shdr* shdr, int shnum,
+                          Elf_Sym*  stab, char* strtab )
+{
+   int j;
+   char *symbol = NULL;
+   Elf_Addr targ;
+   Elf_Rela* rtab = (Elf_Rela*) (ehdrC + shdr[shnum].sh_offset);
+   int         nent = shdr[shnum].sh_size / sizeof(Elf_Rela);
+   int target_shndx = shdr[shnum].sh_info;
+   int symtab_shndx = shdr[shnum].sh_link;
+
+   stab  = (Elf_Sym*) (ehdrC + shdr[ symtab_shndx ].sh_offset);
+   targ  = (Elf_Addr) (ehdrC + shdr[ target_shndx ].sh_offset);
+   IF_DEBUG(linker,debugBelch( "relocations for section %d using symtab %d\n",
+                          target_shndx, symtab_shndx ));
+
+   for (j = 0; j < nent; j++) {
+#if defined(DEBUG) || defined(sparc_HOST_ARCH) || defined(ia64_HOST_ARCH) || defined(powerpc_HOST_ARCH) || defined(x86_64_HOST_ARCH)
+      /* This #ifdef only serves to avoid unused-var warnings. */
+      Elf_Addr  offset = rtab[j].r_offset;
+      Elf_Addr  P      = targ + offset;
+#endif
+      Elf_Addr  info   = rtab[j].r_info;
+      Elf_Addr  A      = rtab[j].r_addend;
+      Elf_Addr  S;
+      void*     S_tmp;
+      Elf_Addr  value;
+#     if defined(sparc_HOST_ARCH)
+      Elf_Word* pP = (Elf_Word*)P;
+      Elf_Word  w1, w2;
+#     elif defined(ia64_HOST_ARCH)
+      Elf64_Xword *pP = (Elf64_Xword *)P;
+      Elf_Addr addr;
+#     elif defined(powerpc_HOST_ARCH)
+      Elf_Sword delta;
+#     endif
+
+      IF_DEBUG(linker,debugBelch( "Rel entry %3d is raw(%6p %6p %6p)   ",
+                             j, (void*)offset, (void*)info,
+                                (void*)A ));
+      if (!info) {
+         IF_DEBUG(linker,debugBelch( " ZERO" ));
+         S = 0;
+      } else {
+         Elf_Sym sym = stab[ELF_R_SYM(info)];
+	 /* First see if it is a local symbol. */
+         if (ELF_ST_BIND(sym.st_info) == STB_LOCAL) {
+            /* Yes, so we can get the address directly from the ELF symbol
+               table. */
+            symbol = sym.st_name==0 ? "(noname)" : strtab+sym.st_name;
+            S = (Elf_Addr)
+                (ehdrC + shdr[ sym.st_shndx ].sh_offset
+                       + stab[ELF_R_SYM(info)].st_value);
+#ifdef ELF_FUNCTION_DESC
+	    /* Make a function descriptor for this function */
+            if (S && ELF_ST_TYPE(sym.st_info) == STT_FUNC) {
+               S = allocateFunctionDesc(S + A);
+       	       A = 0;
+            }
+#endif
+	 } else {
+            /* No, so look up the name in our global table. */
+            symbol = strtab + sym.st_name;
+            S_tmp = lookupSymbol( symbol );
+            S = (Elf_Addr)S_tmp;
+
+#ifdef ELF_FUNCTION_DESC
+	    /* If a function, already a function descriptor - we would
+	       have to copy it to add an offset. */
+            if (S && (ELF_ST_TYPE(sym.st_info) == STT_FUNC) && (A != 0))
+               errorBelch("%s: function %s with addend %p", oc->fileName, symbol, (void *)A);
+#endif
+	 }
+         if (!S) {
+	   errorBelch("%s: unknown symbol `%s'", oc->fileName, symbol);
+	   return 0;
+         }
+         IF_DEBUG(linker,debugBelch( "`%s' resolves to %p", symbol, (void*)S ));
+      }
+
+      IF_DEBUG(linker,debugBelch("Reloc: P = %p   S = %p   A = %p\n",
+                                        (void*)P, (void*)S, (void*)A ));
+      /* checkProddableBlock ( oc, (void*)P ); */
+
+      value = S + A;
+
+      switch (ELF_R_TYPE(info)) {
+#        if defined(sparc_HOST_ARCH)
+         case R_SPARC_WDISP30:
+            w1 = *pP & 0xC0000000;
+            w2 = (Elf_Word)((value - P) >> 2);
+            ASSERT((w2 & 0xC0000000) == 0);
+            w1 |= w2;
+            *pP = w1;
+            break;
+         case R_SPARC_HI22:
+            w1 = *pP & 0xFFC00000;
+            w2 = (Elf_Word)(value >> 10);
+            ASSERT((w2 & 0xFFC00000) == 0);
+            w1 |= w2;
+            *pP = w1;
+            break;
+         case R_SPARC_LO10:
+            w1 = *pP & ~0x3FF;
+            w2 = (Elf_Word)(value & 0x3FF);
+            ASSERT((w2 & ~0x3FF) == 0);
+            w1 |= w2;
+            *pP = w1;
+            break;
+         /* According to the Sun documentation:
+            R_SPARC_UA32
+            This relocation type resembles R_SPARC_32, except it refers to an
+            unaligned word. That is, the word to be relocated must be treated
+            as four separate bytes with arbitrary alignment, not as a word
+            aligned according to the architecture requirements.
+
+            (JRS: which means that freeloading on the R_SPARC_32 case
+            is probably wrong, but hey ...)
+         */
+         case R_SPARC_UA32:
+         case R_SPARC_32:
+            w2 = (Elf_Word)value;
+            *pP = w2;
+            break;
+#        elif defined(ia64_HOST_ARCH)
+	 case R_IA64_DIR64LSB:
+	 case R_IA64_FPTR64LSB:
+	    *pP = value;
+	    break;
+	 case R_IA64_PCREL64LSB:
+	    *pP = value - P;
+	    break;
+	 case R_IA64_SEGREL64LSB:
+	    addr = findElfSegment(ehdrC, value);
+	    *pP = value - addr;
+	    break;
+	 case R_IA64_GPREL22:
+	    ia64_reloc_gprel22(P, value);
+	    break;
+	 case R_IA64_LTOFF22:
+	 case R_IA64_LTOFF22X:
+	 case R_IA64_LTOFF_FPTR22:
+	    addr = allocateGOTEntry(value);
+	    ia64_reloc_gprel22(P, addr);
+	    break;
+	 case R_IA64_PCREL21B:
+	    ia64_reloc_pcrel21(P, S, oc);
+	    break;
+	 case R_IA64_LDXMOV:
+	    /* This goes with R_IA64_LTOFF22X and points to the load to
+	     * convert into a move.  We don't implement relaxation. */
+	    break;
+#        elif defined(powerpc_HOST_ARCH)
+         case R_PPC_ADDR16_LO:
+            *(Elf32_Half*) P = value;
+            break;
+
+         case R_PPC_ADDR16_HI:
+            *(Elf32_Half*) P = value >> 16;
+            break;
+ 
+         case R_PPC_ADDR16_HA:
+            *(Elf32_Half*) P = (value + 0x8000) >> 16;
+            break;
+
+         case R_PPC_ADDR32:
+            *(Elf32_Word *) P = value;
+            break;
+
+         case R_PPC_REL32:
+            *(Elf32_Word *) P = value - P;
+            break;
+
+         case R_PPC_REL24:
+            delta = value - P;
+
+            if( delta << 6 >> 6 != delta )
+            {
+               value = makeJumpIsland( oc, ELF_R_SYM(info), value );
+               delta = value - P;
+
+               if( value == 0 || delta << 6 >> 6 != delta )
+               {
+                  barf( "Unable to make ppcJumpIsland for #%d",
+                        ELF_R_SYM(info) );
+                  return 0;
+               }
+            }
+
+            *(Elf_Word *) P = (*(Elf_Word *) P & 0xfc000003)
+                                          | (delta & 0x3fffffc);
+            break;
+#        endif
+
+#if x86_64_HOST_ARCH
+      case R_X86_64_64:
+	  *(Elf64_Xword *)P = value;
+	  break;
+
+      case R_X86_64_PC32:
+      {
+	  StgInt64 off = value - P;
+	  if (off >= 0x7fffffffL || off < -0x80000000L) {
+	      barf("R_X86_64_PC32 relocation out of range: %s = %p",
+		   symbol, off);
+	  }
+	  *(Elf64_Word *)P = (Elf64_Word)off;
+	  break;
+      }
+
+      case R_X86_64_32:
+	  if (value >= 0x7fffffffL) {
+	      barf("R_X86_64_32 relocation out of range: %s = %p\n",
+		   symbol, value);
+	  }
+	  *(Elf64_Word *)P = (Elf64_Word)value;
+	  break;
+
+      case R_X86_64_32S:
+	  if ((StgInt64)value > 0x7fffffffL || (StgInt64)value < -0x80000000L) {
+	      barf("R_X86_64_32S relocation out of range: %s = %p\n",
+		   symbol, value);
+	  }
+	  *(Elf64_Sword *)P = (Elf64_Sword)value;
+	  break;
+#endif
+
+         default:
+            errorBelch("%s: unhandled ELF relocation(RelA) type %lu\n",
+		  oc->fileName, (lnat)ELF_R_TYPE(info));
+            return 0;
+      }
+
+   }
+   return 1;
+}
+
+static int
+ocResolve_ELF ( ObjectCode* oc )
+{
+   char *strtab;
+   int   shnum, ok;
+   Elf_Sym*  stab  = NULL;
+   char*     ehdrC = (char*)(oc->image);
+   Elf_Ehdr* ehdr  = (Elf_Ehdr*) ehdrC;
+   Elf_Shdr* shdr  = (Elf_Shdr*) (ehdrC + ehdr->e_shoff);
+
+   /* first find "the" symbol table */
+   stab = (Elf_Sym*) findElfSection ( ehdrC, SHT_SYMTAB );
+
+   /* also go find the string table */
+   strtab = findElfSection ( ehdrC, SHT_STRTAB );
+
+   if (stab == NULL || strtab == NULL) {
+      errorBelch("%s: can't find string or symbol table", oc->fileName);
+      return 0;
+   }
+
+   /* Process the relocation sections. */
+   for (shnum = 0; shnum < ehdr->e_shnum; shnum++) {
+      if (shdr[shnum].sh_type == SHT_REL) {
+         ok = do_Elf_Rel_relocations ( oc, ehdrC, shdr,
+                                       shnum, stab, strtab );
+         if (!ok) return ok;
+      }
+      else
+      if (shdr[shnum].sh_type == SHT_RELA) {
+         ok = do_Elf_Rela_relocations ( oc, ehdrC, shdr,
+                                        shnum, stab, strtab );
+         if (!ok) return ok;
+      }
+   }
+
+   /* Free the local symbol table; we won't need it again. */
+   freeHashTable(oc->lochash, NULL);
+   oc->lochash = NULL;
+
+#if defined(powerpc_HOST_ARCH)
+   ocFlushInstructionCache( oc );
+#endif
+
+   return 1;
+}
+
+/*
+ * IA64 specifics
+ * Instructions are 41 bits long, packed into 128 bit bundles with a 5-bit template
+ * at the front.  The following utility functions pack and unpack instructions, and
+ * take care of the most common relocations.
+ */
+
+#ifdef ia64_HOST_ARCH
+
+static Elf64_Xword
+ia64_extract_instruction(Elf64_Xword *target)
+{
+   Elf64_Xword w1, w2;
+   int slot = (Elf_Addr)target & 3;
+   target = (Elf_Addr)target & ~3;
+
+   w1 = *target;
+   w2 = *(target+1);
+
+   switch (slot)
+   {
+      case 0:
+         return ((w1 >> 5) & 0x1ffffffffff);
+      case 1:
+         return (w1 >> 46) | ((w2 & 0x7fffff) << 18);
+      case 2:
+         return (w2 >> 23);
+      default:
+         barf("ia64_extract_instruction: invalid slot %p", target);
+   }
+}
+
+static void
+ia64_deposit_instruction(Elf64_Xword *target, Elf64_Xword value)
+{
+   int slot = (Elf_Addr)target & 3;
+   target = (Elf_Addr)target & ~3;
+
+   switch (slot)
+   {
+      case 0:
+         *target |= value << 5;
+         break;
+      case 1:
+         *target |= value << 46;
+         *(target+1) |= value >> 18;
+         break;
+      case 2:
+         *(target+1) |= value << 23;
+         break;
+   }
+}
+
+static void
+ia64_reloc_gprel22(Elf_Addr target, Elf_Addr value)
+{
+   Elf64_Xword instruction;
+   Elf64_Sxword rel_value;
+
+   rel_value = value - gp_val;
+   if ((rel_value > 0x1fffff) || (rel_value < -0x1fffff))
+      barf("GP-relative data out of range (address = 0x%lx, gp = 0x%lx)", value, gp_val);
+
+   instruction = ia64_extract_instruction((Elf64_Xword *)target);
+   instruction |= (((rel_value >> 0) & 0x07f) << 13)		/* imm7b */
+		    | (((rel_value >> 7) & 0x1ff) << 27)	/* imm9d */
+		    | (((rel_value >> 16) & 0x01f) << 22)	/* imm5c */
+		    | ((Elf64_Xword)(rel_value < 0) << 36);	/* s */
+   ia64_deposit_instruction((Elf64_Xword *)target, instruction);
+}
+
+static void
+ia64_reloc_pcrel21(Elf_Addr target, Elf_Addr value, ObjectCode *oc)
+{
+   Elf64_Xword instruction;
+   Elf64_Sxword rel_value;
+   Elf_Addr entry;
+
+   entry = allocatePLTEntry(value, oc);
+
+   rel_value = (entry >> 4) - (target >> 4);
+   if ((rel_value > 0xfffff) || (rel_value < -0xfffff))
+      barf("PLT entry too far away (entry = 0x%lx, target = 0x%lx)", entry, target);
+
+   instruction = ia64_extract_instruction((Elf64_Xword *)target);
+   instruction |= ((rel_value & 0xfffff) << 13) 		/* imm20b */
+	    	    | ((Elf64_Xword)(rel_value < 0) << 36);	/* s */
+   ia64_deposit_instruction((Elf64_Xword *)target, instruction);
+}
+
+#endif /* ia64 */
+
+/*
+ * PowerPC ELF specifics
+ */
+
+#ifdef powerpc_HOST_ARCH
+
+static int ocAllocateJumpIslands_ELF( ObjectCode *oc )
+{
+  Elf_Ehdr *ehdr;
+  Elf_Shdr* shdr;
+  int i;
+
+  ehdr = (Elf_Ehdr *) oc->image;
+  shdr = (Elf_Shdr *) ( ((char *)oc->image) + ehdr->e_shoff );
+
+  for( i = 0; i < ehdr->e_shnum; i++ )
+    if( shdr[i].sh_type == SHT_SYMTAB )
+      break;
+
+  if( i == ehdr->e_shnum )
+  {
+    errorBelch( "This ELF file contains no symtab" );
+    return 0;
+  }
+
+  if( shdr[i].sh_entsize != sizeof( Elf_Sym ) )
+  {
+    errorBelch( "The entry size (%d) of the symtab isn't %d\n",
+      shdr[i].sh_entsize, sizeof( Elf_Sym ) );
+    
+    return 0;
+  }
+
+  return ocAllocateJumpIslands( oc, shdr[i].sh_size / sizeof( Elf_Sym ), 0 );
+}
+
+#endif /* powerpc */
+
+#endif /* ELF */
+
+/* --------------------------------------------------------------------------
+ * Mach-O specifics
+ * ------------------------------------------------------------------------*/
+
+#if defined(OBJFORMAT_MACHO)
+
+/*
+  Support for MachO linking on Darwin/MacOS X
+  by Wolfgang Thaller (wolfgang.thaller@gmx.net)
+
+  I hereby formally apologize for the hackish nature of this code.
+  Things that need to be done:
+  *) implement ocVerifyImage_MachO
+  *) add still more sanity checks.
+*/
+
+#ifdef powerpc_HOST_ARCH
+static int ocAllocateJumpIslands_MachO(ObjectCode* oc)
+{
+    struct mach_header *header = (struct mach_header *) oc->image;
+    struct load_command *lc = (struct load_command *) (header + 1);
+    unsigned i;
+
+    for( i = 0; i < header->ncmds; i++ )
+    {   
+        if( lc->cmd == LC_SYMTAB )
+        {
+                // Find out the first and last undefined external
+                // symbol, so we don't have to allocate too many
+                // jump islands.
+            struct symtab_command *symLC = (struct symtab_command *) lc;
+            unsigned min = symLC->nsyms, max = 0;
+            struct nlist *nlist =
+                symLC ? (struct nlist*) ((char*) oc->image + symLC->symoff)
+                      : NULL;
+            for(i=0;i<symLC->nsyms;i++)
+            {
+                if(nlist[i].n_type & N_STAB)
+                    ;
+                else if(nlist[i].n_type & N_EXT)
+                {
+                    if((nlist[i].n_type & N_TYPE) == N_UNDF
+                        && (nlist[i].n_value == 0))
+                    {
+                        if(i < min)
+                            min = i;
+                        if(i > max)
+                            max = i;
+                    }
+                }
+            }
+            if(max >= min)
+                return ocAllocateJumpIslands(oc, max - min + 1, min);
+
+            break;
+        }
+        
+        lc = (struct load_command *) ( ((char *)lc) + lc->cmdsize );
+    }
+    return ocAllocateJumpIslands(oc,0,0);
+}
+#endif
+
+static int ocVerifyImage_MachO(ObjectCode* oc STG_UNUSED)
+{
+    // FIXME: do some verifying here
+    return 1;
+}
+
+static int resolveImports(
+    ObjectCode* oc,
+    char *image,
+    struct symtab_command *symLC,
+    struct section *sect,    // ptr to lazy or non-lazy symbol pointer section
+    unsigned long *indirectSyms,
+    struct nlist *nlist)
+{
+    unsigned i;
+    size_t itemSize = 4;
+
+#if i386_HOST_ARCH
+    int isJumpTable = 0;
+    if(!strcmp(sect->sectname,"__jump_table"))
+    {
+        isJumpTable = 1;
+        itemSize = 5;
+        ASSERT(sect->reserved2 == itemSize);
+    }
+#endif
+
+    for(i=0; i*itemSize < sect->size;i++)
+    {
+	// according to otool, reserved1 contains the first index into the indirect symbol table
+	struct nlist *symbol = &nlist[indirectSyms[sect->reserved1+i]];
+	char *nm = image + symLC->stroff + symbol->n_un.n_strx;
+	void *addr = NULL;
+
+	if((symbol->n_type & N_TYPE) == N_UNDF
+	    && (symbol->n_type & N_EXT) && (symbol->n_value != 0))
+	    addr = (void*) (symbol->n_value);
+	else if((addr = lookupLocalSymbol(oc,nm)) != NULL)
+	    ;
+	else
+	    addr = lookupSymbol(nm);
+	if(!addr)
+	{
+	    errorBelch("\n%s: unknown symbol `%s'", oc->fileName, nm);
+	    return 0;
+	}
+	ASSERT(addr);
+
+#if i386_HOST_ARCH
+        if(isJumpTable)
+        {
+            checkProddableBlock(oc,image + sect->offset + i*itemSize);
+            *(image + sect->offset + i*itemSize) = 0xe9; // jmp
+            *(unsigned*)(image + sect->offset + i*itemSize + 1)
+                = (char*)addr - (image + sect->offset + i*itemSize + 5);
+        }
+        else
+#endif
+	{
+	    checkProddableBlock(oc,((void**)(image + sect->offset)) + i);
+	    ((void**)(image + sect->offset))[i] = addr;
+        }
+    }
+
+    return 1;
+}
+
+static unsigned long relocateAddress(
+    ObjectCode* oc,
+    int nSections,
+    struct section* sections,
+    unsigned long address)
+{
+    int i;
+    for(i = 0; i < nSections; i++)
+    {
+        if(sections[i].addr <= address
+            && address < sections[i].addr + sections[i].size)
+        {
+            return (unsigned long)oc->image
+                    + sections[i].offset + address - sections[i].addr;
+        }
+    }
+    barf("Invalid Mach-O file:"
+         "Address out of bounds while relocating object file");
+    return 0;
+}
+
+static int relocateSection(
+    ObjectCode* oc,
+    char *image,
+    struct symtab_command *symLC, struct nlist *nlist,
+    int nSections, struct section* sections, struct section *sect)
+{
+    struct relocation_info *relocs;
+    int i,n;
+
+    if(!strcmp(sect->sectname,"__la_symbol_ptr"))
+	return 1;
+    else if(!strcmp(sect->sectname,"__nl_symbol_ptr"))
+	return 1;
+    else if(!strcmp(sect->sectname,"__la_sym_ptr2"))
+	return 1;
+    else if(!strcmp(sect->sectname,"__la_sym_ptr3"))
+	return 1;
+
+    n = sect->nreloc;
+    relocs = (struct relocation_info*) (image + sect->reloff);
+
+    for(i=0;i<n;i++)
+    {
+	if(relocs[i].r_address & R_SCATTERED)
+	{
+	    struct scattered_relocation_info *scat =
+		(struct scattered_relocation_info*) &relocs[i];
+
+	    if(!scat->r_pcrel)
+	    {
+		if(scat->r_length == 2)
+		{
+		    unsigned long word = 0;
+		    unsigned long* wordPtr = (unsigned long*) (image + sect->offset + scat->r_address);
+		    checkProddableBlock(oc,wordPtr);
+
+                    // Note on relocation types:
+                    // i386 uses the GENERIC_RELOC_* types,
+                    // while ppc uses special PPC_RELOC_* types.
+                    // *_RELOC_VANILLA and *_RELOC_PAIR have the same value
+                    // in both cases, all others are different.
+                    // Therefore, we use GENERIC_RELOC_VANILLA
+                    // and GENERIC_RELOC_PAIR instead of the PPC variants,
+                    // and use #ifdefs for the other types.
+                    
+		    // Step 1: Figure out what the relocated value should be
+		    if(scat->r_type == GENERIC_RELOC_VANILLA)
+		    {
+                        word = *wordPtr + (unsigned long) relocateAddress(
+                                                                oc,
+                                                                nSections,
+                                                                sections,
+                                                                scat->r_value)
+                                        - scat->r_value;
+		    }
+#ifdef powerpc_HOST_ARCH
+		    else if(scat->r_type == PPC_RELOC_SECTDIFF
+		        || scat->r_type == PPC_RELOC_LO16_SECTDIFF
+		        || scat->r_type == PPC_RELOC_HI16_SECTDIFF
+		        || scat->r_type == PPC_RELOC_HA16_SECTDIFF)
+#else
+                    else if(scat->r_type == GENERIC_RELOC_SECTDIFF)
+#endif
+		    {
+		        struct scattered_relocation_info *pair =
+		                (struct scattered_relocation_info*) &relocs[i+1];
+
+		        if(!pair->r_scattered || pair->r_type != GENERIC_RELOC_PAIR)
+		            barf("Invalid Mach-O file: "
+		                 "RELOC_*_SECTDIFF not followed by RELOC_PAIR");
+
+		        word = (unsigned long)
+		               (relocateAddress(oc, nSections, sections, scat->r_value)
+		              - relocateAddress(oc, nSections, sections, pair->r_value));
+		        i++;
+		    }
+#ifdef powerpc_HOST_ARCH
+		    else if(scat->r_type == PPC_RELOC_HI16
+                         || scat->r_type == PPC_RELOC_LO16
+                         || scat->r_type == PPC_RELOC_HA16
+                         || scat->r_type == PPC_RELOC_LO14)
+                    {   // these are generated by label+offset things
+		        struct relocation_info *pair = &relocs[i+1];
+                        if((pair->r_address & R_SCATTERED) || pair->r_type != PPC_RELOC_PAIR)
+		            barf("Invalid Mach-O file: "
+		                 "PPC_RELOC_* not followed by PPC_RELOC_PAIR");
+                        
+                        if(scat->r_type == PPC_RELOC_LO16)
+                        {
+                            word = ((unsigned short*) wordPtr)[1];
+                            word |= ((unsigned long) relocs[i+1].r_address & 0xFFFF) << 16;
+                        }
+                        else if(scat->r_type == PPC_RELOC_LO14)
+                        {
+                            barf("Unsupported Relocation: PPC_RELOC_LO14");
+                            word = ((unsigned short*) wordPtr)[1] & 0xFFFC;
+                            word |= ((unsigned long) relocs[i+1].r_address & 0xFFFF) << 16;
+                        }
+                        else if(scat->r_type == PPC_RELOC_HI16)
+                        {
+                            word = ((unsigned short*) wordPtr)[1] << 16;
+                            word |= ((unsigned long) relocs[i+1].r_address & 0xFFFF);
+                        }
+                        else if(scat->r_type == PPC_RELOC_HA16)
+                        {
+                            word = ((unsigned short*) wordPtr)[1] << 16;
+                            word += ((short)relocs[i+1].r_address & (short)0xFFFF);
+                        }
+                       
+                        
+                        word += (unsigned long) relocateAddress(oc, nSections, sections, scat->r_value)
+                                                - scat->r_value;
+                        
+                        i++;
+                    }
+ #endif
+                    else
+		        continue;  // ignore the others
+
+#ifdef powerpc_HOST_ARCH
+                    if(scat->r_type == GENERIC_RELOC_VANILLA
+                        || scat->r_type == PPC_RELOC_SECTDIFF)
+#else
+                    if(scat->r_type == GENERIC_RELOC_VANILLA
+                        || scat->r_type == GENERIC_RELOC_SECTDIFF)
+#endif
+                    {
+                        *wordPtr = word;
+                    }
+#ifdef powerpc_HOST_ARCH
+                    else if(scat->r_type == PPC_RELOC_LO16_SECTDIFF || scat->r_type == PPC_RELOC_LO16)
+                    {
+                        ((unsigned short*) wordPtr)[1] = word & 0xFFFF;
+                    }
+                    else if(scat->r_type == PPC_RELOC_HI16_SECTDIFF || scat->r_type == PPC_RELOC_HI16)
+                    {
+                        ((unsigned short*) wordPtr)[1] = (word >> 16) & 0xFFFF;
+                    }
+                    else if(scat->r_type == PPC_RELOC_HA16_SECTDIFF || scat->r_type == PPC_RELOC_HA16)
+                    {
+                        ((unsigned short*) wordPtr)[1] = ((word >> 16) & 0xFFFF)
+                            + ((word & (1<<15)) ? 1 : 0);
+                    }
+#endif
+		}
+	    }
+
+	    continue; // FIXME: I hope it's OK to ignore all the others.
+	}
+	else
+	{
+	    struct relocation_info *reloc = &relocs[i];
+	    if(reloc->r_pcrel && !reloc->r_extern)
+		continue;
+
+	    if(reloc->r_length == 2)
+	    {
+		unsigned long word = 0;
+#ifdef powerpc_HOST_ARCH
+                unsigned long jumpIsland = 0;
+                long offsetToJumpIsland = 0xBADBAD42; // initialise to bad value
+                                                      // to avoid warning and to catch
+                                                      // bugs.
+#endif
+
+		unsigned long* wordPtr = (unsigned long*) (image + sect->offset + reloc->r_address);
+		checkProddableBlock(oc,wordPtr);
+
+		if(reloc->r_type == GENERIC_RELOC_VANILLA)
+		{
+		    word = *wordPtr;
+		}
+#ifdef powerpc_HOST_ARCH
+		else if(reloc->r_type == PPC_RELOC_LO16)
+		{
+		    word = ((unsigned short*) wordPtr)[1];
+		    word |= ((unsigned long) relocs[i+1].r_address & 0xFFFF) << 16;
+		}
+		else if(reloc->r_type == PPC_RELOC_HI16)
+		{
+		    word = ((unsigned short*) wordPtr)[1] << 16;
+		    word |= ((unsigned long) relocs[i+1].r_address & 0xFFFF);
+		}
+		else if(reloc->r_type == PPC_RELOC_HA16)
+		{
+		    word = ((unsigned short*) wordPtr)[1] << 16;
+		    word += ((short)relocs[i+1].r_address & (short)0xFFFF);
+		}
+		else if(reloc->r_type == PPC_RELOC_BR24)
+		{
+		    word = *wordPtr;
+		    word = (word & 0x03FFFFFC) | ((word & 0x02000000) ? 0xFC000000 : 0);
+		}
+#endif
+
+		if(!reloc->r_extern)
+		{
+		    long delta =
+			sections[reloc->r_symbolnum-1].offset
+			- sections[reloc->r_symbolnum-1].addr
+			+ ((long) image);
+
+		    word += delta;
+		}
+		else
+		{
+		    struct nlist *symbol = &nlist[reloc->r_symbolnum];
+		    char *nm = image + symLC->stroff + symbol->n_un.n_strx;
+		    void *symbolAddress = lookupSymbol(nm);
+		    if(!symbolAddress)
+		    {
+			errorBelch("\nunknown symbol `%s'", nm);
+			return 0;
+		    }
+
+		    if(reloc->r_pcrel)
+                    {  
+#ifdef powerpc_HOST_ARCH
+                            // In the .o file, this should be a relative jump to NULL
+                            // and we'll change it to a relative jump to the symbol
+                        ASSERT(-word == reloc->r_address);
+                        jumpIsland = makeJumpIsland(oc,reloc->r_symbolnum,(unsigned long) symbolAddress);
+                        if(jumpIsland != 0)
+                        {
+                            offsetToJumpIsland = word + jumpIsland
+                                - (((long)image) + sect->offset - sect->addr);
+                        }
+#endif
+			word += (unsigned long) symbolAddress
+                                - (((long)image) + sect->offset - sect->addr);
+                    }
+                    else
+                    {
+                        word += (unsigned long) symbolAddress;
+                    }
+		}
+
+		if(reloc->r_type == GENERIC_RELOC_VANILLA)
+		{
+		    *wordPtr = word;
+		    continue;
+		}
+#ifdef powerpc_HOST_ARCH
+		else if(reloc->r_type == PPC_RELOC_LO16)
+		{
+		    ((unsigned short*) wordPtr)[1] = word & 0xFFFF;
+		    i++; continue;
+		}
+		else if(reloc->r_type == PPC_RELOC_HI16)
+		{
+		    ((unsigned short*) wordPtr)[1] = (word >> 16) & 0xFFFF;
+		    i++; continue;
+		}
+		else if(reloc->r_type == PPC_RELOC_HA16)
+		{
+		    ((unsigned short*) wordPtr)[1] = ((word >> 16) & 0xFFFF)
+			+ ((word & (1<<15)) ? 1 : 0);
+		    i++; continue;
+		}
+		else if(reloc->r_type == PPC_RELOC_BR24)
+		{
+                    if((long)word > (long)0x01FFFFFF || (long)word < (long)0xFFE00000)
+                    {
+                        // The branch offset is too large.
+                        // Therefore, we try to use a jump island.
+                        if(jumpIsland == 0)
+                        {
+                            barf("unconditional relative branch out of range: "
+                                 "no jump island available");
+                        }
+                        
+                        word = offsetToJumpIsland;
+                        if((long)word > (long)0x01FFFFFF || (long)word < (long)0xFFE00000)
+                            barf("unconditional relative branch out of range: "
+                                 "jump island out of range");
+                    }
+		    *wordPtr = (*wordPtr & 0xFC000003) | (word & 0x03FFFFFC);
+		    continue;
+		}
+#endif
+            }
+	    barf("\nunknown relocation %d",reloc->r_type);
+	    return 0;
+	}
+    }
+    return 1;
+}
+
+static int ocGetNames_MachO(ObjectCode* oc)
+{
+    char *image = (char*) oc->image;
+    struct mach_header *header = (struct mach_header*) image;
+    struct load_command *lc = (struct load_command*) (image + sizeof(struct mach_header));
+    unsigned i,curSymbol = 0;
+    struct segment_command *segLC = NULL;
+    struct section *sections;
+    struct symtab_command *symLC = NULL;
+    struct nlist *nlist;
+    unsigned long commonSize = 0;
+    char    *commonStorage = NULL;
+    unsigned long commonCounter;
+
+    for(i=0;i<header->ncmds;i++)
+    {
+	if(lc->cmd == LC_SEGMENT)
+	    segLC = (struct segment_command*) lc;
+	else if(lc->cmd == LC_SYMTAB)
+	    symLC = (struct symtab_command*) lc;
+	lc = (struct load_command *) ( ((char*)lc) + lc->cmdsize );
+    }
+
+    sections = (struct section*) (segLC+1);
+    nlist = symLC ? (struct nlist*) (image + symLC->symoff)
+                  : NULL;
+
+    for(i=0;i<segLC->nsects;i++)
+    {
+        if(sections[i].size == 0)
+            continue;
+
+        if((sections[i].flags & SECTION_TYPE) == S_ZEROFILL)
+        {
+            char * zeroFillArea = stgCallocBytes(1,sections[i].size,
+                                      "ocGetNames_MachO(common symbols)");
+            sections[i].offset = zeroFillArea - image;
+        }
+
+	if(!strcmp(sections[i].sectname,"__text"))
+	    addSection(oc, SECTIONKIND_CODE_OR_RODATA,
+		(void*) (image + sections[i].offset),
+		(void*) (image + sections[i].offset + sections[i].size));
+	else if(!strcmp(sections[i].sectname,"__const"))
+	    addSection(oc, SECTIONKIND_RWDATA,
+		(void*) (image + sections[i].offset),
+		(void*) (image + sections[i].offset + sections[i].size));
+	else if(!strcmp(sections[i].sectname,"__data"))
+	    addSection(oc, SECTIONKIND_RWDATA,
+		(void*) (image + sections[i].offset),
+		(void*) (image + sections[i].offset + sections[i].size));
+	else if(!strcmp(sections[i].sectname,"__bss")
+	        || !strcmp(sections[i].sectname,"__common"))
+	    addSection(oc, SECTIONKIND_RWDATA,
+		(void*) (image + sections[i].offset),
+		(void*) (image + sections[i].offset + sections[i].size));
+
+        addProddableBlock(oc, (void*) (image + sections[i].offset),
+                                        sections[i].size);
+    }
+
+	// count external symbols defined here
+    oc->n_symbols = 0;
+    if(symLC)
+    {
+        for(i=0;i<symLC->nsyms;i++)
+        {
+            if(nlist[i].n_type & N_STAB)
+                ;
+            else if(nlist[i].n_type & N_EXT)
+            {
+                if((nlist[i].n_type & N_TYPE) == N_UNDF
+                    && (nlist[i].n_value != 0))
+                {
+                    commonSize += nlist[i].n_value;
+                    oc->n_symbols++;
+                }
+                else if((nlist[i].n_type & N_TYPE) == N_SECT)
+                    oc->n_symbols++;
+            }
+        }
+    }
+    oc->symbols = stgMallocBytes(oc->n_symbols * sizeof(char*),
+				   "ocGetNames_MachO(oc->symbols)");
+
+    if(symLC)
+    {
+        for(i=0;i<symLC->nsyms;i++)
+        {
+            if(nlist[i].n_type & N_STAB)
+                ;
+            else if((nlist[i].n_type & N_TYPE) == N_SECT)
+            {
+                if(nlist[i].n_type & N_EXT)
+                {
+                    char *nm = image + symLC->stroff + nlist[i].n_un.n_strx;
+                    ghciInsertStrHashTable(oc->fileName, symhash, nm,
+                                            image
+                                            + sections[nlist[i].n_sect-1].offset
+                                            - sections[nlist[i].n_sect-1].addr
+                                            + nlist[i].n_value);
+                    oc->symbols[curSymbol++] = nm;
+                }
+                else
+                {
+                    char *nm = image + symLC->stroff + nlist[i].n_un.n_strx;
+                    ghciInsertStrHashTable(oc->fileName, oc->lochash, nm,
+                                            image
+                                            + sections[nlist[i].n_sect-1].offset
+                                            - sections[nlist[i].n_sect-1].addr
+                                            + nlist[i].n_value);
+                }
+            }
+        }
+    }
+
+    commonStorage = stgCallocBytes(1,commonSize,"ocGetNames_MachO(common symbols)");
+    commonCounter = (unsigned long)commonStorage;
+    if(symLC)
+    {
+        for(i=0;i<symLC->nsyms;i++)
+        {
+	    if((nlist[i].n_type & N_TYPE) == N_UNDF
+	            && (nlist[i].n_type & N_EXT) && (nlist[i].n_value != 0))
+	    {
+	        char *nm = image + symLC->stroff + nlist[i].n_un.n_strx;
+	        unsigned long sz = nlist[i].n_value;
+
+	        nlist[i].n_value = commonCounter;
+
+	        ghciInsertStrHashTable(oc->fileName, symhash, nm,
+	                               (void*)commonCounter);
+	        oc->symbols[curSymbol++] = nm;
+
+	        commonCounter += sz;
+	    }
+        }
+    }
+    return 1;
+}
+
+static int ocResolve_MachO(ObjectCode* oc)
+{
+    char *image = (char*) oc->image;
+    struct mach_header *header = (struct mach_header*) image;
+    struct load_command *lc = (struct load_command*) (image + sizeof(struct mach_header));
+    unsigned i;
+    struct segment_command *segLC = NULL;
+    struct section *sections;
+    struct symtab_command *symLC = NULL;
+    struct dysymtab_command *dsymLC = NULL;
+    struct nlist *nlist;
+
+    for(i=0;i<header->ncmds;i++)
+    {
+	if(lc->cmd == LC_SEGMENT)
+	    segLC = (struct segment_command*) lc;
+	else if(lc->cmd == LC_SYMTAB)
+	    symLC = (struct symtab_command*) lc;
+	else if(lc->cmd == LC_DYSYMTAB)
+	    dsymLC = (struct dysymtab_command*) lc;
+	lc = (struct load_command *) ( ((char*)lc) + lc->cmdsize );
+    }
+
+    sections = (struct section*) (segLC+1);
+    nlist = symLC ? (struct nlist*) (image + symLC->symoff)
+                  : NULL;
+
+    if(dsymLC)
+    {
+        unsigned long *indirectSyms
+            = (unsigned long*) (image + dsymLC->indirectsymoff);
+
+        for(i=0;i<segLC->nsects;i++)
+        {
+            if(    !strcmp(sections[i].sectname,"__la_symbol_ptr")
+                || !strcmp(sections[i].sectname,"__la_sym_ptr2")
+                || !strcmp(sections[i].sectname,"__la_sym_ptr3"))
+            {
+                if(!resolveImports(oc,image,symLC,&sections[i],indirectSyms,nlist))
+                    return 0;
+            }
+            else if(!strcmp(sections[i].sectname,"__nl_symbol_ptr")
+                ||  !strcmp(sections[i].sectname,"__pointers"))
+            {
+                if(!resolveImports(oc,image,symLC,&sections[i],indirectSyms,nlist))
+                    return 0;
+            }
+            else if(!strcmp(sections[i].sectname,"__jump_table"))
+            {
+                if(!resolveImports(oc,image,symLC,&sections[i],indirectSyms,nlist))
+                    return 0;
+            }
+        }
+    }
+    
+    for(i=0;i<segLC->nsects;i++)
+    {
+	if(!relocateSection(oc,image,symLC,nlist,segLC->nsects,sections,&sections[i]))
+	    return 0;
+    }
+
+    /* Free the local symbol table; we won't need it again. */
+    freeHashTable(oc->lochash, NULL);
+    oc->lochash = NULL;
+
+#if defined (powerpc_HOST_ARCH)
+    ocFlushInstructionCache( oc );
+#endif
+
+    return 1;
+}
+
+#ifdef powerpc_HOST_ARCH
+/*
+ * The Mach-O object format uses leading underscores. But not everywhere.
+ * There is a small number of runtime support functions defined in
+ * libcc_dynamic.a whose name does not have a leading underscore.
+ * As a consequence, we can't get their address from C code.
+ * We have to use inline assembler just to take the address of a function.
+ * Yuck.
+ */
+
+static void machoInitSymbolsWithoutUnderscore()
+{
+    extern void* symbolsWithoutUnderscore[];
+    void **p = symbolsWithoutUnderscore;
+    __asm__ volatile(".globl _symbolsWithoutUnderscore\n.data\n_symbolsWithoutUnderscore:");
+
+#undef Sym
+#define Sym(x)  \
+    __asm__ volatile(".long " # x);
+
+    RTS_MACHO_NOUNDERLINE_SYMBOLS
+
+    __asm__ volatile(".text");
+    
+#undef Sym
+#define Sym(x)  \
+    ghciInsertStrHashTable("(GHCi built-in symbols)", symhash, #x, *p++);
+    
+    RTS_MACHO_NOUNDERLINE_SYMBOLS
+    
+#undef Sym
+}
+#endif
+
+/*
+ * Figure out by how much to shift the entire Mach-O file in memory
+ * when loading so that its single segment ends up 16-byte-aligned
+ */
+static int machoGetMisalignment( FILE * f )
+{
+    struct mach_header header;
+    int misalignment;
+    
+    fread(&header, sizeof(header), 1, f);
+    rewind(f);
+
+    if(header.magic != MH_MAGIC)
+        return 0;
+    
+    misalignment = (header.sizeofcmds + sizeof(header))
+                    & 0xF;
+
+    return misalignment ? (16 - misalignment) : 0;
+}
+
+#endif
diff --git a/rts/LinkerInternals.h b/rts/LinkerInternals.h
new file mode 100644
index 0000000000..07d6334c7f
--- /dev/null
+++ b/rts/LinkerInternals.h
@@ -0,0 +1,110 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2000
+ *
+ * RTS Object Linker
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef LINKERINTERNALS_H
+#define LINKERINTERNALS_H
+
+typedef enum { OBJECT_LOADED, OBJECT_RESOLVED } OStatus;
+
+/* Indication of section kinds for loaded objects.  Needed by
+   the GC for deciding whether or not a pointer on the stack
+   is a code pointer.
+*/
+typedef 
+   enum { SECTIONKIND_CODE_OR_RODATA,
+          SECTIONKIND_RWDATA,
+          SECTIONKIND_OTHER,
+          SECTIONKIND_NOINFOAVAIL } 
+   SectionKind;
+
+typedef 
+   struct _Section { 
+      void* start; 
+      void* end; 
+      SectionKind kind;
+      struct _Section* next;
+   } 
+   Section;
+
+typedef 
+   struct _ProddableBlock {
+      void* start;
+      int   size;
+      struct _ProddableBlock* next;
+   }
+   ProddableBlock;
+
+/* Jump Islands are sniplets of machine code required for relative
+ * address relocations on the PowerPC.
+ */
+#ifdef powerpc_HOST_ARCH
+typedef struct {
+    short lis_r12, hi_addr;
+    short ori_r12_r12, lo_addr;
+    long mtctr_r12;
+    long bctr;
+} ppcJumpIsland;
+#endif
+
+/* Top-level structure for an object module.  One of these is allocated
+ * for each object file in use.
+ */
+typedef struct _ObjectCode {
+    OStatus    status;
+    char*      fileName;
+    int        fileSize;
+    char*      formatName;            /* eg "ELF32", "DLL", "COFF", etc. */
+
+    /* An array containing ptrs to all the symbol names copied from
+       this object into the global symbol hash table.  This is so that
+       we know which parts of the latter mapping to nuke when this
+       object is removed from the system. */
+    char**     symbols;
+    int        n_symbols;
+
+    /* ptr to malloc'd lump of memory holding the obj file */
+    char*      image;
+
+#ifdef darwin_HOST_OS
+    /* record by how much image has been deliberately misaligned
+       after allocation, so that we can use realloc */
+    int        misalignment;
+#endif
+
+    /* The section-kind entries for this object module.  Linked
+       list. */
+    Section* sections;
+
+    /* A private hash table for local symbols. */
+    HashTable* lochash;
+    
+    /* Allow a chain of these things */
+    struct _ObjectCode * next;
+
+    /* SANITY CHECK ONLY: a list of the only memory regions which may
+       safely be prodded during relocation.  Any attempt to prod
+       outside one of these is an error in the linker. */
+    ProddableBlock* proddables;
+
+#ifdef ia64_HOST_ARCH
+    /* Procedure Linkage Table for this object */
+    void *plt;
+    unsigned int pltIndex;
+#endif
+
+#ifdef powerpc_HOST_ARCH
+    ppcJumpIsland   *jump_islands;
+    unsigned long   island_start_symbol;
+    unsigned long   n_islands;
+#endif
+
+} ObjectCode;
+
+extern ObjectCode *objects;
+
+#endif /* LINKERINTERNALS_H */
diff --git a/rts/MBlock.c b/rts/MBlock.c
new file mode 100644
index 0000000000..fa8fd49d88
--- /dev/null
+++ b/rts/MBlock.c
@@ -0,0 +1,453 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1998-1999
+ *
+ * MegaBlock Allocator Interface.  This file contains all the dirty
+ * architecture-dependent hackery required to get a chunk of aligned
+ * memory from the operating system.
+ *
+ * ---------------------------------------------------------------------------*/
+
+/* This is non-posix compliant. */
+/* #include "PosixSource.h" */
+
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "RtsFlags.h"
+#include "MBlock.h"
+#include "BlockAlloc.h"
+
+#ifdef HAVE_STDLIB_H
+#include <stdlib.h>
+#endif
+#ifdef HAVE_STRING_H
+#include <string.h>
+#endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+#ifndef mingw32_HOST_OS
+# ifdef HAVE_SYS_MMAN_H
+# include <sys/mman.h>
+# endif
+#endif
+#ifdef HAVE_FCNTL_H
+#include <fcntl.h>
+#endif
+#if HAVE_WINDOWS_H
+#include <windows.h>
+#endif
+#if darwin_HOST_OS
+#include <mach/vm_map.h>
+#endif
+
+#include <errno.h>
+
+lnat mblocks_allocated = 0;
+
+/* -----------------------------------------------------------------------------
+   The MBlock Map: provides our implementation of HEAP_ALLOCED()
+   -------------------------------------------------------------------------- */
+
+#if SIZEOF_VOID_P == 4
+StgWord8 mblock_map[MBLOCK_MAP_SIZE]; // initially all zeros
+#elif SIZEOF_VOID_P == 8
+static MBlockMap dummy_mblock_map;
+MBlockMap *mblock_cache = &dummy_mblock_map;
+int mblock_map_count = 0;
+MBlockMap **mblock_maps = NULL;
+
+static MBlockMap *
+findMBlockMap(void *p)
+{
+    int i;
+    StgWord32 hi = (StgWord32) (((StgWord)p) >> 32);
+    for( i = 0; i < mblock_map_count; i++ )
+    {
+        if(mblock_maps[i]->addrHigh32 == hi)
+        {
+	    return mblock_maps[i];
+	}
+    }
+    return NULL;
+}
+
+StgBool
+slowIsHeapAlloced(void *p)
+{
+    MBlockMap *map = findMBlockMap(p);
+    if(map)
+    {
+    	mblock_cache = map;
+	return map->mblocks[MBLOCK_MAP_ENTRY(p)];
+    }
+    else
+    	return 0;
+}
+#endif
+
+static void
+markHeapAlloced(void *p)
+{
+#if SIZEOF_VOID_P == 4
+    mblock_map[MBLOCK_MAP_ENTRY(p)] = 1;
+#elif SIZEOF_VOID_P == 8
+    MBlockMap *map = findMBlockMap(p);
+    if(map == NULL)
+    {
+    	mblock_map_count++;
+    	mblock_maps = realloc(mblock_maps,
+			      sizeof(MBlockMap*) * mblock_map_count);
+	map = mblock_maps[mblock_map_count-1] = calloc(1,sizeof(MBlockMap));
+	map->addrHigh32 = (StgWord32) (((StgWord)p) >> 32);
+    }
+    map->mblocks[MBLOCK_MAP_ENTRY(p)] = 1;
+    mblock_cache = map;
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+   Allocate new mblock(s)
+   -------------------------------------------------------------------------- */
+
+void *
+getMBlock(void)
+{
+  return getMBlocks(1);
+}
+
+/* -----------------------------------------------------------------------------
+   The mmap() method
+
+   On Unix-like systems, we use mmap() to allocate our memory.  We
+   want memory in chunks of MBLOCK_SIZE, and aligned on an MBLOCK_SIZE
+   boundary.  The mmap() interface doesn't give us this level of
+   control, so we have to use some heuristics.
+
+   In the general case, if we want a block of n megablocks, then we
+   allocate n+1 and trim off the slop from either side (using
+   munmap()) to get an aligned chunk of size n.  However, the next
+   time we'll try to allocate directly after the previously allocated
+   chunk, on the grounds that this is aligned and likely to be free.
+   If it turns out that we were wrong, we have to munmap() and try
+   again using the general method.
+
+   Note on posix_memalign(): this interface is available on recent
+   systems and appears to provide exactly what we want.  However, it
+   turns out not to be as good as our mmap() implementation, because
+   it wastes extra space (using double the address space, in a test on
+   x86_64/Linux).  The problem seems to be that posix_memalign()
+   returns memory that can be free()'d, so the library must store
+   extra information along with the allocated block, thus messing up
+   the alignment.  Hence, we don't use posix_memalign() for now.
+
+   -------------------------------------------------------------------------- */
+
+#if !defined(mingw32_HOST_OS) && !defined(cygwin32_HOST_OS)
+
+// A wrapper around mmap(), to abstract away from OS differences in
+// the mmap() interface.
+
+static void *
+my_mmap (void *addr, lnat size)
+{
+    void *ret;
+
+#if defined(solaris2_HOST_OS) || defined(irix_HOST_OS)
+    { 
+	int fd = open("/dev/zero",O_RDONLY);
+	ret = mmap(addr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+	close(fd);
+    }
+#elif hpux_HOST_OS
+    ret = mmap(addr, size, PROT_READ | PROT_WRITE, 
+	       MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+#elif darwin_HOST_OS
+    // Without MAP_FIXED, Apple's mmap ignores addr.
+    // With MAP_FIXED, it overwrites already mapped regions, whic
+    // mmap(0, ... MAP_FIXED ...) is worst of all: It unmaps the program text
+    // and replaces it with zeroes, causing instant death.
+    // This behaviour seems to be conformant with IEEE Std 1003.1-2001.
+    // Let's just use the underlying Mach Microkernel calls directly,
+    // they're much nicer.
+    
+    kern_return_t err;
+    ret = addr;
+    if(addr)	// try to allocate at adress
+	err = vm_allocate(mach_task_self(),(vm_address_t*) &ret, size, FALSE);
+    if(!addr || err)	// try to allocate anywhere
+	err = vm_allocate(mach_task_self(),(vm_address_t*) &ret, size, TRUE);
+	
+    if(err) {
+	// don't know what the error codes mean exactly, assume it's
+	// not our problem though.
+	errorBelch("memory allocation failed (requested %lu bytes)", size);
+	stg_exit(EXIT_FAILURE);
+    } else {
+	vm_protect(mach_task_self(),ret,size,FALSE,VM_PROT_READ|VM_PROT_WRITE);
+    }
+#else
+    ret = mmap(addr, size, PROT_READ | PROT_WRITE | PROT_EXEC, 
+	       MAP_ANON | MAP_PRIVATE, -1, 0);
+#endif
+
+    if (ret == (void *)-1) {
+	if (errno == ENOMEM || 
+	    (errno == EINVAL && sizeof(void*)==4 && size >= 0xc0000000)) {
+	    // If we request more than 3Gig, then we get EINVAL
+	    // instead of ENOMEM (at least on Linux).
+	    errorBelch("out of memory (requested %lu bytes)", size);
+	    stg_exit(EXIT_FAILURE);
+	} else {
+	    barf("getMBlock: mmap: %s", strerror(errno));
+	}
+    }
+
+    return ret;
+}
+
+// Implements the general case: allocate a chunk of memory of 'size'
+// mblocks.
+
+static void *
+gen_map_mblocks (lnat size)
+{
+    int slop;
+    void *ret;
+
+    // Try to map a larger block, and take the aligned portion from
+    // it (unmap the rest).
+    size += MBLOCK_SIZE;
+    ret = my_mmap(0, size);
+    
+    // unmap the slop bits around the chunk we allocated
+    slop = (W_)ret & MBLOCK_MASK;
+    
+    if (munmap(ret, MBLOCK_SIZE - slop) == -1) {
+      barf("gen_map_mblocks: munmap failed");
+    }
+    if (slop > 0 && munmap(ret+size-slop, slop) == -1) {
+      barf("gen_map_mblocks: munmap failed");
+    }
+
+    // ToDo: if we happened to get an aligned block, then don't
+    // unmap the excess, just use it. For this to work, you
+    // need to keep in mind the following:
+    //     * Calling my_mmap() with an 'addr' arg pointing to
+    //       already my_mmap()ed space is OK and won't fail.
+    //     * If my_mmap() can't satisfy the request at the
+    //       given 'next_request' address in getMBlocks(), that
+    //       you unmap the extra mblock mmap()ed here (or simply
+    //       satisfy yourself that the slop introduced isn't worth
+    //       salvaging.)
+    // 
+
+    // next time, try after the block we just got.
+    ret += MBLOCK_SIZE - slop;
+    return ret;
+}
+
+
+// The external interface: allocate 'n' mblocks, and return the
+// address.
+
+void *
+getMBlocks(nat n)
+{
+  static caddr_t next_request = (caddr_t)HEAP_BASE;
+  caddr_t ret;
+  lnat size = MBLOCK_SIZE * n;
+  nat i;
+ 
+  if (next_request == 0) {
+      // use gen_map_mblocks the first time.
+      ret = gen_map_mblocks(size);
+  } else {
+      ret = my_mmap(next_request, size);
+
+      if (((W_)ret & MBLOCK_MASK) != 0) {
+	  // misaligned block!
+#if 0 // defined(DEBUG)
+	  errorBelch("warning: getMBlock: misaligned block %p returned when allocating %d megablock(s) at %p", ret, n, next_request);
+#endif
+
+	  // unmap this block...
+	  if (munmap(ret, size) == -1) {
+	      barf("getMBlock: munmap failed");
+	  }
+	  // and do it the hard way
+	  ret = gen_map_mblocks(size);
+      }
+  }
+
+  // Next time, we'll try to allocate right after the block we just got.
+  // ToDo: check that we haven't already grabbed the memory at next_request
+  next_request = ret + size;
+
+  IF_DEBUG(gc,debugBelch("Allocated %d megablock(s) at %p\n",n,ret));
+
+  // fill in the table
+  for (i = 0; i < n; i++) {
+      markHeapAlloced( ret + i * MBLOCK_SIZE );
+  }
+
+  mblocks_allocated += n;
+
+  return ret;
+}
+
+void
+freeAllMBlocks(void)
+{
+  /* XXX Do something here */
+}
+
+#else /* defined(mingw32_HOST_OS) || defined(cygwin32_HOST_OS) */
+
+/*
+ On Win32 platforms we make use of the two-phased virtual memory API
+ to allocate mega blocks. We proceed as follows:
+
+ Reserve a large chunk of VM (256M at the time, or what the user asked
+ for via the -M option), but don't supply a base address that's aligned on
+ a MB boundary. Instead we round up to the nearest mblock from the chunk of
+ VM we're handed back from the OS (at the moment we just leave the 'slop' at
+ the beginning of the reserved chunk unused - ToDo: reuse it .)
+
+ Reserving memory doesn't allocate physical storage (not even in the
+ page file), this is done later on by committing pages (or mega-blocks in
+ our case).
+*/
+
+static char* base_non_committed = (char*)0;
+static char* end_non_committed = (char*)0;
+
+static void *membase;
+
+/* Default is to reserve 256M of VM to minimise the slop cost. */
+#define SIZE_RESERVED_POOL  ( 256 * 1024 * 1024 )
+
+/* Number of bytes reserved */
+static unsigned long size_reserved_pool = SIZE_RESERVED_POOL;
+
+void *
+getMBlocks(nat n)
+{
+  static char* base_mblocks       = (char*)0;
+  static char* next_request       = (char*)0;
+  void* ret                       = (void*)0;
+  nat i;
+
+  lnat size = MBLOCK_SIZE * n;
+  
+  if ( (base_non_committed == 0) || (next_request + size > end_non_committed) ) {
+    if (base_non_committed) {
+	/* Tacky, but if no user-provided -M option is in effect,
+	 * set it to the default (==256M) in time for the heap overflow PSA.
+	 */
+	if (RtsFlags.GcFlags.maxHeapSize == 0) {
+	    RtsFlags.GcFlags.maxHeapSize = size_reserved_pool / BLOCK_SIZE;
+	}
+	heapOverflow();
+    }
+    if (RtsFlags.GcFlags.maxHeapSize != 0) {
+      size_reserved_pool = BLOCK_SIZE * RtsFlags.GcFlags.maxHeapSize;
+      if (size_reserved_pool < MBLOCK_SIZE) {
+	size_reserved_pool = 2*MBLOCK_SIZE;
+      }
+    }
+    base_non_committed = VirtualAlloc ( NULL
+                                      , size_reserved_pool
+				      , MEM_RESERVE
+				      , PAGE_READWRITE
+				      );
+    membase = base_non_committed;
+    if ( base_non_committed == 0 ) {
+         errorBelch("getMBlocks: VirtualAlloc MEM_RESERVE %lu failed with: %ld\n", size_reserved_pool, GetLastError());
+       ret=(void*)-1;
+    } else {
+      end_non_committed = (char*)base_non_committed + (unsigned long)size_reserved_pool;
+      /* The returned pointer is not aligned on a mega-block boundary. Make it. */
+      base_mblocks = (char*)((unsigned long)base_non_committed & (unsigned long)~MBLOCK_MASK) + MBLOCK_SIZE;
+#      if 0
+       debugBelch("getMBlocks: Dropping %d bytes off of 256M chunk\n", 
+		  (unsigned)base_mblocks - (unsigned)base_non_committed);
+#      endif
+
+       if ( ((char*)base_mblocks + size) > end_non_committed ) {
+          debugBelch("getMBlocks: oops, committed too small a region to start with.");
+	  ret=(void*)-1;
+       } else {
+          next_request = base_mblocks;
+       }
+    }
+  }
+  /* Commit the mega block(s) to phys mem */
+  if ( ret != (void*)-1 ) {
+     ret = VirtualAlloc(next_request, size, MEM_COMMIT, PAGE_READWRITE);
+     if (ret == NULL) {
+        debugBelch("getMBlocks: VirtualAlloc MEM_COMMIT %lu failed with: %ld\n", size, GetLastError());
+        ret=(void*)-1;
+     }
+  }
+
+  if (((W_)ret & MBLOCK_MASK) != 0) {
+    barf("getMBlocks: misaligned block returned");
+  }
+
+  if (ret == (void*)-1) {
+     barf("getMBlocks: unknown memory allocation failure on Win32.");
+  }
+
+  IF_DEBUG(gc,debugBelch("Allocated %d megablock(s) at 0x%x\n",n,(nat)ret));
+  next_request = (char*)next_request + size;
+
+  mblocks_allocated += n;
+  
+  // fill in the table
+  for (i = 0; i < n; i++) {
+      markHeapAlloced( ret + i * MBLOCK_SIZE );
+  }
+
+  return ret;
+}
+
+void
+freeAllMBlocks(void)
+{
+  BOOL rc;
+
+  rc = VirtualFree(membase, 0, MEM_RELEASE);
+  
+  if (rc == FALSE) {
+     debugBelch("freeAllMBlocks: VirtualFree failed with: %ld\n", GetLastError());
+  }
+}
+
+/* Hand back the physical memory that is allocated to a mega-block. 
+   ToDo: chain the released mega block onto some list so that
+         getMBlocks() can get at it.
+
+   Currently unused.
+*/
+#if 0
+void
+freeMBlock(void* p, nat n)
+{
+  BOOL rc;
+
+  rc = VirtualFree(p, n * MBLOCK_SIZE , MEM_DECOMMIT );
+  
+  if (rc == FALSE) {
+#    ifdef DEBUG
+     debugBelch("freeMBlocks: VirtualFree failed with: %d\n", GetLastError());
+#    endif
+  }
+
+}
+#endif
+
+#endif
diff --git a/rts/MBlock.h b/rts/MBlock.h
new file mode 100644
index 0000000000..1cc0dc5a1f
--- /dev/null
+++ b/rts/MBlock.h
@@ -0,0 +1,90 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * MegaBlock Allocator interface.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef MBLOCK_H
+#define MBLOCK_H
+
+extern lnat RTS_VAR(mblocks_allocated);
+
+extern void * getMBlock(void);
+extern void * getMBlocks(nat n);
+extern void freeAllMBlocks(void);
+
+#if osf3_HOST_OS
+/* ToDo: Perhaps by adjusting this value we can make linking without
+ * -static work (i.e., not generate a core-dumping executable)? */
+#if SIZEOF_VOID_P == 8
+#define HEAP_BASE 0x180000000L
+#else
+#error I have no idea where to begin the heap on a non-64-bit osf3 machine.
+#endif
+
+#else
+
+// we're using the generic method
+#define HEAP_BASE 0
+
+#endif
+
+/* -----------------------------------------------------------------------------
+   The HEAP_ALLOCED() test.
+
+   HEAP_ALLOCED is called FOR EVERY SINGLE CLOSURE during GC.
+   It needs to be FAST.
+
+   Implementation of HEAP_ALLOCED
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+   Since heap is allocated in chunks of megablocks (MBLOCK_SIZE), we
+   can just use a table to record which megablocks in the address
+   space belong to the heap.  On a 32-bit machine, with 1Mb
+   megablocks, using 8 bits for each entry in the table, the table
+   requires 4k.  Lookups during GC will be fast, because the table
+   will be quickly cached (indeed, performance measurements showed no
+   measurable difference between doing the table lookup and using a
+   constant comparison).
+
+   On 64-bit machines, we cache one 12-bit block map that describes
+   4096 megablocks or 4GB of memory. If HEAP_ALLOCED is called for
+   an address that is not in the cache, it calls slowIsHeapAlloced
+   (see MBlock.c) which will find the block map for the 4GB block in
+   question.
+   -------------------------------------------------------------------------- */
+
+#if SIZEOF_VOID_P == 4
+extern StgWord8 mblock_map[];
+
+/* On a 32-bit machine a 4KB table is always sufficient */
+# define MBLOCK_MAP_SIZE	4096
+# define MBLOCK_MAP_ENTRY(p)	((StgWord)(p) >> MBLOCK_SHIFT)
+# define HEAP_ALLOCED(p)	mblock_map[MBLOCK_MAP_ENTRY(p)]
+
+#elif SIZEOF_VOID_P == 8
+
+# define MBLOCK_MAP_SIZE	4096
+# define MBLOCK_MAP_ENTRY(p)	(((StgWord)(p) & 0xffffffff) >> MBLOCK_SHIFT)
+
+typedef struct {
+    StgWord32	addrHigh32;
+    StgWord8	mblocks[MBLOCK_MAP_SIZE];
+} MBlockMap;
+
+extern MBlockMap *mblock_cache;
+
+StgBool slowIsHeapAlloced(void *p);
+
+# define HEAP_ALLOCED(p)  					\
+	( ((((StgWord)(p)) >> 32) == mblock_cache->addrHigh32)	\
+	? mblock_cache->mblocks[MBLOCK_MAP_ENTRY(p)]		\
+	: slowIsHeapAlloced(p) )
+
+#else
+# error HEAP_ALLOCED not defined
+#endif
+
+#endif /* MBLOCK_H */
diff --git a/rts/Main.c b/rts/Main.c
new file mode 100644
index 0000000000..6aef280e25
--- /dev/null
+++ b/rts/Main.c
@@ -0,0 +1,138 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1998-2000
+ *
+ * Main function for a standalone Haskell program.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#define COMPILING_RTS_MAIN
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsAPI.h"
+#include "SchedAPI.h"
+#include "Schedule.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "Prelude.h"
+#include "Task.h"
+#include <stdlib.h>
+
+#ifdef DEBUG
+# include "Printer.h"   /* for printing        */
+#endif
+
+#ifdef PAR
+# include "Parallel.h"
+# include "ParallelRts.h"
+# include "LLC.h"
+#endif
+
+#if defined(GRAN) || defined(PAR)
+# include "GranSimRts.h"
+#endif
+
+#ifdef HAVE_WINDOWS_H
+# include <windows.h>
+#endif
+
+extern void __stginit_ZCMain(void);
+
+/* Hack: we assume that we're building a batch-mode system unless 
+ * INTERPRETER is set
+ */
+#ifndef INTERPRETER /* Hack */
+int main(int argc, char *argv[])
+{
+    int exit_status;
+    SchedulerStatus status;
+    /* all GranSim/GUM init is done in startupHaskell; sets IAmMainThread! */
+
+    startupHaskell(argc,argv,__stginit_ZCMain);
+
+    /* kick off the computation by creating the main thread with a pointer
+       to mainIO_closure representing the computation of the overall program;
+       then enter the scheduler with this thread and off we go;
+      
+       the same for GranSim (we have only one instance of this code)
+
+       in a parallel setup, where we have many instances of this code
+       running on different PEs, we should do this only for the main PE
+       (IAmMainThread is set in startupHaskell) 
+    */
+
+#  if defined(PAR)
+
+#   if defined(DEBUG)
+    { /* a wait loop to allow attachment of gdb to UNIX threads */
+      nat i, j, s;
+
+      for (i=0, s=0; i<(nat)RtsFlags.ParFlags.wait; i++)
+	for (j=0; j<1000000; j++) 
+	  s += j % 65536;
+    }
+    IF_PAR_DEBUG(verbose,
+		 belch("Passed wait loop"));
+#   endif
+
+    if (IAmMainThread == rtsTrue) {
+      IF_PAR_DEBUG(verbose,
+		   debugBelch("==== [%x] Main Thread Started ...\n", mytid));
+
+      /* ToDo: Dump event for the main thread */
+      status = rts_mainLazyIO((HaskellObj)mainIO_closure, NULL);
+    } else {
+      /* Just to show we're alive */
+      IF_PAR_DEBUG(verbose,
+		   debugBelch("== [%x] Non-Main PE enters scheduler via taskStart() without work ...\n",
+			   mytid));
+     
+      /* all non-main threads enter the scheduler without work */
+      taskStart();       
+      status = Success;  // declare victory (see shutdownParallelSystem)
+    }
+
+#  elif defined(GRAN)
+
+    /* ToDo: Dump event for the main thread */
+    status = rts_mainLazyIO(mainIO_closure, NULL);
+
+#  else /* !PAR && !GRAN */
+
+    /* ToDo: want to start with a larger stack size */
+    { 
+	void *cap = rts_lock();
+	cap = rts_evalLazyIO(cap,(HaskellObj)(void *)mainIO_closure, NULL);
+	status = rts_getSchedStatus(cap);
+	rts_unlock(cap);
+    }
+
+#  endif /* !PAR && !GRAN */
+
+    /* check the status of the entire Haskell computation */
+    switch (status) {
+    case Killed:
+      errorBelch("main thread exited (uncaught exception)");
+      exit_status = EXIT_KILLED;
+      break;
+    case Interrupted:
+      errorBelch("interrupted");
+      exit_status = EXIT_INTERRUPTED;
+      break;
+    case Success:
+      exit_status = EXIT_SUCCESS;
+      break;
+#if defined(PAR)
+    case NoStatus:
+      errorBelch("main thread PE killed; probably due to failure of another PE; check /tmp/pvml...");
+      exit_status = EXIT_KILLED;
+      break;
+#endif 
+    default:
+      barf("main thread completed with invalid status");
+    }
+    shutdownHaskellAndExit(exit_status);
+    return 0; /* never reached, keep gcc -Wall happy */
+}
+# endif /* BATCH_MODE */
diff --git a/rts/Makefile b/rts/Makefile
new file mode 100644
index 0000000000..2319788d65
--- /dev/null
+++ b/rts/Makefile
@@ -0,0 +1,370 @@
+#-----------------------------------------------------------------------------
+#
+#  This is the Makefile for the runtime-system stuff.
+#  This stuff is written in C (and cannot be written in Haskell).
+#
+#  .c  files are vanilla C,
+#  .hc files are "Haskellized-C", compiled using the C compiler and
+#      (possibly) the assembly-mangler.  The GHC driver script
+#      knows how to compile this stuff.
+#
+#  Other sorta independent, compile-once subdirs are:
+#	gmp		-- GNU multi-precision library (for Integer)
+
+#-----------------------------------------------------------------------------
+# Preamble
+
+TOP=..
+
+# Set UseGhcForCc: this causes the fptools build system to use a different
+# set of suffix rules for compiling C code, using $(HC) rather than $(CC)
+# and prepending "-optc" to $(CC_OPTS).  NB. must be done before including
+# boilerplate.mk below.
+UseGhcForCc = YES
+
+include $(TOP)/mk/boilerplate.mk
+
+PACKAGE = rts
+
+HC=$(GHC_INPLACE)
+
+# -----------------------------------------------------------------------------
+# RTS ways
+
+WAYS=$(GhcLibWays) $(GhcRTSWays)
+
+ifneq "$(findstring debug, $(way))" ""
+GhcRtsHcOpts=
+GhcRtsCcOpts=-g
+endif
+
+# -----------------------------------------------------------------------------
+
+# Tells the build system not to add various Haskellish options to $(SRC_HC_OPTS)
+NON_HS_PACKAGE = YES
+
+# grab sources from these subdirectories
+ALL_DIRS = hooks parallel
+
+ifeq "$(HOSTPLATFORM)" "i386-unknown-mingw32"
+ALL_DIRS += win32
+else
+ALL_DIRS += posix
+endif
+
+ifneq "$(DLLized)" "YES"
+EXCLUDED_SRCS += RtsDllMain.c
+else
+EXCLUDED_SRCS += Main.c
+endif
+
+# This file ends up being empty unless we're building for a powerpc
+# or darwin system, and it is reported that Solaris ld chokes on it when
+# building HSrts.o.
+ifeq "$(findstring $(TargetArch_CPP), powerpc powerpc64)" ""
+ifeq "$(findstring $(TargetOS_CPP), darwin)" ""
+EXCLUDED_SRCS += AdjustorAsm.S
+endif
+endif
+
+EXCLUDED_SRCS += parallel/SysMan.c
+
+# The build system doesn't give us these
+CMM_SRCS = $(filter-out AutoApply%.cmm, $(wildcard *.cmm)) $(EXTRA_CMM_SRCS)
+CMM_OBJS = $(patsubst %.cmm,%.$(way_)o, $(CMM_SRCS))
+
+CLEAN_FILES += $(CMM_OBJS)
+
+# Override the default $(LIBOBJS) (defaults to $(HS_OBJS))
+LIBOBJS = $(C_OBJS) $(CMM_OBJS)
+
+SplitObjs=NO
+
+H_FILES = $(wildcard ../includes/*.h) $(wildcard *.h)
+
+#-----------------------------------------------------------------------------
+# Flags for compiling RTS .c and .hc files
+
+# gcc provides lots of useful warnings if you ask it.
+# This is a pretty good list to start with - use a # to comment out
+# any you don't like.
+WARNING_OPTS += -Wall 
+WARNING_OPTS += -W
+WARNING_OPTS += -Wstrict-prototypes 
+WARNING_OPTS += -Wmissing-prototypes 
+WARNING_OPTS += -Wmissing-declarations
+WARNING_OPTS += -Winline
+WARNING_OPTS += -Waggregate-return
+#WARNING_OPTS += -Wpointer-arith
+WARNING_OPTS += -Wbad-function-cast
+#WARNING_OPTS += -Wcast-align
+#WARNING_OPTS += -Wnested-externs
+#WARNING_OPTS += -Wshadow
+#WARNING_OPTS += -Wcast-qual
+#WARNING_OPTS += -Wno-unused 
+#WARNING_OPTS += -Wredundant-decls 
+#WARNING_OPTS += -Wconversion
+
+STANDARD_OPTS += -I../includes -I. -Iparallel
+# COMPILING_RTS is only used when building Win32 DLL support.
+STANDARD_OPTS += -DCOMPILING_RTS
+
+# HC_OPTS is included in both .c and .cmm compilations, whereas CC_OPTS is
+# only included in .c compilations.  HC_OPTS included the WAY_* opts, which
+# must be included in both types of compilations.
+
+SRC_CC_OPTS += $(WARNING_OPTS)
+SRC_CC_OPTS += $(STANDARD_OPTS)
+
+SRC_CC_OPTS += $(GhcRtsCcOpts)
+SRC_HC_OPTS += $(GhcRtsHcOpts)
+
+ifneq "$(DLLized)" "YES"
+SRC_HC_OPTS += -static
+endif
+# SRC_HC_OPTS += -fPIC
+
+RtsMessages_CC_OPTS += -DProjectVersion=\"$(ProjectVersion)\"
+
+ifeq "$(way)" "mp"
+SRC_HC_OPTS += -I$$PVM_ROOT/include
+endif
+
+# If -DDEBUG is in effect, adjust package conf accordingly..
+ifneq "$(strip $(filter -optc-DDEBUG,$(GhcRtsHcOpts)))" ""
+PACKAGE_CPP_OPTS += -DDEBUG
+endif
+
+ifeq "$(HaveLibMingwEx)" "YES"
+PACKAGE_CPP_OPTS += -DHAVE_LIBMINGWEX
+endif
+
+ifeq "$(DotnetSupport)" "YES"
+
+# 
+# Would like to just use SUBDIRS here, but need to
+# descend into dotnet/ earlier than that.
+#
+all ::
+	$(MAKE) -C dotnet all
+
+# But use SUBDIRS for other recursive targets.
+SUBDIRS += dotnet
+
+LIBOBJS += dotnet/Invoke.o
+endif
+
+# Suppress uninitialized variable warnings for GC.c
+GC_CC_OPTS 	+= -Wno-uninitialized
+
+#-----------------------------------------------------------------------------
+# Include the Front panel code?
+
+# we need GTK+ for the front panel
+ifneq "$(GTK_CONFIG)" ""
+ifeq "$(GhcRtsWithFrontPanel)" "YES"
+SRC_HC_OPTS 		+= `$(GTK_CONFIG) --cflags` -optc-DRTS_GTK_FRONTPANEL
+VisCallbacks_CC_OPTS 	+= -Wno-unused
+SRC_MKDEPENDC_OPTS	+= `$(GTK_CONFIG) --cflags`
+else # GhcRtsWithFrontPanel
+EXCLUDED_SRCS		+= $(wildcard Vis*.c)
+endif
+else # GTK_CONFIG
+EXCLUDED_SRCS		+= $(wildcard Vis*.c)
+endif
+
+#-----------------------------------------------------------------------------
+# make depend setup
+
+SRC_MKDEPENDC_OPTS += -I. -I../includes
+
+# Hack: we define every way-related option here, so that we get (hopefully)
+# a superset of the dependencies.  To do this properly, we should generate
+# a different set of dependencies for each way.  Further hack: PROFILING and
+# TICKY_TICKY can't be used together, so we omit TICKY_TICKY for now.
+SRC_MKDEPENDC_OPTS += -DPROFILING -DTHREADED_RTS -DDEBUG
+
+# -----------------------------------------------------------------------------
+# The auto-generated apply code
+
+# We want a slightly different version for the unregisterised way, so we make
+# AutoApply on a per-way basis (eg. AutoApply_p.cmm).
+
+AUTO_APPLY_CMM = AutoApply$(_way).cmm
+
+ifneq "$(BootingFromHc)" "YES"
+$(AUTO_APPLY_CMM): $(GHC_GENAPPLY)
+	@$(RM) $@
+	$(GENAPPLY) $(if $(filter $(way), u debug_u), -u) >$@
+endif
+
+EXTRA_CMM_SRCS += $(AUTO_APPLY_CMM)
+
+CLEAN_FILES += $(AUTO_APPLY_CMM)
+
+# -----------------------------------------------------------------------------
+#
+#  Building DLLs is only supported on mingw32 at the moment.
+#
+ifeq "$(DLLized)" "YES"
+SRC_BLD_DLL_OPTS += -lHS_imp_stub -lgmp_imp
+
+# It's not included in the DLL, but we need to compile it up separately.
+all :: Main.dll_o
+
+# Need an import library containing the symbols the RTS uses from the Prelude.
+# So, to avoid bootstrapping trouble, we build one containing just the syms
+# we need. Weirdly named to avoid clashing later on when compiling the contents
+# of ghc/lib/..
+#
+# Note: if you do change the name of the Prelude DLL, the "--dllname <nm>.dll"
+# below will need to be updated as well.
+
+$(DLL_PEN)/HSrts$(_way).dll :: libHS_imp_stub.a
+
+libHS_imp_stub.a :
+	dlltool --output-lib libHS_imp_stub.a --def HSprel.def --dllname HSstd.dll
+
+endif
+
+# -----------------------------------------------------------------------------
+# Compile GMP only if we don't have it already
+#
+# We use GMP's own configuration stuff, because it's all rather hairy
+# and not worth re-implementing in our Makefile framework.
+
+ifneq "$(HaveLibGmp)" "YES"
+ifneq "$(HaveFrameworkGMP)" "YES"
+boot ::
+	if [ -f gmp/config.status ]; then \
+	   cd gmp && CC=$(WhatGccIsCalled) ./config.status; \
+        else \
+	   cd gmp && CC=$(WhatGccIsCalled) ./configure --enable-shared=no \
+		        --host=`echo $(HOSTPLATFORM) | sed 's/i[567]86/i486/g'`; \
+	fi
+
+# Slight cheatage here to pass host as target, but x-compilation isn't supported by ghc.
+
+ifeq "$(way)" ""
+all :: gmp/libgmp.a
+
+ifeq "$(DLLized)" "YES"
+all :: $(DLL_PEN)/gmp.dll
+
+$(DLL_PEN)/gmp.dll:
+	$(MAKE) -C gmp gmp.dll
+	$(MV) gmp/gmp.dll $(DLL_PEN)
+endif
+endif
+
+install :: gmp/libgmp.a
+
+ifeq "$(way)" ""
+clean distclean maintainer-clean ::
+	-$(MAKE) -C gmp MAKEFLAGS= $@
+
+INSTALL_LIBS += gmp/libgmp.a
+endif
+endif
+
+gmp/libgmp.a ::
+	$(MAKE) -C gmp MAKEFLAGS=
+	@$(CP) gmp/.libs/libgmp.a gmp
+	@$(RANLIB) gmp/libgmp.a
+endif
+
+CLEAN_FILES += gmp/libgmp.a
+
+#-----------------------------------------------------------------------------
+#
+# Building the GUM SysMan
+#
+
+ifeq "$(way)" "mp"
+all :: parallel/SysMan
+
+ifdef solaris2_TARGET_OS
+__socket_libs = -lsocket -lnsl
+else
+__socket_libs =
+endif
+
+parallel/SysMan : parallel/SysMan.mp_o parallel/LLComms.mp_o RtsUtils.mp_o RtsFlags.mp_o
+	$(RM) $@
+	gcc -o $@ parallel/SysMan.mp_o parallel/LLComms.mp_o -L$$PVM_ROOT/lib/$$PVM_ARCH -lgpvm3 -lpvm3 $(__socket_libs)
+
+CLEAN_FILES  += parallel/SysMan.mp_o parallel/SysMan
+INSTALL_LIBEXECS += parallel/SysMan
+endif
+
+#-----------------------------------------------------------------------------
+# Compiling the cmm files
+
+# ToDo: should we really include Rts.h here?  Required for GNU_ATTRIBUTE().
+SRC_HC_OPTS += \
+  -I. \
+  -\#include Prelude.h \
+  -\#include Rts.h \
+  -\#include RtsFlags.h \
+  -\#include RtsUtils.h \
+  -\#include StgRun.h \
+  -\#include Schedule.h \
+  -\#include Printer.h \
+  -\#include Sanity.h \
+  -\#include STM.h \
+  -\#include Storage.h \
+  -\#include SchedAPI.h \
+  -\#include Timer.h \
+  -\#include ProfHeap.h \
+  -\#include LdvProfile.h \
+  -\#include Profiling.h \
+  -\#include OSThreads.h \
+  -\#include Apply.h \
+  -\#include SMP.h
+
+ifeq "$(Windows)" "YES"
+PrimOps_HC_OPTS += -\#include '<windows.h>' -\#include win32/AsyncIO.h
+else
+PrimOps_HC_OPTS += -\#include posix/Itimer.h
+endif
+
+# -O3 helps unroll some loops (especially in copy() with a constant argument).
+# -fno-strict-aliasing is a hack because we often mix StgPtr and StgClosure pointers
+# to the same object, and gcc will assume these don't alias.  eg. it happens in
+# copy() with gcc 3.4.3, the upd_evacee() assigments get moved before the object copy.
+GC_HC_OPTS += -optc-O3 -optc-fno-strict-aliasing
+
+# Cmm must be compiled via-C for now, because the NCG can't handle loops
+SRC_HC_OPTS += -fvia-C
+
+# We *want* type-checking of hand-written cmm.
+SRC_HC_OPTS += -dcmm-lint 
+
+ifneq "$(BootingFromHc)" "YES"
+# .cmm files depend on all the .h files, to a first approximation.
+%.$(way_)o : %.cmm $(H_FILES)
+	$(HC_PRE_OPTS)
+	$(HC) $(HC_OPTS) -c $< -o $@
+	$(HC_POST_OPTS)
+
+%.$(way_)hc : %.cmm $(H_FILES)
+	$(HC) $(HC_OPTS) -C $< -o $@
+
+%.$(way_)s : %.cmm $(H_FILES)
+	$(HC) $(HC_OPTS) -S $< -o $@
+endif
+
+#-----------------------------------------------------------------------------
+#
+# Files to install
+#
+# Just libHSrts is installed uniformly across ways
+#
+INSTALL_LIBS += $(LIBRARY)
+ifeq "$(DLLized)" "YES"
+INSTALL_PROGS += $(DLL_NAME) gmp/gmp.dll
+INSTALL_LIBS += $(patsubst %.a,%_imp.a,$(LIBARY))
+INSTALL_LIBS += gmp/libgmp_imp.a Main.dll_o
+endif
+
+include $(TOP)/mk/target.mk
diff --git a/rts/PosixSource.h b/rts/PosixSource.h
new file mode 100644
index 0000000000..a938f9bc0f
--- /dev/null
+++ b/rts/PosixSource.h
@@ -0,0 +1,18 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Include this file into sources which should not need any non-Posix services.
+ * That includes most RTS C sources.
+ * ---------------------------------------------------------------------------*/
+
+#ifndef POSIXSOURCE_H
+#define POSIXSOURCE_H
+
+#define _POSIX_SOURCE   1
+#define _POSIX_C_SOURCE 199506L
+#define _ISOC9X_SOURCE
+
+/* Let's be ISO C9X too... */
+
+#endif /* POSIXSOURCE_H */
diff --git a/rts/Prelude.h b/rts/Prelude.h
new file mode 100644
index 0000000000..c209b2b800
--- /dev/null
+++ b/rts/Prelude.h
@@ -0,0 +1,129 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Prelude identifiers that we sometimes need to refer to in the RTS.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef PRELUDE_H
+#define PRELUDE_H
+
+/* These definitions are required by the RTS .cmm files too, so we
+ * need declarations that we can #include into the generated .hc files.
+ */
+#if IN_STG_CODE
+#define PRELUDE_INFO(i)       extern W_(i)[]
+#define PRELUDE_CLOSURE(i)    extern W_(i)[]
+#else
+#define PRELUDE_INFO(i)       extern DLL_IMPORT const StgInfoTable i
+#define PRELUDE_CLOSURE(i)    extern DLL_IMPORT StgClosure i
+#endif
+
+/* Define canonical names so we can abstract away from the actual
+ * modules these names are defined in.
+ */
+
+PRELUDE_CLOSURE(GHCziBase_True_closure);
+PRELUDE_CLOSURE(GHCziBase_False_closure);
+PRELUDE_CLOSURE(GHCziPack_unpackCString_closure);
+PRELUDE_CLOSURE(GHCziWeak_runFinalizzerBatch_closure);
+
+#ifdef IN_STG_CODE
+extern W_ ZCMain_main_closure[];
+#else
+extern StgClosure ZCMain_main_closure;
+#endif
+
+PRELUDE_CLOSURE(GHCziIOBase_stackOverflow_closure);
+PRELUDE_CLOSURE(GHCziIOBase_heapOverflow_closure);
+PRELUDE_CLOSURE(GHCziIOBase_BlockedOnDeadMVar_closure);
+PRELUDE_CLOSURE(GHCziIOBase_BlockedIndefinitely_closure);
+PRELUDE_CLOSURE(GHCziIOBase_NonTermination_closure);
+PRELUDE_CLOSURE(GHCziIOBase_NestedAtomically_closure);
+
+PRELUDE_INFO(GHCziBase_Czh_static_info);
+PRELUDE_INFO(GHCziBase_Izh_static_info);
+PRELUDE_INFO(GHCziFloat_Fzh_static_info);
+PRELUDE_INFO(GHCziFloat_Dzh_static_info);
+PRELUDE_INFO(Addr_Azh_static_info);
+PRELUDE_INFO(GHCziPtr_Ptr_static_info);
+PRELUDE_INFO(GHCziPtr_FunPtr_static_info);
+PRELUDE_INFO(GHCziInt_I8zh_static_info);
+PRELUDE_INFO(GHCziInt_I16zh_static_info);
+PRELUDE_INFO(GHCziInt_I32zh_static_info);
+PRELUDE_INFO(GHCziInt_I64zh_static_info);
+PRELUDE_INFO(GHCziWord_Wzh_static_info);
+PRELUDE_INFO(GHCziWord_W8zh_static_info);
+PRELUDE_INFO(GHCziWord_W16zh_static_info);
+PRELUDE_INFO(GHCziWord_W32zh_static_info);
+PRELUDE_INFO(GHCziWord_W64zh_static_info);
+PRELUDE_INFO(GHCziBase_Czh_con_info);
+PRELUDE_INFO(GHCziBase_Izh_con_info);
+PRELUDE_INFO(GHCziFloat_Fzh_con_info);
+PRELUDE_INFO(GHCziFloat_Dzh_con_info);
+PRELUDE_INFO(GHCziPtr_Ptr_con_info);
+PRELUDE_INFO(GHCziPtr_FunPtr_con_info);
+PRELUDE_INFO(Addr_Azh_con_info);
+PRELUDE_INFO(GHCziWord_Wzh_con_info);
+PRELUDE_INFO(GHCziInt_I8zh_con_info);
+PRELUDE_INFO(GHCziInt_I16zh_con_info);
+PRELUDE_INFO(GHCziInt_I32zh_con_info);
+PRELUDE_INFO(GHCziInt_I64zh_con_info);
+PRELUDE_INFO(GHCziWord_W8zh_con_info);
+PRELUDE_INFO(GHCziWord_W16zh_con_info);
+PRELUDE_INFO(GHCziWord_W32zh_con_info);
+PRELUDE_INFO(GHCziWord_W64zh_con_info);
+PRELUDE_INFO(GHCziStable_StablePtr_static_info);
+PRELUDE_INFO(GHCziStable_StablePtr_con_info);
+
+#define True_closure              (&GHCziBase_True_closure)
+#define False_closure             (&GHCziBase_False_closure)
+#define unpackCString_closure     (&GHCziPack_unpackCString_closure)
+#define runFinalizerBatch_closure (&GHCziWeak_runFinalizzerBatch_closure)
+#define mainIO_closure            (&ZCMain_main_closure)
+
+#define stackOverflow_closure     (&GHCziIOBase_stackOverflow_closure)
+#define heapOverflow_closure      (&GHCziIOBase_heapOverflow_closure)
+#define BlockedOnDeadMVar_closure (&GHCziIOBase_BlockedOnDeadMVar_closure)
+#define BlockedIndefinitely_closure (&GHCziIOBase_BlockedIndefinitely_closure)
+#define NonTermination_closure    (&GHCziIOBase_NonTermination_closure)
+#define NestedAtomically_closure  (&GHCziIOBase_NestedAtomically_closure)
+
+#define Czh_static_info           (&GHCziBase_Czh_static_info)
+#define Fzh_static_info           (&GHCziFloat_Fzh_static_info)
+#define Dzh_static_info           (&GHCziFloat_Dzh_static_info)
+#define Azh_static_info           (&Addr_Azh_static_info)
+#define Izh_static_info           (&GHCziBase_Izh_static_info)
+#define I8zh_static_info          (&GHCziInt_I8zh_static_info)
+#define I16zh_static_info         (&GHCziInt_I16zh_static_info)
+#define I32zh_static_info         (&GHCziInt_I32zh_static_info)
+#define I64zh_static_info         (&GHCziInt_I64zh_static_info)
+#define Wzh_static_info           (&GHCziWord_Wzh_static_info)
+#define W8zh_static_info          (&GHCziWord_W8zh_static_info)
+#define W16zh_static_info         (&GHCziWord_W16zh_static_info)
+#define W32zh_static_info         (&GHCziWord_W32zh_static_info)
+#define W64zh_static_info         (&GHCziWord_W64zh_static_info)
+#define Ptr_static_info           (&GHCziPtr_Ptr_static_info)
+#define FunPtr_static_info        (&GHCziPtr_FunPtr_static_info)
+#define Czh_con_info              (&GHCziBase_Czh_con_info)
+#define Izh_con_info              (&GHCziBase_Izh_con_info)
+#define Fzh_con_info              (&GHCziFloat_Fzh_con_info)
+#define Dzh_con_info              (&GHCziFloat_Dzh_con_info)
+#define Azh_con_info              (&Addr_Azh_con_info)
+#define Wzh_con_info              (&GHCziWord_Wzh_con_info)
+#define W8zh_con_info             (&GHCziWord_W8zh_con_info)
+#define W16zh_con_info            (&GHCziWord_W16zh_con_info)
+#define W32zh_con_info            (&GHCziWord_W32zh_con_info)
+#define W64zh_con_info            (&GHCziWord_W64zh_con_info)
+#define I8zh_con_info             (&GHCziInt_I8zh_con_info)
+#define I16zh_con_info            (&GHCziInt_I16zh_con_info)
+#define I32zh_con_info            (&GHCziInt_I32zh_con_info)
+#define I64zh_con_info            (&GHCziInt_I64zh_con_info)
+#define I64zh_con_info            (&GHCziInt_I64zh_con_info)
+#define Ptr_con_info              (&GHCziPtr_Ptr_con_info)
+#define FunPtr_con_info           (&GHCziPtr_FunPtr_con_info)
+#define StablePtr_static_info     (&GHCziStable_StablePtr_static_info)
+#define StablePtr_con_info        (&GHCziStable_StablePtr_con_info)
+
+#endif /* PRELUDE_H */
diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm
new file mode 100644
index 0000000000..f1c214e304
--- /dev/null
+++ b/rts/PrimOps.cmm
@@ -0,0 +1,2106 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2004
+ *
+ * Out-of-line primitive operations
+ *
+ * This file contains the implementations of all the primitive
+ * operations ("primops") which are not expanded inline.  See
+ * ghc/compiler/prelude/primops.txt.pp for a list of all the primops;
+ * this file contains code for most of those with the attribute
+ * out_of_line=True.
+ *
+ * Entry convention: the entry convention for a primop is that all the
+ * args are in Stg registers (R1, R2, etc.).  This is to make writing
+ * the primops easier.  (see compiler/codeGen/CgCallConv.hs).
+ *
+ * Return convention: results from a primop are generally returned
+ * using the ordinary unboxed tuple return convention.  The C-- parser
+ * implements the RET_xxxx() macros to perform unboxed-tuple returns
+ * based on the prevailing return convention.
+ *
+ * This file is written in a subset of C--, extended with various
+ * features specific to GHC.  It is compiled by GHC directly.  For the
+ * syntax of .cmm files, see the parser in ghc/compiler/cmm/CmmParse.y.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Cmm.h"
+
+/*-----------------------------------------------------------------------------
+  Array Primitives
+
+  Basically just new*Array - the others are all inline macros.
+
+  The size arg is always passed in R1, and the result returned in R1.
+
+  The slow entry point is for returning from a heap check, the saved
+  size argument must be re-loaded from the stack.
+  -------------------------------------------------------------------------- */
+
+/* for objects that are *less* than the size of a word, make sure we
+ * round up to the nearest word for the size of the array.
+ */
+
+newByteArrayzh_fast
+{
+    W_ words, payload_words, n, p;
+    MAYBE_GC(NO_PTRS,newByteArrayzh_fast);
+    n = R1;
+    payload_words = ROUNDUP_BYTES_TO_WDS(n);
+    words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
+    "ptr" p = foreign "C" allocateLocal(MyCapability() "ptr",words) [];
+    TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
+    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
+    StgArrWords_words(p) = payload_words;
+    RET_P(p);
+}
+
+newPinnedByteArrayzh_fast
+{
+    W_ words, payload_words, n, p;
+
+    MAYBE_GC(NO_PTRS,newPinnedByteArrayzh_fast);
+    n = R1;
+    payload_words = ROUNDUP_BYTES_TO_WDS(n);
+
+    // We want an 8-byte aligned array.  allocatePinned() gives us
+    // 8-byte aligned memory by default, but we want to align the
+    // *goods* inside the ArrWords object, so we have to check the
+    // size of the ArrWords header and adjust our size accordingly.
+    words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words;
+    if ((SIZEOF_StgArrWords & 7) != 0) {
+	words = words + 1;
+    }
+
+    "ptr" p = foreign "C" allocatePinned(words) [];
+    TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0);
+
+    // Again, if the ArrWords header isn't a multiple of 8 bytes, we
+    // have to push the object forward one word so that the goods
+    // fall on an 8-byte boundary.
+    if ((SIZEOF_StgArrWords & 7) != 0) {
+	p = p + WDS(1);
+    }
+
+    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
+    StgArrWords_words(p) = payload_words;
+    RET_P(p);
+}
+
+newArrayzh_fast
+{
+    W_ words, n, init, arr, p;
+    /* Args: R1 = words, R2 = initialisation value */
+
+    n = R1;
+    MAYBE_GC(R2_PTR,newArrayzh_fast);
+
+    words = BYTES_TO_WDS(SIZEOF_StgMutArrPtrs) + n;
+    "ptr" arr = foreign "C" allocateLocal(MyCapability() "ptr",words) [R2];
+    TICK_ALLOC_PRIM(SIZEOF_StgMutArrPtrs, WDS(n), 0);
+
+    SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, W_[CCCS]);
+    StgMutArrPtrs_ptrs(arr) = n;
+
+    // Initialise all elements of the the array with the value in R2
+    init = R2;
+    p = arr + SIZEOF_StgMutArrPtrs;
+  for:
+    if (p < arr + WDS(words)) {
+	W_[p] = init;
+	p = p + WDS(1);
+	goto for;
+    }
+
+    RET_P(arr);
+}
+
+unsafeThawArrayzh_fast
+{
+  // SUBTLETY TO DO WITH THE OLD GEN MUTABLE LIST
+  //
+  // A MUT_ARR_PTRS lives on the mutable list, but a MUT_ARR_PTRS_FROZEN 
+  // normally doesn't.  However, when we freeze a MUT_ARR_PTRS, we leave
+  // it on the mutable list for the GC to remove (removing something from
+  // the mutable list is not easy, because the mut_list is only singly-linked).
+  // 
+  // So that we can tell whether a MUT_ARR_PTRS_FROZEN is on the mutable list,
+  // when we freeze it we set the info ptr to be MUT_ARR_PTRS_FROZEN0
+  // to indicate that it is still on the mutable list.
+  //
+  // So, when we thaw a MUT_ARR_PTRS_FROZEN, we must cope with two cases:
+  // either it is on a mut_list, or it isn't.  We adopt the convention that
+  // the closure type is MUT_ARR_PTRS_FROZEN0 if it is on the mutable list,
+  // and MUT_ARR_PTRS_FROZEN otherwise.  In fact it wouldn't matter if
+  // we put it on the mutable list more than once, but it would get scavenged
+  // multiple times during GC, which would be unnecessarily slow.
+  //
+  if (StgHeader_info(R1) != stg_MUT_ARR_PTRS_FROZEN0_info) {
+	SET_INFO(R1,stg_MUT_ARR_PTRS_DIRTY_info);
+	foreign "C" recordMutableLock(R1 "ptr") [R1];
+	// must be done after SET_INFO, because it ASSERTs closure_MUTABLE()
+	RET_P(R1);
+  } else {
+	SET_INFO(R1,stg_MUT_ARR_PTRS_DIRTY_info);
+	RET_P(R1);
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   MutVar primitives
+   -------------------------------------------------------------------------- */
+
+newMutVarzh_fast
+{
+    W_ mv;
+    /* Args: R1 = initialisation value */
+
+    ALLOC_PRIM( SIZEOF_StgMutVar, R1_PTR, newMutVarzh_fast);
+
+    mv = Hp - SIZEOF_StgMutVar + WDS(1);
+    SET_HDR(mv,stg_MUT_VAR_DIRTY_info,W_[CCCS]);
+    StgMutVar_var(mv) = R1;
+    
+    RET_P(mv);
+}
+
+atomicModifyMutVarzh_fast
+{
+    W_ mv, z, x, y, r;
+    /* Args: R1 :: MutVar#,  R2 :: a -> (a,b) */
+
+    /* If x is the current contents of the MutVar#, then 
+       We want to make the new contents point to
+
+         (sel_0 (f x))
+ 
+       and the return value is
+	 
+	 (sel_1 (f x))
+
+        obviously we can share (f x).
+
+         z = [stg_ap_2 f x]  (max (HS + 2) MIN_UPD_SIZE)
+	 y = [stg_sel_0 z]   (max (HS + 1) MIN_UPD_SIZE)
+         r = [stg_sel_1 z]   (max (HS + 1) MIN_UPD_SIZE)
+    */
+
+#if MIN_UPD_SIZE > 1
+#define THUNK_1_SIZE (SIZEOF_StgThunkHeader + WDS(MIN_UPD_SIZE))
+#define TICK_ALLOC_THUNK_1() TICK_ALLOC_UP_THK(WDS(1),WDS(MIN_UPD_SIZE-1))
+#else
+#define THUNK_1_SIZE (SIZEOF_StgThunkHeader + WDS(1))
+#define TICK_ALLOC_THUNK_1() TICK_ALLOC_UP_THK(WDS(1),0)
+#endif
+
+#if MIN_UPD_SIZE > 2
+#define THUNK_2_SIZE (SIZEOF_StgThunkHeader + WDS(MIN_UPD_SIZE))
+#define TICK_ALLOC_THUNK_2() TICK_ALLOC_UP_THK(WDS(2),WDS(MIN_UPD_SIZE-2))
+#else
+#define THUNK_2_SIZE (SIZEOF_StgThunkHeader + WDS(2))
+#define TICK_ALLOC_THUNK_2() TICK_ALLOC_UP_THK(WDS(2),0)
+#endif
+
+#define SIZE (THUNK_2_SIZE + THUNK_1_SIZE + THUNK_1_SIZE)
+
+   HP_CHK_GEN_TICKY(SIZE, R1_PTR & R2_PTR, atomicModifyMutVarzh_fast);
+
+#if defined(THREADED_RTS)
+    foreign "C" ACQUIRE_LOCK(atomic_modify_mutvar_mutex "ptr") [R1,R2];
+#endif
+
+   x = StgMutVar_var(R1);
+
+   TICK_ALLOC_THUNK_2();
+   CCCS_ALLOC(THUNK_2_SIZE);
+   z = Hp - THUNK_2_SIZE + WDS(1);
+   SET_HDR(z, stg_ap_2_upd_info, W_[CCCS]);
+   LDV_RECORD_CREATE(z);
+   StgThunk_payload(z,0) = R2;
+   StgThunk_payload(z,1) = x;
+
+   TICK_ALLOC_THUNK_1();
+   CCCS_ALLOC(THUNK_1_SIZE);
+   y = z - THUNK_1_SIZE;
+   SET_HDR(y, stg_sel_0_upd_info, W_[CCCS]);
+   LDV_RECORD_CREATE(y);
+   StgThunk_payload(y,0) = z;
+
+   StgMutVar_var(R1) = y;
+   foreign "C" dirty_MUT_VAR(BaseReg "ptr", R1 "ptr") [R1];
+
+   TICK_ALLOC_THUNK_1();
+   CCCS_ALLOC(THUNK_1_SIZE);
+   r = y - THUNK_1_SIZE;
+   SET_HDR(r, stg_sel_1_upd_info, W_[CCCS]);
+   LDV_RECORD_CREATE(r);
+   StgThunk_payload(r,0) = z;
+
+#if defined(THREADED_RTS)
+    foreign "C" RELEASE_LOCK(atomic_modify_mutvar_mutex "ptr") [];
+#endif
+
+   RET_P(r);
+}
+
+/* -----------------------------------------------------------------------------
+   Weak Pointer Primitives
+   -------------------------------------------------------------------------- */
+
+STRING(stg_weak_msg,"New weak pointer at %p\n")
+
+mkWeakzh_fast
+{
+  /* R1 = key
+     R2 = value
+     R3 = finalizer (or NULL)
+  */
+  W_ w;
+
+  if (R3 == NULL) {
+    R3 = stg_NO_FINALIZER_closure;
+  }
+
+  ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR & R3_PTR, mkWeakzh_fast );
+
+  w = Hp - SIZEOF_StgWeak + WDS(1);
+  SET_HDR(w, stg_WEAK_info, W_[CCCS]);
+
+  StgWeak_key(w)       = R1;
+  StgWeak_value(w)     = R2;
+  StgWeak_finalizer(w) = R3;
+
+  StgWeak_link(w)	= W_[weak_ptr_list];
+  W_[weak_ptr_list] 	= w;
+
+  IF_DEBUG(weak, foreign "C" debugBelch(stg_weak_msg,w) []);
+
+  RET_P(w);
+}
+
+
+finalizzeWeakzh_fast
+{
+  /* R1 = weak ptr
+   */
+  W_ w, f;
+
+  w = R1;
+
+  // already dead?
+  if (GET_INFO(w) == stg_DEAD_WEAK_info) {
+      RET_NP(0,stg_NO_FINALIZER_closure);
+  }
+
+  // kill it
+#ifdef PROFILING
+  // @LDV profiling
+  // A weak pointer is inherently used, so we do not need to call
+  // LDV_recordDead_FILL_SLOP_DYNAMIC():
+  //    LDV_recordDead_FILL_SLOP_DYNAMIC((StgClosure *)w);
+  // or, LDV_recordDead():
+  //    LDV_recordDead((StgClosure *)w, sizeofW(StgWeak) - sizeofW(StgProfHeader));
+  // Furthermore, when PROFILING is turned on, dead weak pointers are exactly as 
+  // large as weak pointers, so there is no need to fill the slop, either.
+  // See stg_DEAD_WEAK_info in StgMiscClosures.hc.
+#endif
+
+  //
+  // Todo: maybe use SET_HDR() and remove LDV_recordCreate()?
+  //
+  SET_INFO(w,stg_DEAD_WEAK_info);
+  LDV_RECORD_CREATE(w);
+
+  f = StgWeak_finalizer(w);
+  StgDeadWeak_link(w) = StgWeak_link(w);
+
+  /* return the finalizer */
+  if (f == stg_NO_FINALIZER_closure) {
+      RET_NP(0,stg_NO_FINALIZER_closure);
+  } else {
+      RET_NP(1,f);
+  }
+}
+
+deRefWeakzh_fast
+{
+  /* R1 = weak ptr */
+  W_ w, code, val;
+
+  w = R1;
+  if (GET_INFO(w) == stg_WEAK_info) {
+    code = 1;
+    val = StgWeak_value(w);
+  } else {
+    code = 0;
+    val = w;
+  }
+  RET_NP(code,val);
+}
+
+/* -----------------------------------------------------------------------------
+   Arbitrary-precision Integer operations.
+
+   There are some assumptions in this code that mp_limb_t == W_.  This is
+   the case for all the platforms that GHC supports, currently.
+   -------------------------------------------------------------------------- */
+
+int2Integerzh_fast
+{
+   /* arguments: R1 = Int# */
+
+   W_ val, s, p;	/* to avoid aliasing */
+
+   val = R1;
+   ALLOC_PRIM( SIZEOF_StgArrWords + WDS(1), NO_PTRS, int2Integerzh_fast );
+
+   p = Hp - SIZEOF_StgArrWords;
+   SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
+   StgArrWords_words(p) = 1;
+
+   /* mpz_set_si is inlined here, makes things simpler */
+   if (%lt(val,0)) { 
+	s  = -1;
+	Hp(0) = -val;
+   } else { 
+     if (%gt(val,0)) {
+	s = 1;
+	Hp(0) = val;
+     } else {
+	s = 0;
+     }
+  }
+
+   /* returns (# size  :: Int#, 
+		 data  :: ByteArray# 
+	       #)
+   */
+   RET_NP(s,p);
+}
+
+word2Integerzh_fast
+{
+   /* arguments: R1 = Word# */
+
+   W_ val, s, p;	/* to avoid aliasing */
+
+   val = R1;
+
+   ALLOC_PRIM( SIZEOF_StgArrWords + WDS(1), NO_PTRS, word2Integerzh_fast);
+
+   p = Hp - SIZEOF_StgArrWords;
+   SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
+   StgArrWords_words(p) = 1;
+
+   if (val != 0) {
+	s = 1;
+	W_[Hp] = val;
+   } else {
+	s = 0;
+   }
+
+   /* returns (# size  :: Int#, 
+		 data  :: ByteArray# #)
+   */
+   RET_NP(s,p);
+}
+
+
+/*
+ * 'long long' primops for converting to/from Integers.
+ */
+
+#ifdef SUPPORT_LONG_LONGS
+
+int64ToIntegerzh_fast
+{
+   /* arguments: L1 = Int64# */
+
+   L_ val;
+   W_ hi, s, neg, words_needed, p;
+
+   val = L1;
+   neg = 0;
+
+   if ( %ge(val,0x100000000::L_) || %le(val,-0x100000000::L_) )  { 
+       words_needed = 2;
+   } else { 
+       // minimum is one word
+       words_needed = 1;
+   }
+
+   ALLOC_PRIM( SIZEOF_StgArrWords + WDS(words_needed),
+	       NO_PTRS, int64ToIntegerzh_fast );
+
+   p = Hp - SIZEOF_StgArrWords - WDS(words_needed) + WDS(1);
+   SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
+   StgArrWords_words(p) = words_needed;
+
+   if ( %lt(val,0::L_) ) {
+     neg = 1;
+     val = -val;
+   }
+
+   hi = TO_W_(val >> 32);
+
+   if ( words_needed == 2 )  { 
+      s = 2;
+      Hp(-1) = TO_W_(val);
+      Hp(0) = hi;
+   } else { 
+       if ( val != 0::L_ ) {
+	   s = 1;
+	   Hp(0) = TO_W_(val);
+       } else /* val==0 */  {
+	   s = 0;
+       }
+   }
+   if ( neg != 0 ) {
+	s = -s;
+   }
+
+   /* returns (# size  :: Int#, 
+		 data  :: ByteArray# #)
+   */
+   RET_NP(s,p);
+}
+
+word64ToIntegerzh_fast
+{
+   /* arguments: L1 = Word64# */
+
+   L_ val;
+   W_ hi, s, words_needed, p;
+
+   val = L1;
+   if ( val >= 0x100000000::L_ ) {
+      words_needed = 2;
+   } else {
+      words_needed = 1;
+   }
+
+   ALLOC_PRIM( SIZEOF_StgArrWords + WDS(words_needed),
+	       NO_PTRS, word64ToIntegerzh_fast );
+
+   p = Hp - SIZEOF_StgArrWords - WDS(words_needed) + WDS(1);
+   SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
+   StgArrWords_words(p) = words_needed;
+
+   hi = TO_W_(val >> 32);
+   if ( val >= 0x100000000::L_ ) { 
+     s = 2;
+     Hp(-1) = TO_W_(val);
+     Hp(0)  = hi;
+   } else {
+      if ( val != 0::L_ ) {
+        s = 1;
+        Hp(0) = TO_W_(val);
+     } else /* val==0 */  {
+      s = 0;
+     }
+  }
+
+   /* returns (# size  :: Int#, 
+		 data  :: ByteArray# #)
+   */
+   RET_NP(s,p);
+}
+
+
+#endif /* SUPPORT_LONG_LONGS */
+
+/* ToDo: this is shockingly inefficient */
+
+#ifndef THREADED_RTS
+section "bss" {
+  mp_tmp1:
+    bits8 [SIZEOF_MP_INT];
+}
+
+section "bss" {
+  mp_tmp2:
+    bits8 [SIZEOF_MP_INT];
+}
+
+section "bss" {
+  mp_result1:
+    bits8 [SIZEOF_MP_INT];
+}
+
+section "bss" {
+  mp_result2:
+    bits8 [SIZEOF_MP_INT];
+}
+#endif
+
+#ifdef THREADED_RTS
+#define FETCH_MP_TEMP(X) \
+W_ X; \
+X = BaseReg + (OFFSET_StgRegTable_r ## X);
+#else
+#define FETCH_MP_TEMP(X) /* Nothing */
+#endif
+
+#define GMP_TAKE2_RET1(name,mp_fun)                                     \
+name                                                                    \
+{                                                                       \
+  CInt s1, s2;                                                          \
+  W_ d1, d2;                                                            \
+  FETCH_MP_TEMP(mp_tmp1);                                               \
+  FETCH_MP_TEMP(mp_tmp2);                                               \
+  FETCH_MP_TEMP(mp_result1)                                             \
+  FETCH_MP_TEMP(mp_result2);                                            \
+                                                                        \
+  /* call doYouWantToGC() */                                            \
+  MAYBE_GC(R2_PTR & R4_PTR, name);                                      \
+                                                                        \
+  s1 = W_TO_INT(R1);                                                    \
+  d1 = R2;                                                              \
+  s2 = W_TO_INT(R3);                                                    \
+  d2 = R4;                                                              \
+                                                                        \
+  MP_INT__mp_alloc(mp_tmp1) = W_TO_INT(StgArrWords_words(d1));          \
+  MP_INT__mp_size(mp_tmp1)  = (s1);                                     \
+  MP_INT__mp_d(mp_tmp1)	    = BYTE_ARR_CTS(d1);                         \
+  MP_INT__mp_alloc(mp_tmp2) = W_TO_INT(StgArrWords_words(d2));          \
+  MP_INT__mp_size(mp_tmp2)  = (s2);                                     \
+  MP_INT__mp_d(mp_tmp2)	    = BYTE_ARR_CTS(d2);                         \
+                                                                        \
+  foreign "C" mpz_init(mp_result1 "ptr") [];                            \
+                                                                        \
+  /* Perform the operation */                                           \
+  foreign "C" mp_fun(mp_result1 "ptr",mp_tmp1  "ptr",mp_tmp2  "ptr") []; \
+                                                                        \
+  RET_NP(TO_W_(MP_INT__mp_size(mp_result1)),                            \
+         MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords);                \
+}
+
+#define GMP_TAKE1_RET1(name,mp_fun)                                     \
+name                                                                    \
+{                                                                       \
+  CInt s1;                                                              \
+  W_ d1;                                                                \
+  FETCH_MP_TEMP(mp_tmp1);                                               \
+  FETCH_MP_TEMP(mp_result1)                                             \
+                                                                        \
+  /* call doYouWantToGC() */                                            \
+  MAYBE_GC(R2_PTR, name);                                               \
+                                                                        \
+  d1 = R2;                                                              \
+  s1 = W_TO_INT(R1);                                                    \
+                                                                        \
+  MP_INT__mp_alloc(mp_tmp1)	= W_TO_INT(StgArrWords_words(d1));      \
+  MP_INT__mp_size(mp_tmp1)	= (s1);                                 \
+  MP_INT__mp_d(mp_tmp1)		= BYTE_ARR_CTS(d1);                     \
+                                                                        \
+  foreign "C" mpz_init(mp_result1 "ptr") [];                            \
+                                                                        \
+  /* Perform the operation */                                           \
+  foreign "C" mp_fun(mp_result1 "ptr",mp_tmp1 "ptr") [];                \
+                                                                        \
+  RET_NP(TO_W_(MP_INT__mp_size(mp_result1)),                            \
+         MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords);                \
+}
+
+#define GMP_TAKE2_RET2(name,mp_fun)                                                     \
+name                                                                                    \
+{                                                                                       \
+  CInt s1, s2;                                                                          \
+  W_ d1, d2;                                                                            \
+  FETCH_MP_TEMP(mp_tmp1);                                                               \
+  FETCH_MP_TEMP(mp_tmp2);                                                               \
+  FETCH_MP_TEMP(mp_result1)                                                             \
+  FETCH_MP_TEMP(mp_result2)                                                             \
+                                                                                        \
+  /* call doYouWantToGC() */                                                            \
+  MAYBE_GC(R2_PTR & R4_PTR, name);                                                      \
+                                                                                        \
+  s1 = W_TO_INT(R1);                                                                    \
+  d1 = R2;                                                                              \
+  s2 = W_TO_INT(R3);                                                                    \
+  d2 = R4;                                                                              \
+                                                                                        \
+  MP_INT__mp_alloc(mp_tmp1)	= W_TO_INT(StgArrWords_words(d1));                      \
+  MP_INT__mp_size(mp_tmp1)	= (s1);                                                 \
+  MP_INT__mp_d(mp_tmp1)		= BYTE_ARR_CTS(d1);                                     \
+  MP_INT__mp_alloc(mp_tmp2)	= W_TO_INT(StgArrWords_words(d2));                      \
+  MP_INT__mp_size(mp_tmp2)	= (s2);                                                 \
+  MP_INT__mp_d(mp_tmp2)		= BYTE_ARR_CTS(d2);                                     \
+                                                                                        \
+  foreign "C" mpz_init(mp_result1 "ptr") [];                                               \
+  foreign "C" mpz_init(mp_result2 "ptr") [];                                               \
+                                                                                        \
+  /* Perform the operation */                                                           \
+  foreign "C" mp_fun(mp_result1 "ptr",mp_result2 "ptr",mp_tmp1 "ptr",mp_tmp2 "ptr") [];    \
+                                                                                        \
+  RET_NPNP(TO_W_(MP_INT__mp_size(mp_result1)),                                          \
+           MP_INT__mp_d(mp_result1) - SIZEOF_StgArrWords,                               \
+	   TO_W_(MP_INT__mp_size(mp_result2)),                                          \
+           MP_INT__mp_d(mp_result2) - SIZEOF_StgArrWords);                              \
+}
+
+GMP_TAKE2_RET1(plusIntegerzh_fast,     mpz_add)
+GMP_TAKE2_RET1(minusIntegerzh_fast,    mpz_sub)
+GMP_TAKE2_RET1(timesIntegerzh_fast,    mpz_mul)
+GMP_TAKE2_RET1(gcdIntegerzh_fast,      mpz_gcd)
+GMP_TAKE2_RET1(quotIntegerzh_fast,     mpz_tdiv_q)
+GMP_TAKE2_RET1(remIntegerzh_fast,      mpz_tdiv_r)
+GMP_TAKE2_RET1(divExactIntegerzh_fast, mpz_divexact)
+GMP_TAKE2_RET1(andIntegerzh_fast,      mpz_and)
+GMP_TAKE2_RET1(orIntegerzh_fast,       mpz_ior)
+GMP_TAKE2_RET1(xorIntegerzh_fast,      mpz_xor)
+GMP_TAKE1_RET1(complementIntegerzh_fast, mpz_com)
+
+GMP_TAKE2_RET2(quotRemIntegerzh_fast, mpz_tdiv_qr)
+GMP_TAKE2_RET2(divModIntegerzh_fast,  mpz_fdiv_qr)
+
+#ifndef THREADED_RTS
+section "bss" {
+  mp_tmp_w:  W_; // NB. mp_tmp_w is really an here mp_limb_t
+}
+#endif
+
+gcdIntzh_fast
+{
+    /* R1 = the first Int#; R2 = the second Int# */
+    W_ r; 
+    FETCH_MP_TEMP(mp_tmp_w);
+
+    W_[mp_tmp_w] = R1;
+    r = foreign "C" mpn_gcd_1(mp_tmp_w "ptr", 1, R2) [];
+
+    R1 = r;
+    /* Result parked in R1, return via info-pointer at TOS */
+    jump %ENTRY_CODE(Sp(0));
+}
+
+
+gcdIntegerIntzh_fast
+{
+    /* R1 = s1; R2 = d1; R3 = the int */
+    R1 = foreign "C" mpn_gcd_1( BYTE_ARR_CTS(R2) "ptr", R1, R3) [];
+    
+    /* Result parked in R1, return via info-pointer at TOS */
+    jump %ENTRY_CODE(Sp(0));
+}
+
+
+cmpIntegerIntzh_fast
+{
+    /* R1 = s1; R2 = d1; R3 = the int */
+    W_ usize, vsize, v_digit, u_digit;
+
+    usize = R1;
+    vsize = 0;
+    v_digit = R3;
+
+    // paraphrased from mpz_cmp_si() in the GMP sources
+    if (%gt(v_digit,0)) {
+	vsize = 1;
+    } else { 
+	if (%lt(v_digit,0)) {
+	    vsize = -1;
+	    v_digit = -v_digit;
+	}
+    }
+
+    if (usize != vsize) {
+	R1 = usize - vsize; 
+	jump %ENTRY_CODE(Sp(0));
+    }
+
+    if (usize == 0) {
+	R1 = 0; 
+	jump %ENTRY_CODE(Sp(0));
+    }
+
+    u_digit = W_[BYTE_ARR_CTS(R2)];
+
+    if (u_digit == v_digit) {
+	R1 = 0; 
+	jump %ENTRY_CODE(Sp(0));
+    }
+
+    if (%gtu(u_digit,v_digit)) { // NB. unsigned: these are mp_limb_t's
+	R1 = usize; 
+    } else {
+	R1 = -usize; 
+    }
+
+    jump %ENTRY_CODE(Sp(0));
+}
+
+cmpIntegerzh_fast
+{
+    /* R1 = s1; R2 = d1; R3 = s2; R4 = d2 */
+    W_ usize, vsize, size, up, vp;
+    CInt cmp;
+
+    // paraphrased from mpz_cmp() in the GMP sources
+    usize = R1;
+    vsize = R3;
+
+    if (usize != vsize) {
+	R1 = usize - vsize; 
+	jump %ENTRY_CODE(Sp(0));
+    }
+
+    if (usize == 0) {
+	R1 = 0; 
+	jump %ENTRY_CODE(Sp(0));
+    }
+
+    if (%lt(usize,0)) { // NB. not <, which is unsigned
+	size = -usize;
+    } else {
+	size = usize;
+    }
+
+    up = BYTE_ARR_CTS(R2);
+    vp = BYTE_ARR_CTS(R4);
+
+    cmp = foreign "C" mpn_cmp(up "ptr", vp "ptr", size) [];
+
+    if (cmp == 0 :: CInt) {
+	R1 = 0; 
+	jump %ENTRY_CODE(Sp(0));
+    }
+
+    if (%lt(cmp,0 :: CInt) == %lt(usize,0)) {
+	R1 = 1;
+    } else {
+	R1 = (-1); 
+    }
+    /* Result parked in R1, return via info-pointer at TOS */
+    jump %ENTRY_CODE(Sp(0));
+}
+
+integer2Intzh_fast
+{
+    /* R1 = s; R2 = d */
+    W_ r, s;
+
+    s = R1;
+    if (s == 0) {
+	r = 0;
+    } else {
+	r = W_[R2 + SIZEOF_StgArrWords];
+	if (%lt(s,0)) {
+	    r = -r;
+	}
+    }
+    /* Result parked in R1, return via info-pointer at TOS */
+    R1 = r;
+    jump %ENTRY_CODE(Sp(0));
+}
+
+integer2Wordzh_fast
+{
+  /* R1 = s; R2 = d */
+  W_ r, s;
+
+  s = R1;
+  if (s == 0) {
+    r = 0;
+  } else {
+    r = W_[R2 + SIZEOF_StgArrWords];
+    if (%lt(s,0)) {
+	r = -r;
+    }
+  }
+  /* Result parked in R1, return via info-pointer at TOS */
+  R1 = r;
+  jump %ENTRY_CODE(Sp(0));
+}
+
+decodeFloatzh_fast
+{ 
+    W_ p;
+    F_ arg;
+    FETCH_MP_TEMP(mp_tmp1);
+    FETCH_MP_TEMP(mp_tmp_w);
+    
+    /* arguments: F1 = Float# */
+    arg = F1;
+    
+    ALLOC_PRIM( SIZEOF_StgArrWords + WDS(1), NO_PTRS, decodeFloatzh_fast );
+    
+    /* Be prepared to tell Lennart-coded __decodeFloat
+       where mantissa._mp_d can be put (it does not care about the rest) */
+    p = Hp - SIZEOF_StgArrWords;
+    SET_HDR(p,stg_ARR_WORDS_info,W_[CCCS]);
+    StgArrWords_words(p) = 1;
+    MP_INT__mp_d(mp_tmp1) = BYTE_ARR_CTS(p);
+    
+    /* Perform the operation */
+    foreign "C" __decodeFloat(mp_tmp1 "ptr",mp_tmp_w "ptr" ,arg) [];
+    
+    /* returns: (Int# (expn), Int#, ByteArray#) */
+    RET_NNP(W_[mp_tmp_w], TO_W_(MP_INT__mp_size(mp_tmp1)), p);
+}
+
+#define DOUBLE_MANTISSA_SIZE SIZEOF_DOUBLE
+#define ARR_SIZE (SIZEOF_StgArrWords + DOUBLE_MANTISSA_SIZE)
+
+decodeDoublezh_fast
+{ 
+    D_ arg;
+    W_ p;
+    FETCH_MP_TEMP(mp_tmp1);
+    FETCH_MP_TEMP(mp_tmp_w);
+
+    /* arguments: D1 = Double# */
+    arg = D1;
+
+    ALLOC_PRIM( ARR_SIZE, NO_PTRS, decodeDoublezh_fast );
+    
+    /* Be prepared to tell Lennart-coded __decodeDouble
+       where mantissa.d can be put (it does not care about the rest) */
+    p = Hp - ARR_SIZE + WDS(1);
+    SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]);
+    StgArrWords_words(p) = BYTES_TO_WDS(DOUBLE_MANTISSA_SIZE);
+    MP_INT__mp_d(mp_tmp1) = BYTE_ARR_CTS(p);
+
+    /* Perform the operation */
+    foreign "C" __decodeDouble(mp_tmp1 "ptr", mp_tmp_w "ptr",arg) [];
+    
+    /* returns: (Int# (expn), Int#, ByteArray#) */
+    RET_NNP(W_[mp_tmp_w], TO_W_(MP_INT__mp_size(mp_tmp1)), p);
+}
+
+/* -----------------------------------------------------------------------------
+ * Concurrency primitives
+ * -------------------------------------------------------------------------- */
+
+forkzh_fast
+{
+  /* args: R1 = closure to spark */
+
+  MAYBE_GC(R1_PTR, forkzh_fast);
+
+  W_ closure;
+  W_ threadid;
+  closure = R1;
+
+  "ptr" threadid = foreign "C" createIOThread( MyCapability() "ptr", 
+				RtsFlags_GcFlags_initialStkSize(RtsFlags), 
+				closure "ptr") [];
+  foreign "C" scheduleThread(MyCapability() "ptr", threadid "ptr") [];
+
+  // switch at the earliest opportunity
+  CInt[context_switch] = 1 :: CInt;
+  
+  RET_P(threadid);
+}
+
+forkOnzh_fast
+{
+  /* args: R1 = cpu, R2 = closure to spark */
+
+  MAYBE_GC(R2_PTR, forkOnzh_fast);
+
+  W_ cpu;
+  W_ closure;
+  W_ threadid;
+  cpu = R1;
+  closure = R2;
+
+  "ptr" threadid = foreign "C" createIOThread( MyCapability() "ptr", 
+				RtsFlags_GcFlags_initialStkSize(RtsFlags), 
+				closure "ptr") [];
+  foreign "C" scheduleThreadOn(MyCapability() "ptr", cpu, threadid "ptr") [];
+
+  // switch at the earliest opportunity
+  CInt[context_switch] = 1 :: CInt;
+  
+  RET_P(threadid);
+}
+
+yieldzh_fast
+{
+  jump stg_yield_noregs;
+}
+
+myThreadIdzh_fast
+{
+  /* no args. */
+  RET_P(CurrentTSO);
+}
+
+labelThreadzh_fast
+{
+  /* args: 
+	R1 = ThreadId#
+	R2 = Addr# */
+#ifdef DEBUG
+  foreign "C" labelThread(R1 "ptr", R2 "ptr") [];
+#endif
+  jump %ENTRY_CODE(Sp(0));
+}
+
+isCurrentThreadBoundzh_fast
+{
+  /* no args */
+  W_ r;
+  r = foreign "C" isThreadBound(CurrentTSO) [];
+  RET_N(r);
+}
+
+
+/* -----------------------------------------------------------------------------
+ * TVar primitives
+ * -------------------------------------------------------------------------- */
+
+#ifdef REG_R1
+#define SP_OFF 0
+#define IF_NOT_REG_R1(x) 
+#else
+#define SP_OFF 1
+#define IF_NOT_REG_R1(x) x
+#endif
+
+// Catch retry frame ------------------------------------------------------------
+
+#define CATCH_RETRY_FRAME_ERROR(label) \
+  label { foreign "C" barf("catch_retry_frame incorrectly entered!"); }
+
+CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_0_ret)
+CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_1_ret)
+CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_2_ret)
+CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_3_ret)
+CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_4_ret)
+CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_5_ret)
+CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_6_ret)
+CATCH_RETRY_FRAME_ERROR(stg_catch_retry_frame_7_ret)
+
+#if MAX_VECTORED_RTN > 8
+#error MAX_VECTORED_RTN has changed: please modify stg_catch_retry_frame too.
+#endif
+
+#if defined(PROFILING)
+#define CATCH_RETRY_FRAME_BITMAP 7
+#define CATCH_RETRY_FRAME_WORDS  6
+#else
+#define CATCH_RETRY_FRAME_BITMAP 1
+#define CATCH_RETRY_FRAME_WORDS  4
+#endif
+
+INFO_TABLE_RET(stg_catch_retry_frame,
+	       CATCH_RETRY_FRAME_WORDS, CATCH_RETRY_FRAME_BITMAP,
+	       CATCH_RETRY_FRAME,
+	       stg_catch_retry_frame_0_ret,
+	       stg_catch_retry_frame_1_ret,
+	       stg_catch_retry_frame_2_ret,
+	       stg_catch_retry_frame_3_ret,
+	       stg_catch_retry_frame_4_ret,
+	       stg_catch_retry_frame_5_ret,
+	       stg_catch_retry_frame_6_ret,
+	       stg_catch_retry_frame_7_ret)
+{
+   W_ r, frame, trec, outer;
+   IF_NOT_REG_R1(W_ rval;  rval = Sp(0);  Sp_adj(1); )
+
+   frame = Sp;
+   trec = StgTSO_trec(CurrentTSO);
+   "ptr" outer = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
+   r = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr") [];
+   if (r) {
+     /* Succeeded (either first branch or second branch) */
+     StgTSO_trec(CurrentTSO) = outer;
+     Sp = Sp + SIZEOF_StgCatchRetryFrame;
+     IF_NOT_REG_R1(Sp_adj(-1); Sp(0) = rval;)
+     jump %ENTRY_CODE(Sp(SP_OFF));
+   } else {
+     /* Did not commit: retry */
+     W_ new_trec;
+     "ptr" new_trec = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
+     StgTSO_trec(CurrentTSO) = new_trec;
+     if (StgCatchRetryFrame_running_alt_code(frame)) {
+       R1 = StgCatchRetryFrame_alt_code(frame);
+     } else {
+       R1 = StgCatchRetryFrame_first_code(frame);
+       StgCatchRetryFrame_first_code_trec(frame) = new_trec;
+     }
+     jump stg_ap_v_fast;
+   }
+}
+
+
+// Atomically frame -------------------------------------------------------------
+
+
+#define ATOMICALLY_FRAME_ERROR(label) \
+  label { foreign "C" barf("atomically_frame incorrectly entered!"); }
+
+ATOMICALLY_FRAME_ERROR(stg_atomically_frame_0_ret)
+ATOMICALLY_FRAME_ERROR(stg_atomically_frame_1_ret)
+ATOMICALLY_FRAME_ERROR(stg_atomically_frame_2_ret)
+ATOMICALLY_FRAME_ERROR(stg_atomically_frame_3_ret)
+ATOMICALLY_FRAME_ERROR(stg_atomically_frame_4_ret)
+ATOMICALLY_FRAME_ERROR(stg_atomically_frame_5_ret)
+ATOMICALLY_FRAME_ERROR(stg_atomically_frame_6_ret)
+ATOMICALLY_FRAME_ERROR(stg_atomically_frame_7_ret)
+
+#if MAX_VECTORED_RTN > 8
+#error MAX_VECTORED_RTN has changed: please modify stg_atomically_frame too.
+#endif
+
+#if defined(PROFILING)
+#define ATOMICALLY_FRAME_BITMAP 3
+#define ATOMICALLY_FRAME_WORDS  3
+#else
+#define ATOMICALLY_FRAME_BITMAP 0
+#define ATOMICALLY_FRAME_WORDS  1
+#endif
+
+
+INFO_TABLE_RET(stg_atomically_frame,
+	       ATOMICALLY_FRAME_WORDS, ATOMICALLY_FRAME_BITMAP,
+	       ATOMICALLY_FRAME,
+	       stg_atomically_frame_0_ret,
+	       stg_atomically_frame_1_ret,
+	       stg_atomically_frame_2_ret,
+	       stg_atomically_frame_3_ret,
+	       stg_atomically_frame_4_ret,
+	       stg_atomically_frame_5_ret,
+	       stg_atomically_frame_6_ret,
+	       stg_atomically_frame_7_ret)
+{
+  W_ frame, trec, valid;
+  IF_NOT_REG_R1(W_ rval;  rval = Sp(0);  Sp_adj(1); )
+
+  frame = Sp;
+  trec = StgTSO_trec(CurrentTSO);
+
+  /* The TSO is not currently waiting: try to commit the transaction */
+  valid = foreign "C" stmCommitTransaction(MyCapability() "ptr", trec "ptr") [];
+  if (valid) {
+    /* Transaction was valid: commit succeeded */
+    StgTSO_trec(CurrentTSO) = NO_TREC;
+    Sp = Sp + SIZEOF_StgAtomicallyFrame;
+    IF_NOT_REG_R1(Sp_adj(-1); Sp(0) = rval;)
+    jump %ENTRY_CODE(Sp(SP_OFF));
+  } else {
+    /* Transaction was not valid: try again */
+    "ptr" trec = foreign "C" stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr") [];
+    StgTSO_trec(CurrentTSO) = trec;
+    R1 = StgAtomicallyFrame_code(frame);
+    jump stg_ap_v_fast;
+  }
+}
+
+INFO_TABLE_RET(stg_atomically_waiting_frame,
+	       ATOMICALLY_FRAME_WORDS, ATOMICALLY_FRAME_BITMAP,
+	       ATOMICALLY_FRAME,
+	       stg_atomically_frame_0_ret,
+	       stg_atomically_frame_1_ret,
+	       stg_atomically_frame_2_ret,
+	       stg_atomically_frame_3_ret,
+	       stg_atomically_frame_4_ret,
+	       stg_atomically_frame_5_ret,
+	       stg_atomically_frame_6_ret,
+	       stg_atomically_frame_7_ret)
+{
+  W_ frame, trec, valid;
+  IF_NOT_REG_R1(W_ rval;  rval = Sp(0);  Sp_adj(1); )
+
+  frame = Sp;
+
+  /* The TSO is currently waiting: should we stop waiting? */
+  valid = foreign "C" stmReWait(MyCapability() "ptr", CurrentTSO "ptr") [];
+  if (valid) {
+    /* Previous attempt is still valid: no point trying again yet */
+	  IF_NOT_REG_R1(Sp_adj(-2);
+			Sp(1) = stg_NO_FINALIZER_closure;
+		  	Sp(0) = stg_ut_1_0_unreg_info;)
+    jump stg_block_noregs;
+  } else {
+    /* Previous attempt is no longer valid: try again */
+    "ptr" trec = foreign "C" stmStartTransaction(MyCapability() "ptr", NO_TREC "ptr") [];
+    StgTSO_trec(CurrentTSO) = trec;
+    StgHeader_info(frame) = stg_atomically_frame_info;
+    R1 = StgAtomicallyFrame_code(frame);
+    jump stg_ap_v_fast;
+  }
+}
+
+// STM catch frame --------------------------------------------------------------
+
+#define CATCH_STM_FRAME_ENTRY_TEMPLATE(label,ret)          \
+   label                                                   \
+   {                                                       \
+      IF_NOT_REG_R1(W_ rval;  rval = Sp(0);  Sp_adj(1); )  \
+      Sp = Sp + SIZEOF_StgCatchSTMFrame;                   \
+      IF_NOT_REG_R1(Sp_adj(-1); Sp(0) = rval;)             \
+      jump ret;                                            \
+   }
+
+#ifdef REG_R1
+#define SP_OFF 0
+#else
+#define SP_OFF 1
+#endif
+
+CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_0_ret,%RET_VEC(Sp(SP_OFF),0))
+CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_1_ret,%RET_VEC(Sp(SP_OFF),1))
+CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_2_ret,%RET_VEC(Sp(SP_OFF),2))
+CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_3_ret,%RET_VEC(Sp(SP_OFF),3))
+CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_4_ret,%RET_VEC(Sp(SP_OFF),4))
+CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_5_ret,%RET_VEC(Sp(SP_OFF),5))
+CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_6_ret,%RET_VEC(Sp(SP_OFF),6))
+CATCH_STM_FRAME_ENTRY_TEMPLATE(stg_catch_stm_frame_7_ret,%RET_VEC(Sp(SP_OFF),7))
+
+#if MAX_VECTORED_RTN > 8
+#error MAX_VECTORED_RTN has changed: please modify stg_catch_stm_frame too.
+#endif
+
+#if defined(PROFILING)
+#define CATCH_STM_FRAME_BITMAP 3
+#define CATCH_STM_FRAME_WORDS  3
+#else
+#define CATCH_STM_FRAME_BITMAP 0
+#define CATCH_STM_FRAME_WORDS  1
+#endif
+
+/* Catch frames are very similar to update frames, but when entering
+ * one we just pop the frame off the stack and perform the correct
+ * kind of return to the activation record underneath us on the stack.
+ */
+
+INFO_TABLE_RET(stg_catch_stm_frame,
+	       CATCH_STM_FRAME_WORDS, CATCH_STM_FRAME_BITMAP,
+	       CATCH_STM_FRAME,
+	       stg_catch_stm_frame_0_ret,
+	       stg_catch_stm_frame_1_ret,
+	       stg_catch_stm_frame_2_ret,
+	       stg_catch_stm_frame_3_ret,
+	       stg_catch_stm_frame_4_ret,
+	       stg_catch_stm_frame_5_ret,
+	       stg_catch_stm_frame_6_ret,
+	       stg_catch_stm_frame_7_ret)
+CATCH_STM_FRAME_ENTRY_TEMPLATE(,%ENTRY_CODE(Sp(SP_OFF)))
+
+
+// Primop definition ------------------------------------------------------------
+
+atomicallyzh_fast
+{
+  W_ frame;
+  W_ old_trec;
+  W_ new_trec;
+  
+  // stmStartTransaction may allocate
+  MAYBE_GC (R1_PTR, atomicallyzh_fast); 
+
+  /* Args: R1 = m :: STM a */
+  STK_CHK_GEN(SIZEOF_StgAtomicallyFrame + WDS(1), R1_PTR, atomicallyzh_fast);
+
+  old_trec = StgTSO_trec(CurrentTSO);
+
+  /* Nested transactions are not allowed; raise an exception */
+  if (old_trec != NO_TREC) {
+     R1 = GHCziIOBase_NestedAtomically_closure;
+     jump raisezh_fast;
+  }
+
+  /* Set up the atomically frame */
+  Sp = Sp - SIZEOF_StgAtomicallyFrame;
+  frame = Sp;
+
+  SET_HDR(frame,stg_atomically_frame_info, W_[CCCS]);
+  StgAtomicallyFrame_code(frame) = R1;
+
+  /* Start the memory transcation */
+  "ptr" new_trec = foreign "C" stmStartTransaction(MyCapability() "ptr", old_trec "ptr") [R1];
+  StgTSO_trec(CurrentTSO) = new_trec;
+
+  /* Apply R1 to the realworld token */
+  jump stg_ap_v_fast;
+}
+
+
+catchSTMzh_fast
+{
+  W_ frame;
+  
+  /* Args: R1 :: STM a */
+  /* Args: R2 :: Exception -> STM a */
+  STK_CHK_GEN(SIZEOF_StgCatchSTMFrame + WDS(1), R1_PTR & R2_PTR, catchSTMzh_fast);
+
+  /* Set up the catch frame */
+  Sp = Sp - SIZEOF_StgCatchSTMFrame;
+  frame = Sp;
+
+  SET_HDR(frame, stg_catch_stm_frame_info, W_[CCCS]);
+  StgCatchSTMFrame_handler(frame) = R2;
+
+  /* Apply R1 to the realworld token */
+  jump stg_ap_v_fast;
+}
+
+
+catchRetryzh_fast
+{
+  W_ frame;
+  W_ new_trec;
+  W_ trec;
+
+  // stmStartTransaction may allocate
+  MAYBE_GC (R1_PTR & R2_PTR, catchRetryzh_fast); 
+
+  /* Args: R1 :: STM a */
+  /* Args: R2 :: STM a */
+  STK_CHK_GEN(SIZEOF_StgCatchRetryFrame + WDS(1), R1_PTR & R2_PTR, catchRetryzh_fast);
+
+  /* Start a nested transaction within which to run the first code */
+  trec = StgTSO_trec(CurrentTSO);
+  "ptr" new_trec = foreign "C" stmStartTransaction(MyCapability() "ptr", trec "ptr") [R1,R2];
+  StgTSO_trec(CurrentTSO) = new_trec;
+
+  /* Set up the catch-retry frame */
+  Sp = Sp - SIZEOF_StgCatchRetryFrame;
+  frame = Sp;
+  
+  SET_HDR(frame, stg_catch_retry_frame_info, W_[CCCS]);
+  StgCatchRetryFrame_running_alt_code(frame) = 0 :: CInt; // false;
+  StgCatchRetryFrame_first_code(frame) = R1;
+  StgCatchRetryFrame_alt_code(frame) = R2;
+  StgCatchRetryFrame_first_code_trec(frame) = new_trec;
+
+  /* Apply R1 to the realworld token */
+  jump stg_ap_v_fast;
+}
+
+
+retryzh_fast
+{
+  W_ frame_type;
+  W_ frame;
+  W_ trec;
+  W_ outer;
+  W_ r;
+
+  MAYBE_GC (NO_PTRS, retryzh_fast); // STM operations may allocate
+
+  // Find the enclosing ATOMICALLY_FRAME or CATCH_RETRY_FRAME
+retry_pop_stack:
+  trec = StgTSO_trec(CurrentTSO);
+  "ptr" outer = foreign "C" stmGetEnclosingTRec(trec "ptr") [];
+  StgTSO_sp(CurrentTSO) = Sp;
+  frame_type = foreign "C" findRetryFrameHelper(CurrentTSO "ptr") [];
+  Sp = StgTSO_sp(CurrentTSO);
+  frame = Sp;
+
+  if (frame_type == CATCH_RETRY_FRAME) {
+    // The retry reaches a CATCH_RETRY_FRAME before the atomic frame
+    ASSERT(outer != NO_TREC);
+    if (!StgCatchRetryFrame_running_alt_code(frame)) {
+      // Retry in the first code: try the alternative
+      "ptr" trec = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
+      StgTSO_trec(CurrentTSO) = trec;
+      StgCatchRetryFrame_running_alt_code(frame) = 1 :: CInt; // true;
+      R1 = StgCatchRetryFrame_alt_code(frame);
+      jump stg_ap_v_fast;
+    } else {
+      // Retry in the alternative code: propagate
+      W_ other_trec;
+      other_trec = StgCatchRetryFrame_first_code_trec(frame);
+      r = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", other_trec "ptr") [];
+      if (r) {
+        r = foreign "C" stmCommitNestedTransaction(MyCapability() "ptr", trec "ptr") [];
+      } else {
+        foreign "C" stmAbortTransaction(MyCapability() "ptr", trec "ptr") [];
+      }
+      if (r) {
+        // Merge between siblings succeeded: commit it back to enclosing transaction
+        // and then propagate the retry
+        StgTSO_trec(CurrentTSO) = outer;
+        Sp = Sp + SIZEOF_StgCatchRetryFrame;
+        goto retry_pop_stack;
+      } else {
+        // Merge failed: we musn't propagate the retry.  Try both paths again.
+        "ptr" trec = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
+        StgCatchRetryFrame_first_code_trec(frame) = trec;
+        StgCatchRetryFrame_running_alt_code(frame) = 0 :: CInt; // false;
+        StgTSO_trec(CurrentTSO) = trec;
+        R1 = StgCatchRetryFrame_first_code(frame);
+        jump stg_ap_v_fast;
+      }
+    }
+  }
+
+  // We've reached the ATOMICALLY_FRAME: attempt to wait 
+  ASSERT(frame_type == ATOMICALLY_FRAME);
+  ASSERT(outer == NO_TREC);
+  r = foreign "C" stmWait(MyCapability() "ptr", CurrentTSO "ptr", trec "ptr") [];
+  if (r) {
+    // Transaction was valid: stmWait put us on the TVars' queues, we now block
+    StgHeader_info(frame) = stg_atomically_waiting_frame_info;
+    Sp = frame;
+    // Fix up the stack in the unregisterised case: the return convention is different.
+    IF_NOT_REG_R1(Sp_adj(-2); 
+		  Sp(1) = stg_NO_FINALIZER_closure;
+		  Sp(0) = stg_ut_1_0_unreg_info;)
+    R3 = trec; // passing to stmWaitUnblock()
+    jump stg_block_stmwait;
+  } else {
+    // Transaction was not valid: retry immediately
+    "ptr" trec = foreign "C" stmStartTransaction(MyCapability() "ptr", outer "ptr") [];
+    StgTSO_trec(CurrentTSO) = trec;
+    R1 = StgAtomicallyFrame_code(frame);
+    Sp = frame;
+    jump stg_ap_v_fast;
+  }
+}
+
+
+newTVarzh_fast
+{
+  W_ tv;
+  W_ new_value;
+
+  /* Args: R1 = initialisation value */
+
+  MAYBE_GC (R1_PTR, newTVarzh_fast); 
+  new_value = R1;
+  "ptr" tv = foreign "C" stmNewTVar(MyCapability() "ptr", new_value "ptr") [];
+  RET_P(tv);
+}
+
+
+readTVarzh_fast
+{
+  W_ trec;
+  W_ tvar;
+  W_ result;
+
+  /* Args: R1 = TVar closure */
+
+  MAYBE_GC (R1_PTR, readTVarzh_fast); // Call to stmReadTVar may allocate
+  trec = StgTSO_trec(CurrentTSO);
+  tvar = R1;
+  "ptr" result = foreign "C" stmReadTVar(MyCapability() "ptr", trec "ptr", tvar "ptr") [];
+
+  RET_P(result);
+}
+
+
+writeTVarzh_fast
+{
+  W_ trec;
+  W_ tvar;
+  W_ new_value;
+  
+  /* Args: R1 = TVar closure */
+  /*       R2 = New value    */
+
+  MAYBE_GC (R1_PTR & R2_PTR, writeTVarzh_fast); // Call to stmWriteTVar may allocate
+  trec = StgTSO_trec(CurrentTSO);
+  tvar = R1;
+  new_value = R2;
+  foreign "C" stmWriteTVar(MyCapability() "ptr", trec "ptr", tvar "ptr", new_value "ptr") [];
+
+  jump %ENTRY_CODE(Sp(0));
+}
+
+
+/* -----------------------------------------------------------------------------
+ * MVar primitives
+ *
+ * take & putMVar work as follows.  Firstly, an important invariant:
+ *
+ *    If the MVar is full, then the blocking queue contains only
+ *    threads blocked on putMVar, and if the MVar is empty then the
+ *    blocking queue contains only threads blocked on takeMVar.
+ *
+ * takeMvar:
+ *    MVar empty : then add ourselves to the blocking queue
+ *    MVar full  : remove the value from the MVar, and
+ *                 blocking queue empty     : return
+ *                 blocking queue non-empty : perform the first blocked putMVar
+ *                                            from the queue, and wake up the
+ *                                            thread (MVar is now full again)
+ *
+ * putMVar is just the dual of the above algorithm.
+ *
+ * How do we "perform a putMVar"?  Well, we have to fiddle around with
+ * the stack of the thread waiting to do the putMVar.  See
+ * stg_block_putmvar and stg_block_takemvar in HeapStackCheck.c for
+ * the stack layout, and the PerformPut and PerformTake macros below.
+ *
+ * It is important that a blocked take or put is woken up with the
+ * take/put already performed, because otherwise there would be a
+ * small window of vulnerability where the thread could receive an
+ * exception and never perform its take or put, and we'd end up with a
+ * deadlock.
+ *
+ * -------------------------------------------------------------------------- */
+
+isEmptyMVarzh_fast
+{
+    /* args: R1 = MVar closure */
+
+    if (GET_INFO(R1) == stg_EMPTY_MVAR_info) {
+	RET_N(1);
+    } else {
+	RET_N(0);
+    }
+}
+
+newMVarzh_fast
+{
+    /* args: none */
+    W_ mvar;
+
+    ALLOC_PRIM ( SIZEOF_StgMVar, NO_PTRS, newMVarzh_fast );
+  
+    mvar = Hp - SIZEOF_StgMVar + WDS(1);
+    SET_HDR(mvar,stg_EMPTY_MVAR_info,W_[CCCS]);
+    StgMVar_head(mvar)  = stg_END_TSO_QUEUE_closure;
+    StgMVar_tail(mvar)  = stg_END_TSO_QUEUE_closure;
+    StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
+    RET_P(mvar);
+}
+
+
+/* If R1 isn't available, pass it on the stack */
+#ifdef REG_R1
+#define PerformTake(tso, value)				\
+    W_[StgTSO_sp(tso) + WDS(1)] = value;		\
+    W_[StgTSO_sp(tso) + WDS(0)] = stg_gc_unpt_r1_info;
+#else
+#define PerformTake(tso, value)					\
+    W_[StgTSO_sp(tso) + WDS(1)] = value;			\
+    W_[StgTSO_sp(tso) + WDS(0)] = stg_ut_1_0_unreg_info;
+#endif
+
+#define PerformPut(tso,lval)			\
+    StgTSO_sp(tso) = StgTSO_sp(tso) + WDS(3);	\
+    lval = W_[StgTSO_sp(tso) - WDS(1)];
+
+takeMVarzh_fast
+{
+    W_ mvar, val, info, tso;
+
+    /* args: R1 = MVar closure */
+    mvar = R1;
+
+#if defined(THREADED_RTS)
+    "ptr" info = foreign "C" lockClosure(mvar "ptr") [];
+#else
+    info = GET_INFO(mvar);
+#endif
+
+    /* If the MVar is empty, put ourselves on its blocking queue,
+     * and wait until we're woken up.
+     */
+    if (info == stg_EMPTY_MVAR_info) {
+	if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
+	    StgMVar_head(mvar) = CurrentTSO;
+	} else {
+	    StgTSO_link(StgMVar_tail(mvar)) = CurrentTSO;
+	}
+	StgTSO_link(CurrentTSO)        = stg_END_TSO_QUEUE_closure;
+	StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
+	StgTSO_block_info(CurrentTSO)  = mvar;
+	StgMVar_tail(mvar) = CurrentTSO;
+	
+	jump stg_block_takemvar;
+  }
+
+  /* we got the value... */
+  val = StgMVar_value(mvar);
+
+  if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure)
+  {
+      /* There are putMVar(s) waiting... 
+       * wake up the first thread on the queue
+       */
+      ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
+
+      /* actually perform the putMVar for the thread that we just woke up */
+      tso = StgMVar_head(mvar);
+      PerformPut(tso,StgMVar_value(mvar));
+      foreign "C" dirtyTSO(tso "ptr") [];
+
+#if defined(GRAN) || defined(PAR)
+      /* ToDo: check 2nd arg (mvar) is right */
+      "ptr" tso = foreign "C" unblockOne(StgMVar_head(mvar),mvar) [];
+      StgMVar_head(mvar) = tso;
+#else
+      "ptr" tso = foreign "C" unblockOne(MyCapability() "ptr", 
+				         StgMVar_head(mvar) "ptr") [];
+      StgMVar_head(mvar) = tso;
+#endif
+
+      if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
+	  StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
+      }
+
+#if defined(THREADED_RTS)
+      foreign "C" unlockClosure(mvar "ptr", stg_FULL_MVAR_info) [];
+#endif
+      RET_P(val);
+  } 
+  else
+  {
+      /* No further putMVars, MVar is now empty */
+      StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
+ 
+#if defined(THREADED_RTS)
+      foreign "C" unlockClosure(mvar "ptr", stg_EMPTY_MVAR_info) [];
+#else
+      SET_INFO(mvar,stg_EMPTY_MVAR_info);
+#endif
+
+      RET_P(val);
+  }
+}
+
+
+tryTakeMVarzh_fast
+{
+    W_ mvar, val, info, tso;
+
+    /* args: R1 = MVar closure */
+
+    mvar = R1;
+
+#if defined(THREADED_RTS)
+    "ptr" info = foreign "C" lockClosure(mvar "ptr") [];
+#else
+    info = GET_INFO(mvar);
+#endif
+
+    if (info == stg_EMPTY_MVAR_info) {
+#if defined(THREADED_RTS)
+        foreign "C" unlockClosure(mvar "ptr", stg_EMPTY_MVAR_info) [];
+#endif
+	/* HACK: we need a pointer to pass back, 
+	 * so we abuse NO_FINALIZER_closure
+	 */
+	RET_NP(0, stg_NO_FINALIZER_closure);
+    }
+
+    /* we got the value... */
+    val = StgMVar_value(mvar);
+
+    if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
+
+	/* There are putMVar(s) waiting... 
+	 * wake up the first thread on the queue
+	 */
+	ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
+
+	/* actually perform the putMVar for the thread that we just woke up */
+	tso = StgMVar_head(mvar);
+	PerformPut(tso,StgMVar_value(mvar));
+        foreign "C" dirtyTSO(tso "ptr") [];
+
+#if defined(GRAN) || defined(PAR)
+	/* ToDo: check 2nd arg (mvar) is right */
+	"ptr" tso = foreign "C" unblockOne(StgMVar_head(mvar) "ptr", mvar "ptr") [];
+	StgMVar_head(mvar) = tso;
+#else
+	"ptr" tso = foreign "C" unblockOne(MyCapability() "ptr",
+					   StgMVar_head(mvar) "ptr") [];
+	StgMVar_head(mvar) = tso;
+#endif
+
+	if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
+	    StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
+	}
+#if defined(THREADED_RTS)
+        foreign "C" unlockClosure(mvar "ptr", stg_FULL_MVAR_info) [];
+#endif
+    }
+    else 
+    {
+	/* No further putMVars, MVar is now empty */
+	StgMVar_value(mvar) = stg_END_TSO_QUEUE_closure;
+#if defined(THREADED_RTS)
+    	foreign "C" unlockClosure(mvar "ptr", stg_EMPTY_MVAR_info) [];
+#else
+	SET_INFO(mvar,stg_EMPTY_MVAR_info);
+#endif
+    }
+    
+    RET_NP(1, val);
+}
+
+
+putMVarzh_fast
+{
+    W_ mvar, info, tso;
+
+    /* args: R1 = MVar, R2 = value */
+    mvar = R1;
+
+#if defined(THREADED_RTS)
+    "ptr" info = foreign "C" lockClosure(mvar "ptr") [R2];
+#else
+    info = GET_INFO(mvar);
+#endif
+
+    if (info == stg_FULL_MVAR_info) {
+	if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
+	    StgMVar_head(mvar) = CurrentTSO;
+	} else {
+	    StgTSO_link(StgMVar_tail(mvar)) = CurrentTSO;
+	}
+	StgTSO_link(CurrentTSO)        = stg_END_TSO_QUEUE_closure;
+	StgTSO_why_blocked(CurrentTSO) = BlockedOnMVar::I16;
+	StgTSO_block_info(CurrentTSO)  = mvar;
+	StgMVar_tail(mvar) = CurrentTSO;
+	
+	jump stg_block_putmvar;
+    }
+  
+    if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
+
+	/* There are takeMVar(s) waiting: wake up the first one
+	 */
+	ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
+
+	/* actually perform the takeMVar */
+	tso = StgMVar_head(mvar);
+	PerformTake(tso, R2);
+        foreign "C" dirtyTSO(tso "ptr") [];
+      
+#if defined(GRAN) || defined(PAR)
+	/* ToDo: check 2nd arg (mvar) is right */
+	"ptr" tso = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr",mvar "ptr") [];
+	StgMVar_head(mvar) = tso;
+#else
+	"ptr" tso = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr") [];
+	StgMVar_head(mvar) = tso;
+#endif
+
+	if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
+	    StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
+	}
+
+#if defined(THREADED_RTS)
+    	foreign "C" unlockClosure(mvar "ptr", stg_EMPTY_MVAR_info) [];
+#endif
+	jump %ENTRY_CODE(Sp(0));
+    }
+    else
+    {
+	/* No further takes, the MVar is now full. */
+	StgMVar_value(mvar) = R2;
+
+#if defined(THREADED_RTS)
+    	foreign "C" unlockClosure(mvar "ptr", stg_FULL_MVAR_info) [];
+#else
+	SET_INFO(mvar,stg_FULL_MVAR_info);
+#endif
+	jump %ENTRY_CODE(Sp(0));
+    }
+    
+    /* ToDo: yield afterward for better communication performance? */
+}
+
+
+tryPutMVarzh_fast
+{
+    W_ mvar, info, tso;
+
+    /* args: R1 = MVar, R2 = value */
+    mvar = R1;
+
+#if defined(THREADED_RTS)
+    "ptr" info = foreign "C" lockClosure(mvar "ptr") [R2];
+#else
+    info = GET_INFO(mvar);
+#endif
+
+    if (info == stg_FULL_MVAR_info) {
+#if defined(THREADED_RTS)
+    	foreign "C" unlockClosure(mvar "ptr", stg_FULL_MVAR_info) [];
+#endif
+	RET_N(0);
+    }
+  
+    if (StgMVar_head(mvar) != stg_END_TSO_QUEUE_closure) {
+
+	/* There are takeMVar(s) waiting: wake up the first one
+	 */
+	ASSERT(StgTSO_why_blocked(StgMVar_head(mvar)) == BlockedOnMVar::I16);
+	
+	/* actually perform the takeMVar */
+	tso = StgMVar_head(mvar);
+	PerformTake(tso, R2);
+        foreign "C" dirtyTSO(tso "ptr") [];
+      
+#if defined(GRAN) || defined(PAR)
+	/* ToDo: check 2nd arg (mvar) is right */
+	"ptr" tso = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr",mvar "ptr") [];
+	StgMVar_head(mvar) = tso;
+#else
+	"ptr" tso = foreign "C" unblockOne(MyCapability() "ptr", StgMVar_head(mvar) "ptr") [];
+	StgMVar_head(mvar) = tso;
+#endif
+
+	if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) {
+	    StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
+	}
+
+#if defined(THREADED_RTS)
+    	foreign "C" unlockClosure(mvar "ptr", stg_EMPTY_MVAR_info) [];
+#endif
+    }
+    else
+    {
+	/* No further takes, the MVar is now full. */
+	StgMVar_value(mvar) = R2;
+
+#if defined(THREADED_RTS)
+    	foreign "C" unlockClosure(mvar "ptr", stg_FULL_MVAR_info) [];
+#else
+	SET_INFO(mvar,stg_FULL_MVAR_info);
+#endif
+    }
+    
+    RET_N(1);
+    /* ToDo: yield afterward for better communication performance? */
+}
+
+
+/* -----------------------------------------------------------------------------
+   Stable pointer primitives
+   -------------------------------------------------------------------------  */
+
+makeStableNamezh_fast
+{
+    W_ index, sn_obj;
+
+    ALLOC_PRIM( SIZEOF_StgStableName, R1_PTR, makeStableNamezh_fast );
+  
+    index = foreign "C" lookupStableName(R1 "ptr") [];
+
+    /* Is there already a StableName for this heap object?
+     *  stable_ptr_table is a pointer to an array of snEntry structs.
+     */
+    if ( snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry) == NULL ) {
+	sn_obj = Hp - SIZEOF_StgStableName + WDS(1);
+	SET_HDR(sn_obj, stg_STABLE_NAME_info, W_[CCCS]);
+	StgStableName_sn(sn_obj) = index;
+	snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry) = sn_obj;
+    } else {
+	sn_obj = snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry);
+    }
+    
+    RET_P(sn_obj);
+}
+
+
+makeStablePtrzh_fast
+{
+    /* Args: R1 = a */
+    W_ sp;
+    MAYBE_GC(R1_PTR, makeStablePtrzh_fast);
+    "ptr" sp = foreign "C" getStablePtr(R1 "ptr") [];
+    RET_N(sp);
+}
+
+deRefStablePtrzh_fast
+{
+    /* Args: R1 = the stable ptr */
+    W_ r, sp;
+    sp = R1;
+    r = snEntry_addr(W_[stable_ptr_table] + sp*SIZEOF_snEntry);
+    RET_P(r);
+}
+
+/* -----------------------------------------------------------------------------
+   Bytecode object primitives
+   -------------------------------------------------------------------------  */
+
+newBCOzh_fast
+{
+    /* R1 = instrs
+       R2 = literals
+       R3 = ptrs
+       R4 = itbls
+       R5 = arity
+       R6 = bitmap array
+    */
+    W_ bco, bitmap_arr, bytes, words;
+    
+    bitmap_arr = R6;
+    words = BYTES_TO_WDS(SIZEOF_StgBCO) + StgArrWords_words(bitmap_arr);
+    bytes = WDS(words);
+
+    ALLOC_PRIM( bytes, R1_PTR&R2_PTR&R3_PTR&R4_PTR&R6_PTR, newBCOzh_fast );
+
+    bco = Hp - bytes + WDS(1);
+    SET_HDR(bco, stg_BCO_info, W_[CCCS]);
+    
+    StgBCO_instrs(bco)     = R1;
+    StgBCO_literals(bco)   = R2;
+    StgBCO_ptrs(bco)       = R3;
+    StgBCO_itbls(bco)      = R4;
+    StgBCO_arity(bco)      = HALF_W_(R5);
+    StgBCO_size(bco)       = HALF_W_(words);
+    
+    // Copy the arity/bitmap info into the BCO
+    W_ i;
+    i = 0;
+for:
+    if (i < StgArrWords_words(bitmap_arr)) {
+	StgBCO_bitmap(bco,i) = StgArrWords_payload(bitmap_arr,i);
+	i = i + 1;
+	goto for;
+    }
+    
+    RET_P(bco);
+}
+
+
+mkApUpd0zh_fast
+{
+    // R1 = the BCO# for the AP
+    //	
+    W_ ap;
+
+    // This function is *only* used to wrap zero-arity BCOs in an
+    // updatable wrapper (see ByteCodeLink.lhs).  An AP thunk is always
+    // saturated and always points directly to a FUN or BCO.
+    ASSERT(%INFO_TYPE(%GET_STD_INFO(R1)) == HALF_W_(BCO) &&
+	   StgBCO_arity(R1) == HALF_W_(0));
+
+    HP_CHK_GEN_TICKY(SIZEOF_StgAP, R1_PTR, mkApUpd0zh_fast);
+    TICK_ALLOC_UP_THK(0, 0);
+    CCCS_ALLOC(SIZEOF_StgAP);
+
+    ap = Hp - SIZEOF_StgAP + WDS(1);
+    SET_HDR(ap, stg_AP_info, W_[CCCS]);
+    
+    StgAP_n_args(ap) = HALF_W_(0);
+    StgAP_fun(ap) = R1;
+    
+    RET_P(ap);
+}
+
+/* -----------------------------------------------------------------------------
+   Thread I/O blocking primitives
+   -------------------------------------------------------------------------- */
+
+/* Add a thread to the end of the blocked queue. (C-- version of the C
+ * macro in Schedule.h).
+ */
+#define APPEND_TO_BLOCKED_QUEUE(tso)			\
+    ASSERT(StgTSO_link(tso) == END_TSO_QUEUE);		\
+    if (W_[blocked_queue_hd] == END_TSO_QUEUE) {	\
+      W_[blocked_queue_hd] = tso;			\
+    } else {						\
+      StgTSO_link(W_[blocked_queue_tl]) = tso;		\
+    }							\
+    W_[blocked_queue_tl] = tso;
+
+waitReadzh_fast
+{
+    /* args: R1 */
+#ifdef THREADED_RTS
+    foreign "C" barf("waitRead# on threaded RTS");
+#else
+
+    ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
+    StgTSO_why_blocked(CurrentTSO) = BlockedOnRead::I16;
+    StgTSO_block_info(CurrentTSO) = R1;
+    // No locking - we're not going to use this interface in the
+    // threaded RTS anyway.
+    APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
+    jump stg_block_noregs;
+#endif
+}
+
+waitWritezh_fast
+{
+    /* args: R1 */
+#ifdef THREADED_RTS
+    foreign "C" barf("waitWrite# on threaded RTS");
+#else
+
+    ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
+    StgTSO_why_blocked(CurrentTSO) = BlockedOnWrite::I16;
+    StgTSO_block_info(CurrentTSO) = R1;
+    // No locking - we're not going to use this interface in the
+    // threaded RTS anyway.
+    APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
+    jump stg_block_noregs;
+#endif
+}
+
+
+STRING(stg_delayzh_malloc_str, "delayzh_fast")
+delayzh_fast
+{
+#ifdef mingw32_HOST_OS
+    W_ ares;
+    CInt reqID;
+#else
+    W_ t, prev, target;
+#endif
+
+#ifdef THREADED_RTS
+    foreign "C" barf("delay# on threaded RTS");
+#else
+
+    /* args: R1 (microsecond delay amount) */
+    ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
+    StgTSO_why_blocked(CurrentTSO) = BlockedOnDelay::I16;
+
+#ifdef mingw32_HOST_OS
+
+    /* could probably allocate this on the heap instead */
+    "ptr" ares = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
+					    stg_delayzh_malloc_str);
+    reqID = foreign "C" addDelayRequest(R1);
+    StgAsyncIOResult_reqID(ares)   = reqID;
+    StgAsyncIOResult_len(ares)     = 0;
+    StgAsyncIOResult_errCode(ares) = 0;
+    StgTSO_block_info(CurrentTSO)  = ares;
+
+    /* Having all async-blocked threads reside on the blocked_queue
+     * simplifies matters, so change the status to OnDoProc put the
+     * delayed thread on the blocked_queue.
+     */
+    StgTSO_why_blocked(CurrentTSO) = BlockedOnDoProc::I16;
+    APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
+    jump stg_block_async_void;
+
+#else
+
+    W_ time;
+    time = foreign "C" getourtimeofday();
+    target = (R1 / (TICK_MILLISECS*1000)) + time;
+    StgTSO_block_info(CurrentTSO) = target;
+
+    /* Insert the new thread in the sleeping queue. */
+    prev = NULL;
+    t = W_[sleeping_queue];
+while:
+    if (t != END_TSO_QUEUE && StgTSO_block_info(t) < target) {
+	prev = t;
+	t = StgTSO_link(t);
+	goto while;
+    }
+
+    StgTSO_link(CurrentTSO) = t;
+    if (prev == NULL) {
+	W_[sleeping_queue] = CurrentTSO;
+    } else {
+	StgTSO_link(prev) = CurrentTSO;
+    }
+    jump stg_block_noregs;
+#endif
+#endif /* !THREADED_RTS */
+}
+
+
+#ifdef mingw32_HOST_OS
+STRING(stg_asyncReadzh_malloc_str, "asyncReadzh_fast")
+asyncReadzh_fast
+{
+    W_ ares;
+    CInt reqID;
+
+#ifdef THREADED_RTS
+    foreign "C" barf("asyncRead# on threaded RTS");
+#else
+
+    /* args: R1 = fd, R2 = isSock, R3 = len, R4 = buf */
+    ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
+    StgTSO_why_blocked(CurrentTSO) = BlockedOnRead::I16;
+
+    /* could probably allocate this on the heap instead */
+    "ptr" ares = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
+					    stg_asyncReadzh_malloc_str)
+			[R1,R2,R3,R4];
+    reqID = foreign "C" addIORequest(R1, 0/*FALSE*/,R2,R3,R4 "ptr") [];
+    StgAsyncIOResult_reqID(ares)   = reqID;
+    StgAsyncIOResult_len(ares)     = 0;
+    StgAsyncIOResult_errCode(ares) = 0;
+    StgTSO_block_info(CurrentTSO)  = ares;
+    APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
+    jump stg_block_async;
+#endif
+}
+
+STRING(stg_asyncWritezh_malloc_str, "asyncWritezh_fast")
+asyncWritezh_fast
+{
+    W_ ares;
+    CInt reqID;
+
+#ifdef THREADED_RTS
+    foreign "C" barf("asyncWrite# on threaded RTS");
+#else
+
+    /* args: R1 = fd, R2 = isSock, R3 = len, R4 = buf */
+    ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
+    StgTSO_why_blocked(CurrentTSO) = BlockedOnWrite::I16;
+
+    "ptr" ares = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
+					    stg_asyncWritezh_malloc_str)
+			[R1,R2,R3,R4];
+    reqID = foreign "C" addIORequest(R1, 1/*TRUE*/,R2,R3,R4 "ptr") [];
+
+    StgAsyncIOResult_reqID(ares)   = reqID;
+    StgAsyncIOResult_len(ares)     = 0;
+    StgAsyncIOResult_errCode(ares) = 0;
+    StgTSO_block_info(CurrentTSO)  = ares;
+    APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
+    jump stg_block_async;
+#endif
+}
+
+STRING(stg_asyncDoProczh_malloc_str, "asyncDoProczh_fast")
+asyncDoProczh_fast
+{
+    W_ ares;
+    CInt reqID;
+
+#ifdef THREADED_RTS
+    foreign "C" barf("asyncDoProc# on threaded RTS");
+#else
+
+    /* args: R1 = proc, R2 = param */
+    ASSERT(StgTSO_why_blocked(CurrentTSO) == NotBlocked::I16);
+    StgTSO_why_blocked(CurrentTSO) = BlockedOnDoProc::I16;
+
+    /* could probably allocate this on the heap instead */
+    "ptr" ares = foreign "C" stgMallocBytes(SIZEOF_StgAsyncIOResult,
+					    stg_asyncDoProczh_malloc_str) 
+				[R1,R2];
+    reqID = foreign "C" addDoProcRequest(R1 "ptr",R2 "ptr") [];
+    StgAsyncIOResult_reqID(ares)   = reqID;
+    StgAsyncIOResult_len(ares)     = 0;
+    StgAsyncIOResult_errCode(ares) = 0;
+    StgTSO_block_info(CurrentTSO) = ares;
+    APPEND_TO_BLOCKED_QUEUE(CurrentTSO);
+    jump stg_block_async;
+#endif
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+  ** temporary **
+
+   classes CCallable and CReturnable don't really exist, but the
+   compiler insists on generating dictionaries containing references
+   to GHC_ZcCCallable_static_info etc., so we provide dummy symbols
+   for these.  Some C compilers can't cope with zero-length static arrays,
+   so we have to make these one element long.
+  --------------------------------------------------------------------------- */
+
+section "rodata" {
+  GHC_ZCCCallable_static_info:   W_ 0;
+}
+
+section "rodata" {
+  GHC_ZCCReturnable_static_info: W_ 0;
+}
diff --git a/rts/Printer.c b/rts/Printer.c
new file mode 100644
index 0000000000..8290d220a0
--- /dev/null
+++ b/rts/Printer.c
@@ -0,0 +1,1127 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1994-2000.
+ *
+ * Heap printer
+ * 
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "Printer.h"
+#include "RtsUtils.h"
+
+#ifdef DEBUG
+
+#include "RtsFlags.h"
+#include "MBlock.h"
+#include "Storage.h"
+#include "Bytecodes.h"  /* for InstrPtr */
+#include "Disassembler.h"
+#include "Apply.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#if defined(GRAN) || defined(PAR)
+// HWL: explicit fixed header size to make debugging easier
+int fixed_hs = sizeof(StgHeader), itbl_sz = sizeofW(StgInfoTable), 
+    uf_sz=sizeofW(StgUpdateFrame); 
+#endif
+
+/* --------------------------------------------------------------------------
+ * local function decls
+ * ------------------------------------------------------------------------*/
+
+static void    printStdObjPayload( StgClosure *obj );
+#ifdef USING_LIBBFD
+static void    reset_table   ( int size );
+static void    prepare_table ( void );
+static void    insert        ( unsigned value, const char *name );
+#endif
+#if 0 /* unused but might be useful sometime */
+static rtsBool lookup_name   ( char *name, unsigned *result );
+static void    enZcode       ( char *in, char *out );
+#endif
+static char    unZcode       ( char ch );
+const char *   lookupGHCName ( void *addr );
+static void    printZcoded   ( const char *raw );
+
+/* --------------------------------------------------------------------------
+ * Printer
+ * ------------------------------------------------------------------------*/
+
+void printPtr( StgPtr p )
+{
+    const char *raw;
+    raw = lookupGHCName(p);
+    if (raw != NULL) {
+        printZcoded(raw);
+    } else {
+        debugBelch("%p", p);
+    }
+}
+  
+void printObj( StgClosure *obj )
+{
+    debugBelch("Object "); printPtr((StgPtr)obj); debugBelch(" = ");
+    printClosure(obj);
+}
+
+STATIC_INLINE void
+printStdObjHdr( StgClosure *obj, char* tag )
+{
+    debugBelch("%s(",tag);
+    printPtr((StgPtr)obj->header.info);
+#ifdef PROFILING
+    debugBelch(", %s", obj->header.prof.ccs->cc->label);
+#endif
+}
+
+static void
+printStdObjPayload( StgClosure *obj )
+{
+    StgWord i, j;
+    const StgInfoTable* info;
+
+    info = get_itbl(obj);
+    for (i = 0; i < info->layout.payload.ptrs; ++i) {
+        debugBelch(", ");
+        printPtr((StgPtr)obj->payload[i]);
+    }
+    for (j = 0; j < info->layout.payload.nptrs; ++j) {
+        debugBelch(", %pd#",obj->payload[i+j]);
+    }
+    debugBelch(")\n");
+}
+
+static void
+printThunkPayload( StgThunk *obj )
+{
+    StgWord i, j;
+    const StgInfoTable* info;
+
+    info = get_itbl(obj);
+    for (i = 0; i < info->layout.payload.ptrs; ++i) {
+        debugBelch(", ");
+        printPtr((StgPtr)obj->payload[i]);
+    }
+    for (j = 0; j < info->layout.payload.nptrs; ++j) {
+        debugBelch(", %pd#",obj->payload[i+j]);
+    }
+    debugBelch(")\n");
+}
+
+static void
+printThunkObject( StgThunk *obj, char* tag )
+{
+    printStdObjHdr( (StgClosure *)obj, tag );
+    printThunkPayload( obj );
+}
+
+void
+printClosure( StgClosure *obj )
+{
+    StgInfoTable *info;
+    
+    info = get_itbl(obj);
+
+    switch ( info->type ) {
+    case INVALID_OBJECT:
+            barf("Invalid object");
+
+    case CONSTR:
+    case CONSTR_1_0: case CONSTR_0_1:
+    case CONSTR_1_1: case CONSTR_0_2: case CONSTR_2_0:
+    case CONSTR_INTLIKE:
+    case CONSTR_CHARLIKE:
+    case CONSTR_STATIC:
+    case CONSTR_NOCAF_STATIC:
+        {
+            StgWord i, j;
+#ifdef PROFILING
+	    debugBelch("%s(", info->prof.closure_desc);
+	    debugBelch("%s", obj->header.prof.ccs->cc->label);
+#else
+            debugBelch("CONSTR(");
+            printPtr((StgPtr)obj->header.info);
+            debugBelch("(tag=%d)",info->srt_bitmap);
+#endif
+            for (i = 0; i < info->layout.payload.ptrs; ++i) {
+		debugBelch(", ");
+                printPtr((StgPtr)obj->payload[i]);
+            }
+            for (j = 0; j < info->layout.payload.nptrs; ++j) {
+                debugBelch(", %p#", obj->payload[i+j]);
+            }
+            debugBelch(")\n");
+            break;
+        }
+
+    case FUN:
+    case FUN_1_0: case FUN_0_1: 
+    case FUN_1_1: case FUN_0_2: case FUN_2_0:
+    case FUN_STATIC:
+	debugBelch("FUN/%d(",itbl_to_fun_itbl(info)->f.arity);
+	printPtr((StgPtr)obj->header.info);
+#ifdef PROFILING
+	debugBelch(", %s", obj->header.prof.ccs->cc->label);
+#endif
+	printStdObjPayload(obj);
+	break;
+
+    case THUNK:
+    case THUNK_1_0: case THUNK_0_1:
+    case THUNK_1_1: case THUNK_0_2: case THUNK_2_0:
+    case THUNK_STATIC:
+            /* ToDo: will this work for THUNK_STATIC too? */
+#ifdef PROFILING
+	    printThunkObject((StgThunk *)obj,info->prof.closure_desc);
+#else
+            printThunkObject((StgThunk *)obj,"THUNK");
+#endif
+            break;
+
+    case THUNK_SELECTOR:
+	printStdObjHdr(obj, "THUNK_SELECTOR");
+	debugBelch(", %p)\n", ((StgSelector *)obj)->selectee);
+	break;
+
+    case BCO:
+            disassemble( (StgBCO*)obj );
+            break;
+
+    case AP:
+        {
+	    StgAP* ap = stgCast(StgAP*,obj);
+            StgWord i;
+            debugBelch("AP("); printPtr((StgPtr)ap->fun);
+            for (i = 0; i < ap->n_args; ++i) {
+                debugBelch(", ");
+                printPtr((P_)ap->payload[i]);
+            }
+            debugBelch(")\n");
+            break;
+        }
+
+    case PAP:
+        {
+	    StgPAP* pap = stgCast(StgPAP*,obj);
+            StgWord i;
+            debugBelch("PAP/%d(",pap->arity); 
+	    printPtr((StgPtr)pap->fun);
+            for (i = 0; i < pap->n_args; ++i) {
+                debugBelch(", ");
+                printPtr((StgPtr)pap->payload[i]);
+            }
+            debugBelch(")\n");
+            break;
+        }
+
+    case AP_STACK:
+        {
+	    StgAP_STACK* ap = stgCast(StgAP_STACK*,obj);
+            StgWord i;
+            debugBelch("AP_STACK("); printPtr((StgPtr)ap->fun);
+            for (i = 0; i < ap->size; ++i) {
+                debugBelch(", ");
+                printPtr((P_)ap->payload[i]);
+            }
+            debugBelch(")\n");
+            break;
+        }
+
+    case IND:
+            debugBelch("IND("); 
+            printPtr((StgPtr)stgCast(StgInd*,obj)->indirectee);
+            debugBelch(")\n"); 
+            break;
+
+    case IND_OLDGEN:
+            debugBelch("IND_OLDGEN("); 
+            printPtr((StgPtr)stgCast(StgInd*,obj)->indirectee);
+            debugBelch(")\n"); 
+            break;
+
+    case IND_PERM:
+            debugBelch("IND("); 
+            printPtr((StgPtr)stgCast(StgInd*,obj)->indirectee);
+            debugBelch(")\n"); 
+            break;
+
+    case IND_OLDGEN_PERM:
+            debugBelch("IND_OLDGEN_PERM("); 
+            printPtr((StgPtr)stgCast(StgInd*,obj)->indirectee);
+            debugBelch(")\n"); 
+            break;
+
+    case IND_STATIC:
+            debugBelch("IND_STATIC("); 
+            printPtr((StgPtr)stgCast(StgInd*,obj)->indirectee);
+            debugBelch(")\n"); 
+            break;
+
+    /* Cannot happen -- use default case.
+    case RET_BCO:
+    case RET_SMALL:
+    case RET_VEC_SMALL:
+    case RET_BIG:
+    case RET_VEC_BIG:
+    case RET_DYN:
+    case RET_FUN:
+    */
+
+    case UPDATE_FRAME:
+        {
+            StgUpdateFrame* u = stgCast(StgUpdateFrame*,obj);
+            debugBelch("UPDATE_FRAME(");
+            printPtr((StgPtr)GET_INFO(u));
+            debugBelch(",");
+            printPtr((StgPtr)u->updatee);
+            debugBelch(")\n"); 
+            break;
+        }
+
+    case CATCH_FRAME:
+        {
+            StgCatchFrame* u = stgCast(StgCatchFrame*,obj);
+            debugBelch("CATCH_FRAME(");
+            printPtr((StgPtr)GET_INFO(u));
+            debugBelch(",");
+            printPtr((StgPtr)u->handler);
+            debugBelch(")\n"); 
+            break;
+        }
+
+    case STOP_FRAME:
+        {
+            StgStopFrame* u = stgCast(StgStopFrame*,obj);
+            debugBelch("STOP_FRAME(");
+            printPtr((StgPtr)GET_INFO(u));
+            debugBelch(")\n"); 
+            break;
+        }
+
+    case CAF_BLACKHOLE:
+            debugBelch("CAF_BH"); 
+            break;
+
+    case BLACKHOLE:
+            debugBelch("BH\n"); 
+            break;
+
+    case SE_BLACKHOLE:
+            debugBelch("SE_BH\n"); 
+            break;
+
+    case SE_CAF_BLACKHOLE:
+            debugBelch("SE_CAF_BH\n"); 
+            break;
+
+    case ARR_WORDS:
+        {
+            StgWord i;
+            debugBelch("ARR_WORDS(\"");
+            /* ToDo: we can't safely assume that this is a string! 
+            for (i = 0; arrWordsGetChar(obj,i); ++i) {
+                putchar(arrWordsGetChar(obj,i));
+		} */
+	    for (i=0; i<((StgArrWords *)obj)->words; i++)
+	      debugBelch("%lu", (lnat)((StgArrWords *)obj)->payload[i]);
+            debugBelch("\")\n");
+            break;
+        }
+
+    case MUT_ARR_PTRS_CLEAN:
+	debugBelch("MUT_ARR_PTRS_CLEAN(size=%lu)\n", (lnat)((StgMutArrPtrs *)obj)->ptrs);
+	break;
+
+    case MUT_ARR_PTRS_DIRTY:
+	debugBelch("MUT_ARR_PTRS_DIRTY(size=%lu)\n", (lnat)((StgMutArrPtrs *)obj)->ptrs);
+	break;
+
+    case MUT_ARR_PTRS_FROZEN:
+	debugBelch("MUT_ARR_PTRS_FROZEN(size=%lu)\n", (lnat)((StgMutArrPtrs *)obj)->ptrs);
+	break;
+
+    case MVAR:
+        {
+	  StgMVar* mv = (StgMVar*)obj;
+	  debugBelch("MVAR(head=%p, tail=%p, value=%p)\n", mv->head, mv->tail, mv->value);
+          break;
+        }
+
+    case MUT_VAR_CLEAN:
+        {
+	  StgMutVar* mv = (StgMutVar*)obj;
+	  debugBelch("MUT_VAR_CLEAN(var=%p)\n", mv->var);
+          break;
+        }
+
+    case MUT_VAR_DIRTY:
+        {
+	  StgMutVar* mv = (StgMutVar*)obj;
+	  debugBelch("MUT_VAR_DIRTY(var=%p)\n", mv->var);
+          break;
+        }
+
+    case WEAK:
+            debugBelch("WEAK("); 
+	    debugBelch(" key=%p value=%p finalizer=%p", 
+		    (StgPtr)(((StgWeak*)obj)->key),
+		    (StgPtr)(((StgWeak*)obj)->value),
+		    (StgPtr)(((StgWeak*)obj)->finalizer));
+            debugBelch(")\n"); 
+	    /* ToDo: chase 'link' ? */
+            break;
+
+    case STABLE_NAME:
+            debugBelch("STABLE_NAME(%lu)\n", (lnat)((StgStableName*)obj)->sn); 
+            break;
+
+    case TSO:
+      debugBelch("TSO("); 
+      debugBelch("%d (%p)",((StgTSO*)obj)->id, (StgTSO*)obj);
+      debugBelch(")\n"); 
+      break;
+
+#if defined(PAR)
+    case BLOCKED_FETCH:
+      debugBelch("BLOCKED_FETCH("); 
+      printGA(&(stgCast(StgBlockedFetch*,obj)->ga));
+      printPtr((StgPtr)(stgCast(StgBlockedFetch*,obj)->node));
+      debugBelch(")\n"); 
+      break;
+
+    case FETCH_ME:
+      debugBelch("FETCH_ME("); 
+      printGA((globalAddr *)stgCast(StgFetchMe*,obj)->ga);
+      debugBelch(")\n"); 
+      break;
+
+    case FETCH_ME_BQ:
+      debugBelch("FETCH_ME_BQ("); 
+      // printGA((globalAddr *)stgCast(StgFetchMe*,obj)->ga);
+      printPtr((StgPtr)stgCast(StgFetchMeBlockingQueue*,obj)->blocking_queue);
+      debugBelch(")\n"); 
+      break;
+#endif
+
+#if defined(GRAN) || defined(PAR)
+    case RBH:
+      debugBelch("RBH("); 
+      printPtr((StgPtr)stgCast(StgRBH*,obj)->blocking_queue);
+      debugBelch(")\n"); 
+      break;
+
+#endif
+
+#if 0
+      /* Symptomatic of a problem elsewhere, have it fall-through & fail */
+    case EVACUATED:
+      debugBelch("EVACUATED("); 
+      printClosure((StgEvacuated*)obj->evacuee);
+      debugBelch(")\n"); 
+      break;
+#endif
+
+#if defined(PAR) && defined(DIST)
+    case REMOTE_REF:
+      debugBelch("REMOTE_REF("); 
+      printGA((globalAddr *)stgCast(StgFetchMe*,obj)->ga);
+      debugBelch(")\n"); 
+      break;
+#endif
+
+    default:
+            //barf("printClosure %d",get_itbl(obj)->type);
+            debugBelch("*** printClosure: unknown type %d ****\n",
+                    get_itbl(obj)->type );
+            barf("printClosure %d",get_itbl(obj)->type);
+            return;
+    }
+}
+
+/*
+void printGraph( StgClosure *obj )
+{
+ printClosure(obj);
+}
+*/
+
+StgPtr
+printStackObj( StgPtr sp )
+{
+    /*debugBelch("Stack[%d] = ", &stgStack[STACK_SIZE] - sp); */
+
+        StgClosure* c = (StgClosure*)(*sp);
+        printPtr((StgPtr)*sp);
+        if (c == (StgClosure*)&stg_ctoi_R1p_info) {
+           debugBelch("\t\t\tstg_ctoi_ret_R1p_info\n" );
+	} else
+        if (c == (StgClosure*)&stg_ctoi_R1n_info) {
+           debugBelch("\t\t\tstg_ctoi_ret_R1n_info\n" );
+	} else
+        if (c == (StgClosure*)&stg_ctoi_F1_info) {
+           debugBelch("\t\t\tstg_ctoi_ret_F1_info\n" );
+	} else
+        if (c == (StgClosure*)&stg_ctoi_D1_info) {
+           debugBelch("\t\t\tstg_ctoi_ret_D1_info\n" );
+	} else
+        if (c == (StgClosure*)&stg_ctoi_V_info) {
+           debugBelch("\t\t\tstg_ctoi_ret_V_info\n" );
+	} else
+        if (get_itbl(c)->type == BCO) {
+           debugBelch("\t\t\t");
+           debugBelch("BCO(...)\n"); 
+        }
+        else {
+           debugBelch("\t\t\t");
+           printClosure ( (StgClosure*)(*sp));
+        }
+        sp += 1;
+
+    return sp;
+    
+}
+
+static void
+printSmallBitmap( StgPtr spBottom, StgPtr payload, StgWord bitmap, nat size )
+{
+    StgPtr p;
+    nat i;
+
+    p = payload;
+    for(i = 0; i < size; i++, bitmap >>= 1 ) {
+	debugBelch("   stk[%ld] (%p) = ", (long)(spBottom-(payload+i)), payload+i);
+	if ((bitmap & 1) == 0) {
+	    printPtr((P_)payload[i]);
+	    debugBelch("\n");
+	} else {
+	    debugBelch("Word# %lu\n", (lnat)payload[i]);
+	}
+    }
+}
+
+static void
+printLargeBitmap( StgPtr spBottom, StgPtr payload, StgLargeBitmap* large_bitmap, nat size )
+{
+    StgWord bmp;
+    nat i, j;
+
+    i = 0;
+    for (bmp=0; i < size; bmp++) {
+	StgWord bitmap = large_bitmap->bitmap[bmp];
+	j = 0;
+	for(; i < size && j < BITS_IN(W_); j++, i++, bitmap >>= 1 ) {
+	    debugBelch("   stk[%lu] (%p) = ", (lnat)(spBottom-(payload+i)), payload+i);
+	    if ((bitmap & 1) == 0) {
+		printPtr((P_)payload[i]);
+		debugBelch("\n");
+	    } else {
+		debugBelch("Word# %lu\n", (lnat)payload[i]);
+	    }
+	}
+    }
+}
+
+void
+printStackChunk( StgPtr sp, StgPtr spBottom )
+{
+    StgWord bitmap;
+    const StgInfoTable *info;
+
+    ASSERT(sp <= spBottom);
+    for (; sp < spBottom; sp += stack_frame_sizeW((StgClosure *)sp)) {
+
+	info = get_itbl((StgClosure *)sp);
+
+	switch (info->type) {
+	    
+	case UPDATE_FRAME:
+	case CATCH_FRAME:
+	    printObj((StgClosure*)sp);
+	    continue;
+
+	case STOP_FRAME:
+	    printObj((StgClosure*)sp);
+	    return;
+
+	case RET_DYN:
+	{ 
+	    StgRetDyn* r;
+	    StgPtr p;
+	    StgWord dyn;
+	    nat size;
+
+	    r = (StgRetDyn *)sp;
+	    dyn = r->liveness;
+	    debugBelch("RET_DYN (%p)\n", r);
+
+	    p = (P_)(r->payload);
+	    printSmallBitmap(spBottom, sp,
+			     RET_DYN_LIVENESS(r->liveness), 
+			     RET_DYN_BITMAP_SIZE);
+	    p += RET_DYN_BITMAP_SIZE + RET_DYN_NONPTR_REGS_SIZE;
+
+	    for (size = RET_DYN_NONPTRS(dyn); size > 0; size--) {
+		debugBelch("   stk[%ld] (%p) = ", (long)(spBottom-p), p);
+		debugBelch("Word# %ld\n", (long)*p);
+		p++;
+	    }
+	
+	    for (size = RET_DYN_PTRS(dyn); size > 0; size--) {
+		debugBelch("   stk[%ld] (%p) = ", (long)(spBottom-p), p);
+		printPtr(p);
+		p++;
+	    }
+	    continue;
+	}
+
+	case RET_SMALL:
+	case RET_VEC_SMALL:
+	    debugBelch("RET_SMALL (%p)\n", info);
+	    bitmap = info->layout.bitmap;
+	    printSmallBitmap(spBottom, sp+1, 
+			     BITMAP_BITS(bitmap), BITMAP_SIZE(bitmap));
+	    continue;
+
+	case RET_BCO: {
+	    StgBCO *bco;
+	    
+	    bco = ((StgBCO *)sp[1]);
+
+	    debugBelch("RET_BCO (%p)\n", sp);
+	    printLargeBitmap(spBottom, sp+2,
+			     BCO_BITMAP(bco), BCO_BITMAP_SIZE(bco));
+	    continue;
+	}
+
+	case RET_BIG:
+	case RET_VEC_BIG:
+	    barf("todo");
+
+	case RET_FUN:
+	{
+	    StgFunInfoTable *fun_info;
+	    StgRetFun *ret_fun;
+	    nat size;
+
+	    ret_fun = (StgRetFun *)sp;
+	    fun_info = get_fun_itbl(ret_fun->fun);
+	    size = ret_fun->size;
+	    debugBelch("RET_FUN (%p) (type=%d)\n", ret_fun->fun, fun_info->f.fun_type);
+	    switch (fun_info->f.fun_type) {
+	    case ARG_GEN:
+		printSmallBitmap(spBottom, sp+2,
+				 BITMAP_BITS(fun_info->f.b.bitmap),
+				 BITMAP_SIZE(fun_info->f.b.bitmap));
+		break;
+	    case ARG_GEN_BIG:
+		printLargeBitmap(spBottom, sp+2,
+				 GET_FUN_LARGE_BITMAP(fun_info),
+				 GET_FUN_LARGE_BITMAP(fun_info)->size);
+		break;
+	    default:
+		printSmallBitmap(spBottom, sp+2,
+				 BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]),
+				 BITMAP_SIZE(stg_arg_bitmaps[fun_info->f.fun_type]));
+		break;
+	    }
+	    continue;
+	}
+	   
+	default:
+	    debugBelch("unknown object %d\n", info->type);
+	    barf("printStackChunk");
+	}
+    }
+}
+
+void printTSO( StgTSO *tso )
+{
+    printStackChunk( tso->sp, tso->stack+tso->stack_size);
+}
+
+/* -----------------------------------------------------------------------------
+   Closure types
+   
+   NOTE: must be kept in sync with the closure types in includes/ClosureTypes.h
+   -------------------------------------------------------------------------- */
+
+static char *closure_type_names[] = {
+    "INVALID_OBJECT",
+    "CONSTR",
+    "CONSTR_1",
+    "CONSTR_0",
+    "CONSTR_2",
+    "CONSTR_1",
+    "CONSTR_0",
+    "CONSTR_INTLIKE",
+    "CONSTR_CHARLIKE",
+    "CONSTR_STATIC",
+    "CONSTR_NOCAF_STATIC",
+    "FUN",
+    "FUN_1_0",
+    "FUN_0_1",
+    "FUN_2_0",
+    "FUN_1_1",
+    "FUN_0",
+    "FUN_STATIC",
+    "THUNK",
+    "THUNK_1_0",
+    "THUNK_0_1",
+    "THUNK_2_0",
+    "THUNK_1_1",
+    "THUNK_0",
+    "THUNK_STATIC",
+    "THUNK_SELECTOR",
+    "BCO",
+    "AP_UPD",
+    "PAP",
+    "AP_STACK",
+    "IND",
+    "IND_OLDGEN",
+    "IND_PERM",
+    "IND_OLDGEN_PERM",
+    "IND_STATIC",
+    "RET_BCO",
+    "RET_SMALL",
+    "RET_VEC_SMALL",
+    "RET_BIG",
+    "RET_VEC_BIG",
+    "RET_DYN",
+    "RET_FUN",
+    "UPDATE_FRAME",
+    "CATCH_FRAME",
+    "STOP_FRAME",
+    "CAF_BLACKHOLE",
+    "BLACKHOLE",
+    "BLACKHOLE_BQ",
+    "SE_BLACKHOLE",
+    "SE_CAF_BLACKHOLE",
+    "MVAR",
+    "ARR_WORDS",
+    "MUT_ARR_PTRS_CLEAN",
+    "MUT_ARR_PTRS_DIRTY",
+    "MUT_ARR_PTRS_FROZEN",
+    "MUT_VAR_CLEAN",
+    "MUT_VAR_DIRTY",
+    "MUT_CONS",
+    "WEAK",
+    "FOREIGN",
+    "STABLE_NAME",
+    "TSO",
+    "BLOCKED_FETCH",
+    "FETCH_ME",
+    "FETCH_ME_BQ",
+    "RBH",
+    "EVACUATED",
+    "REMOTE_REF",
+    "TVAR_WAIT_QUEUE",
+    "TVAR",
+    "TREC_CHUNK",
+    "TREC_HEADER",
+    "ATOMICALLY_FRAME",
+    "CATCH_RETRY_FRAME"
+};
+
+
+char *
+info_type(StgClosure *closure){ 
+  return closure_type_names[get_itbl(closure)->type];
+}
+
+char *
+info_type_by_ip(StgInfoTable *ip){ 
+  return closure_type_names[ip->type];
+}
+
+void
+info_hdr_type(StgClosure *closure, char *res){ 
+  strcpy(res,closure_type_names[get_itbl(closure)->type]);
+}
+
+/* --------------------------------------------------------------------------
+ * Address printing code
+ *
+ * Uses symbol table in (unstripped executable)
+ * ------------------------------------------------------------------------*/
+
+/* --------------------------------------------------------------------------
+ * Simple lookup table
+ *
+ * Current implementation is pretty dumb!
+ * ------------------------------------------------------------------------*/
+
+struct entry {
+    nat value;
+    const char *name;
+};
+
+static nat table_size;
+static struct entry* table;
+
+#ifdef USING_LIBBFD
+static nat max_table_size;
+
+static void reset_table( int size )
+{
+    max_table_size = size;
+    table_size = 0;
+    table = (struct entry *)stgMallocBytes(size * sizeof(struct entry), "Printer.c:reset_table()");
+}
+
+static void prepare_table( void )
+{
+    /* Could sort it...  */
+}
+
+static void insert( unsigned value, const char *name )
+{
+    if ( table_size >= max_table_size ) {
+        barf( "Symbol table overflow\n" );
+    }
+    table[table_size].value = value;
+    table[table_size].name = name;
+    table_size = table_size + 1;
+}
+#endif
+
+#if 0
+static rtsBool lookup_name( char *name, unsigned *result )
+{
+    int i;
+    for( i = 0; i < table_size && strcmp(name,table[i].name) != 0; ++i ) {
+    }
+    if (i < table_size) {
+        *result = table[i].value;
+        return rtsTrue;
+    } else {
+        return rtsFalse;
+    }
+}
+#endif
+
+/* Code from somewhere inside GHC (circa 1994)
+ * * Z-escapes:
+ *     "std"++xs -> "Zstd"++xs
+ *     char_to_c 'Z'  = "ZZ"
+ *     char_to_c '&'  = "Za"
+ *     char_to_c '|'  = "Zb"
+ *     char_to_c ':'  = "Zc"
+ *     char_to_c '/'  = "Zd"
+ *     char_to_c '='  = "Ze"
+ *     char_to_c '>'  = "Zg"
+ *     char_to_c '#'  = "Zh"
+ *     char_to_c '<'  = "Zl"
+ *     char_to_c '-'  = "Zm"
+ *     char_to_c '!'  = "Zn"
+ *     char_to_c '.'  = "Zo"
+ *     char_to_c '+'  = "Zp"
+ *     char_to_c '\'' = "Zq"
+ *     char_to_c '*'  = "Zt"
+ *     char_to_c '_'  = "Zu"
+ *     char_to_c c    = "Z" ++ show (ord c)
+ */
+static char unZcode( char ch )
+{
+    switch (ch) {
+    case 'a'  : return ('&');
+    case 'b'  : return ('|');
+    case 'c'  : return (':');
+    case 'd'  : return ('/');
+    case 'e'  : return ('=');
+    case 'g'  : return ('>');
+    case 'h'  : return ('#');
+    case 'l'  : return ('<');
+    case 'm'  : return ('-');
+    case 'n'  : return ('!');
+    case 'o'  : return ('.');
+    case 'p'  : return ('+');
+    case 'q'  : return ('\'');
+    case 't'  : return ('*');
+    case 'u'  : return ('_');
+    case 'Z'  :
+    case '\0' : return ('Z');
+    default   : return (ch);
+    }
+}
+
+#if 0
+/* Precondition: out big enough to handle output (about twice length of in) */
+static void enZcode( char *in, char *out )
+{
+    int i, j;
+
+    j = 0;
+    out[ j++ ] = '_';
+    for( i = 0; in[i] != '\0'; ++i ) {
+        switch (in[i]) {
+        case 'Z'  : 
+                out[j++] = 'Z';
+                out[j++] = 'Z';
+                break;
+        case '&'  : 
+                out[j++] = 'Z';
+                out[j++] = 'a';
+                break;
+        case '|'  : 
+                out[j++] = 'Z';
+                out[j++] = 'b';
+                break;
+        case ':'  : 
+                out[j++] = 'Z';
+                out[j++] = 'c';
+                break;
+        case '/'  : 
+                out[j++] = 'Z';
+                out[j++] = 'd';
+                break;
+        case '='  : 
+                out[j++] = 'Z';
+                out[j++] = 'e';
+                break;
+        case '>'  : 
+                out[j++] = 'Z';
+                out[j++] = 'g';
+                break;
+        case '#'  : 
+                out[j++] = 'Z';
+                out[j++] = 'h';
+                break;
+        case '<'  : 
+                out[j++] = 'Z';
+                out[j++] = 'l';
+                break;
+        case '-'  : 
+                out[j++] = 'Z';
+                out[j++] = 'm';
+                break;
+        case '!'  : 
+                out[j++] = 'Z';
+                out[j++] = 'n';
+                break;
+        case '.'  : 
+                out[j++] = 'Z';
+                out[j++] = 'o';
+                break;
+        case '+'  : 
+                out[j++] = 'Z';
+                out[j++] = 'p';
+                break;
+        case '\'' : 
+                out[j++] = 'Z';
+                out[j++] = 'q';
+                break;
+        case '*'  : 
+                out[j++] = 'Z';
+                out[j++] = 't';
+                break;
+        case '_'  : 
+                out[j++] = 'Z';
+                out[j++] = 'u';
+                break;
+        default :
+                out[j++] = in[i];
+                break;
+        }
+    }
+    out[j] = '\0';
+}
+#endif
+
+const char *lookupGHCName( void *addr )
+{
+    nat i;
+    for( i = 0; i < table_size && table[i].value != (unsigned) addr; ++i ) {
+    }
+    if (i < table_size) {
+        return table[i].name;
+    } else {
+        return NULL;
+    }
+}
+
+static void printZcoded( const char *raw )
+{
+    nat j = 0;
+    
+    while ( raw[j] != '\0' ) {
+        if (raw[j] == 'Z') {
+            debugBelch("%c", unZcode(raw[j+1]));
+            j = j + 2;
+        } else {
+            debugBelch("%c", unZcode(raw[j+1]));
+            j = j + 1;
+        }
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Symbol table loading
+ * ------------------------------------------------------------------------*/
+
+/* Causing linking trouble on Win32 plats, so I'm
+   disabling this for now. 
+*/
+#ifdef USING_LIBBFD
+
+#include <bfd.h>
+
+/* Fairly ad-hoc piece of code that seems to filter out a lot of
+ * rubbish like the obj-splitting symbols
+ */
+
+static rtsBool isReal( flagword flags STG_UNUSED, const char *name )
+{
+#if 0
+    /* ToDo: make this work on BFD */
+    int tp = type & N_TYPE;    
+    if (tp == N_TEXT || tp == N_DATA) {
+        return (name[0] == '_' && name[1] != '_');
+    } else {
+        return rtsFalse;
+    }
+#else
+    if (*name == '\0'  || 
+	(name[0] == 'g' && name[1] == 'c' && name[2] == 'c') ||
+	(name[0] == 'c' && name[1] == 'c' && name[2] == '.')) {
+	return rtsFalse;
+    }
+    return rtsTrue;
+#endif
+}
+
+extern void DEBUG_LoadSymbols( char *name )
+{
+    bfd* abfd;
+    char **matching;
+
+    bfd_init();
+    abfd = bfd_openr(name, "default");
+    if (abfd == NULL) {
+	barf("can't open executable %s to get symbol table", name);
+    }
+    if (!bfd_check_format_matches (abfd, bfd_object, &matching)) {
+	barf("mismatch");
+    }
+
+    {
+	long storage_needed;
+	asymbol **symbol_table;
+	long number_of_symbols;
+        long num_real_syms = 0;
+	long i;
+     
+	storage_needed = bfd_get_symtab_upper_bound (abfd);
+     
+	if (storage_needed < 0) {
+	    barf("can't read symbol table");
+	}     
+#if 0
+	if (storage_needed == 0) {
+	    debugBelch("no storage needed");
+	}
+#endif
+	symbol_table = (asymbol **) stgMallocBytes(storage_needed,"DEBUG_LoadSymbols");
+
+	number_of_symbols = bfd_canonicalize_symtab (abfd, symbol_table);
+     
+	if (number_of_symbols < 0) {
+	    barf("can't canonicalise symbol table");
+	}
+
+        for( i = 0; i != number_of_symbols; ++i ) {
+            symbol_info info;
+            bfd_get_symbol_info(abfd,symbol_table[i],&info);
+            /*debugBelch("\t%c\t0x%x      \t%s\n",info.type,(nat)info.value,info.name); */
+            if (isReal(info.type, info.name)) {
+                num_real_syms += 1;
+            }
+        }
+    
+        IF_DEBUG(interpreter,
+                 debugBelch("Loaded %ld symbols. Of which %ld are real symbols\n", 
+                         number_of_symbols, num_real_syms)
+                 );
+
+        reset_table( num_real_syms );
+    
+        for( i = 0; i != number_of_symbols; ++i ) {
+            symbol_info info;
+            bfd_get_symbol_info(abfd,symbol_table[i],&info);
+            if (isReal(info.type, info.name)) {
+                insert( info.value, info.name );
+            }
+        }
+
+        stgFree(symbol_table);
+    }
+    prepare_table();
+}
+
+#else /* HAVE_BFD_H */
+
+extern void DEBUG_LoadSymbols( char *name STG_UNUSED )
+{
+  /* nothing, yet */
+}
+
+#endif /* HAVE_BFD_H */
+
+void findPtr(P_ p, int);		/* keep gcc -Wall happy */
+
+void
+findPtr(P_ p, int follow)
+{
+  nat s, g;
+  P_ q, r;
+  bdescr *bd;
+#if defined(__GNUC__)
+  const int arr_size = 1024;
+#else
+#define arr_size 1024
+#endif
+  StgPtr arr[arr_size];
+  int i = 0;
+
+  for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+      for (s = 0; s < generations[g].n_steps; s++) {
+	  bd = generations[g].steps[s].blocks;
+	  for (; bd; bd = bd->link) {
+	      for (q = bd->start; q < bd->free; q++) {
+		  if (*q == (W_)p) {
+		      if (i < arr_size) {
+			  r = q;
+			  while (!LOOKS_LIKE_INFO_PTR(*r) || (P_)*r == NULL) {
+			      r--;
+			  }
+			  debugBelch("%p = ", r);
+			  printClosure((StgClosure *)r);
+			  arr[i++] = r;
+		      } else {
+			  return;
+		      }
+		  }
+	      }
+	  }
+      }
+  }
+  if (follow && i == 1) {
+      debugBelch("-->\n");
+      findPtr(arr[0], 1);
+  }
+}
+
+#else /* DEBUG */
+void printPtr( StgPtr p )
+{
+    debugBelch("ptr 0x%p (enable -DDEBUG for more info) " , p );
+}
+  
+void printObj( StgClosure *obj )
+{
+    debugBelch("obj 0x%p (enable -DDEBUG for more info) " , obj );
+}
+#endif /* DEBUG */
diff --git a/rts/Printer.h b/rts/Printer.h
new file mode 100644
index 0000000000..54bf611250
--- /dev/null
+++ b/rts/Printer.h
@@ -0,0 +1,31 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Prototypes for functions in Printer.c
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef PRINTER_H
+#define PRINTER_H
+
+extern void   	   printPtr        ( StgPtr p );
+extern void   	   printObj        ( StgClosure *obj );
+
+#ifdef DEBUG
+extern void   	   printClosure    ( StgClosure *obj );
+extern StgStackPtr printStackObj   ( StgStackPtr sp );
+extern void        printStackChunk ( StgStackPtr sp, StgStackPtr spLim );
+extern void        printTSO        ( StgTSO *tso );
+
+void   	           info_hdr_type   ( StgClosure *closure, char *res );
+char  *	           info_type       ( StgClosure *closure );
+char  *	           info_type_by_ip ( StgInfoTable *ip );
+
+extern void DEBUG_LoadSymbols( char *name );
+
+extern const char *lookupGHCName( void *addr );
+#endif
+
+#endif /* PRINTER_H */
+
diff --git a/rts/ProfHeap.c b/rts/ProfHeap.c
new file mode 100644
index 0000000000..312bee735c
--- /dev/null
+++ b/rts/ProfHeap.c
@@ -0,0 +1,1156 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2003
+ *
+ * Support for heap profiling
+ *
+ * ---------------------------------------------------------------------------*/
+
+#if defined(DEBUG) && !defined(PROFILING)
+#define DEBUG_HEAP_PROF
+#else
+#undef DEBUG_HEAP_PROF
+#endif
+
+#if defined(PROFILING) || defined(DEBUG_HEAP_PROF)
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "RtsFlags.h"
+#include "Profiling.h"
+#include "Storage.h"
+#include "ProfHeap.h"
+#include "Stats.h"
+#include "Hash.h"
+#include "RetainerProfile.h"
+#include "LdvProfile.h"
+#include "Arena.h"
+#include "Printer.h"
+
+#include <string.h>
+#include <stdlib.h>
+#include <math.h>
+
+/* -----------------------------------------------------------------------------
+ * era stores the current time period.  It is the same as the
+ * number of censuses that have been performed.
+ *
+ * RESTRICTION:
+ *   era must be no longer than LDV_SHIFT (15 or 30) bits.
+ * Invariants:
+ *   era is initialized to 1 in initHeapProfiling().
+ *
+ * max_era is initialized to 2^LDV_SHIFT in initHeapProfiling().
+ * When era reaches max_era, the profiling stops because a closure can
+ * store only up to (max_era - 1) as its creation or last use time.
+ * -------------------------------------------------------------------------- */
+unsigned int era;
+static nat max_era;
+
+/* -----------------------------------------------------------------------------
+ * Counters
+ *
+ * For most heap profiles each closure identity gets a simple count
+ * of live words in the heap at each census.  However, if we're
+ * selecting by biography, then we have to keep the various
+ * lag/drag/void counters for each identity.
+ * -------------------------------------------------------------------------- */
+typedef struct _counter {
+    void *identity;
+    union {
+	nat resid;
+	struct {
+	    int prim;     // total size of 'inherently used' closures
+	    int not_used; // total size of 'never used' closures
+	    int used;     // total size of 'used at least once' closures
+	    int void_total;  // current total size of 'destroyed without being used' closures
+	    int drag_total;  // current total size of 'used at least once and waiting to die'
+	} ldv;
+    } c;
+    struct _counter *next;
+} counter;
+
+STATIC_INLINE void
+initLDVCtr( counter *ctr )
+{
+    ctr->c.ldv.prim = 0;
+    ctr->c.ldv.not_used = 0;
+    ctr->c.ldv.used = 0;
+    ctr->c.ldv.void_total = 0;
+    ctr->c.ldv.drag_total = 0;
+}
+
+typedef struct {
+    double      time;    // the time in MUT time when the census is made
+    HashTable * hash;
+    counter   * ctrs;
+    Arena     * arena;
+
+    // for LDV profiling, when just displaying by LDV
+    int       prim;
+    int       not_used;
+    int       used;
+    int       void_total;
+    int       drag_total;
+} Census;
+
+static Census *censuses = NULL;
+static nat n_censuses = 0;
+
+#ifdef PROFILING
+static void aggregateCensusInfo( void );
+#endif
+
+static void dumpCensus( Census *census );
+
+/* -----------------------------------------------------------------------------
+   Closure Type Profiling;
+
+   PROBABLY TOTALLY OUT OF DATE -- ToDo (SDM)
+   -------------------------------------------------------------------------- */
+
+#ifdef DEBUG_HEAP_PROF
+static char *type_names[] = {
+      "INVALID_OBJECT"
+    , "CONSTR"
+    , "CONSTR_INTLIKE"
+    , "CONSTR_CHARLIKE"
+    , "CONSTR_STATIC"
+    , "CONSTR_NOCAF_STATIC"
+
+    , "FUN"
+    , "FUN_STATIC"
+
+    , "THUNK"
+    , "THUNK_STATIC"
+    , "THUNK_SELECTOR"
+
+    , "BCO"
+    , "AP_STACK"
+    , "AP"
+
+    , "PAP"
+
+    , "IND"
+    , "IND_OLDGEN"
+    , "IND_PERM"
+    , "IND_OLDGEN_PERM"
+    , "IND_STATIC"
+
+    , "RET_BCO"
+    , "RET_SMALL"
+    , "RET_VEC_SMALL"
+    , "RET_BIG"
+    , "RET_VEC_BIG"
+    , "RET_DYN"
+    , "UPDATE_FRAME"
+    , "CATCH_FRAME"
+    , "STOP_FRAME"
+
+    , "BLACKHOLE"
+    , "MVAR"
+
+    , "ARR_WORDS"
+
+    , "MUT_ARR_PTRS_CLEAN"
+    , "MUT_ARR_PTRS_DIRTY"
+    , "MUT_ARR_PTRS_FROZEN"
+    , "MUT_VAR_CLEAN"
+    , "MUT_VAR_DIRTY"
+
+    , "WEAK"
+  
+    , "TSO"
+
+    , "BLOCKED_FETCH"
+    , "FETCH_ME"
+
+    , "EVACUATED"
+};
+
+#endif /* DEBUG_HEAP_PROF */
+
+/* -----------------------------------------------------------------------------
+ * Find the "closure identity", which is a unique pointer reresenting
+ * the band to which this closure's heap space is attributed in the
+ * heap profile.
+ * ------------------------------------------------------------------------- */
+STATIC_INLINE void *
+closureIdentity( StgClosure *p )
+{
+    switch (RtsFlags.ProfFlags.doHeapProfile) {
+
+#ifdef PROFILING
+    case HEAP_BY_CCS:
+	return p->header.prof.ccs;
+    case HEAP_BY_MOD:
+	return p->header.prof.ccs->cc->module;
+    case HEAP_BY_DESCR:
+	return get_itbl(p)->prof.closure_desc;
+    case HEAP_BY_TYPE:
+	return get_itbl(p)->prof.closure_type;
+    case HEAP_BY_RETAINER:
+	// AFAIK, the only closures in the heap which might not have a
+	// valid retainer set are DEAD_WEAK closures.
+	if (isRetainerSetFieldValid(p))
+	    return retainerSetOf(p);
+	else
+	    return NULL;
+
+#else // DEBUG
+    case HEAP_BY_INFOPTR:
+	return (void *)((StgClosure *)p)->header.info; 
+    case HEAP_BY_CLOSURE_TYPE:
+	return type_names[get_itbl(p)->type];
+
+#endif
+    default:
+	barf("closureIdentity");
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * Profiling type predicates
+ * ----------------------------------------------------------------------- */
+#ifdef PROFILING
+STATIC_INLINE rtsBool
+doingLDVProfiling( void )
+{
+    return (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_LDV 
+	    || RtsFlags.ProfFlags.bioSelector != NULL);
+}
+
+STATIC_INLINE rtsBool
+doingRetainerProfiling( void )
+{
+    return (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_RETAINER
+	    || RtsFlags.ProfFlags.retainerSelector != NULL);
+}
+#endif /* PROFILING */
+
+// Precesses a closure 'c' being destroyed whose size is 'size'.
+// Make sure that LDV_recordDead() is not invoked on 'inherently used' closures
+// such as TSO; they should not be involved in computing dragNew or voidNew.
+// 
+// Even though era is checked in both LdvCensusForDead() and 
+// LdvCensusKillAll(), we still need to make sure that era is > 0 because 
+// LDV_recordDead() may be called from elsewhere in the runtime system. E.g., 
+// when a thunk is replaced by an indirection object.
+
+#ifdef PROFILING
+void
+LDV_recordDead( StgClosure *c, nat size )
+{
+    void *id;
+    nat t;
+    counter *ctr;
+
+    if (era > 0 && closureSatisfiesConstraints(c)) {
+	size -= sizeofW(StgProfHeader);
+	ASSERT(LDVW(c) != 0);
+	if ((LDVW((c)) & LDV_STATE_MASK) == LDV_STATE_CREATE) {
+	    t = (LDVW((c)) & LDV_CREATE_MASK) >> LDV_SHIFT;
+	    if (t < era) {
+		if (RtsFlags.ProfFlags.bioSelector == NULL) {
+		    censuses[t].void_total   += (int)size;
+		    censuses[era].void_total -= (int)size;
+		    ASSERT(censuses[t].void_total < censuses[t].not_used);
+		} else {
+		    id = closureIdentity(c);
+		    ctr = lookupHashTable(censuses[t].hash, (StgWord)id);
+		    ASSERT( ctr != NULL );
+		    ctr->c.ldv.void_total += (int)size;
+		    ctr = lookupHashTable(censuses[era].hash, (StgWord)id);
+		    if (ctr == NULL) {
+			ctr = arenaAlloc(censuses[era].arena, sizeof(counter));
+			initLDVCtr(ctr);
+			insertHashTable(censuses[era].hash, (StgWord)id, ctr);
+			ctr->identity = id;
+			ctr->next = censuses[era].ctrs;
+			censuses[era].ctrs = ctr;
+		    }
+		    ctr->c.ldv.void_total -= (int)size;
+		}
+	    }
+	} else {
+	    t = LDVW((c)) & LDV_LAST_MASK;
+	    if (t + 1 < era) {
+		if (RtsFlags.ProfFlags.bioSelector == NULL) {
+		    censuses[t+1].drag_total += size;
+		    censuses[era].drag_total -= size;
+		} else {
+		    void *id;
+		    id = closureIdentity(c);
+		    ctr = lookupHashTable(censuses[t+1].hash, (StgWord)id);
+		    ASSERT( ctr != NULL );
+		    ctr->c.ldv.drag_total += (int)size;
+		    ctr = lookupHashTable(censuses[era].hash, (StgWord)id);
+		    if (ctr == NULL) {
+			ctr = arenaAlloc(censuses[era].arena, sizeof(counter));
+			initLDVCtr(ctr);
+			insertHashTable(censuses[era].hash, (StgWord)id, ctr);
+			ctr->identity = id;
+			ctr->next = censuses[era].ctrs;
+			censuses[era].ctrs = ctr;
+		    }
+		    ctr->c.ldv.drag_total -= (int)size;
+		}
+	    }
+	}
+    }
+}
+#endif
+
+/* --------------------------------------------------------------------------
+ * Initialize censuses[era];
+ * ----------------------------------------------------------------------- */
+STATIC_INLINE void
+initEra(Census *census)
+{
+    census->hash  = allocHashTable();
+    census->ctrs  = NULL;
+    census->arena = newArena();
+
+    census->not_used   = 0;
+    census->used       = 0;
+    census->prim       = 0;
+    census->void_total = 0;
+    census->drag_total = 0;
+}
+
+/* --------------------------------------------------------------------------
+ * Increases era by 1 and initialize census[era].
+ * Reallocates gi[] and increases its size if needed.
+ * ----------------------------------------------------------------------- */
+static void
+nextEra( void )
+{
+#ifdef PROFILING
+    if (doingLDVProfiling()) { 
+	era++;
+
+	if (era == max_era) {
+	    errorBelch("maximum number of censuses reached; use +RTS -i to reduce");
+	    stg_exit(EXIT_FAILURE);
+	}
+	
+	if (era == n_censuses) {
+	    n_censuses *= 2;
+	    censuses = stgReallocBytes(censuses, sizeof(Census) * n_censuses,
+				       "nextEra");
+	}
+    }
+#endif /* PROFILING */
+
+    initEra( &censuses[era] );
+}
+
+/* -----------------------------------------------------------------------------
+ * DEBUG heap profiling, by info table
+ * -------------------------------------------------------------------------- */
+
+#ifdef DEBUG_HEAP_PROF
+FILE *hp_file;
+static char *hp_filename;
+
+void initProfiling1( void )
+{
+}
+
+void initProfiling2( void )
+{
+  if (RtsFlags.ProfFlags.doHeapProfile) {
+    /* Initialise the log file name */
+    hp_filename = stgMallocBytes(strlen(prog_name) + 6, "hpFileName");
+    sprintf(hp_filename, "%s.hp", prog_name);
+    
+    /* open the log file */
+    if ((hp_file = fopen(hp_filename, "w")) == NULL) {
+      debugBelch("Can't open profiling report file %s\n", 
+	      hp_filename);
+      RtsFlags.ProfFlags.doHeapProfile = 0;
+      return;
+    }
+  }
+  
+  initHeapProfiling();
+}
+
+void endProfiling( void )
+{
+  endHeapProfiling();
+}
+#endif /* DEBUG_HEAP_PROF */
+
+static void
+printSample(rtsBool beginSample, StgDouble sampleValue)
+{
+    StgDouble fractionalPart, integralPart;
+    fractionalPart = modf(sampleValue, &integralPart);
+    fprintf(hp_file, "%s %d.%02d\n",
+            (beginSample ? "BEGIN_SAMPLE" : "END_SAMPLE"),
+            (int)integralPart, (int)(fractionalPart * 100));
+}
+
+/* --------------------------------------------------------------------------
+ * Initialize the heap profilier
+ * ----------------------------------------------------------------------- */
+nat
+initHeapProfiling(void)
+{
+    if (! RtsFlags.ProfFlags.doHeapProfile) {
+        return 0;
+    }
+
+#ifdef PROFILING
+    if (doingLDVProfiling() && doingRetainerProfiling()) {
+	errorBelch("cannot mix -hb and -hr");
+	stg_exit(EXIT_FAILURE);
+    }
+#endif
+
+    // we only count eras if we're doing LDV profiling.  Otherwise era
+    // is fixed at zero.
+#ifdef PROFILING
+    if (doingLDVProfiling()) {
+	era = 1;
+    } else
+#endif
+    {
+	era = 0;
+    }
+
+    {   // max_era = 2^LDV_SHIFT
+	nat p;
+	max_era = 1;
+	for (p = 0; p < LDV_SHIFT; p++)
+	    max_era *= 2;
+    }
+
+    n_censuses = 32;
+    censuses = stgMallocBytes(sizeof(Census) * n_censuses, "initHeapProfiling");
+
+    initEra( &censuses[era] );
+
+    /* initProfilingLogFile(); */
+    fprintf(hp_file, "JOB \"%s", prog_name);
+
+#ifdef PROFILING
+    {
+	int count;
+	for(count = 1; count < prog_argc; count++)
+	    fprintf(hp_file, " %s", prog_argv[count]);
+	fprintf(hp_file, " +RTS");
+	for(count = 0; count < rts_argc; count++)
+	    fprintf(hp_file, " %s", rts_argv[count]);
+    }
+#endif /* PROFILING */
+
+    fprintf(hp_file, "\"\n" );
+
+    fprintf(hp_file, "DATE \"%s\"\n", time_str());
+
+    fprintf(hp_file, "SAMPLE_UNIT \"seconds\"\n");
+    fprintf(hp_file, "VALUE_UNIT \"bytes\"\n");
+
+    printSample(rtsTrue, 0);
+    printSample(rtsFalse, 0);
+
+#ifdef DEBUG_HEAP_PROF
+    DEBUG_LoadSymbols(prog_name);
+#endif
+
+#ifdef PROFILING
+    if (doingRetainerProfiling()) {
+	initRetainerProfiling();
+    }
+#endif
+
+    return 0;
+}
+
+void
+endHeapProfiling(void)
+{
+    StgDouble seconds;
+
+    if (! RtsFlags.ProfFlags.doHeapProfile) {
+        return;
+    }
+
+#ifdef PROFILING
+    if (doingRetainerProfiling()) {
+	endRetainerProfiling();
+    }
+#endif
+
+#ifdef PROFILING
+    if (doingLDVProfiling()) {
+	nat t;
+	LdvCensusKillAll();
+	aggregateCensusInfo();
+	for (t = 1; t < era; t++) {
+	    dumpCensus( &censuses[t] );
+	}
+    }
+#endif
+
+    seconds = mut_user_time();
+    printSample(rtsTrue, seconds);
+    printSample(rtsFalse, seconds);
+    fclose(hp_file);
+}
+
+
+
+#ifdef PROFILING
+static size_t
+buf_append(char *p, const char *q, char *end)
+{
+    int m;
+
+    for (m = 0; p < end; p++, q++, m++) {
+	*p = *q;
+	if (*q == '\0') { break; }
+    }
+    return m;
+}
+
+static void
+fprint_ccs(FILE *fp, CostCentreStack *ccs, nat max_length)
+{
+    char buf[max_length+1], *p, *buf_end;
+
+    // MAIN on its own gets printed as "MAIN", otherwise we ignore MAIN.
+    if (ccs == CCS_MAIN) {
+	fprintf(fp, "MAIN");
+	return;
+    }
+
+    fprintf(fp, "(%ld)", ccs->ccsID);
+
+    p = buf;
+    buf_end = buf + max_length + 1;
+
+    // keep printing components of the stack until we run out of space
+    // in the buffer.  If we run out of space, end with "...".
+    for (; ccs != NULL && ccs != CCS_MAIN; ccs = ccs->prevStack) {
+
+	// CAF cost centres print as M.CAF, but we leave the module
+	// name out of all the others to save space.
+	if (!strcmp(ccs->cc->label,"CAF")) {
+	    p += buf_append(p, ccs->cc->module, buf_end);
+	    p += buf_append(p, ".CAF", buf_end);
+	} else {
+	    if (ccs->prevStack != NULL && ccs->prevStack != CCS_MAIN) {
+		p += buf_append(p, "/", buf_end);
+	    }
+	    p += buf_append(p, ccs->cc->label, buf_end);
+	}
+	
+	if (p >= buf_end) {
+	    sprintf(buf+max_length-4, "...");
+	    break;
+	}
+    }
+    fprintf(fp, "%s", buf);
+}
+#endif /* PROFILING */
+
+rtsBool
+strMatchesSelector( char* str, char* sel )
+{
+   char* p;
+   // debugBelch("str_matches_selector %s %s\n", str, sel);
+   while (1) {
+       // Compare str against wherever we've got to in sel.
+       p = str;
+       while (*p != '\0' && *sel != ',' && *sel != '\0' && *p == *sel) {
+	   p++; sel++;
+       }
+       // Match if all of str used and have reached the end of a sel fragment.
+       if (*p == '\0' && (*sel == ',' || *sel == '\0'))
+	   return rtsTrue;
+       
+       // No match.  Advance sel to the start of the next elem.
+       while (*sel != ',' && *sel != '\0') sel++;
+       if (*sel == ',') sel++;
+       
+       /* Run out of sel ?? */
+       if (*sel == '\0') return rtsFalse;
+   }
+}
+
+/* -----------------------------------------------------------------------------
+ * Figure out whether a closure should be counted in this census, by
+ * testing against all the specified constraints.
+ * -------------------------------------------------------------------------- */
+rtsBool
+closureSatisfiesConstraints( StgClosure* p )
+{
+#ifdef DEBUG_HEAP_PROF
+    (void)p;   /* keep gcc -Wall happy */
+    return rtsTrue;
+#else
+   rtsBool b;
+
+   // The CCS has a selected field to indicate whether this closure is
+   // deselected by not being mentioned in the module, CC, or CCS
+   // selectors.
+   if (!p->header.prof.ccs->selected) {
+       return rtsFalse;
+   }
+
+   if (RtsFlags.ProfFlags.descrSelector) {
+       b = strMatchesSelector( (get_itbl((StgClosure *)p))->prof.closure_desc,
+				 RtsFlags.ProfFlags.descrSelector );
+       if (!b) return rtsFalse;
+   }
+   if (RtsFlags.ProfFlags.typeSelector) {
+       b = strMatchesSelector( (get_itbl((StgClosure *)p))->prof.closure_type,
+                                RtsFlags.ProfFlags.typeSelector );
+       if (!b) return rtsFalse;
+   }
+   if (RtsFlags.ProfFlags.retainerSelector) {
+       RetainerSet *rs;
+       nat i;
+       // We must check that the retainer set is valid here.  One
+       // reason it might not be valid is if this closure is a
+       // a newly deceased weak pointer (i.e. a DEAD_WEAK), since
+       // these aren't reached by the retainer profiler's traversal.
+       if (isRetainerSetFieldValid((StgClosure *)p)) {
+	   rs = retainerSetOf((StgClosure *)p);
+	   if (rs != NULL) {
+	       for (i = 0; i < rs->num; i++) {
+		   b = strMatchesSelector( rs->element[i]->cc->label,
+					   RtsFlags.ProfFlags.retainerSelector );
+		   if (b) return rtsTrue;
+	       }
+	   }
+       }
+       return rtsFalse;
+   }
+   return rtsTrue;
+#endif /* PROFILING */
+}
+
+/* -----------------------------------------------------------------------------
+ * Aggregate the heap census info for biographical profiling
+ * -------------------------------------------------------------------------- */
+#ifdef PROFILING
+static void
+aggregateCensusInfo( void )
+{
+    HashTable *acc;
+    nat t;
+    counter *c, *d, *ctrs;
+    Arena *arena;
+
+    if (!doingLDVProfiling()) return;
+
+    // Aggregate the LDV counters when displaying by biography.
+    if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_LDV) {
+	int void_total, drag_total;
+
+	// Now we compute void_total and drag_total for each census
+	// After the program has finished, the void_total field of
+	// each census contains the count of words that were *created*
+	// in this era and were eventually void.  Conversely, if a
+	// void closure was destroyed in this era, it will be
+	// represented by a negative count of words in void_total.
+	//
+	// To get the count of live words that are void at each
+	// census, just propagate the void_total count forwards:
+
+	void_total = 0;
+	drag_total = 0;
+	for (t = 1; t < era; t++) { // note: start at 1, not 0
+	    void_total += censuses[t].void_total;
+	    drag_total += censuses[t].drag_total;
+	    censuses[t].void_total = void_total;
+	    censuses[t].drag_total = drag_total;
+
+	    ASSERT( censuses[t].void_total <= censuses[t].not_used );
+	    // should be true because: void_total is the count of
+	    // live words that are void at this census, which *must*
+	    // be less than the number of live words that have not
+	    // been used yet.
+
+	    ASSERT( censuses[t].drag_total <= censuses[t].used );
+	    // similar reasoning as above.
+	}
+	
+	return;
+    }
+
+    // otherwise... we're doing a heap profile that is restricted to
+    // some combination of lag, drag, void or use.  We've kept all the
+    // census info for all censuses so far, but we still need to
+    // aggregate the counters forwards.
+
+    arena = newArena();
+    acc = allocHashTable();
+    ctrs = NULL;
+
+    for (t = 1; t < era; t++) {
+
+	// first look through all the counters we're aggregating
+	for (c = ctrs; c != NULL; c = c->next) {
+	    // if one of the totals is non-zero, then this closure
+	    // type must be present in the heap at this census time...
+	    d = lookupHashTable(censuses[t].hash, (StgWord)c->identity);
+
+	    if (d == NULL) {
+		// if this closure identity isn't present in the
+		// census for this time period, then our running
+		// totals *must* be zero.
+		ASSERT(c->c.ldv.void_total == 0 && c->c.ldv.drag_total == 0);
+
+		// debugCCS(c->identity);
+		// debugBelch(" census=%d void_total=%d drag_total=%d\n",
+		//         t, c->c.ldv.void_total, c->c.ldv.drag_total);
+	    } else {
+		d->c.ldv.void_total += c->c.ldv.void_total;
+		d->c.ldv.drag_total += c->c.ldv.drag_total;
+		c->c.ldv.void_total =  d->c.ldv.void_total;
+		c->c.ldv.drag_total =  d->c.ldv.drag_total;
+
+		ASSERT( c->c.ldv.void_total >= 0 );
+		ASSERT( c->c.ldv.drag_total >= 0 );
+	    }
+	}
+
+	// now look through the counters in this census to find new ones
+	for (c = censuses[t].ctrs; c != NULL; c = c->next) {
+	    d = lookupHashTable(acc, (StgWord)c->identity);
+	    if (d == NULL) {
+		d = arenaAlloc( arena, sizeof(counter) );
+		initLDVCtr(d);
+		insertHashTable( acc, (StgWord)c->identity, d );
+		d->identity = c->identity;
+		d->next = ctrs;
+		ctrs = d;
+		d->c.ldv.void_total = c->c.ldv.void_total;
+		d->c.ldv.drag_total = c->c.ldv.drag_total;
+	    }
+	    ASSERT( c->c.ldv.void_total >= 0 );
+	    ASSERT( c->c.ldv.drag_total >= 0 );
+	}
+    }
+
+    freeHashTable(acc, NULL);
+    arenaFree(arena);
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+ * Print out the results of a heap census.
+ * -------------------------------------------------------------------------- */
+static void
+dumpCensus( Census *census )
+{
+    counter *ctr;
+    int count;
+
+    printSample(rtsTrue, census->time);
+
+#ifdef PROFILING
+    if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_LDV) {
+      fprintf(hp_file, "VOID\t%lu\n", (unsigned long)(census->void_total) * sizeof(W_));
+	fprintf(hp_file, "LAG\t%lu\n", 
+		(unsigned long)(census->not_used - census->void_total) * sizeof(W_));
+	fprintf(hp_file, "USE\t%lu\n", 
+		(unsigned long)(census->used - census->drag_total) * sizeof(W_));
+	fprintf(hp_file, "INHERENT_USE\t%lu\n", 
+		(unsigned long)(census->prim) * sizeof(W_));
+	fprintf(hp_file, "DRAG\t%lu\n",
+		(unsigned long)(census->drag_total) * sizeof(W_));
+	printSample(rtsFalse, census->time);
+	return;
+    }
+#endif
+
+    for (ctr = census->ctrs; ctr != NULL; ctr = ctr->next) {
+
+#ifdef PROFILING
+	if (RtsFlags.ProfFlags.bioSelector != NULL) {
+	    count = 0;
+	    if (strMatchesSelector("lag", RtsFlags.ProfFlags.bioSelector))
+		count += ctr->c.ldv.not_used - ctr->c.ldv.void_total;
+	    if (strMatchesSelector("drag", RtsFlags.ProfFlags.bioSelector))
+		count += ctr->c.ldv.drag_total;
+	    if (strMatchesSelector("void", RtsFlags.ProfFlags.bioSelector))
+		count += ctr->c.ldv.void_total;
+	    if (strMatchesSelector("use", RtsFlags.ProfFlags.bioSelector))
+		count += ctr->c.ldv.used - ctr->c.ldv.drag_total;
+	} else
+#endif
+	{
+	    count = ctr->c.resid;
+	}
+
+	ASSERT( count >= 0 );
+
+	if (count == 0) continue;
+
+#ifdef DEBUG_HEAP_PROF
+	switch (RtsFlags.ProfFlags.doHeapProfile) {
+	case HEAP_BY_INFOPTR:
+	    fprintf(hp_file, "%s", lookupGHCName(ctr->identity));
+	    break;
+	case HEAP_BY_CLOSURE_TYPE:
+	    fprintf(hp_file, "%s", (char *)ctr->identity);
+	    break;
+	}
+#endif
+	
+#ifdef PROFILING
+	switch (RtsFlags.ProfFlags.doHeapProfile) {
+	case HEAP_BY_CCS:
+	    fprint_ccs(hp_file, (CostCentreStack *)ctr->identity, 25);
+	    break;
+	case HEAP_BY_MOD:
+	case HEAP_BY_DESCR:
+	case HEAP_BY_TYPE:
+	    fprintf(hp_file, "%s", (char *)ctr->identity);
+	    break;
+	case HEAP_BY_RETAINER:
+	{
+	    RetainerSet *rs = (RetainerSet *)ctr->identity;
+
+	    // it might be the distinguished retainer set rs_MANY:
+	    if (rs == &rs_MANY) {
+		fprintf(hp_file, "MANY");
+		break;
+	    }
+
+	    // Mark this retainer set by negating its id, because it
+	    // has appeared in at least one census.  We print the
+	    // values of all such retainer sets into the log file at
+	    // the end.  A retainer set may exist but not feature in
+	    // any censuses if it arose as the intermediate retainer
+	    // set for some closure during retainer set calculation.
+	    if (rs->id > 0)
+		rs->id = -(rs->id);
+
+	    // report in the unit of bytes: * sizeof(StgWord)
+	    printRetainerSetShort(hp_file, rs);
+	    break;
+	}
+	default:
+	    barf("dumpCensus; doHeapProfile");
+	}
+#endif
+
+	fprintf(hp_file, "\t%lu\n", (unsigned long)count * sizeof(W_));
+    }
+
+    printSample(rtsFalse, census->time);
+}
+
+/* -----------------------------------------------------------------------------
+ * Code to perform a heap census.
+ * -------------------------------------------------------------------------- */
+static void
+heapCensusChain( Census *census, bdescr *bd )
+{
+    StgPtr p;
+    StgInfoTable *info;
+    void *identity;
+    nat size;
+    counter *ctr;
+    nat real_size;
+    rtsBool prim;
+
+    for (; bd != NULL; bd = bd->link) {
+
+	// HACK: ignore pinned blocks, because they contain gaps.
+	// It's not clear exactly what we'd like to do here, since we
+	// can't tell which objects in the block are actually alive.
+	// Perhaps the whole block should be counted as SYSTEM memory.
+	if (bd->flags & BF_PINNED) {
+	    continue;
+	}
+
+	p = bd->start;
+	while (p < bd->free) {
+	    info = get_itbl((StgClosure *)p);
+	    prim = rtsFalse;
+	    
+	    switch (info->type) {
+
+	    case THUNK:
+		size = thunk_sizeW_fromITBL(info);
+		break;
+
+	    case THUNK_1_1:
+	    case THUNK_0_2:
+	    case THUNK_2_0:
+		size = sizeofW(StgThunkHeader) + 2;
+		break;
+
+	    case THUNK_1_0:
+	    case THUNK_0_1:
+	    case THUNK_SELECTOR:
+		size = sizeofW(StgThunkHeader) + 1;
+		break;
+
+	    case CONSTR:
+	    case FUN:
+	    case IND_PERM:
+	    case IND_OLDGEN:
+	    case IND_OLDGEN_PERM:
+	    case CAF_BLACKHOLE:
+	    case SE_CAF_BLACKHOLE:
+	    case SE_BLACKHOLE:
+	    case BLACKHOLE:
+	    case CONSTR_INTLIKE:
+	    case CONSTR_CHARLIKE:
+	    case FUN_1_0:
+	    case FUN_0_1:
+	    case FUN_1_1:
+	    case FUN_0_2:
+	    case FUN_2_0:
+	    case CONSTR_1_0:
+	    case CONSTR_0_1:
+	    case CONSTR_1_1:
+	    case CONSTR_0_2:
+	    case CONSTR_2_0:
+		size = sizeW_fromITBL(info);
+		break;
+
+	    case IND:
+		// Special case/Delicate Hack: INDs don't normally
+		// appear, since we're doing this heap census right
+		// after GC.  However, GarbageCollect() also does
+		// resurrectThreads(), which can update some
+		// blackholes when it calls raiseAsync() on the
+		// resurrected threads.  So we know that any IND will
+		// be the size of a BLACKHOLE.
+		size = BLACKHOLE_sizeW();
+		break;
+
+	    case BCO:
+		prim = rtsTrue;
+		size = bco_sizeW((StgBCO *)p);
+		break;
+
+	    case MVAR:
+	    case WEAK:
+	    case STABLE_NAME:
+	    case MUT_VAR_CLEAN:
+	    case MUT_VAR_DIRTY:
+		prim = rtsTrue;
+		size = sizeW_fromITBL(info);
+		break;
+
+	    case AP:
+		size = ap_sizeW((StgAP *)p);
+		break;
+
+	    case PAP:
+		size = pap_sizeW((StgPAP *)p);
+		break;
+
+	    case AP_STACK:
+		size = ap_stack_sizeW((StgAP_STACK *)p);
+		break;
+		
+	    case ARR_WORDS:
+		prim = rtsTrue;
+		size = arr_words_sizeW(stgCast(StgArrWords*,p));
+		break;
+		
+	    case MUT_ARR_PTRS_CLEAN:
+	    case MUT_ARR_PTRS_DIRTY:
+	    case MUT_ARR_PTRS_FROZEN:
+	    case MUT_ARR_PTRS_FROZEN0:
+		prim = rtsTrue;
+		size = mut_arr_ptrs_sizeW((StgMutArrPtrs *)p);
+		break;
+		
+	    case TSO:
+		prim = rtsTrue;
+#ifdef DEBUG_HEAP_PROF
+		size = tso_sizeW((StgTSO *)p);
+		break;
+#else
+		if (RtsFlags.ProfFlags.includeTSOs) {
+		    size = tso_sizeW((StgTSO *)p);
+		    break;
+		} else {
+		    // Skip this TSO and move on to the next object
+		    p += tso_sizeW((StgTSO *)p);
+		    continue;
+		}
+#endif
+
+	    case TREC_HEADER: 
+		prim = rtsTrue;
+		size = sizeofW(StgTRecHeader);
+		break;
+
+	    case TVAR_WAIT_QUEUE:
+		prim = rtsTrue;
+		size = sizeofW(StgTVarWaitQueue);
+		break;
+		
+	    case TVAR:
+		prim = rtsTrue;
+		size = sizeofW(StgTVar);
+		break;
+		
+	    case TREC_CHUNK:
+		prim = rtsTrue;
+		size = sizeofW(StgTRecChunk);
+		break;
+
+	    default:
+		barf("heapCensus, unknown object: %d", info->type);
+	    }
+	    
+	    identity = NULL;
+
+#ifdef DEBUG_HEAP_PROF
+	    real_size = size;
+#else
+	    // subtract the profiling overhead
+	    real_size = size - sizeofW(StgProfHeader);
+#endif
+
+	    if (closureSatisfiesConstraints((StgClosure*)p)) {
+#ifdef PROFILING
+		if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_LDV) {
+		    if (prim)
+			census->prim += real_size;
+		    else if ((LDVW(p) & LDV_STATE_MASK) == LDV_STATE_CREATE)
+			census->not_used += real_size;
+		    else
+			census->used += real_size;
+		} else
+#endif
+		{
+		    identity = closureIdentity((StgClosure *)p);
+
+		    if (identity != NULL) {
+			ctr = lookupHashTable( census->hash, (StgWord)identity );
+			if (ctr != NULL) {
+#ifdef PROFILING
+			    if (RtsFlags.ProfFlags.bioSelector != NULL) {
+				if (prim)
+				    ctr->c.ldv.prim += real_size;
+				else if ((LDVW(p) & LDV_STATE_MASK) == LDV_STATE_CREATE)
+				    ctr->c.ldv.not_used += real_size;
+				else
+				    ctr->c.ldv.used += real_size;
+			    } else
+#endif
+			    {
+				ctr->c.resid += real_size;
+			    }
+			} else {
+			    ctr = arenaAlloc( census->arena, sizeof(counter) );
+			    initLDVCtr(ctr);
+			    insertHashTable( census->hash, (StgWord)identity, ctr );
+			    ctr->identity = identity;
+			    ctr->next = census->ctrs;
+			    census->ctrs = ctr;
+
+#ifdef PROFILING
+			    if (RtsFlags.ProfFlags.bioSelector != NULL) {
+				if (prim)
+				    ctr->c.ldv.prim = real_size;
+				else if ((LDVW(p) & LDV_STATE_MASK) == LDV_STATE_CREATE)
+				    ctr->c.ldv.not_used = real_size;
+				else
+				    ctr->c.ldv.used = real_size;
+			    } else
+#endif
+			    {
+				ctr->c.resid = real_size;
+			    }
+			}
+		    }
+		}
+	    }
+
+	    p += size;
+	}
+    }
+}
+
+void
+heapCensus( void )
+{
+  nat g, s;
+  Census *census;
+
+  census = &censuses[era];
+  census->time  = mut_user_time();
+    
+  // calculate retainer sets if necessary
+#ifdef PROFILING
+  if (doingRetainerProfiling()) {
+      retainerProfile();
+  }
+#endif
+
+#ifdef PROFILING
+  stat_startHeapCensus();
+#endif
+
+  // Traverse the heap, collecting the census info
+
+  // First the small_alloc_list: we have to fix the free pointer at
+  // the end by calling tidyAllocatedLists() first.
+  tidyAllocateLists();
+  heapCensusChain( census, small_alloc_list );
+
+  // Now traverse the heap in each generation/step.
+  if (RtsFlags.GcFlags.generations == 1) {
+      heapCensusChain( census, g0s0->blocks );
+  } else {
+      for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+	  for (s = 0; s < generations[g].n_steps; s++) {
+	      heapCensusChain( census, generations[g].steps[s].blocks );
+	      // Are we interested in large objects?  might be
+	      // confusing to include the stack in a heap profile.
+	      heapCensusChain( census, generations[g].steps[s].large_objects );
+	  }
+      }
+  }
+
+  // dump out the census info
+#ifdef PROFILING
+    // We can't generate any info for LDV profiling until
+    // the end of the run...
+    if (!doingLDVProfiling())
+	dumpCensus( census );
+#else
+    dumpCensus( census );
+#endif
+
+
+  // free our storage, unless we're keeping all the census info for
+  // future restriction by biography.
+#ifdef PROFILING
+  if (RtsFlags.ProfFlags.bioSelector == NULL)
+#endif
+  {
+      freeHashTable( census->hash, NULL/* don't free the elements */ );
+      arenaFree( census->arena );
+      census->hash = NULL;
+      census->arena = NULL;
+  }
+
+  // we're into the next time period now
+  nextEra();
+
+#ifdef PROFILING
+  stat_endHeapCensus();
+#endif
+}    
+
+#endif /* PROFILING || DEBUG_HEAP_PROF */
+
diff --git a/rts/ProfHeap.h b/rts/ProfHeap.h
new file mode 100644
index 0000000000..0251416762
--- /dev/null
+++ b/rts/ProfHeap.h
@@ -0,0 +1,19 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Support for heap profiling
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef PROFHEAP_H
+#define PROFHEAP_H
+
+extern void    heapCensus( void );
+extern nat     initHeapProfiling( void );
+extern void    endHeapProfiling( void );
+extern rtsBool closureSatisfiesConstraints( StgClosure* p );
+extern void    LDV_recordDead( StgClosure *c, nat size );
+extern rtsBool strMatchesSelector( char* str, char* sel );
+
+#endif /* PROFHEAP_H */
diff --git a/rts/Profiling.c b/rts/Profiling.c
new file mode 100644
index 0000000000..028dc5a509
--- /dev/null
+++ b/rts/Profiling.c
@@ -0,0 +1,941 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2000
+ *
+ * Support for profiling
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifdef PROFILING
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "RtsFlags.h"
+#include "Profiling.h"
+#include "Storage.h"
+#include "Proftimer.h"
+#include "Timer.h"
+#include "ProfHeap.h"
+#include "Arena.h"
+#include "RetainerProfile.h"
+#include "LdvProfile.h"
+
+#include <string.h>
+
+/*
+ * Profiling allocation arena.
+ */
+Arena *prof_arena;
+
+/*
+ * Global variables used to assign unique IDs to cc's, ccs's, and 
+ * closure_cats
+ */
+
+unsigned int CC_ID;
+unsigned int CCS_ID;
+unsigned int HP_ID;
+
+/* figures for the profiling report.
+ */
+static ullong total_alloc;
+static lnat   total_prof_ticks;
+
+/* Globals for opening the profiling log file(s)
+ */
+static char *prof_filename; /* prof report file name = <program>.prof */
+FILE *prof_file;
+
+static char *hp_filename;	/* heap profile (hp2ps style) log file */
+FILE *hp_file;
+
+/* The Current Cost Centre Stack (for attributing costs)
+ */
+CostCentreStack *CCCS;
+
+/* Linked lists to keep track of cc's and ccs's that haven't
+ * been declared in the log file yet
+ */
+CostCentre *CC_LIST;
+CostCentreStack *CCS_LIST;
+
+/*
+ * Built-in cost centres and cost-centre stacks:
+ *
+ *    MAIN   is the root of the cost-centre stack tree.  If there are
+ *           no _scc_s in the program, all costs will be attributed
+ *           to MAIN.
+ *
+ *    SYSTEM is the RTS in general (scheduler, etc.).  All costs for
+ *           RTS operations apart from garbage collection are attributed
+ *           to SYSTEM.
+ *
+ *    GC     is the storage manager / garbage collector.
+ *
+ *    OVERHEAD gets all costs generated by the profiling system
+ *           itself.  These are costs that would not be incurred
+ *           during non-profiled execution of the program.
+ *
+ *    SUBSUMED is the one-and-only CCS placed on top-level functions. 
+ *           It indicates that all costs are to be attributed to the
+ *           enclosing cost centre stack.  SUBSUMED never accumulates
+ *           any costs.  The is_caf flag is set on the subsumed cost
+ *           centre.
+ *
+ *    DONT_CARE is a placeholder cost-centre we assign to static
+ *           constructors.  It should *never* accumulate any costs.
+ */
+
+CC_DECLARE(CC_MAIN,      "MAIN", 	"MAIN",      CC_IS_BORING, );
+CC_DECLARE(CC_SYSTEM,    "SYSTEM",   	"MAIN",      CC_IS_BORING, );
+CC_DECLARE(CC_GC,        "GC",   	"GC",        CC_IS_BORING, );
+CC_DECLARE(CC_OVERHEAD,  "OVERHEAD_of", "PROFILING", CC_IS_CAF,    );
+CC_DECLARE(CC_SUBSUMED,  "SUBSUMED",    "MAIN",      CC_IS_CAF,    );
+CC_DECLARE(CC_DONT_CARE, "DONT_CARE",   "MAIN",      CC_IS_BORING, );
+
+CCS_DECLARE(CCS_MAIN, 	    CC_MAIN,       );
+CCS_DECLARE(CCS_SYSTEM,	    CC_SYSTEM,     );
+CCS_DECLARE(CCS_GC,         CC_GC,         );
+CCS_DECLARE(CCS_OVERHEAD,   CC_OVERHEAD,   );
+CCS_DECLARE(CCS_SUBSUMED,   CC_SUBSUMED,   );
+CCS_DECLARE(CCS_DONT_CARE,  CC_DONT_CARE, );
+
+/* 
+ * Uniques for the XML log-file format
+ */
+#define CC_UQ         1
+#define CCS_UQ        2
+#define TC_UQ         3
+#define HEAP_OBJ_UQ   4
+#define TIME_UPD_UQ   5
+#define HEAP_UPD_UQ   6
+
+/* 
+ * Static Functions
+ */
+
+static  CostCentreStack * ActualPush_     ( CostCentreStack *ccs, CostCentre *cc, 
+					    CostCentreStack *new_ccs );
+static  rtsBool           ccs_to_ignore   ( CostCentreStack *ccs );
+static  void              count_ticks     ( CostCentreStack *ccs );
+static  void              inherit_costs   ( CostCentreStack *ccs );
+static  void              reportCCS       ( CostCentreStack *ccs, nat indent );
+static  void              DecCCS          ( CostCentreStack *ccs );
+static  void              DecBackEdge     ( CostCentreStack *ccs, 
+					    CostCentreStack *oldccs );
+static  CostCentreStack * CheckLoop       ( CostCentreStack *ccs, CostCentre *cc );
+static  CostCentreStack * pruneCCSTree    ( CostCentreStack *ccs );
+static  CostCentreStack * ActualPush      ( CostCentreStack *, CostCentre * );
+static  CostCentreStack * IsInIndexTable  ( IndexTable *, CostCentre * );
+static  IndexTable *      AddToIndexTable ( IndexTable *, CostCentreStack *, 
+					    CostCentre *, unsigned int );
+static  void              ccsSetSelected  ( CostCentreStack *ccs );
+
+static  void              initTimeProfiling   ( void );
+static  void              initProfilingLogFile( void );
+
+static  void              reportCCS_XML       ( CostCentreStack *ccs );
+
+/* -----------------------------------------------------------------------------
+   Initialise the profiling environment
+   -------------------------------------------------------------------------- */
+
+void
+initProfiling1 (void)
+{
+  // initialise our arena
+  prof_arena = newArena();
+
+  /* for the benefit of allocate()... */
+  CCCS = CCS_SYSTEM;
+  
+  /* Initialize counters for IDs */
+  CC_ID  = 1;
+  CCS_ID = 1;
+  HP_ID  = 1;
+  
+  /* Initialize Declaration lists to NULL */
+  CC_LIST  = NULL;
+  CCS_LIST = NULL;
+
+  /* Register all the cost centres / stacks in the program 
+   * CC_MAIN gets link = 0, all others have non-zero link.
+   */
+  REGISTER_CC(CC_MAIN);
+  REGISTER_CC(CC_SYSTEM);
+  REGISTER_CC(CC_GC);
+  REGISTER_CC(CC_OVERHEAD);
+  REGISTER_CC(CC_SUBSUMED);
+  REGISTER_CC(CC_DONT_CARE);
+  REGISTER_CCS(CCS_MAIN);
+  REGISTER_CCS(CCS_SYSTEM);
+  REGISTER_CCS(CCS_GC);
+  REGISTER_CCS(CCS_OVERHEAD);
+  REGISTER_CCS(CCS_SUBSUMED);
+  REGISTER_CCS(CCS_DONT_CARE);
+
+  CCCS = CCS_OVERHEAD;
+
+  /* cost centres are registered by the per-module 
+   * initialisation code now... 
+   */
+}
+
+void
+initProfiling2 (void)
+{
+  CostCentreStack *ccs, *next;
+
+  CCCS = CCS_SYSTEM;
+
+  /* Set up the log file, and dump the header and cost centre
+   * information into it.  */
+  initProfilingLogFile();
+
+  /* find all the "special" cost centre stacks, and make them children
+   * of CCS_MAIN.
+   */
+  ASSERT(CCS_MAIN->prevStack == 0);
+  CCS_MAIN->root = CC_MAIN;
+  ccsSetSelected(CCS_MAIN);
+  DecCCS(CCS_MAIN);
+
+  for (ccs = CCS_LIST; ccs != CCS_MAIN; ) {
+    next = ccs->prevStack;
+    ccs->prevStack = 0;
+    ActualPush_(CCS_MAIN,ccs->cc,ccs);
+    ccs->root = ccs->cc;
+    ccs = next;
+  }
+  
+  if (RtsFlags.CcFlags.doCostCentres) {
+    initTimeProfiling();
+  }
+
+  if (RtsFlags.ProfFlags.doHeapProfile) {
+    initHeapProfiling();
+  }
+}
+
+// Decide whether closures with this CCS should contribute to the heap
+// profile.
+static void 
+ccsSetSelected( CostCentreStack *ccs )
+{
+    if (RtsFlags.ProfFlags.modSelector) {
+	if (! strMatchesSelector( ccs->cc->module,
+				  RtsFlags.ProfFlags.modSelector ) ) {
+	    ccs->selected = 0;
+	    return;
+	}
+    }
+    if (RtsFlags.ProfFlags.ccSelector) {
+	if (! strMatchesSelector( ccs->cc->label,
+				  RtsFlags.ProfFlags.ccSelector ) ) {
+	    ccs->selected = 0;
+	    return;
+	}
+    }
+    if (RtsFlags.ProfFlags.ccsSelector) {
+	CostCentreStack *c;
+	for (c = ccs; c != NULL; c = c->prevStack) {
+	    if ( strMatchesSelector( c->cc->label,
+				     RtsFlags.ProfFlags.ccsSelector )) {
+		break; 
+	    }
+	}
+	if (c == NULL) {
+	    ccs->selected = 0;
+	    return;
+	}
+    }
+
+    ccs->selected = 1;
+    return;
+}
+
+
+static void
+initProfilingLogFile(void)
+{
+    /* Initialise the log file name */
+    prof_filename = arenaAlloc(prof_arena, strlen(prog_name) + 6);
+    sprintf(prof_filename, "%s.prof", prog_name);
+
+    /* open the log file */
+    if ((prof_file = fopen(prof_filename, "w")) == NULL) {
+	debugBelch("Can't open profiling report file %s\n", prof_filename);
+	RtsFlags.CcFlags.doCostCentres = 0;
+        // The following line was added by Sung; retainer/LDV profiling may need
+        // two output files, i.e., <program>.prof/hp.
+        if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_RETAINER)
+            RtsFlags.ProfFlags.doHeapProfile = 0;
+	return;
+    }
+
+    if (RtsFlags.CcFlags.doCostCentres == COST_CENTRES_XML) {
+	/* dump the time, and the profiling interval */
+	fprintf(prof_file, "\"%s\"\n", time_str());
+	fprintf(prof_file, "\"%d ms\"\n", TICK_MILLISECS);
+	
+	/* declare all the cost centres */
+	{
+	    CostCentre *cc;
+	    for (cc = CC_LIST; cc != NULL; cc = cc->link) {
+		fprintf(prof_file, "%d %d \"%s\" \"%s\"\n",
+			CC_UQ, cc->ccID, cc->label, cc->module);
+	    }
+	}
+    }
+    
+    if (RtsFlags.ProfFlags.doHeapProfile) {
+	/* Initialise the log file name */
+	hp_filename = arenaAlloc(prof_arena, strlen(prog_name) + 6);
+	sprintf(hp_filename, "%s.hp", prog_name);
+	
+	/* open the log file */
+	if ((hp_file = fopen(hp_filename, "w")) == NULL) {
+	    debugBelch("Can't open profiling report file %s\n", 
+		    hp_filename);
+	    RtsFlags.ProfFlags.doHeapProfile = 0;
+	    return;
+	}
+    }
+}
+
+void
+initTimeProfiling(void)
+{
+  /* Start ticking */
+  startProfTimer();
+};
+
+void 
+endProfiling ( void )
+{
+  if (RtsFlags.CcFlags.doCostCentres) {
+    stopProfTimer();
+  }
+  if (RtsFlags.ProfFlags.doHeapProfile) {
+    endHeapProfiling();
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   Set cost centre stack when entering a function.
+   -------------------------------------------------------------------------- */
+rtsBool entering_PAP;
+
+void
+EnterFunCCS ( CostCentreStack *ccsfn )
+{
+  /* PAP_entry has already set CCCS for us */
+  if (entering_PAP) {
+    entering_PAP = rtsFalse;
+    return;
+  }
+
+  if (ccsfn->root->is_caf == CC_IS_CAF) {
+    CCCS = AppendCCS(CCCS,ccsfn);
+  } else {
+    CCCS = ccsfn;
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   Cost-centre stack manipulation
+   -------------------------------------------------------------------------- */
+
+#ifdef DEBUG
+CostCentreStack * _PushCostCentre ( CostCentreStack *ccs, CostCentre *cc );
+CostCentreStack *
+PushCostCentre ( CostCentreStack *ccs, CostCentre *cc )
+#define PushCostCentre _PushCostCentre
+{
+  IF_DEBUG(prof, 
+	   debugBelch("Pushing %s on ", cc->label);
+	   debugCCS(ccs);
+	   debugBelch("\n"));
+  return PushCostCentre(ccs,cc);
+}
+#endif
+
+CostCentreStack *
+PushCostCentre ( CostCentreStack *ccs, CostCentre *cc )
+{
+  CostCentreStack *temp_ccs;
+  
+  if (ccs == EMPTY_STACK)
+    return ActualPush(ccs,cc);
+  else {
+    if (ccs->cc == cc)
+      return ccs;
+    else {
+      /* check if we've already memoized this stack */
+      temp_ccs = IsInIndexTable(ccs->indexTable,cc);
+      
+      if (temp_ccs != EMPTY_STACK)
+	return temp_ccs;
+      else {
+	temp_ccs = CheckLoop(ccs,cc);
+	if (temp_ccs != NULL) {
+	  /* we have recursed to an older CCS.  Mark this in
+	   * the index table, and emit a "back edge" into the
+	   * log file.
+	   */
+	  ccs->indexTable = AddToIndexTable(ccs->indexTable,temp_ccs,cc,1);
+	  DecBackEdge(temp_ccs,ccs);
+	  return temp_ccs;
+	} else {
+	  return ActualPush(ccs,cc);
+	}
+      }
+    }
+  }
+}
+
+static CostCentreStack *
+CheckLoop ( CostCentreStack *ccs, CostCentre *cc )
+{
+  while (ccs != EMPTY_STACK) {
+    if (ccs->cc == cc)
+      return ccs;
+    ccs = ccs->prevStack;
+  }
+  return NULL;
+}
+
+/* Append ccs1 to ccs2 (ignoring any CAF cost centre at the root of ccs1 */
+
+#ifdef DEBUG
+CostCentreStack *_AppendCCS ( CostCentreStack *ccs1, CostCentreStack *ccs2 );
+CostCentreStack *
+AppendCCS ( CostCentreStack *ccs1, CostCentreStack *ccs2 )
+#define AppendCCS _AppendCCS
+{
+  IF_DEBUG(prof, 
+	   if (ccs1 != ccs2) {
+	     debugBelch("Appending ");
+	     debugCCS(ccs1);
+	     debugBelch(" to ");
+	     debugCCS(ccs2);
+	     debugBelch("\n");});
+  return AppendCCS(ccs1,ccs2);
+}
+#endif
+
+CostCentreStack *
+AppendCCS ( CostCentreStack *ccs1, CostCentreStack *ccs2 )
+{
+  CostCentreStack *ccs = NULL;
+
+  if (ccs1 == ccs2) {
+    return ccs1;
+  }
+
+  if (ccs2->cc->is_caf == CC_IS_CAF) {
+    return ccs1;
+  }
+  
+  if (ccs2->prevStack != NULL) {
+    ccs = AppendCCS(ccs1, ccs2->prevStack);
+  }
+
+  return PushCostCentre(ccs,ccs2->cc);
+}
+
+static CostCentreStack *
+ActualPush ( CostCentreStack *ccs, CostCentre *cc )
+{
+  CostCentreStack *new_ccs;
+  
+  /* allocate space for a new CostCentreStack */
+  new_ccs = (CostCentreStack *) arenaAlloc(prof_arena, sizeof(CostCentreStack));
+  
+  return ActualPush_(ccs, cc, new_ccs);
+}
+
+static CostCentreStack *
+ActualPush_ ( CostCentreStack *ccs, CostCentre *cc, CostCentreStack *new_ccs )
+{
+  /* assign values to each member of the structure */
+  new_ccs->ccsID = CCS_ID++;
+  new_ccs->cc = cc;
+  new_ccs->prevStack = ccs;
+  
+  new_ccs->indexTable = EMPTY_TABLE;
+  
+  /* Initialise the various _scc_ counters to zero
+   */
+  new_ccs->scc_count        = 0;
+  
+  /* Initialize all other stats here.  There should be a quick way
+   * that's easily used elsewhere too 
+   */
+  new_ccs->time_ticks = 0;
+  new_ccs->mem_alloc = 0;
+  new_ccs->inherited_ticks = 0;
+  new_ccs->inherited_alloc = 0;
+  
+  new_ccs->root = ccs->root;
+
+  // Set the selected field.
+  ccsSetSelected(new_ccs);
+
+  /* update the memoization table for the parent stack */
+  if (ccs != EMPTY_STACK)
+    ccs->indexTable = AddToIndexTable(ccs->indexTable, new_ccs, cc, 
+				      0/*not a back edge*/);
+  
+  /* make sure this CC is declared at the next heap/time sample */
+  DecCCS(new_ccs);
+  
+  /* return a pointer to the new stack */
+  return new_ccs;
+}
+
+
+static CostCentreStack *
+IsInIndexTable(IndexTable *it, CostCentre *cc)
+{
+  while (it!=EMPTY_TABLE)
+    {
+      if (it->cc==cc)
+	return it->ccs;
+      else
+	it = it->next;
+    }
+  
+  /* otherwise we never found it so return EMPTY_TABLE */
+  return EMPTY_TABLE;
+}
+
+
+static IndexTable *
+AddToIndexTable(IndexTable *it, CostCentreStack *new_ccs, 
+		CostCentre *cc, unsigned int back_edge)
+{
+  IndexTable *new_it;
+  
+  new_it = arenaAlloc(prof_arena, sizeof(IndexTable));
+  
+  new_it->cc = cc;
+  new_it->ccs = new_ccs;
+  new_it->next = it;
+  new_it->back_edge = back_edge;
+  return new_it;
+}
+
+
+static void
+DecCCS(CostCentreStack *ccs)
+{
+  if (prof_file && RtsFlags.CcFlags.doCostCentres == COST_CENTRES_XML) {
+    if (ccs->prevStack == EMPTY_STACK)
+      fprintf(prof_file, "%d %d 1 %d\n", CCS_UQ, 
+	      ccs->ccsID, ccs->cc->ccID);
+    else
+      fprintf(prof_file, "%d %d 2 %d %d\n", CCS_UQ, 
+	      ccs->ccsID, ccs->cc->ccID, ccs->prevStack->ccsID);
+  }
+}
+
+static void
+DecBackEdge( CostCentreStack *ccs, CostCentreStack *oldccs )
+{
+  if (prof_file && RtsFlags.CcFlags.doCostCentres == COST_CENTRES_XML) {
+    if (ccs->prevStack == EMPTY_STACK)
+      fprintf(prof_file, "%d %d 1 %d\n", CCS_UQ, 
+	      ccs->ccsID, ccs->cc->ccID);
+    else
+      fprintf(prof_file, "%d %d 2 %d %d\n", CCS_UQ, 
+	      ccs->ccsID, ccs->cc->ccID, oldccs->ccsID);
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   Generating a time & allocation profiling report.
+   -------------------------------------------------------------------------- */
+
+/* We omit certain system-related CCs and CCSs from the default
+ * reports, so as not to cause confusion.
+ */
+static rtsBool
+cc_to_ignore (CostCentre *cc)
+{
+    if (    cc == CC_OVERHEAD 
+	 || cc == CC_DONT_CARE
+	 || cc == CC_GC 
+	 || cc == CC_SYSTEM) {
+	return rtsTrue;
+    } else {
+	return rtsFalse;
+    }
+}
+
+static rtsBool
+ccs_to_ignore (CostCentreStack *ccs)
+{
+    if (    ccs == CCS_OVERHEAD 
+	 || ccs == CCS_DONT_CARE
+	 || ccs == CCS_GC 
+	 || ccs == CCS_SYSTEM) {
+	return rtsTrue;
+    } else {
+	return rtsFalse;
+    }
+}
+
+/* -----------------------------------------------------------------------------
+   Generating the aggregated per-cost-centre time/alloc report.
+   -------------------------------------------------------------------------- */
+
+static CostCentre *sorted_cc_list;
+
+static void
+aggregate_cc_costs( CostCentreStack *ccs )
+{
+  IndexTable *i;
+
+  ccs->cc->mem_alloc += ccs->mem_alloc;
+  ccs->cc->time_ticks += ccs->time_ticks;
+
+  for (i = ccs->indexTable; i != 0; i = i->next) {
+    if (!i->back_edge) {
+      aggregate_cc_costs(i->ccs);
+    }
+  }
+}
+
+static void
+insert_cc_in_sorted_list( CostCentre *new_cc )
+{
+  CostCentre **prev, *cc;
+
+  prev = &sorted_cc_list;
+  for (cc = sorted_cc_list; cc != NULL; cc = cc->link) {
+    if (new_cc->time_ticks > cc->time_ticks) {
+      new_cc->link = cc;
+      *prev = new_cc;
+      return;
+    } else {
+      prev = &(cc->link);
+    }
+  }
+  new_cc->link = NULL;
+  *prev = new_cc;
+}
+
+static void
+report_per_cc_costs( void )
+{
+  CostCentre *cc, *next;
+
+  aggregate_cc_costs(CCS_MAIN);
+  sorted_cc_list = NULL;
+
+  for (cc = CC_LIST; cc != NULL; cc = next) {
+    next = cc->link;
+    if (cc->time_ticks > total_prof_ticks/100
+	|| cc->mem_alloc > total_alloc/100
+	|| RtsFlags.CcFlags.doCostCentres >= COST_CENTRES_ALL) {
+      insert_cc_in_sorted_list(cc);
+    }
+  }
+  
+  fprintf(prof_file, "%-30s %-20s", "COST CENTRE", "MODULE");  
+  fprintf(prof_file, "%6s %6s", "%time", "%alloc");
+  if (RtsFlags.CcFlags.doCostCentres >= COST_CENTRES_VERBOSE) {
+    fprintf(prof_file, "  %5s %9s", "ticks", "bytes");
+  }
+  fprintf(prof_file, "\n\n");
+
+  for (cc = sorted_cc_list; cc != NULL; cc = cc->link) {
+      if (cc_to_ignore(cc)) {
+	  continue;
+      }
+      fprintf(prof_file, "%-30s %-20s", cc->label, cc->module);
+      fprintf(prof_file, "%6.1f %6.1f",
+	      total_prof_ticks == 0 ? 0.0 : (cc->time_ticks / (StgFloat) total_prof_ticks * 100),
+	      total_alloc == 0 ? 0.0 : (cc->mem_alloc / (StgFloat)
+					total_alloc * 100)
+	  );
+      
+      if (RtsFlags.CcFlags.doCostCentres >= COST_CENTRES_VERBOSE) {
+	fprintf(prof_file, "  %5llu %9llu", (StgWord64)(cc->time_ticks), cc->mem_alloc);
+      }
+      fprintf(prof_file, "\n");
+  }
+
+  fprintf(prof_file,"\n\n");
+}
+
+/* -----------------------------------------------------------------------------
+   Generate the cost-centre-stack time/alloc report
+   -------------------------------------------------------------------------- */
+
+static void 
+fprint_header( void )
+{
+  fprintf(prof_file, "%-24s %-10s                                                            individual    inherited\n", "", "");
+
+  fprintf(prof_file, "%-24s %-50s", "COST CENTRE", "MODULE");  
+  fprintf(prof_file, "%6s %10s  %5s %5s   %5s %5s", "no.", "entries", "%time", "%alloc", "%time", "%alloc");
+
+  if (RtsFlags.CcFlags.doCostCentres >= COST_CENTRES_VERBOSE) {
+    fprintf(prof_file, "  %5s %9s", "ticks", "bytes");
+#if defined(PROFILING_DETAIL_COUNTS)
+    fprintf(prof_file, "  %8s %8s %8s %8s %8s %8s %8s",
+	    "closures", "thunks", "funcs", "PAPs", "subfuns", "subcafs", "cafssub");
+#endif
+  }
+
+  fprintf(prof_file, "\n\n");
+}
+
+void
+reportCCSProfiling( void )
+{
+    nat count;
+    char temp[128]; /* sigh: magic constant */
+
+    stopProfTimer();
+
+    total_prof_ticks = 0;
+    total_alloc = 0;
+    count_ticks(CCS_MAIN);
+    
+    switch (RtsFlags.CcFlags.doCostCentres) {
+    case 0:
+      return;
+    case COST_CENTRES_XML:
+      gen_XML_logfile();
+      return;
+    default:
+      break;
+    }
+
+    fprintf(prof_file, "\t%s Time and Allocation Profiling Report  (%s)\n", 
+	    time_str(), "Final");
+
+    fprintf(prof_file, "\n\t  ");
+    fprintf(prof_file, " %s", prog_name);
+    fprintf(prof_file, " +RTS");
+    for (count = 0; rts_argv[count]; count++)
+	fprintf(prof_file, " %s", rts_argv[count]);
+    fprintf(prof_file, " -RTS");
+    for (count = 1; prog_argv[count]; count++)
+	fprintf(prof_file, " %s", prog_argv[count]);
+    fprintf(prof_file, "\n\n");
+
+    fprintf(prof_file, "\ttotal time  = %11.2f secs   (%lu ticks @ %d ms)\n",
+	    total_prof_ticks / (StgFloat) TICK_FREQUENCY, 
+	    total_prof_ticks, TICK_MILLISECS);
+
+    fprintf(prof_file, "\ttotal alloc = %11s bytes",
+	    ullong_format_string(total_alloc * sizeof(W_),
+				 temp, rtsTrue/*commas*/));
+
+#if defined(PROFILING_DETAIL_COUNTS)
+    fprintf(prof_file, "  (%lu closures)", total_allocs);
+#endif
+    fprintf(prof_file, "  (excludes profiling overheads)\n\n");
+
+    report_per_cc_costs();
+
+    inherit_costs(CCS_MAIN);
+
+    fprint_header();
+    reportCCS(pruneCCSTree(CCS_MAIN), 0);
+}
+
+static void 
+reportCCS(CostCentreStack *ccs, nat indent)
+{
+  CostCentre *cc;
+  IndexTable *i;
+
+  cc = ccs->cc;
+  
+  /* Only print cost centres with non 0 data ! */
+  
+  if ( RtsFlags.CcFlags.doCostCentres >= COST_CENTRES_ALL ||
+       ! ccs_to_ignore(ccs))
+	/* force printing of *all* cost centres if -P -P */ 
+    {
+
+    fprintf(prof_file, "%-*s%-*s %-50s", 
+	    indent, "", 24-indent, cc->label, cc->module);
+
+    fprintf(prof_file, "%6d %11.0f %5.1f  %5.1f   %5.1f  %5.1f",
+	    ccs->ccsID, (double) ccs->scc_count, 
+	    total_prof_ticks == 0 ? 0.0 : ((double)ccs->time_ticks / (double)total_prof_ticks * 100.0),
+	    total_alloc == 0 ? 0.0 : ((double)ccs->mem_alloc / (double)total_alloc * 100.0),
+	    total_prof_ticks == 0 ? 0.0 : ((double)ccs->inherited_ticks / (double)total_prof_ticks * 100.0),
+	    total_alloc == 0 ? 0.0 : ((double)ccs->inherited_alloc / (double)total_alloc * 100.0)
+	    );
+
+    if (RtsFlags.CcFlags.doCostCentres >= COST_CENTRES_VERBOSE) {
+      fprintf(prof_file, "  %5llu %9llu", (StgWord64)(ccs->time_ticks), ccs->mem_alloc*sizeof(W_));
+#if defined(PROFILING_DETAIL_COUNTS)
+      fprintf(prof_file, "  %8ld %8ld %8ld %8ld %8ld %8ld %8ld",
+	      ccs->mem_allocs, ccs->thunk_count,
+	      ccs->function_count, ccs->pap_count,
+	      ccs->subsumed_fun_count,	ccs->subsumed_caf_count,
+	      ccs->caffun_subsumed);
+#endif
+    }
+    fprintf(prof_file, "\n");
+  }
+
+  for (i = ccs->indexTable; i != 0; i = i->next) {
+    if (!i->back_edge) {
+      reportCCS(i->ccs, indent+1);
+    }
+  }
+}
+
+
+/* Traverse the cost centre stack tree and accumulate
+ * ticks/allocations.
+ */
+static void
+count_ticks(CostCentreStack *ccs)
+{
+  IndexTable *i;
+  
+  if (!ccs_to_ignore(ccs)) {
+    total_alloc += ccs->mem_alloc;
+    total_prof_ticks += ccs->time_ticks;
+  }
+  for (i = ccs->indexTable; i != NULL; i = i->next)
+    if (!i->back_edge) {
+      count_ticks(i->ccs);
+    }
+}
+
+/* Traverse the cost centre stack tree and inherit ticks & allocs.
+ */
+static void
+inherit_costs(CostCentreStack *ccs)
+{
+  IndexTable *i;
+
+  if (ccs_to_ignore(ccs)) { return; }
+
+  ccs->inherited_ticks += ccs->time_ticks;
+  ccs->inherited_alloc += ccs->mem_alloc;
+
+  for (i = ccs->indexTable; i != NULL; i = i->next)
+      if (!i->back_edge) {
+	  inherit_costs(i->ccs);
+	  ccs->inherited_ticks += i->ccs->inherited_ticks;
+	  ccs->inherited_alloc += i->ccs->inherited_alloc;
+      }
+  
+  return;
+}
+
+static CostCentreStack *
+pruneCCSTree( CostCentreStack *ccs )
+{
+  CostCentreStack *ccs1;
+  IndexTable *i, **prev;
+  
+  prev = &ccs->indexTable;
+  for (i = ccs->indexTable; i != 0; i = i->next) {
+    if (i->back_edge) { continue; }
+
+    ccs1 = pruneCCSTree(i->ccs);
+    if (ccs1 == NULL) {
+      *prev = i->next;
+    } else {
+      prev = &(i->next);
+    }
+  }
+
+  if ( (RtsFlags.CcFlags.doCostCentres >= COST_CENTRES_ALL
+	/* force printing of *all* cost centres if -P -P */ )
+       
+       || ( ccs->indexTable != 0 )
+       || ( ccs->scc_count || ccs->time_ticks || ccs->mem_alloc )
+      ) {
+      return ccs;
+  } else {
+      return NULL;
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   Generate the XML time/allocation profile
+   -------------------------------------------------------------------------- */
+
+void
+gen_XML_logfile( void )
+{
+  fprintf(prof_file, "%d %lu", TIME_UPD_UQ, total_prof_ticks);
+
+  reportCCS_XML(pruneCCSTree(CCS_MAIN));
+
+  fprintf(prof_file, " 0\n");
+
+  fclose(prof_file);
+}
+
+static void 
+reportCCS_XML(CostCentreStack *ccs)
+{
+  CostCentre *cc;
+  IndexTable *i;
+
+  if (ccs_to_ignore(ccs)) { return; }
+
+  cc = ccs->cc;
+  
+  fprintf(prof_file, " 1 %d %llu %llu %llu", 
+	  ccs->ccsID, ccs->scc_count, (StgWord64)(ccs->time_ticks), ccs->mem_alloc);
+
+  for (i = ccs->indexTable; i != 0; i = i->next) {
+    if (!i->back_edge) {
+      reportCCS_XML(i->ccs);
+    }
+  }
+}
+
+void
+fprintCCS( FILE *f, CostCentreStack *ccs )
+{
+  fprintf(f,"<");
+  for (; ccs && ccs != CCS_MAIN; ccs = ccs->prevStack ) {
+      fprintf(f,"%s.%s", ccs->cc->module, ccs->cc->label);
+      if (ccs->prevStack && ccs->prevStack != CCS_MAIN) {
+	  fprintf(f,",");
+      }
+  }
+  fprintf(f,">");
+}
+
+/* For calling from .cmm code, where we can't reliably refer to stderr */
+void
+fprintCCS_stderr( CostCentreStack *ccs )
+{
+    fprintCCS(stderr, ccs);
+}
+
+#ifdef DEBUG
+void
+debugCCS( CostCentreStack *ccs )
+{
+  debugBelch("<");
+  for (; ccs && ccs != CCS_MAIN; ccs = ccs->prevStack ) {
+      debugBelch("%s.%s", ccs->cc->module, ccs->cc->label);
+      if (ccs->prevStack && ccs->prevStack != CCS_MAIN) {
+	  debugBelch(",");
+      }
+  }
+  debugBelch(">");
+}
+#endif /* DEBUG */
+
+#endif /* PROFILING */
diff --git a/rts/Profiling.h b/rts/Profiling.h
new file mode 100644
index 0000000000..d968349a52
--- /dev/null
+++ b/rts/Profiling.h
@@ -0,0 +1,39 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Support for profiling
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef PROFILING_H
+#define PROFILING_H
+
+#include <stdio.h>
+
+#if defined(PROFILING) || defined(DEBUG)
+void initProfiling1 ( void );
+void initProfiling2 ( void );
+void endProfiling   ( void );
+
+extern FILE *prof_file;
+extern FILE *hp_file;
+#endif
+
+#ifdef PROFILING
+
+void gen_XML_logfile     ( void );
+void reportCCSProfiling ( void );
+
+void PrintNewStackDecls ( void );
+
+extern void fprintCCS( FILE *f, CostCentreStack *ccs );
+extern void fprintCCS_stderr( CostCentreStack *ccs );
+
+#ifdef DEBUG
+extern void debugCCS( CostCentreStack *ccs );
+#endif
+
+#endif
+
+#endif /* PROFILING_H */
diff --git a/rts/Proftimer.c b/rts/Proftimer.c
new file mode 100644
index 0000000000..3b499152d6
--- /dev/null
+++ b/rts/Proftimer.c
@@ -0,0 +1,85 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-1999
+ *
+ * Profiling interval timer
+ *
+ * ---------------------------------------------------------------------------*/
+
+#if defined (PROFILING)
+
+#include "PosixSource.h"
+
+#include "Rts.h"
+#include "Profiling.h"
+#include "Timer.h"
+#include "Proftimer.h"
+#include "RtsFlags.h"
+
+static rtsBool do_prof_ticks = rtsFalse;       // enable profiling ticks
+static rtsBool do_heap_prof_ticks = rtsFalse;  // enable heap profiling ticks
+
+// Number of ticks until next heap census
+static int ticks_to_heap_profile;
+
+// Time for a heap profile on the next context switch
+rtsBool performHeapProfile;
+
+void
+stopProfTimer( void )
+{
+    do_prof_ticks = rtsFalse;
+}
+
+void
+startProfTimer( void )
+{
+    do_prof_ticks = rtsTrue;
+}
+
+void
+stopHeapProfTimer( void )
+{
+    do_heap_prof_ticks = rtsFalse;
+}
+
+void
+startHeapProfTimer( void )
+{
+    if (RtsFlags.ProfFlags.doHeapProfile && 
+	RtsFlags.ProfFlags.profileIntervalTicks > 0) {
+	do_heap_prof_ticks = rtsTrue;
+    }
+}
+
+void
+initProfTimer( void )
+{
+    performHeapProfile = rtsFalse;
+
+    RtsFlags.ProfFlags.profileIntervalTicks = 
+	RtsFlags.ProfFlags.profileInterval / TICK_MILLISECS;
+
+    ticks_to_heap_profile = RtsFlags.ProfFlags.profileIntervalTicks;
+
+    startHeapProfTimer();
+}
+
+
+void
+handleProfTick(void)
+{
+    if (do_prof_ticks) {
+	CCCS->time_ticks++;
+    }
+
+    if (do_heap_prof_ticks) {
+	ticks_to_heap_profile--;
+	if (ticks_to_heap_profile <= 0) {
+	    ticks_to_heap_profile = RtsFlags.ProfFlags.profileIntervalTicks;
+	    performHeapProfile = rtsTrue;
+	}
+    }
+}
+
+#endif /* PROFILING */
diff --git a/rts/Proftimer.h b/rts/Proftimer.h
new file mode 100644
index 0000000000..c837b855f9
--- /dev/null
+++ b/rts/Proftimer.h
@@ -0,0 +1,22 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Profiling interval timer
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef PROFTIMER_H
+#define PROFTIMER_H
+
+extern void initProfTimer      ( void );
+extern void handleProfTick     ( void );
+
+extern void stopProfTimer      ( void );
+extern void startProfTimer     ( void );
+extern void stopHeapProfTimer  ( void );
+extern void startHeapProfTimer ( void );
+
+extern rtsBool performHeapProfile;
+
+#endif /* PROFTIMER_H */
diff --git a/rts/RetainerProfile.c b/rts/RetainerProfile.c
new file mode 100644
index 0000000000..c5c3de5314
--- /dev/null
+++ b/rts/RetainerProfile.c
@@ -0,0 +1,2338 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2001
+ * Author: Sungwoo Park
+ *
+ * Retainer profiling.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifdef PROFILING
+
+// Turn off inlining when debugging - it obfuscates things
+#ifdef DEBUG
+#define INLINE
+#else
+#define INLINE inline
+#endif
+
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "RetainerProfile.h"
+#include "RetainerSet.h"
+#include "Schedule.h"
+#include "Printer.h"
+#include "Storage.h"
+#include "RtsFlags.h"
+#include "Weak.h"
+#include "Sanity.h"
+#include "Profiling.h"
+#include "Stats.h"
+#include "BlockAlloc.h"
+#include "ProfHeap.h"
+#include "Apply.h"
+
+/*
+  Note: what to change in order to plug-in a new retainer profiling scheme?
+    (1) type retainer in ../includes/StgRetainerProf.h
+    (2) retainer function R(), i.e., getRetainerFrom()
+    (3) the two hashing functions, hashKeySingleton() and hashKeyAddElement(),
+        in RetainerSet.h, if needed.
+    (4) printRetainer() and printRetainerSetShort() in RetainerSet.c.
+ */
+
+/* -----------------------------------------------------------------------------
+ * Declarations...
+ * -------------------------------------------------------------------------- */
+
+static nat retainerGeneration;	// generation
+
+static nat numObjectVisited;	// total number of objects visited
+static nat timesAnyObjectVisited; // number of times any objects are visited
+
+/*
+  The rs field in the profile header of any object points to its retainer
+  set in an indirect way: if flip is 0, it points to the retainer set;
+  if flip is 1, it points to the next byte after the retainer set (even
+  for NULL pointers). Therefore, with flip 1, (rs ^ 1) is the actual
+  pointer. See retainerSetOf().
+ */
+
+StgWord flip = 0;     // flip bit
+                      // must be 0 if DEBUG_RETAINER is on (for static closures)
+
+#define setRetainerSetToNull(c)   \
+  (c)->header.prof.hp.rs = (RetainerSet *)((StgWord)NULL | flip)
+
+static void retainStack(StgClosure *, retainer, StgPtr, StgPtr);
+static void retainClosure(StgClosure *, StgClosure *, retainer);
+#ifdef DEBUG_RETAINER
+static void belongToHeap(StgPtr p);
+#endif
+
+#ifdef DEBUG_RETAINER
+/*
+  cStackSize records how many times retainStack() has been invoked recursively,
+  that is, the number of activation records for retainStack() on the C stack.
+  maxCStackSize records its max value.
+  Invariants:
+    cStackSize <= maxCStackSize
+ */
+static nat cStackSize, maxCStackSize;
+
+static nat sumOfNewCost;	// sum of the cost of each object, computed
+				// when the object is first visited
+static nat sumOfNewCostExtra;   // for those objects not visited during
+                                // retainer profiling, e.g., MUT_VAR
+static nat costArray[N_CLOSURE_TYPES];
+
+nat sumOfCostLinear;		// sum of the costs of all object, computed
+				// when linearly traversing the heap after
+				// retainer profiling
+nat costArrayLinear[N_CLOSURE_TYPES];
+#endif
+
+/* -----------------------------------------------------------------------------
+ * Retainer stack - header
+ *   Note:
+ *     Although the retainer stack implementation could be separated *
+ *     from the retainer profiling engine, there does not seem to be
+ *     any advantage in doing that; retainer stack is an integral part
+ *     of retainer profiling engine and cannot be use elsewhere at
+ *     all.
+ * -------------------------------------------------------------------------- */
+
+typedef enum {
+    posTypeStep,
+    posTypePtrs,
+    posTypeSRT,
+    posTypeLargeSRT,
+} nextPosType;
+
+typedef union {
+    // fixed layout or layout specified by a field in the closure
+    StgWord step;
+
+    // layout.payload
+    struct {
+    // See StgClosureInfo in InfoTables.h
+#if SIZEOF_VOID_P == 8
+	StgWord32 pos;
+	StgWord32 ptrs;
+#else
+	StgWord16 pos;
+	StgWord16 ptrs;
+#endif
+	StgPtr payload;
+    } ptrs;
+
+    // SRT
+    struct {
+	StgClosure **srt;
+	StgWord    srt_bitmap;
+    } srt;
+
+    // Large SRT
+    struct {
+	StgLargeSRT *srt;
+	StgWord offset;
+    } large_srt;
+	
+} nextPos;
+
+typedef struct {
+    nextPosType type;
+    nextPos next;
+} stackPos;
+
+typedef struct {
+    StgClosure *c;
+    retainer c_child_r;
+    stackPos info;
+} stackElement;
+
+/*
+  Invariants:
+    firstStack points to the first block group.
+    currentStack points to the block group currently being used.
+    currentStack->free == stackLimit.
+    stackTop points to the topmost byte in the stack of currentStack.
+    Unless the whole stack is empty, stackTop must point to the topmost
+    object (or byte) in the whole stack. Thus, it is only when the whole stack
+    is empty that stackTop == stackLimit (not during the execution of push()
+    and pop()).
+    stackBottom == currentStack->start.
+    stackLimit == currentStack->start + BLOCK_SIZE_W * currentStack->blocks.
+  Note:
+    When a current stack becomes empty, stackTop is set to point to
+    the topmost element on the previous block group so as to satisfy
+    the invariants described above.
+ */
+static bdescr *firstStack = NULL;
+static bdescr *currentStack;
+static stackElement *stackBottom, *stackTop, *stackLimit;
+
+/*
+  currentStackBoundary is used to mark the current stack chunk.
+  If stackTop == currentStackBoundary, it means that the current stack chunk
+  is empty. It is the responsibility of the user to keep currentStackBoundary
+  valid all the time if it is to be employed.
+ */
+static stackElement *currentStackBoundary;
+
+/*
+  stackSize records the current size of the stack.
+  maxStackSize records its high water mark.
+  Invariants:
+    stackSize <= maxStackSize
+  Note:
+    stackSize is just an estimate measure of the depth of the graph. The reason
+    is that some heap objects have only a single child and may not result
+    in a new element being pushed onto the stack. Therefore, at the end of
+    retainer profiling, maxStackSize + maxCStackSize is some value no greater
+    than the actual depth of the graph.
+ */
+#ifdef DEBUG_RETAINER
+static int stackSize, maxStackSize;
+#endif
+
+// number of blocks allocated for one stack
+#define BLOCKS_IN_STACK 1
+
+/* -----------------------------------------------------------------------------
+ * Add a new block group to the stack.
+ * Invariants:
+ *  currentStack->link == s.
+ * -------------------------------------------------------------------------- */
+static INLINE void
+newStackBlock( bdescr *bd )
+{
+    currentStack = bd;
+    stackTop     = (stackElement *)(bd->start + BLOCK_SIZE_W * bd->blocks);
+    stackBottom  = (stackElement *)bd->start;
+    stackLimit   = (stackElement *)stackTop;
+    bd->free     = (StgPtr)stackLimit;
+}
+
+/* -----------------------------------------------------------------------------
+ * Return to the previous block group.
+ * Invariants:
+ *   s->link == currentStack.
+ * -------------------------------------------------------------------------- */
+static INLINE void
+returnToOldStack( bdescr *bd )
+{
+    currentStack = bd;
+    stackTop = (stackElement *)bd->free;
+    stackBottom = (stackElement *)bd->start;
+    stackLimit = (stackElement *)(bd->start + BLOCK_SIZE_W * bd->blocks);
+    bd->free = (StgPtr)stackLimit;
+}
+
+/* -----------------------------------------------------------------------------
+ *  Initializes the traverse stack.
+ * -------------------------------------------------------------------------- */
+static void
+initializeTraverseStack( void )
+{
+    if (firstStack != NULL) {
+	freeChain(firstStack);
+    }
+
+    firstStack = allocGroup(BLOCKS_IN_STACK);
+    firstStack->link = NULL;
+    firstStack->u.back = NULL;
+
+    newStackBlock(firstStack);
+}
+
+/* -----------------------------------------------------------------------------
+ * Frees all the block groups in the traverse stack.
+ * Invariants:
+ *   firstStack != NULL
+ * -------------------------------------------------------------------------- */
+static void
+closeTraverseStack( void )
+{
+    freeChain(firstStack);
+    firstStack = NULL;
+}
+
+/* -----------------------------------------------------------------------------
+ * Returns rtsTrue if the whole stack is empty.
+ * -------------------------------------------------------------------------- */
+static INLINE rtsBool
+isEmptyRetainerStack( void )
+{
+    return (firstStack == currentStack) && stackTop == stackLimit;
+}
+
+/* -----------------------------------------------------------------------------
+ * Returns size of stack
+ * -------------------------------------------------------------------------- */
+#ifdef DEBUG
+lnat
+retainerStackBlocks( void )
+{
+    bdescr* bd;
+    lnat res = 0;
+
+    for (bd = firstStack; bd != NULL; bd = bd->link) 
+      res += bd->blocks;
+
+    return res;
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+ * Returns rtsTrue if stackTop is at the stack boundary of the current stack,
+ * i.e., if the current stack chunk is empty.
+ * -------------------------------------------------------------------------- */
+static INLINE rtsBool
+isOnBoundary( void )
+{
+    return stackTop == currentStackBoundary;
+}
+
+/* -----------------------------------------------------------------------------
+ * Initializes *info from ptrs and payload.
+ * Invariants:
+ *   payload[] begins with ptrs pointers followed by non-pointers.
+ * -------------------------------------------------------------------------- */
+static INLINE void
+init_ptrs( stackPos *info, nat ptrs, StgPtr payload )
+{
+    info->type              = posTypePtrs;
+    info->next.ptrs.pos     = 0;
+    info->next.ptrs.ptrs    = ptrs;
+    info->next.ptrs.payload = payload;
+}
+
+/* -----------------------------------------------------------------------------
+ * Find the next object from *info.
+ * -------------------------------------------------------------------------- */
+static INLINE StgClosure *
+find_ptrs( stackPos *info )
+{
+    if (info->next.ptrs.pos < info->next.ptrs.ptrs) {
+	return (StgClosure *)info->next.ptrs.payload[info->next.ptrs.pos++];
+    } else {
+	return NULL;
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ *  Initializes *info from SRT information stored in *infoTable.
+ * -------------------------------------------------------------------------- */
+static INLINE void
+init_srt_fun( stackPos *info, StgFunInfoTable *infoTable )
+{
+    if (infoTable->i.srt_bitmap == (StgHalfWord)(-1)) {
+	info->type = posTypeLargeSRT;
+	info->next.large_srt.srt = (StgLargeSRT *)GET_FUN_SRT(infoTable);
+	info->next.large_srt.offset = 0;
+    } else {
+	info->type = posTypeSRT;
+	info->next.srt.srt = (StgClosure **)GET_FUN_SRT(infoTable);
+	info->next.srt.srt_bitmap = infoTable->i.srt_bitmap;
+    }
+}
+
+static INLINE void
+init_srt_thunk( stackPos *info, StgThunkInfoTable *infoTable )
+{
+    if (infoTable->i.srt_bitmap == (StgHalfWord)(-1)) {
+	info->type = posTypeLargeSRT;
+	info->next.large_srt.srt = (StgLargeSRT *)GET_SRT(infoTable);
+	info->next.large_srt.offset = 0;
+    } else {
+	info->type = posTypeSRT;
+	info->next.srt.srt = (StgClosure **)GET_SRT(infoTable);
+	info->next.srt.srt_bitmap = infoTable->i.srt_bitmap;
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ * Find the next object from *info.
+ * -------------------------------------------------------------------------- */
+static INLINE StgClosure *
+find_srt( stackPos *info )
+{
+    StgClosure *c;
+    StgWord bitmap;
+
+    if (info->type == posTypeSRT) {
+	// Small SRT bitmap
+	bitmap = info->next.srt.srt_bitmap;
+	while (bitmap != 0) {
+	    if ((bitmap & 1) != 0) {
+#ifdef ENABLE_WIN32_DLL_SUPPORT
+		
+		if ((unsigned long)(*(info->next.srt.srt)) & 0x1)
+		    c = (* (StgClosure **)((unsigned long)*(info->next.srt.srt)) & ~0x1);
+		else
+		    c = *(info->next.srt.srt);
+#else
+		c = *(info->next.srt.srt);
+#endif
+		bitmap = bitmap >> 1;
+		info->next.srt.srt++;
+		info->next.srt.srt_bitmap = bitmap;
+		return c;
+	    }
+	    bitmap = bitmap >> 1;
+	    info->next.srt.srt++;
+	}
+	// bitmap is now zero...
+	return NULL;
+    }
+    else {
+	// Large SRT bitmap
+	nat i = info->next.large_srt.offset;
+	StgWord bitmap;
+
+	// Follow the pattern from GC.c:scavenge_large_srt_bitmap().
+	bitmap = info->next.large_srt.srt->l.bitmap[i / BITS_IN(W_)];
+	bitmap = bitmap >> (i % BITS_IN(StgWord));
+	while (i < info->next.large_srt.srt->l.size) {
+	    if ((bitmap & 1) != 0) {
+		c = ((StgClosure **)info->next.large_srt.srt->srt)[i];
+		i++;
+		info->next.large_srt.offset = i;
+		return c;
+	    }
+	    i++;
+	    if (i % BITS_IN(W_) == 0) {
+		bitmap = info->next.large_srt.srt->l.bitmap[i / BITS_IN(W_)];
+	    } else {
+		bitmap = bitmap >> 1;
+	    }
+	}
+	// reached the end of this bitmap.
+	info->next.large_srt.offset = i;
+	return NULL;
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ *  push() pushes a stackElement representing the next child of *c
+ *  onto the traverse stack. If *c has no child, *first_child is set
+ *  to NULL and nothing is pushed onto the stack. If *c has only one
+ *  child, *c_chlid is set to that child and nothing is pushed onto
+ *  the stack.  If *c has more than two children, *first_child is set
+ *  to the first child and a stackElement representing the second
+ *  child is pushed onto the stack.
+
+ *  Invariants:
+ *     *c_child_r is the most recent retainer of *c's children.
+ *     *c is not any of TSO, AP, PAP, AP_STACK, which means that
+ *        there cannot be any stack objects.
+ *  Note: SRTs are considered to  be children as well.
+ * -------------------------------------------------------------------------- */
+static INLINE void
+push( StgClosure *c, retainer c_child_r, StgClosure **first_child )
+{
+    stackElement se;
+    bdescr *nbd;      // Next Block Descriptor
+
+#ifdef DEBUG_RETAINER
+    // debugBelch("push(): stackTop = 0x%x, currentStackBoundary = 0x%x\n", stackTop, currentStackBoundary);
+#endif
+
+    ASSERT(get_itbl(c)->type != TSO);
+    ASSERT(get_itbl(c)->type != AP_STACK);
+
+    //
+    // fill in se
+    //
+
+    se.c = c;
+    se.c_child_r = c_child_r;
+
+    // fill in se.info
+    switch (get_itbl(c)->type) {
+	// no child, no SRT
+    case CONSTR_0_1:
+    case CONSTR_0_2:
+    case CAF_BLACKHOLE:
+    case BLACKHOLE:
+    case SE_BLACKHOLE:
+    case SE_CAF_BLACKHOLE:
+    case ARR_WORDS:
+	*first_child = NULL;
+	return;
+
+	// one child (fixed), no SRT
+    case MUT_VAR_CLEAN:
+    case MUT_VAR_DIRTY:
+	*first_child = ((StgMutVar *)c)->var;
+	return;
+    case THUNK_SELECTOR:
+	*first_child = ((StgSelector *)c)->selectee;
+	return;
+    case IND_PERM:
+    case IND_OLDGEN_PERM:
+    case IND_OLDGEN:
+	*first_child = ((StgInd *)c)->indirectee;
+	return;
+    case CONSTR_1_0:
+    case CONSTR_1_1:
+	*first_child = c->payload[0];
+	return;
+
+	// For CONSTR_2_0 and MVAR, we use se.info.step to record the position
+	// of the next child. We do not write a separate initialization code.
+	// Also we do not have to initialize info.type;
+
+	// two children (fixed), no SRT
+	// need to push a stackElement, but nothing to store in se.info
+    case CONSTR_2_0:
+	*first_child = c->payload[0];         // return the first pointer
+	// se.info.type = posTypeStep;
+	// se.info.next.step = 2;            // 2 = second
+	break;
+
+	// three children (fixed), no SRT
+	// need to push a stackElement
+    case MVAR:
+	// head must be TSO and the head of a linked list of TSOs.
+	// Shoule it be a child? Seems to be yes.
+	*first_child = (StgClosure *)((StgMVar *)c)->head;
+	// se.info.type = posTypeStep;
+	se.info.next.step = 2;            // 2 = second
+	break;
+
+	// three children (fixed), no SRT
+    case WEAK:
+	*first_child = ((StgWeak *)c)->key;
+	// se.info.type = posTypeStep;
+	se.info.next.step = 2;
+	break;
+
+	// layout.payload.ptrs, no SRT
+    case CONSTR:
+    case STABLE_NAME:
+    case BCO:
+    case CONSTR_STATIC:
+	init_ptrs(&se.info, get_itbl(c)->layout.payload.ptrs,
+		  (StgPtr)c->payload);
+	*first_child = find_ptrs(&se.info);
+	if (*first_child == NULL)
+	    return;   // no child
+	break;
+
+	// StgMutArrPtr.ptrs, no SRT
+    case MUT_ARR_PTRS_CLEAN:
+    case MUT_ARR_PTRS_DIRTY:
+    case MUT_ARR_PTRS_FROZEN:
+    case MUT_ARR_PTRS_FROZEN0:
+	init_ptrs(&se.info, ((StgMutArrPtrs *)c)->ptrs,
+		  (StgPtr)(((StgMutArrPtrs *)c)->payload));
+	*first_child = find_ptrs(&se.info);
+	if (*first_child == NULL)
+	    return;
+	break;
+
+    // layout.payload.ptrs, SRT
+    case FUN:           // *c is a heap object.
+    case FUN_2_0:
+	init_ptrs(&se.info, get_itbl(c)->layout.payload.ptrs, (StgPtr)c->payload);
+	*first_child = find_ptrs(&se.info);
+	if (*first_child == NULL)
+	    // no child from ptrs, so check SRT
+	    goto fun_srt_only;
+	break;
+
+    case THUNK:
+    case THUNK_2_0:
+	init_ptrs(&se.info, get_itbl(c)->layout.payload.ptrs, 
+		  (StgPtr)((StgThunk *)c)->payload);
+	*first_child = find_ptrs(&se.info);
+	if (*first_child == NULL)
+	    // no child from ptrs, so check SRT
+	    goto thunk_srt_only;
+	break;
+
+	// 1 fixed child, SRT
+    case FUN_1_0:
+    case FUN_1_1:
+	*first_child = c->payload[0];
+	ASSERT(*first_child != NULL);
+	init_srt_fun(&se.info, get_fun_itbl(c));
+	break;
+
+    case THUNK_1_0:
+    case THUNK_1_1:
+	*first_child = ((StgThunk *)c)->payload[0];
+	ASSERT(*first_child != NULL);
+	init_srt_thunk(&se.info, get_thunk_itbl(c));
+	break;
+
+    case FUN_STATIC:      // *c is a heap object.
+	ASSERT(get_itbl(c)->srt_bitmap != 0);
+    case FUN_0_1:
+    case FUN_0_2:
+    fun_srt_only:
+        init_srt_fun(&se.info, get_fun_itbl(c));
+	*first_child = find_srt(&se.info);
+	if (*first_child == NULL)
+	    return;     // no child
+	break;
+
+    // SRT only
+    case THUNK_STATIC:
+	ASSERT(get_itbl(c)->srt_bitmap != 0);
+    case THUNK_0_1:
+    case THUNK_0_2:
+    thunk_srt_only:
+        init_srt_thunk(&se.info, get_thunk_itbl(c));
+	*first_child = find_srt(&se.info);
+	if (*first_child == NULL)
+	    return;     // no child
+	break;
+	
+    case TVAR_WAIT_QUEUE:
+	*first_child = (StgClosure *)((StgTVarWaitQueue *)c)->waiting_tso;
+	se.info.next.step = 2;            // 2 = second
+	break;
+    case TVAR:
+	*first_child = (StgClosure *)((StgTVar *)c)->current_value;
+	break;
+    case TREC_HEADER:
+	*first_child = (StgClosure *)((StgTRecHeader *)c)->enclosing_trec;
+	break;
+    case TREC_CHUNK:
+	*first_child = (StgClosure *)((StgTRecChunk *)c)->prev_chunk;
+	se.info.next.step = 0;  // entry no.
+	break;
+
+	// cannot appear
+    case PAP:
+    case AP:
+    case AP_STACK:
+    case TSO:
+    case IND_STATIC:
+    case CONSTR_INTLIKE:
+    case CONSTR_CHARLIKE:
+    case CONSTR_NOCAF_STATIC:
+	// stack objects
+    case UPDATE_FRAME:
+    case CATCH_FRAME:
+    case STOP_FRAME:
+    case RET_DYN:
+    case RET_BCO:
+    case RET_SMALL:
+    case RET_VEC_SMALL:
+    case RET_BIG:
+    case RET_VEC_BIG:
+	// invalid objects
+    case IND:
+    case BLOCKED_FETCH:
+    case FETCH_ME:
+    case FETCH_ME_BQ:
+    case RBH:
+    case REMOTE_REF:
+    case EVACUATED:
+    case INVALID_OBJECT:
+    default:
+	barf("Invalid object *c in push()");
+	return;
+    }
+
+    if (stackTop - 1 < stackBottom) {
+#ifdef DEBUG_RETAINER
+	// debugBelch("push() to the next stack.\n");
+#endif
+	// currentStack->free is updated when the active stack is switched
+	// to the next stack.
+	currentStack->free = (StgPtr)stackTop;
+
+	if (currentStack->link == NULL) {
+	    nbd = allocGroup(BLOCKS_IN_STACK);
+	    nbd->link = NULL;
+	    nbd->u.back = currentStack;
+	    currentStack->link = nbd;
+	} else
+	    nbd = currentStack->link;
+
+	newStackBlock(nbd);
+    }
+
+    // adjust stackTop (acutal push)
+    stackTop--;
+    // If the size of stackElement was huge, we would better replace the
+    // following statement by either a memcpy() call or a switch statement
+    // on the type of the element. Currently, the size of stackElement is
+    // small enough (5 words) that this direct assignment seems to be enough.
+    *stackTop = se;
+
+#ifdef DEBUG_RETAINER
+    stackSize++;
+    if (stackSize > maxStackSize) maxStackSize = stackSize;
+    // ASSERT(stackSize >= 0);
+    // debugBelch("stackSize = %d\n", stackSize);
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+ *  popOff() and popOffReal(): Pop a stackElement off the traverse stack.
+ *  Invariants:
+ *    stackTop cannot be equal to stackLimit unless the whole stack is
+ *    empty, in which case popOff() is not allowed.
+ *  Note:
+ *    You can think of popOffReal() as a part of popOff() which is
+ *    executed at the end of popOff() in necessary. Since popOff() is
+ *    likely to be executed quite often while popOffReal() is not, we
+ *    separate popOffReal() from popOff(), which is declared as an
+ *    INLINE function (for the sake of execution speed).  popOffReal()
+ *    is called only within popOff() and nowhere else.
+ * -------------------------------------------------------------------------- */
+static void
+popOffReal(void)
+{
+    bdescr *pbd;    // Previous Block Descriptor
+
+#ifdef DEBUG_RETAINER
+    // debugBelch("pop() to the previous stack.\n");
+#endif
+
+    ASSERT(stackTop + 1 == stackLimit);
+    ASSERT(stackBottom == (stackElement *)currentStack->start);
+
+    if (firstStack == currentStack) {
+	// The stack is completely empty.
+	stackTop++;
+	ASSERT(stackTop == stackLimit);
+#ifdef DEBUG_RETAINER
+	stackSize--;
+	if (stackSize > maxStackSize) maxStackSize = stackSize;
+	/*
+	  ASSERT(stackSize >= 0);
+	  debugBelch("stackSize = %d\n", stackSize);
+	*/
+#endif
+	return;
+    }
+
+    // currentStack->free is updated when the active stack is switched back
+    // to the previous stack.
+    currentStack->free = (StgPtr)stackLimit;
+
+    // find the previous block descriptor
+    pbd = currentStack->u.back;
+    ASSERT(pbd != NULL);
+
+    returnToOldStack(pbd);
+
+#ifdef DEBUG_RETAINER
+    stackSize--;
+    if (stackSize > maxStackSize) maxStackSize = stackSize;
+    /*
+      ASSERT(stackSize >= 0);
+      debugBelch("stackSize = %d\n", stackSize);
+    */
+#endif
+}
+
+static INLINE void
+popOff(void) {
+#ifdef DEBUG_RETAINER
+    // debugBelch("\tpopOff(): stackTop = 0x%x, currentStackBoundary = 0x%x\n", stackTop, currentStackBoundary);
+#endif
+
+    ASSERT(stackTop != stackLimit);
+    ASSERT(!isEmptyRetainerStack());
+
+    // <= (instead of <) is wrong!
+    if (stackTop + 1 < stackLimit) {
+	stackTop++;
+#ifdef DEBUG_RETAINER
+	stackSize--;
+	if (stackSize > maxStackSize) maxStackSize = stackSize;
+	/*
+	  ASSERT(stackSize >= 0);
+	  debugBelch("stackSize = %d\n", stackSize);
+	*/
+#endif
+	return;
+    }
+
+    popOffReal();
+}
+
+/* -----------------------------------------------------------------------------
+ *  Finds the next object to be considered for retainer profiling and store
+ *  its pointer to *c.
+ *  Test if the topmost stack element indicates that more objects are left,
+ *  and if so, retrieve the first object and store its pointer to *c. Also,
+ *  set *cp and *r appropriately, both of which are stored in the stack element.
+ *  The topmost stack element then is overwritten so as for it to now denote
+ *  the next object.
+ *  If the topmost stack element indicates no more objects are left, pop
+ *  off the stack element until either an object can be retrieved or
+ *  the current stack chunk becomes empty, indicated by rtsTrue returned by
+ *  isOnBoundary(), in which case *c is set to NULL.
+ *  Note:
+ *    It is okay to call this function even when the current stack chunk
+ *    is empty.
+ * -------------------------------------------------------------------------- */
+static INLINE void
+pop( StgClosure **c, StgClosure **cp, retainer *r )
+{
+    stackElement *se;
+
+#ifdef DEBUG_RETAINER
+    // debugBelch("pop(): stackTop = 0x%x, currentStackBoundary = 0x%x\n", stackTop, currentStackBoundary);
+#endif
+
+    do {
+	if (isOnBoundary()) {     // if the current stack chunk is depleted
+	    *c = NULL;
+	    return;
+	}
+
+	se = stackTop;
+
+	switch (get_itbl(se->c)->type) {
+	    // two children (fixed), no SRT
+	    // nothing in se.info
+	case CONSTR_2_0:
+	    *c = se->c->payload[1];
+	    *cp = se->c;
+	    *r = se->c_child_r;
+	    popOff();
+	    return;
+
+	    // three children (fixed), no SRT
+	    // need to push a stackElement
+	case MVAR:
+	    if (se->info.next.step == 2) {
+		*c = (StgClosure *)((StgMVar *)se->c)->tail;
+		se->info.next.step++;             // move to the next step
+		// no popOff
+	    } else {
+		*c = ((StgMVar *)se->c)->value;
+		popOff();
+	    }
+	    *cp = se->c;
+	    *r = se->c_child_r;
+	    return;
+
+	    // three children (fixed), no SRT
+	case WEAK:
+	    if (se->info.next.step == 2) {
+		*c = ((StgWeak *)se->c)->value;
+		se->info.next.step++;
+		// no popOff
+	    } else {
+		*c = ((StgWeak *)se->c)->finalizer;
+		popOff();
+	    }
+	    *cp = se->c;
+	    *r = se->c_child_r;
+	    return;
+
+	case TVAR_WAIT_QUEUE:
+	    if (se->info.next.step == 2) {
+		*c = (StgClosure *)((StgTVarWaitQueue *)se->c)->next_queue_entry;
+		se->info.next.step++;             // move to the next step
+		// no popOff
+	    } else {
+		*c = (StgClosure *)((StgTVarWaitQueue *)se->c)->prev_queue_entry;
+		popOff();
+	    }
+	    *cp = se->c;
+	    *r = se->c_child_r;
+	    return;
+
+	case TVAR:
+	    *c = (StgClosure *)((StgTVar *)se->c)->first_wait_queue_entry;
+	    *cp = se->c;
+	    *r = se->c_child_r;
+	    popOff();
+	    return;
+
+	case TREC_HEADER:
+	    *c = (StgClosure *)((StgTRecHeader *)se->c)->current_chunk;
+	    *cp = se->c;
+	    *r = se->c_child_r;
+	    popOff();
+	    return;
+
+	case TREC_CHUNK: {
+	    // These are pretty complicated: we have N entries, each
+	    // of which contains 3 fields that we want to follow.  So
+	    // we divide the step counter: the 2 low bits indicate
+	    // which field, and the rest of the bits indicate the
+	    // entry number (starting from zero).
+	    nat entry_no = se->info.next.step >> 2;
+	    nat field_no = se->info.next.step & 3;
+	    if (entry_no == ((StgTRecChunk *)se->c)->next_entry_idx) {
+		*c = NULL;
+		popOff();
+		return;
+	    }
+	    TRecEntry *entry = &((StgTRecChunk *)se->c)->entries[entry_no];
+	    if (field_no == 0) {
+		*c = (StgClosure *)entry->tvar;
+	    } else if (field_no == 1) {
+		*c = entry->expected_value;
+	    } else {
+		*c = entry->new_value;
+	    }
+	    *cp = se->c;
+	    *r = se->c_child_r;
+	    se->info.next.step++;
+	    return;
+	}
+
+	case CONSTR:
+	case STABLE_NAME:
+	case BCO:
+	case CONSTR_STATIC:
+	    // StgMutArrPtr.ptrs, no SRT
+	case MUT_ARR_PTRS_CLEAN:
+	case MUT_ARR_PTRS_DIRTY:
+	case MUT_ARR_PTRS_FROZEN:
+	case MUT_ARR_PTRS_FROZEN0:
+	    *c = find_ptrs(&se->info);
+	    if (*c == NULL) {
+		popOff();
+		break;
+	    }
+	    *cp = se->c;
+	    *r = se->c_child_r;
+	    return;
+
+	    // layout.payload.ptrs, SRT
+	case FUN:         // always a heap object
+	case FUN_2_0:
+	    if (se->info.type == posTypePtrs) {
+		*c = find_ptrs(&se->info);
+		if (*c != NULL) {
+		    *cp = se->c;
+		    *r = se->c_child_r;
+		    return;
+		}
+		init_srt_fun(&se->info, get_fun_itbl(se->c));
+	    }
+	    goto do_srt;
+
+	case THUNK:
+	case THUNK_2_0:
+	    if (se->info.type == posTypePtrs) {
+		*c = find_ptrs(&se->info);
+		if (*c != NULL) {
+		    *cp = se->c;
+		    *r = se->c_child_r;
+		    return;
+		}
+		init_srt_thunk(&se->info, get_thunk_itbl(se->c));
+	    }
+	    goto do_srt;
+
+	    // SRT
+	do_srt:
+	case THUNK_STATIC:
+	case FUN_STATIC:
+	case FUN_0_1:
+	case FUN_0_2:
+	case THUNK_0_1:
+	case THUNK_0_2:
+	case FUN_1_0:
+	case FUN_1_1:
+	case THUNK_1_0:
+	case THUNK_1_1:
+	    *c = find_srt(&se->info);
+	    if (*c != NULL) {
+		*cp = se->c;
+		*r = se->c_child_r;
+		return;
+	    }
+	    popOff();
+	    break;
+
+	    // no child (fixed), no SRT
+	case CONSTR_0_1:
+	case CONSTR_0_2:
+	case CAF_BLACKHOLE:
+	case BLACKHOLE:
+	case SE_BLACKHOLE:
+	case SE_CAF_BLACKHOLE:
+	case ARR_WORDS:
+	    // one child (fixed), no SRT
+	case MUT_VAR_CLEAN:
+	case MUT_VAR_DIRTY:
+	case THUNK_SELECTOR:
+	case IND_PERM:
+	case IND_OLDGEN_PERM:
+	case IND_OLDGEN:
+	case CONSTR_1_1:
+	    // cannot appear
+	case PAP:
+	case AP:
+	case AP_STACK:
+	case TSO:
+	case IND_STATIC:
+	case CONSTR_INTLIKE:
+	case CONSTR_CHARLIKE:
+	case CONSTR_NOCAF_STATIC:
+	    // stack objects
+	case RET_DYN:
+	case UPDATE_FRAME:
+	case CATCH_FRAME:
+	case STOP_FRAME:
+	case RET_BCO:
+	case RET_SMALL:
+	case RET_VEC_SMALL:
+	case RET_BIG:
+	case RET_VEC_BIG:
+	    // invalid objects
+	case IND:
+	case BLOCKED_FETCH:
+	case FETCH_ME:
+	case FETCH_ME_BQ:
+	case RBH:
+	case REMOTE_REF:
+	case EVACUATED:
+	case INVALID_OBJECT:
+	default:
+	    barf("Invalid object *c in pop()");
+	    return;
+	}
+    } while (rtsTrue);
+}
+
+/* -----------------------------------------------------------------------------
+ * RETAINER PROFILING ENGINE
+ * -------------------------------------------------------------------------- */
+
+void
+initRetainerProfiling( void )
+{
+    initializeAllRetainerSet();
+    retainerGeneration = 0;
+}
+
+/* -----------------------------------------------------------------------------
+ *  This function must be called before f-closing prof_file.
+ * -------------------------------------------------------------------------- */
+void
+endRetainerProfiling( void )
+{
+#ifdef SECOND_APPROACH
+    outputAllRetainerSet(prof_file);
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+ *  Returns the actual pointer to the retainer set of the closure *c.
+ *  It may adjust RSET(c) subject to flip.
+ *  Side effects:
+ *    RSET(c) is initialized to NULL if its current value does not
+ *    conform to flip.
+ *  Note:
+ *    Even though this function has side effects, they CAN be ignored because
+ *    subsequent calls to retainerSetOf() always result in the same return value
+ *    and retainerSetOf() is the only way to retrieve retainerSet of a given
+ *    closure.
+ *    We have to perform an XOR (^) operation each time a closure is examined.
+ *    The reason is that we do not know when a closure is visited last.
+ * -------------------------------------------------------------------------- */
+static INLINE void
+maybeInitRetainerSet( StgClosure *c )
+{
+    if (!isRetainerSetFieldValid(c)) {
+	setRetainerSetToNull(c);
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ * Returns rtsTrue if *c is a retainer.
+ * -------------------------------------------------------------------------- */
+static INLINE rtsBool
+isRetainer( StgClosure *c )
+{
+    switch (get_itbl(c)->type) {
+	//
+	//  True case
+	//
+	// TSOs MUST be retainers: they constitute the set of roots.
+    case TSO:
+
+	// mutable objects
+    case MVAR:
+    case MUT_VAR_CLEAN:
+    case MUT_VAR_DIRTY:
+    case MUT_ARR_PTRS_CLEAN:
+    case MUT_ARR_PTRS_DIRTY:
+    case MUT_ARR_PTRS_FROZEN:
+    case MUT_ARR_PTRS_FROZEN0:
+
+	// thunks are retainers.
+    case THUNK:
+    case THUNK_1_0:
+    case THUNK_0_1:
+    case THUNK_2_0:
+    case THUNK_1_1:
+    case THUNK_0_2:
+    case THUNK_SELECTOR:
+    case AP:
+    case AP_STACK:
+
+	// Static thunks, or CAFS, are obviously retainers.
+    case THUNK_STATIC:
+
+	// WEAK objects are roots; there is separate code in which traversing
+	// begins from WEAK objects.
+    case WEAK:
+
+	// Since the other mutvar-type things are retainers, seems
+	// like the right thing to do:
+    case TVAR:
+	return rtsTrue;
+
+	//
+	// False case
+	//
+
+	// constructors
+    case CONSTR:
+    case CONSTR_1_0:
+    case CONSTR_0_1:
+    case CONSTR_2_0:
+    case CONSTR_1_1:
+    case CONSTR_0_2:
+	// functions
+    case FUN:
+    case FUN_1_0:
+    case FUN_0_1:
+    case FUN_2_0:
+    case FUN_1_1:
+    case FUN_0_2:
+	// partial applications
+    case PAP:
+	// blackholes
+    case CAF_BLACKHOLE:
+    case BLACKHOLE:
+    case SE_BLACKHOLE:
+    case SE_CAF_BLACKHOLE:
+	// indirection
+    case IND_PERM:
+    case IND_OLDGEN_PERM:
+    case IND_OLDGEN:
+	// static objects
+    case CONSTR_STATIC:
+    case FUN_STATIC:
+	// misc
+    case STABLE_NAME:
+    case BCO:
+    case ARR_WORDS:
+	// STM
+    case TVAR_WAIT_QUEUE:
+    case TREC_HEADER:
+    case TREC_CHUNK:
+	return rtsFalse;
+
+	//
+	// Error case
+	//
+	// IND_STATIC cannot be *c, *cp, *r in the retainer profiling loop.
+    case IND_STATIC:
+	// CONSTR_INTLIKE, CONSTR_CHARLIKE, and CONSTR_NOCAF_STATIC
+	// cannot be *c, *cp, *r in the retainer profiling loop.
+    case CONSTR_INTLIKE:
+    case CONSTR_CHARLIKE:
+    case CONSTR_NOCAF_STATIC:
+	// Stack objects are invalid because they are never treated as
+	// legal objects during retainer profiling.
+    case UPDATE_FRAME:
+    case CATCH_FRAME:
+    case STOP_FRAME:
+    case RET_DYN:
+    case RET_BCO:
+    case RET_SMALL:
+    case RET_VEC_SMALL:
+    case RET_BIG:
+    case RET_VEC_BIG:
+	// other cases
+    case IND:
+    case BLOCKED_FETCH:
+    case FETCH_ME:
+    case FETCH_ME_BQ:
+    case RBH:
+    case REMOTE_REF:
+    case EVACUATED:
+    case INVALID_OBJECT:
+    default:
+	barf("Invalid object in isRetainer(): %d", get_itbl(c)->type);
+	return rtsFalse;
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ *  Returns the retainer function value for the closure *c, i.e., R(*c).
+ *  This function does NOT return the retainer(s) of *c.
+ *  Invariants:
+ *    *c must be a retainer.
+ *  Note:
+ *    Depending on the definition of this function, the maintenance of retainer
+ *    sets can be made easier. If most retainer sets are likely to be created
+ *    again across garbage collections, refreshAllRetainerSet() in
+ *    RetainerSet.c can simply do nothing.
+ *    If this is not the case, we can free all the retainer sets and
+ *    re-initialize the hash table.
+ *    See refreshAllRetainerSet() in RetainerSet.c.
+ * -------------------------------------------------------------------------- */
+static INLINE retainer
+getRetainerFrom( StgClosure *c )
+{
+    ASSERT(isRetainer(c));
+
+#if defined(RETAINER_SCHEME_INFO)
+    // Retainer scheme 1: retainer = info table
+    return get_itbl(c);
+#elif defined(RETAINER_SCHEME_CCS)
+    // Retainer scheme 2: retainer = cost centre stack
+    return c->header.prof.ccs;
+#elif defined(RETAINER_SCHEME_CC)
+    // Retainer scheme 3: retainer = cost centre
+    return c->header.prof.ccs->cc;
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+ *  Associates the retainer set *s with the closure *c, that is, *s becomes
+ *  the retainer set of *c.
+ *  Invariants:
+ *    c != NULL
+ *    s != NULL
+ * -------------------------------------------------------------------------- */
+static INLINE void
+associate( StgClosure *c, RetainerSet *s )
+{
+    // StgWord has the same size as pointers, so the following type
+    // casting is okay.
+    RSET(c) = (RetainerSet *)((StgWord)s | flip);
+}
+
+/* -----------------------------------------------------------------------------
+   Call retainClosure for each of the closures covered by a large bitmap.
+   -------------------------------------------------------------------------- */
+
+static void
+retain_large_bitmap (StgPtr p, StgLargeBitmap *large_bitmap, nat size,
+		     StgClosure *c, retainer c_child_r)
+{
+    nat i, b;
+    StgWord bitmap;
+    
+    b = 0;
+    bitmap = large_bitmap->bitmap[b];
+    for (i = 0; i < size; ) {
+	if ((bitmap & 1) == 0) {
+	    retainClosure((StgClosure *)*p, c, c_child_r);
+	}
+	i++;
+	p++;
+	if (i % BITS_IN(W_) == 0) {
+	    b++;
+	    bitmap = large_bitmap->bitmap[b];
+	} else {
+	    bitmap = bitmap >> 1;
+	}
+    }
+}
+
+static INLINE StgPtr
+retain_small_bitmap (StgPtr p, nat size, StgWord bitmap,
+		     StgClosure *c, retainer c_child_r)
+{
+    while (size > 0) {
+	if ((bitmap & 1) == 0) {
+	    retainClosure((StgClosure *)*p, c, c_child_r);
+	}
+	p++;
+	bitmap = bitmap >> 1;
+	size--;
+    }
+    return p;
+}
+
+/* -----------------------------------------------------------------------------
+ * Call retainClosure for each of the closures in an SRT.
+ * ------------------------------------------------------------------------- */
+
+static void
+retain_large_srt_bitmap (StgLargeSRT *srt, StgClosure *c, retainer c_child_r)
+{
+    nat i, b, size;
+    StgWord bitmap;
+    StgClosure **p;
+    
+    b = 0;
+    p = (StgClosure **)srt->srt;
+    size   = srt->l.size;
+    bitmap = srt->l.bitmap[b];
+    for (i = 0; i < size; ) {
+	if ((bitmap & 1) != 0) {
+	    retainClosure((StgClosure *)*p, c, c_child_r);
+	}
+	i++;
+	p++;
+	if (i % BITS_IN(W_) == 0) {
+	    b++;
+	    bitmap = srt->l.bitmap[b];
+	} else {
+	    bitmap = bitmap >> 1;
+	}
+    }
+}
+
+static INLINE void
+retainSRT (StgClosure **srt, nat srt_bitmap, StgClosure *c, retainer c_child_r)
+{
+  nat bitmap;
+  StgClosure **p;
+
+  bitmap = srt_bitmap;
+  p = srt;
+
+  if (bitmap == (StgHalfWord)(-1)) {  
+      retain_large_srt_bitmap( (StgLargeSRT *)srt, c, c_child_r );
+      return;
+  }
+
+  while (bitmap != 0) {
+      if ((bitmap & 1) != 0) {
+#ifdef ENABLE_WIN32_DLL_SUPPORT
+	  if ( (unsigned long)(*srt) & 0x1 ) {
+	      retainClosure(*stgCast(StgClosure**,(stgCast(unsigned long, *srt) & ~0x1)), 
+			    c, c_child_r);
+	  } else {
+	      retainClosure(*srt,c,c_child_r);
+	  }
+#else
+	  retainClosure(*srt,c,c_child_r);
+#endif
+      }
+      p++;
+      bitmap = bitmap >> 1;
+  }
+}
+
+/* -----------------------------------------------------------------------------
+ *  Process all the objects in the stack chunk from stackStart to stackEnd
+ *  with *c and *c_child_r being their parent and their most recent retainer,
+ *  respectively. Treat stackOptionalFun as another child of *c if it is
+ *  not NULL.
+ *  Invariants:
+ *    *c is one of the following: TSO, AP_STACK.
+ *    If *c is TSO, c == c_child_r.
+ *    stackStart < stackEnd.
+ *    RSET(c) and RSET(c_child_r) are valid, i.e., their
+ *    interpretation conforms to the current value of flip (even when they
+ *    are interpreted to be NULL).
+ *    If *c is TSO, its state is not any of ThreadRelocated, ThreadComplete,
+ *    or ThreadKilled, which means that its stack is ready to process.
+ *  Note:
+ *    This code was almost plagiarzied from GC.c! For each pointer,
+ *    retainClosure() is invoked instead of evacuate().
+ * -------------------------------------------------------------------------- */
+static void
+retainStack( StgClosure *c, retainer c_child_r,
+	     StgPtr stackStart, StgPtr stackEnd )
+{
+    stackElement *oldStackBoundary;
+    StgPtr p;
+    StgRetInfoTable *info;
+    StgWord32 bitmap;
+    nat size;
+
+#ifdef DEBUG_RETAINER
+    cStackSize++;
+    if (cStackSize > maxCStackSize) maxCStackSize = cStackSize;
+#endif
+
+    /*
+      Each invocation of retainStack() creates a new virtual
+      stack. Since all such stacks share a single common stack, we
+      record the current currentStackBoundary, which will be restored
+      at the exit.
+    */
+    oldStackBoundary = currentStackBoundary;
+    currentStackBoundary = stackTop;
+
+#ifdef DEBUG_RETAINER
+    // debugBelch("retainStack() called: oldStackBoundary = 0x%x, currentStackBoundary = 0x%x\n", oldStackBoundary, currentStackBoundary);
+#endif
+
+    ASSERT(get_itbl(c)->type != TSO || 
+	   (((StgTSO *)c)->what_next != ThreadRelocated &&
+	    ((StgTSO *)c)->what_next != ThreadComplete &&
+	    ((StgTSO *)c)->what_next != ThreadKilled));
+    
+    p = stackStart;
+    while (p < stackEnd) {
+	info = get_ret_itbl((StgClosure *)p);
+
+	switch(info->i.type) {
+
+	case UPDATE_FRAME:
+	    retainClosure(((StgUpdateFrame *)p)->updatee, c, c_child_r);
+	    p += sizeofW(StgUpdateFrame);
+	    continue;
+
+	case STOP_FRAME:
+	case CATCH_FRAME:
+	case CATCH_STM_FRAME:
+	case CATCH_RETRY_FRAME:
+	case ATOMICALLY_FRAME:
+	case RET_SMALL:
+	case RET_VEC_SMALL:
+	    bitmap = BITMAP_BITS(info->i.layout.bitmap);
+	    size   = BITMAP_SIZE(info->i.layout.bitmap);
+	    p++;
+	    p = retain_small_bitmap(p, size, bitmap, c, c_child_r);
+
+	follow_srt:
+	    retainSRT((StgClosure **)GET_SRT(info), info->i.srt_bitmap, c, c_child_r);
+	    continue;
+
+	case RET_BCO: {
+	    StgBCO *bco;
+	    
+	    p++;
+	    retainClosure((StgClosure *)*p, c, c_child_r);
+	    bco = (StgBCO *)*p;
+	    p++;
+	    size = BCO_BITMAP_SIZE(bco);
+	    retain_large_bitmap(p, BCO_BITMAP(bco), size, c, c_child_r);
+	    p += size;
+	    continue;
+	}
+
+	    // large bitmap (> 32 entries, or > 64 on a 64-bit machine) 
+	case RET_BIG:
+	case RET_VEC_BIG:
+	    size = GET_LARGE_BITMAP(&info->i)->size;
+	    p++;
+	    retain_large_bitmap(p, GET_LARGE_BITMAP(&info->i),
+				size, c, c_child_r);
+	    p += size;
+	    // and don't forget to follow the SRT 
+	    goto follow_srt;
+
+	    // Dynamic bitmap: the mask is stored on the stack 
+	case RET_DYN: {
+	    StgWord dyn;
+	    dyn = ((StgRetDyn *)p)->liveness;
+
+	    // traverse the bitmap first
+	    bitmap = RET_DYN_LIVENESS(dyn);
+	    p      = (P_)&((StgRetDyn *)p)->payload[0];
+	    size   = RET_DYN_BITMAP_SIZE;
+	    p = retain_small_bitmap(p, size, bitmap, c, c_child_r);
+	    
+	    // skip over the non-ptr words
+	    p += RET_DYN_NONPTRS(dyn) + RET_DYN_NONPTR_REGS_SIZE;
+	    
+	    // follow the ptr words
+	    for (size = RET_DYN_PTRS(dyn); size > 0; size--) {
+		retainClosure((StgClosure *)*p, c, c_child_r);
+		p++;
+	    }
+	    continue;
+	}
+
+	case RET_FUN: {
+	    StgRetFun *ret_fun = (StgRetFun *)p;
+	    StgFunInfoTable *fun_info;
+	    
+	    retainClosure(ret_fun->fun, c, c_child_r);
+	    fun_info = get_fun_itbl(ret_fun->fun);
+	    
+	    p = (P_)&ret_fun->payload;
+	    switch (fun_info->f.fun_type) {
+	    case ARG_GEN:
+		bitmap = BITMAP_BITS(fun_info->f.b.bitmap);
+		size = BITMAP_SIZE(fun_info->f.b.bitmap);
+		p = retain_small_bitmap(p, size, bitmap, c, c_child_r);
+		break;
+	    case ARG_GEN_BIG:
+		size = GET_FUN_LARGE_BITMAP(fun_info)->size;
+		retain_large_bitmap(p, GET_FUN_LARGE_BITMAP(fun_info), 
+				    size, c, c_child_r);
+		p += size;
+		break;
+	    default:
+		bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]);
+		size = BITMAP_SIZE(stg_arg_bitmaps[fun_info->f.fun_type]);
+		p = retain_small_bitmap(p, size, bitmap, c, c_child_r);
+		break;
+	    }
+	    goto follow_srt;
+	}
+
+	default:
+	    barf("Invalid object found in retainStack(): %d",
+		 (int)(info->i.type));
+	}
+    }
+
+    // restore currentStackBoundary
+    currentStackBoundary = oldStackBoundary;
+#ifdef DEBUG_RETAINER
+    // debugBelch("retainStack() finished: currentStackBoundary = 0x%x\n", currentStackBoundary);
+#endif
+
+#ifdef DEBUG_RETAINER
+    cStackSize--;
+#endif
+}
+
+/* ----------------------------------------------------------------------------
+ * Call retainClosure for each of the children of a PAP/AP
+ * ------------------------------------------------------------------------- */
+
+static INLINE StgPtr
+retain_PAP_payload (StgClosure *pap,  retainer c_child_r, StgClosure *fun, 
+		    StgClosure** payload, StgWord n_args)
+{
+    StgPtr p;
+    StgWord bitmap;
+    StgFunInfoTable *fun_info;
+
+    retainClosure(fun, pap, c_child_r);
+    fun_info = get_fun_itbl(fun);
+    ASSERT(fun_info->i.type != PAP);
+
+    p = (StgPtr)payload;
+
+    switch (fun_info->f.fun_type) {
+    case ARG_GEN:
+	bitmap = BITMAP_BITS(fun_info->f.b.bitmap);
+	p = retain_small_bitmap(p, n_args, bitmap, 
+				pap, c_child_r);
+	break;
+    case ARG_GEN_BIG:
+	retain_large_bitmap(p, GET_FUN_LARGE_BITMAP(fun_info),
+			    n_args, pap, c_child_r);
+	p += n_args;
+	break;
+    case ARG_BCO:
+	retain_large_bitmap((StgPtr)payload, BCO_BITMAP(fun),
+			    n_args, pap, c_child_r);
+	p += n_args;
+	break;
+    default:
+	bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]);
+	p = retain_small_bitmap(p, n_args, bitmap, pap, c_child_r);
+	break;
+    }
+    return p;
+}
+
+/* -----------------------------------------------------------------------------
+ *  Compute the retainer set of *c0 and all its desecents by traversing.
+ *  *cp0 is the parent of *c0, and *r0 is the most recent retainer of *c0.
+ *  Invariants:
+ *    c0 = cp0 = r0 holds only for root objects.
+ *    RSET(cp0) and RSET(r0) are valid, i.e., their
+ *    interpretation conforms to the current value of flip (even when they
+ *    are interpreted to be NULL).
+ *    However, RSET(c0) may be corrupt, i.e., it may not conform to
+ *    the current value of flip. If it does not, during the execution
+ *    of this function, RSET(c0) must be initialized as well as all
+ *    its descendants.
+ *  Note:
+ *    stackTop must be the same at the beginning and the exit of this function.
+ *    *c0 can be TSO (as well as AP_STACK).
+ * -------------------------------------------------------------------------- */
+static void
+retainClosure( StgClosure *c0, StgClosure *cp0, retainer r0 )
+{
+    // c = Current closure
+    // cp = Current closure's Parent
+    // r = current closures' most recent Retainer
+    // c_child_r = current closure's children's most recent retainer
+    // first_child = first child of c
+    StgClosure *c, *cp, *first_child;
+    RetainerSet *s, *retainerSetOfc;
+    retainer r, c_child_r;
+    StgWord typeOfc;
+
+#ifdef DEBUG_RETAINER
+    // StgPtr oldStackTop;
+#endif
+
+#ifdef DEBUG_RETAINER
+    // oldStackTop = stackTop;
+    // debugBelch("retainClosure() called: c0 = 0x%x, cp0 = 0x%x, r0 = 0x%x\n", c0, cp0, r0);
+#endif
+
+    // (c, cp, r) = (c0, cp0, r0)
+    c = c0;
+    cp = cp0;
+    r = r0;
+    goto inner_loop;
+
+loop:
+    //debugBelch("loop");
+    // pop to (c, cp, r);
+    pop(&c, &cp, &r);
+
+    if (c == NULL) {
+#ifdef DEBUG_RETAINER
+	// debugBelch("retainClosure() ends: oldStackTop = 0x%x, stackTop = 0x%x\n", oldStackTop, stackTop);
+#endif
+	return;
+    }
+
+    //debugBelch("inner_loop");
+
+inner_loop:
+    // c  = current closure under consideration,
+    // cp = current closure's parent,
+    // r  = current closure's most recent retainer
+    //
+    // Loop invariants (on the meaning of c, cp, r, and their retainer sets):
+    //   RSET(cp) and RSET(r) are valid.
+    //   RSET(c) is valid only if c has been visited before.
+    //
+    // Loop invariants (on the relation between c, cp, and r)
+    //   if cp is not a retainer, r belongs to RSET(cp).
+    //   if cp is a retainer, r == cp.
+
+    typeOfc = get_itbl(c)->type;
+
+#ifdef DEBUG_RETAINER
+    switch (typeOfc) {
+    case IND_STATIC:
+    case CONSTR_INTLIKE:
+    case CONSTR_CHARLIKE:
+    case CONSTR_NOCAF_STATIC:
+    case CONSTR_STATIC:
+    case THUNK_STATIC:
+    case FUN_STATIC:
+	break;
+    default:
+	if (retainerSetOf(c) == NULL) {   // first visit?
+	    costArray[typeOfc] += cost(c);
+	    sumOfNewCost += cost(c);
+	}
+	break;
+    }
+#endif
+
+    // special cases
+    switch (typeOfc) {
+    case TSO:
+	if (((StgTSO *)c)->what_next == ThreadComplete ||
+	    ((StgTSO *)c)->what_next == ThreadKilled) {
+#ifdef DEBUG_RETAINER
+	    debugBelch("ThreadComplete or ThreadKilled encountered in retainClosure()\n");
+#endif
+	    goto loop;
+	}
+	if (((StgTSO *)c)->what_next == ThreadRelocated) {
+#ifdef DEBUG_RETAINER
+	    debugBelch("ThreadRelocated encountered in retainClosure()\n");
+#endif
+	    c = (StgClosure *)((StgTSO *)c)->link;
+	    goto inner_loop;
+	}
+	break;
+
+    case IND_STATIC:
+	// We just skip IND_STATIC, so its retainer set is never computed.
+	c = ((StgIndStatic *)c)->indirectee;
+	goto inner_loop;
+    case CONSTR_INTLIKE:
+    case CONSTR_CHARLIKE:
+	// static objects with no pointers out, so goto loop.
+    case CONSTR_NOCAF_STATIC:
+	// It is not just enough not to compute the retainer set for *c; it is
+	// mandatory because CONSTR_NOCAF_STATIC are not reachable from
+	// scavenged_static_objects, the list from which is assumed to traverse
+	// all static objects after major garbage collections.
+	goto loop;
+    case THUNK_STATIC:
+    case FUN_STATIC:
+	if (get_itbl(c)->srt_bitmap == 0) {
+	    // No need to compute the retainer set; no dynamic objects
+	    // are reachable from *c.
+	    //
+	    // Static objects: if we traverse all the live closures,
+	    // including static closures, during each heap census then
+	    // we will observe that some static closures appear and
+	    // disappear.  eg. a closure may contain a pointer to a
+	    // static function 'f' which is not otherwise reachable
+	    // (it doesn't indirectly point to any CAFs, so it doesn't
+	    // appear in any SRTs), so we would find 'f' during
+	    // traversal.  However on the next sweep there may be no
+	    // closures pointing to 'f'.
+	    //
+	    // We must therefore ignore static closures whose SRT is
+	    // empty, because these are exactly the closures that may
+	    // "appear".  A closure with a non-empty SRT, and which is
+	    // still required, will always be reachable.
+	    //
+	    // But what about CONSTR_STATIC?  Surely these may be able
+	    // to appear, and they don't have SRTs, so we can't
+	    // check.  So for now, we're calling
+	    // resetStaticObjectForRetainerProfiling() from the
+	    // garbage collector to reset the retainer sets in all the
+	    // reachable static objects.
+	    goto loop;
+	}
+    default:
+	break;
+    }
+
+    // The above objects are ignored in computing the average number of times
+    // an object is visited.
+    timesAnyObjectVisited++;
+
+    // If this is the first visit to c, initialize its retainer set.
+    maybeInitRetainerSet(c);
+    retainerSetOfc = retainerSetOf(c);
+
+    // Now compute s:
+    //    isRetainer(cp) == rtsTrue => s == NULL
+    //    isRetainer(cp) == rtsFalse => s == cp.retainer
+    if (isRetainer(cp))
+	s = NULL;
+    else
+	s = retainerSetOf(cp);
+
+    // (c, cp, r, s) is available.
+
+    // (c, cp, r, s, R_r) is available, so compute the retainer set for *c.
+    if (retainerSetOfc == NULL) {
+	// This is the first visit to *c.
+	numObjectVisited++;
+
+	if (s == NULL)
+	    associate(c, singleton(r));
+	else
+	    // s is actually the retainer set of *c!
+	    associate(c, s);
+
+	// compute c_child_r
+	c_child_r = isRetainer(c) ? getRetainerFrom(c) : r;
+    } else {
+	// This is not the first visit to *c.
+	if (isMember(r, retainerSetOfc))
+	    goto loop;          // no need to process child
+
+	if (s == NULL)
+	    associate(c, addElement(r, retainerSetOfc));
+	else {
+	    // s is not NULL and cp is not a retainer. This means that
+	    // each time *cp is visited, so is *c. Thus, if s has
+	    // exactly one more element in its retainer set than c, s
+	    // is also the new retainer set for *c.
+	    if (s->num == retainerSetOfc->num + 1) {
+		associate(c, s);
+	    }
+	    // Otherwise, just add R_r to the current retainer set of *c.
+	    else {
+		associate(c, addElement(r, retainerSetOfc));
+	    }
+	}
+
+	if (isRetainer(c))
+	    goto loop;          // no need to process child
+
+	// compute c_child_r
+	c_child_r = r;
+    }
+
+    // now, RSET() of all of *c, *cp, and *r is valid.
+    // (c, c_child_r) are available.
+
+    // process child
+
+    // Special case closures: we process these all in one go rather
+    // than attempting to save the current position, because doing so
+    // would be hard.
+    switch (typeOfc) {
+    case TSO:
+	retainStack(c, c_child_r,
+		    ((StgTSO *)c)->sp,
+		    ((StgTSO *)c)->stack + ((StgTSO *)c)->stack_size);
+	goto loop;
+
+    case PAP:
+    {
+	StgPAP *pap = (StgPAP *)c;
+	retain_PAP_payload(c, c_child_r, pap->fun, pap->payload, pap->n_args);
+	goto loop;
+    }
+
+    case AP:
+    {
+	StgAP *ap = (StgAP *)c;
+	retain_PAP_payload(c, c_child_r, ap->fun, ap->payload, ap->n_args);
+	goto loop;
+    }
+
+    case AP_STACK:
+	retainClosure(((StgAP_STACK *)c)->fun, c, c_child_r);
+	retainStack(c, c_child_r,
+		    (StgPtr)((StgAP_STACK *)c)->payload,
+		    (StgPtr)((StgAP_STACK *)c)->payload +
+		             ((StgAP_STACK *)c)->size);
+	goto loop;
+    }
+
+    push(c, c_child_r, &first_child);
+
+    // If first_child is null, c has no child.
+    // If first_child is not null, the top stack element points to the next
+    // object. push() may or may not push a stackElement on the stack.
+    if (first_child == NULL)
+	goto loop;
+
+    // (c, cp, r) = (first_child, c, c_child_r)
+    r = c_child_r;
+    cp = c;
+    c = first_child;
+    goto inner_loop;
+}
+
+/* -----------------------------------------------------------------------------
+ *  Compute the retainer set for every object reachable from *tl.
+ * -------------------------------------------------------------------------- */
+static void
+retainRoot( StgClosure **tl )
+{
+    // We no longer assume that only TSOs and WEAKs are roots; any closure can
+    // be a root.
+
+    ASSERT(isEmptyRetainerStack());
+    currentStackBoundary = stackTop;
+
+    if (*tl != &stg_END_TSO_QUEUE_closure && isRetainer(*tl)) {
+	retainClosure(*tl, *tl, getRetainerFrom(*tl));
+    } else {
+	retainClosure(*tl, *tl, CCS_SYSTEM);
+    }
+
+    // NOT TRUE: ASSERT(isMember(getRetainerFrom(*tl), retainerSetOf(*tl)));
+    // *tl might be a TSO which is ThreadComplete, in which
+    // case we ignore it for the purposes of retainer profiling.
+}
+
+/* -----------------------------------------------------------------------------
+ *  Compute the retainer set for each of the objects in the heap.
+ * -------------------------------------------------------------------------- */
+static void
+computeRetainerSet( void )
+{
+    StgWeak *weak;
+    RetainerSet *rtl;
+    nat g;
+    StgPtr ml;
+    bdescr *bd;
+#ifdef DEBUG_RETAINER
+    RetainerSet tmpRetainerSet;
+#endif
+
+    GetRoots(retainRoot);	// for scheduler roots
+
+    // This function is called after a major GC, when key, value, and finalizer
+    // all are guaranteed to be valid, or reachable.
+    //
+    // The following code assumes that WEAK objects are considered to be roots
+    // for retainer profilng.
+    for (weak = weak_ptr_list; weak != NULL; weak = weak->link)
+	// retainRoot((StgClosure *)weak);
+	retainRoot((StgClosure **)&weak);
+
+    // Consider roots from the stable ptr table.
+    markStablePtrTable(retainRoot);
+
+    // The following code resets the rs field of each unvisited mutable
+    // object (computing sumOfNewCostExtra and updating costArray[] when
+    // debugging retainer profiler).
+    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+	// NOT TRUE: even G0 has a block on its mutable list
+        // ASSERT(g != 0 || (generations[g].mut_list == NULL));
+
+	// Traversing through mut_list is necessary
+	// because we can find MUT_VAR objects which have not been
+	// visited during retainer profiling.
+	for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) {
+	    for (ml = bd->start; ml < bd->free; ml++) {
+
+		maybeInitRetainerSet((StgClosure *)*ml);
+		rtl = retainerSetOf((StgClosure *)*ml);
+
+#ifdef DEBUG_RETAINER
+		if (rtl == NULL) {
+		    // first visit to *ml
+		    // This is a violation of the interface rule!
+		    RSET(ml) = (RetainerSet *)((StgWord)(&tmpRetainerSet) | flip);
+		    
+		    switch (get_itbl((StgClosure *)ml)->type) {
+		    case IND_STATIC:
+			// no cost involved
+			break;
+		    case CONSTR_INTLIKE:
+		    case CONSTR_CHARLIKE:
+		    case CONSTR_NOCAF_STATIC:
+		    case CONSTR_STATIC:
+		    case THUNK_STATIC:
+		    case FUN_STATIC:
+			barf("Invalid object in computeRetainerSet(): %d", get_itbl((StgClosure*)ml)->type);
+			break;
+		    default:
+			// dynamic objects
+			costArray[get_itbl((StgClosure *)ml)->type] += cost((StgClosure *)ml);
+			sumOfNewCostExtra += cost((StgClosure *)ml);
+			break;
+		    }
+		}
+#endif
+	    }
+	}
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ *  Traverse all static objects for which we compute retainer sets,
+ *  and reset their rs fields to NULL, which is accomplished by
+ *  invoking maybeInitRetainerSet(). This function must be called
+ *  before zeroing all objects reachable from scavenged_static_objects
+ *  in the case of major gabage collections. See GarbageCollect() in
+ *  GC.c.
+ *  Note:
+ *    The mut_once_list of the oldest generation must also be traversed?
+ *    Why? Because if the evacuation of an object pointed to by a static
+ *    indirection object fails, it is put back to the mut_once_list of
+ *    the oldest generation.
+ *    However, this is not necessary because any static indirection objects
+ *    are just traversed through to reach dynamic objects. In other words,
+ *    they are not taken into consideration in computing retainer sets.
+ * -------------------------------------------------------------------------- */
+void
+resetStaticObjectForRetainerProfiling( void )
+{
+#ifdef DEBUG_RETAINER
+    nat count;
+#endif
+    StgClosure *p;
+
+#ifdef DEBUG_RETAINER
+    count = 0;
+#endif
+    p = scavenged_static_objects;
+    while (p != END_OF_STATIC_LIST) {
+#ifdef DEBUG_RETAINER
+	count++;
+#endif
+	switch (get_itbl(p)->type) {
+	case IND_STATIC:
+	    // Since we do not compute the retainer set of any
+	    // IND_STATIC object, we don't have to reset its retainer
+	    // field.
+	    p = (StgClosure*)*IND_STATIC_LINK(p);
+	    break;
+	case THUNK_STATIC:
+	    maybeInitRetainerSet(p);
+	    p = (StgClosure*)*THUNK_STATIC_LINK(p);
+	    break;
+	case FUN_STATIC:
+	    maybeInitRetainerSet(p);
+	    p = (StgClosure*)*FUN_STATIC_LINK(p);
+	    break;
+	case CONSTR_STATIC:
+	    maybeInitRetainerSet(p);
+	    p = (StgClosure*)*STATIC_LINK(get_itbl(p), p);
+	    break;
+	default:
+	    barf("resetStaticObjectForRetainerProfiling: %p (%s)",
+		 p, get_itbl(p)->type);
+	    break;
+	}
+    }
+#ifdef DEBUG_RETAINER
+    // debugBelch("count in scavenged_static_objects = %d\n", count);
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+ * Perform retainer profiling.
+ * N is the oldest generation being profilied, where the generations are
+ * numbered starting at 0.
+ * Invariants:
+ * Note:
+ *   This function should be called only immediately after major garbage
+ *   collection.
+ * ------------------------------------------------------------------------- */
+void
+retainerProfile(void)
+{
+#ifdef DEBUG_RETAINER
+  nat i;
+  nat totalHeapSize;        // total raw heap size (computed by linear scanning)
+#endif
+
+#ifdef DEBUG_RETAINER
+  debugBelch(" < retainerProfile() invoked : %d>\n", retainerGeneration);
+#endif
+
+  stat_startRP();
+
+  // We haven't flipped the bit yet.
+#ifdef DEBUG_RETAINER
+  debugBelch("Before traversing:\n");
+  sumOfCostLinear = 0;
+  for (i = 0;i < N_CLOSURE_TYPES; i++)
+    costArrayLinear[i] = 0;
+  totalHeapSize = checkHeapSanityForRetainerProfiling();
+
+  debugBelch("\tsumOfCostLinear = %d, totalHeapSize = %d\n", sumOfCostLinear, totalHeapSize);
+  /*
+  debugBelch("costArrayLinear[] = ");
+  for (i = 0;i < N_CLOSURE_TYPES; i++)
+    debugBelch("[%u:%u] ", i, costArrayLinear[i]);
+  debugBelch("\n");
+  */
+
+  ASSERT(sumOfCostLinear == totalHeapSize);
+
+/*
+#define pcostArrayLinear(index) \
+  if (costArrayLinear[index] > 0) \
+    debugBelch("costArrayLinear[" #index "] = %u\n", costArrayLinear[index])
+  pcostArrayLinear(THUNK_STATIC);
+  pcostArrayLinear(FUN_STATIC);
+  pcostArrayLinear(CONSTR_STATIC);
+  pcostArrayLinear(CONSTR_NOCAF_STATIC);
+  pcostArrayLinear(CONSTR_INTLIKE);
+  pcostArrayLinear(CONSTR_CHARLIKE);
+*/
+#endif
+
+  // Now we flips flip.
+  flip = flip ^ 1;
+
+#ifdef DEBUG_RETAINER
+  stackSize = 0;
+  maxStackSize = 0;
+  cStackSize = 0;
+  maxCStackSize = 0;
+#endif
+  numObjectVisited = 0;
+  timesAnyObjectVisited = 0;
+
+#ifdef DEBUG_RETAINER
+  debugBelch("During traversing:\n");
+  sumOfNewCost = 0;
+  sumOfNewCostExtra = 0;
+  for (i = 0;i < N_CLOSURE_TYPES; i++)
+    costArray[i] = 0;
+#endif
+
+  /*
+    We initialize the traverse stack each time the retainer profiling is
+    performed (because the traverse stack size varies on each retainer profiling
+    and this operation is not costly anyhow). However, we just refresh the
+    retainer sets.
+   */
+  initializeTraverseStack();
+#ifdef DEBUG_RETAINER
+  initializeAllRetainerSet();
+#else
+  refreshAllRetainerSet();
+#endif
+  computeRetainerSet();
+
+#ifdef DEBUG_RETAINER
+  debugBelch("After traversing:\n");
+  sumOfCostLinear = 0;
+  for (i = 0;i < N_CLOSURE_TYPES; i++)
+    costArrayLinear[i] = 0;
+  totalHeapSize = checkHeapSanityForRetainerProfiling();
+
+  debugBelch("\tsumOfCostLinear = %d, totalHeapSize = %d\n", sumOfCostLinear, totalHeapSize);
+  ASSERT(sumOfCostLinear == totalHeapSize);
+
+  // now, compare the two results
+  /*
+    Note:
+      costArray[] must be exactly the same as costArrayLinear[].
+      Known exceptions:
+        1) Dead weak pointers, whose type is CONSTR. These objects are not
+           reachable from any roots.
+  */
+  debugBelch("Comparison:\n");
+  debugBelch("\tcostArrayLinear[] (must be empty) = ");
+  for (i = 0;i < N_CLOSURE_TYPES; i++)
+    if (costArray[i] != costArrayLinear[i])
+      // nothing should be printed except MUT_VAR after major GCs
+      debugBelch("[%u:%u] ", i, costArrayLinear[i]);
+  debugBelch("\n");
+
+  debugBelch("\tsumOfNewCost = %u\n", sumOfNewCost);
+  debugBelch("\tsumOfNewCostExtra = %u\n", sumOfNewCostExtra);
+  debugBelch("\tcostArray[] (must be empty) = ");
+  for (i = 0;i < N_CLOSURE_TYPES; i++)
+    if (costArray[i] != costArrayLinear[i])
+      // nothing should be printed except MUT_VAR after major GCs
+      debugBelch("[%u:%u] ", i, costArray[i]);
+  debugBelch("\n");
+
+  // only for major garbage collection
+  ASSERT(sumOfNewCost + sumOfNewCostExtra == sumOfCostLinear);
+#endif
+
+  // post-processing
+  closeTraverseStack();
+#ifdef DEBUG_RETAINER
+  closeAllRetainerSet();
+#else
+  // Note that there is no post-processing for the retainer sets.
+#endif
+  retainerGeneration++;
+
+  stat_endRP(
+    retainerGeneration - 1,   // retainerGeneration has just been incremented!
+#ifdef DEBUG_RETAINER
+    maxCStackSize, maxStackSize,
+#endif
+    (double)timesAnyObjectVisited / numObjectVisited);
+}
+
+/* -----------------------------------------------------------------------------
+ * DEBUGGING CODE
+ * -------------------------------------------------------------------------- */
+
+#ifdef DEBUG_RETAINER
+
+#define LOOKS_LIKE_PTR(r) ((LOOKS_LIKE_STATIC_CLOSURE(r) || \
+        ((HEAP_ALLOCED(r) && ((Bdescr((P_)r)->flags & BF_FREE) == 0)))) && \
+        ((StgWord)(*(StgPtr)r)!=0xaaaaaaaa))
+
+static nat
+sanityCheckHeapClosure( StgClosure *c )
+{
+    StgInfoTable *info;
+
+    ASSERT(LOOKS_LIKE_GHC_INFO(c->header.info));
+    ASSERT(!closure_STATIC(c));
+    ASSERT(LOOKS_LIKE_PTR(c));
+
+    if ((((StgWord)RSET(c) & 1) ^ flip) != 0) {
+	if (get_itbl(c)->type == CONSTR &&
+	    !strcmp(get_itbl(c)->prof.closure_type, "DEAD_WEAK") &&
+	    !strcmp(get_itbl(c)->prof.closure_desc, "DEAD_WEAK")) {
+	    debugBelch("\tUnvisited dead weak pointer object found: c = %p\n", c);
+	    costArray[get_itbl(c)->type] += cost(c);
+	    sumOfNewCost += cost(c);
+	} else
+	    debugBelch(
+		    "Unvisited object: flip = %d, c = %p(%d, %s, %s), rs = %p\n",
+		    flip, c, get_itbl(c)->type,
+		    get_itbl(c)->prof.closure_type, get_itbl(c)->prof.closure_desc,
+		    RSET(c));
+    } else {
+	// debugBelch("sanityCheckHeapClosure) S: flip = %d, c = %p(%d), rs = %p\n", flip, c, get_itbl(c)->type, RSET(c));
+    }
+
+    return closure_sizeW(c);
+}
+
+static nat
+heapCheck( bdescr *bd )
+{
+    StgPtr p;
+    static nat costSum, size;
+
+    costSum = 0;
+    while (bd != NULL) {
+	p = bd->start;
+	while (p < bd->free) {
+	    size = sanityCheckHeapClosure((StgClosure *)p);
+	    sumOfCostLinear += size;
+	    costArrayLinear[get_itbl((StgClosure *)p)->type] += size;
+	    p += size;
+	    // no need for slop check; I think slops are not used currently.
+	}
+	ASSERT(p == bd->free);
+	costSum += bd->free - bd->start;
+	bd = bd->link;
+    }
+
+    return costSum;
+}
+
+static nat
+smallObjectPoolCheck(void)
+{
+    bdescr *bd;
+    StgPtr p;
+    static nat costSum, size;
+
+    bd = small_alloc_list;
+    costSum = 0;
+
+    // first block
+    if (bd == NULL)
+	return costSum;
+
+    p = bd->start;
+    while (p < alloc_Hp) {
+	size = sanityCheckHeapClosure((StgClosure *)p);
+	sumOfCostLinear += size;
+	costArrayLinear[get_itbl((StgClosure *)p)->type] += size;
+	p += size;
+    }
+    ASSERT(p == alloc_Hp);
+    costSum += alloc_Hp - bd->start;
+
+    bd = bd->link;
+    while (bd != NULL) {
+	p = bd->start;
+	while (p < bd->free) {
+	    size = sanityCheckHeapClosure((StgClosure *)p);
+	    sumOfCostLinear += size;
+	    costArrayLinear[get_itbl((StgClosure *)p)->type] += size;
+	    p += size;
+	}
+	ASSERT(p == bd->free);
+	costSum += bd->free - bd->start;
+	bd = bd->link;
+    }
+
+    return costSum;
+}
+
+static nat
+chainCheck(bdescr *bd)
+{
+    nat costSum, size;
+
+    costSum = 0;
+    while (bd != NULL) {
+	// bd->free - bd->start is not an accurate measurement of the
+	// object size.  Actually it is always zero, so we compute its
+	// size explicitly.
+	size = sanityCheckHeapClosure((StgClosure *)bd->start);
+	sumOfCostLinear += size;
+	costArrayLinear[get_itbl((StgClosure *)bd->start)->type] += size;
+	costSum += size;
+	bd = bd->link;
+    }
+
+    return costSum;
+}
+
+static nat
+checkHeapSanityForRetainerProfiling( void )
+{
+    nat costSum, g, s;
+
+    costSum = 0;
+    debugBelch("START: sumOfCostLinear = %d, costSum = %d\n", sumOfCostLinear, costSum);
+    if (RtsFlags.GcFlags.generations == 1) {
+	costSum += heapCheck(g0s0->to_blocks);
+	debugBelch("heapCheck: sumOfCostLinear = %d, costSum = %d\n", sumOfCostLinear, costSum);
+	costSum += chainCheck(g0s0->large_objects);
+	debugBelch("chainCheck: sumOfCostLinear = %d, costSum = %d\n", sumOfCostLinear, costSum);
+    } else {
+	for (g = 0; g < RtsFlags.GcFlags.generations; g++)
+	for (s = 0; s < generations[g].n_steps; s++) {
+	    /*
+	      After all live objects have been scavenged, the garbage
+	      collector may create some objects in
+	      scheduleFinalizers(). These objects are created throught
+	      allocate(), so the small object pool or the large object
+	      pool of the g0s0 may not be empty.
+	    */
+	    if (g == 0 && s == 0) {
+		costSum += smallObjectPoolCheck();
+		debugBelch("smallObjectPoolCheck(): sumOfCostLinear = %d, costSum = %d\n", sumOfCostLinear, costSum);
+		costSum += chainCheck(generations[g].steps[s].large_objects);
+		debugBelch("chainCheck(): sumOfCostLinear = %d, costSum = %d\n", sumOfCostLinear, costSum);
+	    } else {
+		costSum += heapCheck(generations[g].steps[s].blocks);
+		debugBelch("heapCheck(): sumOfCostLinear = %d, costSum = %d\n", sumOfCostLinear, costSum);
+		costSum += chainCheck(generations[g].steps[s].large_objects);
+		debugBelch("chainCheck(): sumOfCostLinear = %d, costSum = %d\n", sumOfCostLinear, costSum);
+	    }
+	}
+    }
+
+    return costSum;
+}
+
+void
+findPointer(StgPtr p)
+{
+    StgPtr q, r, e;
+    bdescr *bd;
+    nat g, s;
+
+    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+	for (s = 0; s < generations[g].n_steps; s++) {
+	    // if (g == 0 && s == 0) continue;
+	    bd = generations[g].steps[s].blocks;
+	    for (; bd; bd = bd->link) {
+		for (q = bd->start; q < bd->free; q++) {
+		    if (*q == (StgWord)p) {
+			r = q;
+			while (!LOOKS_LIKE_GHC_INFO(*r)) r--;
+			debugBelch("Found in gen[%d], step[%d]: q = %p, r = %p\n", g, s, q, r);
+			// return;
+		    }
+		}
+	    }
+	    bd = generations[g].steps[s].large_objects;
+	    for (; bd; bd = bd->link) {
+		e = bd->start + cost((StgClosure *)bd->start);
+		for (q = bd->start; q < e; q++) {
+		    if (*q == (StgWord)p) {
+			r = q;
+			while (*r == 0 || !LOOKS_LIKE_GHC_INFO(*r)) r--;
+			debugBelch("Found in gen[%d], large_objects: %p\n", g, r);
+			// return;
+		    }
+		}
+	    }
+	}
+    }
+}
+
+static void
+belongToHeap(StgPtr p)
+{
+    bdescr *bd;
+    nat g, s;
+
+    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+	for (s = 0; s < generations[g].n_steps; s++) {
+	    // if (g == 0 && s == 0) continue;
+	    bd = generations[g].steps[s].blocks;
+	    for (; bd; bd = bd->link) {
+		if (bd->start <= p && p < bd->free) {
+		    debugBelch("Belongs to gen[%d], step[%d]", g, s);
+		    return;
+		}
+	    }
+	    bd = generations[g].steps[s].large_objects;
+	    for (; bd; bd = bd->link) {
+		if (bd->start <= p && p < bd->start + getHeapClosureSize((StgClosure *)bd->start)) {
+		    debugBelch("Found in gen[%d], large_objects: %p\n", g, bd->start);
+		    return;
+		}
+	    }
+	}
+    }
+}
+#endif /* DEBUG_RETAINER */
+
+#endif /* PROFILING */
diff --git a/rts/RetainerProfile.h b/rts/RetainerProfile.h
new file mode 100644
index 0000000000..827daa8ef4
--- /dev/null
+++ b/rts/RetainerProfile.h
@@ -0,0 +1,47 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2001
+ * Author: Sungwoo Park
+ *
+ * Retainer profiling interface.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef RETAINERPROFILE_H
+#define RETAINERPROFILE_H
+
+#ifdef PROFILING
+
+#include "RetainerSet.h"
+
+extern void  initRetainerProfiling ( void );
+extern void  endRetainerProfiling  ( void );
+extern void  printRetainer         ( FILE *, retainer );
+extern void  retainerProfile       ( void );
+extern void  resetStaticObjectForRetainerProfiling ( void );
+
+extern StgWord RTS_VAR(flip);
+
+// extract the retainer set field from c
+#define RSET(c)   ((c)->header.prof.hp.rs)
+
+#define isRetainerSetFieldValid(c) \
+  ((((StgWord)(c)->header.prof.hp.rs & 1) ^ flip) == 0)
+
+static inline RetainerSet *
+retainerSetOf( StgClosure *c )
+{
+    ASSERT( isRetainerSetFieldValid(c) );
+    // StgWord has the same size as pointers, so the following type
+    // casting is okay.
+    return (RetainerSet *)((StgWord)RSET(c) ^ flip);
+}
+
+// Used by Storage.c:memInventory()
+#ifdef DEBUG
+extern lnat retainerStackBlocks ( void );
+#endif
+
+#endif /* PROFILING */
+
+#endif /* RETAINERPROFILE_H */
diff --git a/rts/RetainerSet.c b/rts/RetainerSet.c
new file mode 100644
index 0000000000..bfa0bc8acf
--- /dev/null
+++ b/rts/RetainerSet.c
@@ -0,0 +1,498 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2001
+ * Author: Sungwoo Park
+ *
+ * Retainer set implementation for retainer profiling (see RetainerProfile.c)
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifdef PROFILING
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "Stats.h"
+#include "RtsUtils.h"
+#include "RetainerSet.h"
+#include "Arena.h"
+#include "Profiling.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#define HASH_TABLE_SIZE 255
+#define hash(hk)  (hk % HASH_TABLE_SIZE)
+static RetainerSet *hashTable[HASH_TABLE_SIZE];
+
+static Arena *arena;		// arena in which we store retainer sets
+
+static int nextId;              // id of next retainer set       
+
+/* -----------------------------------------------------------------------------
+ * rs_MANY is a distinguished retainer set, such that
+ *
+ *        isMember(e, rs_MANY)   = True
+ *
+ *	  addElement(e, rs)      = rs_MANY,   if rs->num >= maxRetainerSetSize
+ *	  addElement(e, rs_MANY) = rs_MANY
+ *
+ * The point of rs_MANY is to keep the total number of retainer sets
+ * from growing too large.
+ * -------------------------------------------------------------------------- */
+RetainerSet rs_MANY = {
+    num : 0,
+    hashKey : 0,
+    link : NULL,
+    id : 1,
+    element : {}
+};
+
+/* -----------------------------------------------------------------------------
+ * calculate the size of a RetainerSet structure
+ * -------------------------------------------------------------------------- */
+STATIC_INLINE size_t
+sizeofRetainerSet( int elems )
+{
+    return (sizeof(RetainerSet) + elems * sizeof(retainer));
+}
+
+/* -----------------------------------------------------------------------------
+ * Creates the first pool and initializes hashTable[].
+ * Frees all pools if any.
+ * -------------------------------------------------------------------------- */
+void
+initializeAllRetainerSet(void)
+{
+    int i;
+
+    arena = newArena();
+
+    for (i = 0; i < HASH_TABLE_SIZE; i++)
+	hashTable[i] = NULL;
+    nextId = 2;   // Initial value must be positive, 2 is MANY.
+}
+
+/* -----------------------------------------------------------------------------
+ * Refreshes all pools for reuse and initializes hashTable[].
+ * -------------------------------------------------------------------------- */
+void
+refreshAllRetainerSet(void)
+{
+#ifdef FIRST_APPROACH
+    int i;
+
+    // first approach: completely refresh
+    arenaFree(arena);
+    arena = newArena();
+
+    for (i = 0; i < HASH_TABLE_SIZE; i++)
+	hashTable[i] = NULL;
+    nextId = 2;
+#endif /* FIRST_APPROACH */
+}
+
+/* -----------------------------------------------------------------------------
+ * Frees all pools.
+ * -------------------------------------------------------------------------- */
+void
+closeAllRetainerSet(void)
+{
+    arenaFree(arena);
+}
+
+/* -----------------------------------------------------------------------------
+ *  Finds or creates if needed a singleton retainer set.
+ * -------------------------------------------------------------------------- */
+RetainerSet *
+singleton(retainer r)
+{
+    RetainerSet *rs;
+    StgWord hk;
+
+    hk = hashKeySingleton(r);
+    for (rs = hashTable[hash(hk)]; rs != NULL; rs = rs->link)
+	if (rs->num == 1 &&  rs->element[0] == r) return rs;    // found it
+
+    // create it
+    rs = arenaAlloc( arena, sizeofRetainerSet(1) );
+    rs->num = 1;
+    rs->hashKey = hk;
+    rs->link = hashTable[hash(hk)];
+    rs->id = nextId++;
+    rs->element[0] = r;
+
+    // The new retainer set is placed at the head of the linked list.
+    hashTable[hash(hk)] = rs;
+
+    return rs;
+}
+
+/* -----------------------------------------------------------------------------
+ *   Finds or creates a retainer set *rs augmented with r.
+ *   Invariants:
+ *     r is not a member of rs, i.e., isMember(r, rs) returns rtsFalse.
+ *     rs is not NULL.
+ *   Note:
+ *     We could check if rs is NULL, in which case this function call
+ *     reverts to singleton(). We do not choose this strategy because
+ *     in most cases addElement() is invoked with non-NULL rs.
+ * -------------------------------------------------------------------------- */
+RetainerSet *
+addElement(retainer r, RetainerSet *rs)
+{
+    nat i;
+    nat nl;             // Number of retainers in *rs Less than r
+    RetainerSet *nrs;   // New Retainer Set
+    StgWord hk;         // Hash Key
+
+#ifdef DEBUG_RETAINER
+    // debugBelch("addElement(%p, %p) = ", r, rs);
+#endif
+
+    ASSERT(rs != NULL);
+    ASSERT(rs->num <= RtsFlags.ProfFlags.maxRetainerSetSize);
+
+    if (rs == &rs_MANY || rs->num == RtsFlags.ProfFlags.maxRetainerSetSize) {
+	return &rs_MANY;
+    }
+
+    ASSERT(!isMember(r, rs));
+
+    for (nl = 0; nl < rs->num; nl++)
+	if (r < rs->element[nl]) break;
+    // Now nl is the index for r into the new set.
+    // Also it denotes the number of retainers less than r in *rs.
+    // Thus, compare the first nl retainers, then r itself, and finally the
+    // remaining (rs->num - nl) retainers.
+
+    hk = hashKeyAddElement(r, rs);
+    for (nrs = hashTable[hash(hk)]; nrs != NULL; nrs = nrs->link) {
+	// test *rs and *nrs for equality
+
+	// check their size
+	if (rs->num + 1 != nrs->num) continue;
+
+	// compare the first nl retainers and find the first non-matching one.
+	for (i = 0; i < nl; i++)
+	    if (rs->element[i] != nrs->element[i]) break;
+	if (i < nl) continue;
+
+	// compare r itself
+	if (r != nrs->element[i]) continue;       // i == nl
+
+	// compare the remaining retainers
+	for (; i < rs->num; i++)
+	    if (rs->element[i] != nrs->element[i + 1]) break;
+	if (i < rs->num) continue;
+
+#ifdef DEBUG_RETAINER
+	// debugBelch("%p\n", nrs);
+#endif
+	// The set we are seeking already exists!
+	return nrs;
+    }
+
+    // create a new retainer set
+    nrs = arenaAlloc( arena, sizeofRetainerSet(rs->num + 1) );
+    nrs->num = rs->num + 1;
+    nrs->hashKey = hk;
+    nrs->link = hashTable[hash(hk)];
+    nrs->id = nextId++;
+    for (i = 0; i < nl; i++) {              // copy the first nl retainers
+	nrs->element[i] = rs->element[i];
+    }
+    nrs->element[i] = r;                    // copy r
+    for (; i < rs->num; i++) {              // copy the remaining retainers
+	nrs->element[i + 1] = rs->element[i];
+    }
+
+    hashTable[hash(hk)] = nrs;
+
+#ifdef DEBUG_RETAINER
+    // debugBelch("%p\n", nrs);
+#endif
+    return nrs;
+}
+
+/* -----------------------------------------------------------------------------
+ *  Call f() for each retainer set.
+ * -------------------------------------------------------------------------- */
+void
+traverseAllRetainerSet(void (*f)(RetainerSet *))
+{
+    int i;
+    RetainerSet *rs;
+
+    (*f)(&rs_MANY);
+    for (i = 0; i < HASH_TABLE_SIZE; i++)
+	for (rs = hashTable[i]; rs != NULL; rs = rs->link)
+	    (*f)(rs);
+}
+
+
+/* -----------------------------------------------------------------------------
+ *  printRetainer() prints the full information on a given retainer,
+ *  not a retainer set.
+ * -------------------------------------------------------------------------- */
+#if defined(RETAINER_SCHEME_INFO)
+// Retainer scheme 1: retainer = info table
+void
+printRetainer(FILE *f, retainer itbl)
+{
+    fprintf(f, "%s[%s]", itbl->prof.closure_desc, itbl->prof.closure_type);
+}
+#elif defined(RETAINER_SCHEME_CCS)
+// Retainer scheme 2: retainer = cost centre stack
+void
+printRetainer(FILE *f, retainer ccs)
+{
+    fprintCCS(f, ccs);
+}
+#elif defined(RETAINER_SCHEME_CC)
+// Retainer scheme 3: retainer = cost centre
+void
+printRetainer(FILE *f, retainer cc)
+{
+    fprintf(f,"%s.%s", cc->module, cc->label);
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+ *  printRetainerSetShort() should always display the same output for
+ *  a given retainer set regardless of the time of invocation.
+ * -------------------------------------------------------------------------- */
+#ifdef SECOND_APPROACH
+#if defined(RETAINER_SCHEME_INFO)
+// Retainer scheme 1: retainer = info table
+void
+printRetainerSetShort(FILE *f, RetainerSet *rs)
+{
+#define MAX_RETAINER_SET_SPACE  24
+    char tmp[MAX_RETAINER_SET_SPACE + 1];
+    int size;
+    nat j;
+
+    ASSERT(rs->id < 0);
+
+    tmp[MAX_RETAINER_SET_SPACE] = '\0';
+
+    // No blank characters are allowed.
+    sprintf(tmp + 0, "(%d)", -(rs->id));
+    size = strlen(tmp);
+    ASSERT(size < MAX_RETAINER_SET_SPACE);
+
+    for (j = 0; j < rs->num; j++) {
+	if (j < rs->num - 1) {
+	    strncpy(tmp + size, rs->element[j]->prof.closure_desc, MAX_RETAINER_SET_SPACE - size);
+	    size = strlen(tmp);
+	    if (size == MAX_RETAINER_SET_SPACE)
+		break;
+	    strncpy(tmp + size, ",", MAX_RETAINER_SET_SPACE - size);
+	    size = strlen(tmp);
+	    if (size == MAX_RETAINER_SET_SPACE)
+		break;
+	}
+	else {
+	    strncpy(tmp + size, rs->element[j]->prof.closure_desc, MAX_RETAINER_SET_SPACE - size);
+	    // size = strlen(tmp);
+	}
+    }
+    fprintf(f, tmp);
+}
+#elif defined(RETAINER_SCHEME_CC)
+// Retainer scheme 3: retainer = cost centre
+void
+printRetainerSetShort(FILE *f, RetainerSet *rs)
+{
+#define MAX_RETAINER_SET_SPACE  24
+    char tmp[MAX_RETAINER_SET_SPACE + 1];
+    int size;
+    nat j;
+
+}
+#elif defined(RETAINER_SCHEME_CCS)
+// Retainer scheme 2: retainer = cost centre stack
+void
+printRetainerSetShort(FILE *f, RetainerSet *rs)
+{
+#define MAX_RETAINER_SET_SPACE  24
+    char tmp[MAX_RETAINER_SET_SPACE + 1];
+    int size;
+    nat j;
+
+    ASSERT(rs->id < 0);
+
+    tmp[MAX_RETAINER_SET_SPACE] = '\0';
+
+    // No blank characters are allowed.
+    sprintf(tmp + 0, "(%d)", -(rs->id));
+    size = strlen(tmp);
+    ASSERT(size < MAX_RETAINER_SET_SPACE);
+
+    for (j = 0; j < rs->num; j++) {
+	if (j < rs->num - 1) {
+	    strncpy(tmp + size, rs->element[j]->cc->label, MAX_RETAINER_SET_SPACE - size);
+	    size = strlen(tmp);
+	    if (size == MAX_RETAINER_SET_SPACE)
+		break;
+	    strncpy(tmp + size, ",", MAX_RETAINER_SET_SPACE - size);
+	    size = strlen(tmp);
+	    if (size == MAX_RETAINER_SET_SPACE)
+		break;
+	}
+	else {
+	    strncpy(tmp + size, rs->element[j]->cc->label, MAX_RETAINER_SET_SPACE - size);
+	    // size = strlen(tmp);
+	}
+    }
+    fprintf(f, tmp);
+}
+#elif defined(RETAINER_SCHEME_CC)
+// Retainer scheme 3: retainer = cost centre
+static void
+printRetainerSetShort(FILE *f, retainerSet *rs)
+{
+#define MAX_RETAINER_SET_SPACE  24
+    char tmp[MAX_RETAINER_SET_SPACE + 1];
+    int size;
+    nat j;
+
+    ASSERT(rs->id < 0);
+
+    tmp[MAX_RETAINER_SET_SPACE] = '\0';
+
+    // No blank characters are allowed.
+    sprintf(tmp + 0, "(%d)", -(rs->id));
+    size = strlen(tmp);
+    ASSERT(size < MAX_RETAINER_SET_SPACE);
+
+    for (j = 0; j < rs->num; j++) {
+	if (j < rs->num - 1) {
+	    strncpy(tmp + size, rs->element[j]->label,
+		    MAX_RETAINER_SET_SPACE - size);
+	    size = strlen(tmp);
+	    if (size == MAX_RETAINER_SET_SPACE)
+		break;
+	    strncpy(tmp + size, ",", MAX_RETAINER_SET_SPACE - size);
+	    size = strlen(tmp);
+	    if (size == MAX_RETAINER_SET_SPACE)
+		break;
+	}
+	else {
+	    strncpy(tmp + size, rs->element[j]->label,
+		    MAX_RETAINER_SET_SPACE - size);
+	    // size = strlen(tmp);
+	}
+    }
+    fprintf(f, tmp);
+/*
+  #define MAX_RETAINER_SET_SPACE  24
+  #define DOT_NUMBER              3
+  // 1. 32 > MAX_RETAINER_SET_SPACE + 1 (1 for '\0')
+  // 2. (MAX_RETAINER_SET_SPACE - DOT_NUMBER ) characters should be enough for
+  //    printing one natural number (plus '(' and ')').
+  char tmp[32];
+  int size, ts;
+  nat j;
+
+  ASSERT(rs->id < 0);
+
+  // No blank characters are allowed.
+  sprintf(tmp + 0, "(%d)", -(rs->id));
+  size = strlen(tmp);
+  ASSERT(size < MAX_RETAINER_SET_SPACE - DOT_NUMBER);
+
+  for (j = 0; j < rs->num; j++) {
+    ts = strlen(rs->element[j]->label);
+    if (j < rs->num - 1) {
+      if (size + ts + 1 > MAX_RETAINER_SET_SPACE - DOT_NUMBER) {
+        sprintf(tmp + size, "...");
+        break;
+      }
+      sprintf(tmp + size, "%s,", rs->element[j]->label);
+      size += ts + 1;
+    }
+    else {
+      if (size + ts > MAX_RETAINER_SET_SPACE - DOT_NUMBER) {
+        sprintf(tmp + size, "...");
+        break;
+      }
+      sprintf(tmp + size, "%s", rs->element[j]->label);
+      size += ts;
+    }
+  }
+  fprintf(f, tmp);
+*/
+}
+#endif /* RETAINER_SCHEME_CC */
+#endif /* SECOND_APPROACH */
+
+/* -----------------------------------------------------------------------------
+ * Dump the contents of each retainer set into the log file at the end
+ * of the run, so the user can find out for a given retainer set ID
+ * the full contents of that set.
+ * --------------------------------------------------------------------------- */
+#ifdef SECOND_APPROACH
+void
+outputAllRetainerSet(FILE *prof_file)
+{
+    nat i, j;
+    nat numSet;
+    RetainerSet *rs, **rsArray, *tmp;
+
+    // find out the number of retainer sets which have had a non-zero cost at
+    // least once during retainer profiling
+    numSet = 0;
+    for (i = 0; i < HASH_TABLE_SIZE; i++)
+	for (rs = hashTable[i]; rs != NULL; rs = rs->link) {
+	    if (rs->id < 0)
+		numSet++;
+	}
+
+    if (numSet == 0)      // retainer profiling was not done at all.
+	return;
+
+    // allocate memory
+    rsArray = stgMallocBytes(numSet * sizeof(RetainerSet *),
+			     "outputAllRetainerSet()");
+
+    // prepare for sorting
+    j = 0;
+    for (i = 0; i < HASH_TABLE_SIZE; i++)
+	for (rs = hashTable[i]; rs != NULL; rs = rs->link) {
+	    if (rs->id < 0) {
+		rsArray[j] = rs;
+		j++;
+	    }
+	}
+
+    ASSERT(j == numSet);
+
+    // sort rsArray[] according to the id of each retainer set
+    for (i = numSet - 1; i > 0; i--) {
+	for (j = 0; j <= i - 1; j++) {
+	    // if (-(rsArray[j]->id) < -(rsArray[j + 1]->id))
+	    if (rsArray[j]->id < rsArray[j + 1]->id) {
+		tmp = rsArray[j];
+		rsArray[j] = rsArray[j + 1];
+		rsArray[j + 1] = tmp;
+	    }
+	}
+    }
+
+    fprintf(prof_file, "\nRetainer sets created during profiling:\n");
+    for (i = 0;i < numSet; i++) {
+	fprintf(prof_file, "SET %u = {", -(rsArray[i]->id));
+	for (j = 0; j < rsArray[i]->num - 1; j++) {
+	    printRetainer(prof_file, rsArray[i]->element[j]);
+	    fprintf(prof_file, ", ");
+	}
+	printRetainer(prof_file, rsArray[i]->element[j]);
+	fprintf(prof_file, "}\n");
+    }
+
+    stgFree(rsArray);
+}
+#endif /* SECOND_APPROACH */
+
+#endif /* PROFILING */
diff --git a/rts/RetainerSet.h b/rts/RetainerSet.h
new file mode 100644
index 0000000000..6a00e1395e
--- /dev/null
+++ b/rts/RetainerSet.h
@@ -0,0 +1,201 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2001
+ * Author: Sungwoo Park
+ *
+ * Retainer set interface for retainer profiling.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef RETAINERSET_H
+#define RETAINERSET_H
+
+#include <stdio.h>
+
+#ifdef PROFILING
+
+/*
+  Type 'retainer' defines the retainer identity.
+
+  Invariant:
+    1. The retainer identity of a given retainer cannot change during 
+    program execution, no matter where it is actually stored.
+    For instance, the memory address of a retainer cannot be used as
+    its retainer identity because its location may change during garbage
+    collections.
+    2. Type 'retainer' must come with comparison operations as well as
+    an equality operation. That it, <, >, and == must be supported -
+    this is necessary to store retainers in a sorted order in retainer sets.
+    Therefore, you cannot use a huge structure type as 'retainer', for instance.
+
+  We illustrate three possibilities of defining 'retainer identity'.
+  Choose one of the following three compiler directives:
+
+   Retainer scheme 1 (RETAINER_SCHEME_INFO) : retainer = info table
+   Retainer scheme 2 (RETAINER_SCHEME_CCS)  : retainer = cost centre stack
+   Retainer scheme 3 (RETAINER_SCHEME_CC)   : retainer = cost centre
+*/
+
+// #define RETAINER_SCHEME_INFO
+#define RETAINER_SCHEME_CCS
+// #define RETAINER_SCHEME_CC
+
+#ifdef RETAINER_SCHEME_INFO
+struct _StgInfoTable;
+typedef struct _StgInfoTable *retainer;
+#endif
+
+#ifdef RETAINER_SCHEME_CCS
+typedef CostCentreStack *retainer;
+#endif
+
+#ifdef RETAINER_SCHEME_CC
+typedef CostCentre *retainer;
+#endif
+
+/*
+  Type 'retainerSet' defines an abstract datatype for sets of retainers.  
+
+  Invariants:
+    A retainer set stores its elements in increasing order (in element[] array).
+ */
+
+typedef struct _RetainerSet {
+  nat num;                      // number of elements
+  StgWord hashKey;              // hash key for this retainer set
+  struct _RetainerSet *link;    // link to the next retainer set in the bucket
+  int id;   // unique id of this retainer set (used when printing)
+            // Its absolute value is interpreted as its true id; if id is
+            // negative, it indicates that this retainer set has had a postive
+            // cost after some retainer profiling.
+  retainer element[0];          // elements of this retainer set
+  // do not put anything below here!
+} RetainerSet;
+
+/*
+  Note: 
+    There are two ways of maintaining all retainer sets. The first is simply by
+    freeing all the retainer sets and re-initialize the hash table at each
+    retainer profiling. The second is by setting the cost field of each 
+    retainer set. The second is preferred to the first if most retainer sets 
+    are likely to be observed again during the next retainer profiling. Note 
+    that in the first approach, we do not free the memory allocated for 
+    retainer sets; we just invalidate all retainer sets.
+ */
+#ifdef DEBUG_RETAINER
+// In thise case, FIRST_APPROACH must be turned on because the memory pool
+// for retainer sets is freed each time.
+#define FIRST_APPROACH
+#else
+// #define FIRST_APPROACH
+#define SECOND_APPROACH
+#endif
+
+// Creates the first pool and initializes a hash table. Frees all pools if any.
+void initializeAllRetainerSet(void);
+
+// Refreshes all pools for reuse and initializes a hash table.
+void refreshAllRetainerSet(void);
+
+// Frees all pools.
+void closeAllRetainerSet(void);
+
+// Finds or creates if needed a singleton retainer set.
+RetainerSet *singleton(retainer r);
+
+extern RetainerSet rs_MANY;
+
+// Checks if a given retainer is a memeber of the retainer set.
+// 
+// Note & (maybe) Todo:
+//   This function needs to be declared as an inline function, so it is declared
+//   as an inline static function here.
+//   This make the interface really bad, but isMember() returns a value, so
+//   it is not easy either to write it as a macro (due to my lack of C 
+//   programming experience). Sungwoo
+//
+// rtsBool isMember(retainer, retainerSet *);
+/*
+  Returns rtsTrue if r is a member of *rs.
+  Invariants:
+    rs is not NULL.
+  Note:
+    The efficiency of this function is subject to the typical size of
+    retainer sets. If it is small, linear scan is better. If it
+    is large in most cases, binary scan is better. 
+    The current implementation mixes the two search strategies.
+ */
+
+#define BINARY_SEARCH_THRESHOLD   8
+INLINE_HEADER rtsBool
+isMember(retainer r, RetainerSet *rs)
+{
+  int i, left, right;       // must be int, not nat (because -1 can appear)
+  retainer ri;
+
+  if (rs == &rs_MANY) { return rtsTrue; }
+
+  if (rs->num < BINARY_SEARCH_THRESHOLD) {
+    for (i = 0; i < (int)rs->num; i++) {
+      ri = rs->element[i];
+      if (r == ri) return rtsTrue;
+      else if (r < ri) return rtsFalse;
+    }
+  } else {
+    left = 0;
+    right = rs->num - 1;
+    while (left <= right) {
+      i = (left + right) / 2;
+      ri = rs->element[i];
+      if (r == ri) return rtsTrue;
+      else if (r < ri) right = i - 1;
+      else left = i + 1;
+    }
+  }
+  return rtsFalse;
+}
+
+// Finds or creates a retainer set augmented with a new retainer.
+RetainerSet *addElement(retainer, RetainerSet *);
+
+// Call f() for each retainer set.
+void traverseAllRetainerSet(void (*f)(RetainerSet *));
+
+#ifdef SECOND_APPROACH
+// Prints a single retainer set.
+void printRetainerSetShort(FILE *, RetainerSet *);
+#endif
+
+// Print the statistics on all the retainer sets.
+// store the sum of all costs and the number of all retainer sets. 
+void outputRetainerSet(FILE *, nat *, nat *);
+
+#ifdef SECOND_APPROACH
+// Print all retainer sets at the exit of the program.
+void outputAllRetainerSet(FILE *);
+#endif
+
+// Hashing functions
+/*
+  Invariants:
+    Once either initializeAllRetainerSet() or refreshAllRetainerSet()
+    is called, there exists only one copy of any retainer set created
+    through singleton() and addElement().  The pool (the storage for
+    retainer sets) is consumed linearly.  All the retainer sets of the
+    same hash function value are linked together from an element in
+    hashTable[].  See the invariants of allocateInPool() for the
+    maximum size of retainer sets.  The hashing function is defined by
+    hashKeySingleton() and hashKeyAddElement(). The hash key for a set
+    must be unique regardless of the order its elements are inserted,
+    i.e., the hashing function must be additive(?).
+*/
+#define hashKeySingleton(r)       ((StgWord)(r))
+#define hashKeyAddElement(r, s)   (hashKeySingleton((r)) + (s)->hashKey)
+
+// Prints the full information on a given retainer.
+// Note: This function is not part of retainerSet interface, but this is
+//       the best place to define it.
+void printRetainer(FILE *, retainer);
+
+#endif /* PROFILING */
+#endif /* RETAINERSET_H */
diff --git a/rts/RtsAPI.c b/rts/RtsAPI.c
new file mode 100644
index 0000000000..b1b1d9c52d
--- /dev/null
+++ b/rts/RtsAPI.c
@@ -0,0 +1,597 @@
+/* ----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2001
+ *
+ * API for invoking Haskell functions via the RTS
+ *
+ * --------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "OSThreads.h"
+#include "Storage.h"
+#include "RtsAPI.h"
+#include "SchedAPI.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "Prelude.h"
+#include "Schedule.h"
+#include "Capability.h"
+
+#include <stdlib.h>
+
+/* ----------------------------------------------------------------------------
+   Building Haskell objects from C datatypes.
+   ------------------------------------------------------------------------- */
+HaskellObj
+rts_mkChar (Capability *cap, HsChar c)
+{
+  StgClosure *p = (StgClosure *)allocateLocal(cap, CONSTR_sizeW(0,1));
+  SET_HDR(p, Czh_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)(StgWord)(StgChar)c;
+  return p;
+}
+
+HaskellObj
+rts_mkInt (Capability *cap, HsInt i)
+{
+  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  SET_HDR(p, Izh_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)(StgInt)i;
+  return p;
+}
+
+HaskellObj
+rts_mkInt8 (Capability *cap, HsInt8 i)
+{
+  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  SET_HDR(p, I8zh_con_info, CCS_SYSTEM);
+  /* Make sure we mask out the bits above the lowest 8 */
+  p->payload[0]  = (StgClosure *)(StgInt)((unsigned)i & 0xff);
+  return p;
+}
+
+HaskellObj
+rts_mkInt16 (Capability *cap, HsInt16 i)
+{
+  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  SET_HDR(p, I16zh_con_info, CCS_SYSTEM);
+  /* Make sure we mask out the relevant bits */
+  p->payload[0]  = (StgClosure *)(StgInt)((unsigned)i & 0xffff);
+  return p;
+}
+
+HaskellObj
+rts_mkInt32 (Capability *cap, HsInt32 i)
+{
+  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  SET_HDR(p, I32zh_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)(StgInt)((unsigned)i & 0xffffffff);
+  return p;
+}
+
+HaskellObj
+rts_mkInt64 (Capability *cap, HsInt64 i)
+{
+  llong *tmp;
+  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,2));
+  SET_HDR(p, I64zh_con_info, CCS_SYSTEM);
+  tmp  = (llong*)&(p->payload[0]);
+  *tmp = (StgInt64)i;
+  return p;
+}
+
+HaskellObj
+rts_mkWord (Capability *cap, HsWord i)
+{
+  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  SET_HDR(p, Wzh_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)(StgWord)i;
+  return p;
+}
+
+HaskellObj
+rts_mkWord8 (Capability *cap, HsWord8 w)
+{
+  /* see rts_mkInt* comments */
+  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  SET_HDR(p, W8zh_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)(StgWord)(w & 0xff);
+  return p;
+}
+
+HaskellObj
+rts_mkWord16 (Capability *cap, HsWord16 w)
+{
+  /* see rts_mkInt* comments */
+  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  SET_HDR(p, W16zh_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)(StgWord)(w & 0xffff);
+  return p;
+}
+
+HaskellObj
+rts_mkWord32 (Capability *cap, HsWord32 w)
+{
+  /* see rts_mkInt* comments */
+  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  SET_HDR(p, W32zh_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)(StgWord)(w & 0xffffffff);
+  return p;
+}
+
+HaskellObj
+rts_mkWord64 (Capability *cap, HsWord64 w)
+{
+  ullong *tmp;
+
+  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,2));
+  /* see mk_Int8 comment */
+  SET_HDR(p, W64zh_con_info, CCS_SYSTEM);
+  tmp  = (ullong*)&(p->payload[0]);
+  *tmp = (StgWord64)w;
+  return p;
+}
+
+HaskellObj
+rts_mkFloat (Capability *cap, HsFloat f)
+{
+  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,1));
+  SET_HDR(p, Fzh_con_info, CCS_SYSTEM);
+  ASSIGN_FLT((P_)p->payload, (StgFloat)f);
+  return p;
+}
+
+HaskellObj
+rts_mkDouble (Capability *cap, HsDouble d)
+{
+  StgClosure *p = (StgClosure *)allocateLocal(cap,CONSTR_sizeW(0,sizeofW(StgDouble)));
+  SET_HDR(p, Dzh_con_info, CCS_SYSTEM);
+  ASSIGN_DBL((P_)p->payload, (StgDouble)d);
+  return p;
+}
+
+HaskellObj
+rts_mkStablePtr (Capability *cap, HsStablePtr s)
+{
+  StgClosure *p = (StgClosure *)allocateLocal(cap,sizeofW(StgHeader)+1);
+  SET_HDR(p, StablePtr_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)s;
+  return p;
+}
+
+HaskellObj
+rts_mkPtr (Capability *cap, HsPtr a)
+{
+  StgClosure *p = (StgClosure *)allocateLocal(cap,sizeofW(StgHeader)+1);
+  SET_HDR(p, Ptr_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)a;
+  return p;
+}
+
+HaskellObj
+rts_mkFunPtr (Capability *cap, HsFunPtr a)
+{
+  StgClosure *p = (StgClosure *)allocateLocal(cap,sizeofW(StgHeader)+1);
+  SET_HDR(p, FunPtr_con_info, CCS_SYSTEM);
+  p->payload[0]  = (StgClosure *)a;
+  return p;
+}
+
+HaskellObj
+rts_mkBool (Capability *cap STG_UNUSED, HsBool b)
+{
+  if (b) {
+    return (StgClosure *)True_closure;
+  } else {
+    return (StgClosure *)False_closure;
+  }
+}
+
+HaskellObj
+rts_mkString (Capability *cap, char *s)
+{
+  return rts_apply(cap, (StgClosure *)unpackCString_closure, rts_mkPtr(cap,s));
+}
+
+HaskellObj
+rts_apply (Capability *cap, HaskellObj f, HaskellObj arg)
+{
+    StgThunk *ap;
+
+    ap = (StgThunk *)allocateLocal(cap,sizeofW(StgThunk) + 2);
+    SET_HDR(ap, (StgInfoTable *)&stg_ap_2_upd_info, CCS_SYSTEM);
+    ap->payload[0] = f;
+    ap->payload[1] = arg;
+    return (StgClosure *)ap;
+}
+
+/* ----------------------------------------------------------------------------
+   Deconstructing Haskell objects
+
+   We would like to assert that we have the right kind of object in
+   each case, but this is problematic because in GHCi the info table
+   for the D# constructor (say) might be dynamically loaded.  Hence we
+   omit these assertions for now.
+   ------------------------------------------------------------------------- */
+
+HsChar
+rts_getChar (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == Czh_con_info ||
+    //        p->header.info == Czh_static_info);
+    return (StgChar)(StgWord)(p->payload[0]);
+}
+
+HsInt
+rts_getInt (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == Izh_con_info ||
+    //        p->header.info == Izh_static_info);
+    return (HsInt)(p->payload[0]);
+}
+
+HsInt8
+rts_getInt8 (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == I8zh_con_info ||
+    //        p->header.info == I8zh_static_info);
+    return (HsInt8)(HsInt)(p->payload[0]);
+}
+
+HsInt16
+rts_getInt16 (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == I16zh_con_info ||
+    //        p->header.info == I16zh_static_info);
+    return (HsInt16)(HsInt)(p->payload[0]);
+}
+
+HsInt32
+rts_getInt32 (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == I32zh_con_info ||
+    //        p->header.info == I32zh_static_info);
+    return (HsInt32)(HsInt)(p->payload[0]);
+}
+
+HsInt64
+rts_getInt64 (HaskellObj p)
+{
+    HsInt64* tmp;
+    // See comment above:
+    // ASSERT(p->header.info == I64zh_con_info ||
+    //        p->header.info == I64zh_static_info);
+    tmp = (HsInt64*)&(p->payload[0]);
+    return *tmp;
+}
+HsWord
+rts_getWord (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == Wzh_con_info ||
+    //        p->header.info == Wzh_static_info);
+    return (HsWord)(p->payload[0]);
+}
+
+HsWord8
+rts_getWord8 (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == W8zh_con_info ||
+    //        p->header.info == W8zh_static_info);
+    return (HsWord8)(HsWord)(p->payload[0]);
+}
+
+HsWord16
+rts_getWord16 (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == W16zh_con_info ||
+    //        p->header.info == W16zh_static_info);
+    return (HsWord16)(HsWord)(p->payload[0]);
+}
+
+HsWord32
+rts_getWord32 (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == W32zh_con_info ||
+    //        p->header.info == W32zh_static_info);
+    return (HsWord32)(HsWord)(p->payload[0]);
+}
+
+
+HsWord64
+rts_getWord64 (HaskellObj p)
+{
+    HsWord64* tmp;
+    // See comment above:
+    // ASSERT(p->header.info == W64zh_con_info ||
+    //        p->header.info == W64zh_static_info);
+    tmp = (HsWord64*)&(p->payload[0]);
+    return *tmp;
+}
+
+HsFloat
+rts_getFloat (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == Fzh_con_info ||
+    //        p->header.info == Fzh_static_info);
+    return (float)(PK_FLT((P_)p->payload));
+}
+
+HsDouble
+rts_getDouble (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == Dzh_con_info ||
+    //        p->header.info == Dzh_static_info);
+    return (double)(PK_DBL((P_)p->payload));
+}
+
+HsStablePtr
+rts_getStablePtr (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == StablePtr_con_info ||
+    //        p->header.info == StablePtr_static_info);
+    return (StgStablePtr)(p->payload[0]);
+}
+
+HsPtr
+rts_getPtr (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == Ptr_con_info ||
+    //        p->header.info == Ptr_static_info);
+    return (Capability *)(p->payload[0]);
+}
+
+HsFunPtr
+rts_getFunPtr (HaskellObj p)
+{
+    // See comment above:
+    // ASSERT(p->header.info == FunPtr_con_info ||
+    //        p->header.info == FunPtr_static_info);
+    return (void *)(p->payload[0]);
+}
+
+HsBool
+rts_getBool (HaskellObj p)
+{
+    StgInfoTable *info;
+
+    info = get_itbl((StgClosure *)p);
+    if (info->srt_bitmap == 0) { // srt_bitmap is the constructor tag
+	return 0;
+    } else {
+	return 1;
+    }
+}
+
+/* -----------------------------------------------------------------------------
+   Creating threads
+   -------------------------------------------------------------------------- */
+
+INLINE_HEADER void pushClosure   (StgTSO *tso, StgWord c) {
+  tso->sp--;
+  tso->sp[0] = (W_) c;
+}
+
+StgTSO *
+createGenThread (Capability *cap, nat stack_size,  StgClosure *closure)
+{
+  StgTSO *t;
+#if defined(GRAN)
+  t = createThread (cap, stack_size, NO_PRI);
+#else
+  t = createThread (cap, stack_size);
+#endif
+  pushClosure(t, (W_)closure);
+  pushClosure(t, (W_)&stg_enter_info);
+  return t;
+}
+
+StgTSO *
+createIOThread (Capability *cap, nat stack_size,  StgClosure *closure)
+{
+  StgTSO *t;
+#if defined(GRAN)
+  t = createThread (cap, stack_size, NO_PRI);
+#else
+  t = createThread (cap, stack_size);
+#endif
+  pushClosure(t, (W_)&stg_noforceIO_info);
+  pushClosure(t, (W_)&stg_ap_v_info);
+  pushClosure(t, (W_)closure);
+  pushClosure(t, (W_)&stg_enter_info);
+  return t;
+}
+
+/*
+ * Same as above, but also evaluate the result of the IO action
+ * to whnf while we're at it.
+ */
+
+StgTSO *
+createStrictIOThread(Capability *cap, nat stack_size,  StgClosure *closure)
+{
+  StgTSO *t;
+#if defined(GRAN)
+  t = createThread(cap, stack_size, NO_PRI);
+#else
+  t = createThread(cap, stack_size);
+#endif
+  pushClosure(t, (W_)&stg_forceIO_info);
+  pushClosure(t, (W_)&stg_ap_v_info);
+  pushClosure(t, (W_)closure);
+  pushClosure(t, (W_)&stg_enter_info);
+  return t;
+}
+
+/* ----------------------------------------------------------------------------
+   Evaluating Haskell expressions
+   ------------------------------------------------------------------------- */
+
+Capability *
+rts_eval (Capability *cap, HaskellObj p, /*out*/HaskellObj *ret)
+{
+    StgTSO *tso;
+    
+    tso = createGenThread(cap, RtsFlags.GcFlags.initialStkSize, p);
+    return scheduleWaitThread(tso,ret,cap);
+}
+
+Capability *
+rts_eval_ (Capability *cap, HaskellObj p, unsigned int stack_size, 
+	   /*out*/HaskellObj *ret)
+{
+    StgTSO *tso;
+
+    tso = createGenThread(cap, stack_size, p);
+    return scheduleWaitThread(tso,ret,cap);
+}
+
+/*
+ * rts_evalIO() evaluates a value of the form (IO a), forcing the action's
+ * result to WHNF before returning.
+ */
+Capability *
+rts_evalIO (Capability *cap, HaskellObj p, /*out*/HaskellObj *ret)
+{
+    StgTSO* tso; 
+    
+    tso = createStrictIOThread(cap, RtsFlags.GcFlags.initialStkSize, p);
+    return scheduleWaitThread(tso,ret,cap);
+}
+
+/*
+ * rts_evalStableIO() is suitable for calling from Haskell.  It
+ * evaluates a value of the form (StablePtr (IO a)), forcing the
+ * action's result to WHNF before returning.  The result is returned
+ * in a StablePtr.
+ */
+Capability *
+rts_evalStableIO (Capability *cap, HsStablePtr s, /*out*/HsStablePtr *ret)
+{
+    StgTSO* tso;
+    StgClosure *p, *r;
+    SchedulerStatus stat;
+    
+    p = (StgClosure *)deRefStablePtr(s);
+    tso = createStrictIOThread(cap, RtsFlags.GcFlags.initialStkSize, p);
+    cap = scheduleWaitThread(tso,&r,cap);
+    stat = rts_getSchedStatus(cap);
+
+    if (stat == Success && ret != NULL) {
+	ASSERT(r != NULL);
+	*ret = getStablePtr((StgPtr)r);
+    }
+
+    return cap;
+}
+
+/*
+ * Like rts_evalIO(), but doesn't force the action's result.
+ */
+Capability *
+rts_evalLazyIO (Capability *cap, HaskellObj p, /*out*/HaskellObj *ret)
+{
+    StgTSO *tso;
+
+    tso = createIOThread(cap, RtsFlags.GcFlags.initialStkSize, p);
+    return scheduleWaitThread(tso,ret,cap);
+}
+
+Capability *
+rts_evalLazyIO_ (Capability *cap, HaskellObj p, unsigned int stack_size, 
+		 /*out*/HaskellObj *ret)
+{
+    StgTSO *tso;
+
+    tso = createIOThread(cap, stack_size, p);
+    return scheduleWaitThread(tso,ret,cap);
+}
+
+/* Convenience function for decoding the returned status. */
+
+void
+rts_checkSchedStatus (char* site, Capability *cap)
+{
+    SchedulerStatus rc = cap->running_task->stat;
+    switch (rc) {
+    case Success:
+	return;
+    case Killed:
+	errorBelch("%s: uncaught exception",site);
+	stg_exit(EXIT_FAILURE);
+    case Interrupted:
+	errorBelch("%s: interrupted", site);
+	stg_exit(EXIT_FAILURE);
+    default:
+	errorBelch("%s: Return code (%d) not ok",(site),(rc));	
+	stg_exit(EXIT_FAILURE);
+    }
+}
+
+SchedulerStatus
+rts_getSchedStatus (Capability *cap)
+{
+    return cap->running_task->stat;
+}
+
+Capability *
+rts_lock (void)
+{
+    Capability *cap;
+    Task *task;
+
+    // ToDo: get rid of this lock in the common case.  We could store
+    // a free Task in thread-local storage, for example.  That would
+    // leave just one lock on the path into the RTS: cap->lock when
+    // acquiring the Capability.
+    ACQUIRE_LOCK(&sched_mutex);
+    task = newBoundTask();
+    RELEASE_LOCK(&sched_mutex);
+
+    cap = NULL;
+    waitForReturnCapability(&cap, task);
+    return (Capability *)cap;
+}
+
+// Exiting the RTS: we hold a Capability that is not necessarily the
+// same one that was originally returned by rts_lock(), because
+// rts_evalIO() etc. may return a new one.  Now that we have
+// investigated the return value, we can release the Capability,
+// and free the Task (in that order).
+
+void
+rts_unlock (Capability *cap)
+{
+    Task *task;
+
+    task = cap->running_task;
+    ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
+
+    // slightly delicate ordering of operations below, pay attention!
+
+    // We are no longer a bound task/thread.  This is important,
+    // because the GC can run when we release the Capability below,
+    // and we don't want it to treat this as a live TSO pointer.
+    task->tso = NULL;
+
+    // Now release the Capability.  With the capability released, GC
+    // may happen.  NB. does not try to put the current Task on the
+    // worker queue.
+    releaseCapability(cap);
+
+    // Finally, we can release the Task to the free list.
+    boundTaskExiting(task);
+}
diff --git a/rts/RtsDllMain.c b/rts/RtsDllMain.c
new file mode 100644
index 0000000000..af3c5090de
--- /dev/null
+++ b/rts/RtsDllMain.c
@@ -0,0 +1,39 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1999-2000
+ *
+ * Entry point for RTS-in-a-DLL
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsAPI.h"
+
+#ifdef HAVE_WINDOWS_H
+#include <windows.h>
+#endif
+
+/* I'd be mildly surprised if this wasn't defined, but still. */
+#ifdef ENABLE_WIN32_DLL_SUPPORT
+
+BOOL
+WINAPI
+DllMain ( HINSTANCE hInstance
+        , DWORD reason
+	, LPVOID reserved
+	)
+{
+  /*
+   * Note: the DllMain() doesn't call startupHaskell() for you,
+   *       that is the task of users of the RTS. The reason is
+   *       that *you* want to be able to control the arguments
+   *       you pass to the RTS.
+   */
+  switch (reason) {
+  case DLL_PROCESS_DETACH: shutdownHaskell();
+  }
+  return TRUE;
+}
+
+#endif /* ENABLE_WIN32_DLL_SUPPORT */
diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c
new file mode 100644
index 0000000000..0f83b3356c
--- /dev/null
+++ b/rts/RtsFlags.c
@@ -0,0 +1,2281 @@
+
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The AQUA Project, Glasgow University, 1994-1997
+ * (c) The GHC Team, 1998-1999
+ *
+ * Functions for parsing the argument list.
+ *
+ * ---------------------------------------------------------------------------*/
+
+//@menu
+//* Includes::			
+//* Constants::			
+//* Static function decls::	
+//* Command-line option parsing routines::  
+//* GranSim specific options::	
+//* Aux fcts::			
+//@end menu
+//*/
+
+//@node Includes, Constants
+//@subsection Includes
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "BlockAlloc.h"
+#include "Timer.h"		/* CS_MIN_MILLISECS */
+#include "Profiling.h"
+
+#ifdef HAVE_CTYPE_H
+#include <ctype.h>
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+
+// Flag Structure
+RTS_FLAGS RtsFlags;
+
+/*
+ * Split argument lists
+ */
+int     prog_argc = 0;    /* an "int" so as to match normal "argc" */
+char  **prog_argv = NULL;
+char   *prog_name = NULL; /* 'basename' of prog_argv[0] */
+int     rts_argc = 0;  /* ditto */
+char   *rts_argv[MAX_RTS_ARGS];
+
+//@node Constants, Static function decls, Includes
+//@subsection Constants
+
+/*
+ * constants, used later 
+ */
+#define RTS 1
+#define PGM 0
+
+#if defined(GRAN)
+
+static char *gran_debug_opts_strs[] = {
+  "DEBUG (-bDe, -bD1): event_trace; printing event trace.\n",
+  "DEBUG (-bDE, -bD2): event_stats; printing event statistics.\n",
+  "DEBUG (-bDb, -bD4): bq; check blocking queues\n",
+  "DEBUG (-bDG, -bD8): pack; routines for (un-)packing graph structures.\n",
+  "DEBUG (-bDq, -bD16): checkSparkQ; check consistency of the spark queues.\n",
+  "DEBUG (-bDf, -bD32): thunkStealing; print forwarding of fetches.\n",
+  "DEBUG (-bDr, -bD64): randomSteal; stealing sparks/threads from random PEs.\n",
+  "DEBUG (-bDF, -bD128): findWork; searching spark-pools (local & remote), thread queues for work.\n",
+  "DEBUG (-bDu, -bD256): unused; currently unused flag.\n",
+  "DEBUG (-bDS, -bD512): pri; priority sparking or scheduling.\n",
+  "DEBUG (-bD:, -bD1024): checkLight; check GranSim-Light setup.\n",
+  "DEBUG (-bDo, -bD2048): sortedQ; check whether spark/thread queues are sorted.\n",
+  "DEBUG (-bDz, -bD4096): blockOnFetch; check for blocked on fetch.\n",
+  "DEBUG (-bDP, -bD8192): packBuffer; routines handling pack buffer (GranSim internal!).\n",
+  "DEBUG (-bDt, -bD16384): blockOnFetch_sanity; check for TSO asleep on fetch.\n",
+};
+
+/* one character codes for the available debug options */
+static char gran_debug_opts_flags[] = {
+  'e', 'E', 'b', 'G', 'q', 'f', 'r', 'F', 'u', 'S', ':', 'o', 'z', 'P', 't'
+};
+
+#elif defined(PAR)
+
+static char *par_debug_opts_strs[] = {
+  "DEBUG (-qDv, -qD1): verbose; be generally verbose with parallel related stuff.\n",
+  "DEBUG (-qDq, -qD2): bq; print blocking queues.\n",
+  "DEBUG (-qDs, -qD4): schedule; scheduling of parallel threads.\n",
+  "DEBUG (-qDe, -qD8): free; free messages.\n",
+  "DEBUG (-qDr, -qD16): resume; resume messages.\n",
+  "DEBUG (-qDw, -qD32): weight; print weights and distrib GC stuff.\n",
+  "DEBUG (-qDF, -qD64): fetch; fetch messages.\n",
+  // "DEBUG (-qDa, -qD128): ack; ack messages.\n",
+  "DEBUG (-qDf, -qD128): fish; fish messages.\n",
+  //"DEBUG (-qDo, -qD512): forward; forwarding messages to other PEs.\n",
+  "DEBUG (-qDl, -qD256): tables; print internal LAGA etc tables.\n",
+  "DEBUG (-qDo, -qD512): packet; packets and graph structures when packing.\n",
+  "DEBUG (-qDp, -qD1024): pack; packing and unpacking graphs.\n",
+  "DEBUG (-qDz, -qD2048): paranoia; ridiculously detailed output (excellent for filling a partition).\n"
+};
+
+/* one character codes for the available debug options */
+static char par_debug_opts_flags[] = {
+  'v', 'q', 's', 'e', 'r', 'w', 'F', 'f', 'l', 'o', 'p', 'z'
+};
+
+#endif /* PAR */
+
+//@node Static function decls, Command-line option parsing routines, Constants
+//@subsection Static function decls
+
+/* -----------------------------------------------------------------------------
+   Static function decls
+   -------------------------------------------------------------------------- */
+
+static int		/* return NULL on error */
+open_stats_file (
+    I_ arg,
+    int argc, char *argv[],
+    int rts_argc, char *rts_argv[],
+    const char *FILENAME_FMT,
+    FILE **file_ret);
+
+static I_ decode(const char *s);
+static void bad_option(const char *s);
+
+#if defined(GRAN)
+static void enable_GranSimLight(void);
+static void process_gran_option(int arg, int *rts_argc, char *rts_argv[], rtsBool *error);
+static void set_GranSim_debug_options(nat n);
+static void help_GranSim_debug_options(nat n);
+#elif defined(PAR)
+static void process_par_option(int arg, int *rts_argc, char *rts_argv[], rtsBool *error);
+static void set_par_debug_options(nat n);
+static void help_par_debug_options(nat n);
+#endif
+
+//@node Command-line option parsing routines, GranSim specific options, Static function decls
+//@subsection Command-line option parsing routines
+
+/* -----------------------------------------------------------------------------
+ * Command-line option parsing routines.
+ * ---------------------------------------------------------------------------*/
+
+void initRtsFlagsDefaults(void)
+{
+    RtsFlags.GcFlags.statsFile		= NULL;
+    RtsFlags.GcFlags.giveStats		= NO_GC_STATS;
+
+    RtsFlags.GcFlags.maxStkSize		= (8 * 1024 * 1024) / sizeof(W_);
+    RtsFlags.GcFlags.initialStkSize	= 1024 / sizeof(W_);
+
+    RtsFlags.GcFlags.minAllocAreaSize   = (512 * 1024)        / BLOCK_SIZE;
+    RtsFlags.GcFlags.minOldGenSize      = (1024 * 1024)       / BLOCK_SIZE;
+    RtsFlags.GcFlags.maxHeapSize	= 0;    /* off by default */
+    RtsFlags.GcFlags.heapSizeSuggestion	= 0;    /* none */
+    RtsFlags.GcFlags.pcFreeHeap		= 3;	/* 3% */
+    RtsFlags.GcFlags.oldGenFactor       = 2;
+#if defined(PAR)
+    /* A hack currently needed for GUM -- HWL */
+    RtsFlags.GcFlags.generations        = 1;
+    RtsFlags.GcFlags.steps              = 2;
+    RtsFlags.GcFlags.squeezeUpdFrames	= rtsFalse;
+#else
+    RtsFlags.GcFlags.generations        = 2;
+    RtsFlags.GcFlags.steps              = 2;
+    RtsFlags.GcFlags.squeezeUpdFrames	= rtsTrue;
+#endif
+    RtsFlags.GcFlags.compact            = rtsFalse;
+    RtsFlags.GcFlags.compactThreshold   = 30.0;
+#ifdef RTS_GTK_FRONTPANEL
+    RtsFlags.GcFlags.frontpanel         = rtsFalse;
+#endif
+    RtsFlags.GcFlags.idleGCDelayTicks   = 300 / TICK_MILLISECS; /* ticks */
+
+#ifdef DEBUG
+    RtsFlags.DebugFlags.scheduler	= rtsFalse;
+    RtsFlags.DebugFlags.interpreter	= rtsFalse;
+    RtsFlags.DebugFlags.codegen		= rtsFalse;
+    RtsFlags.DebugFlags.weak		= rtsFalse;
+    RtsFlags.DebugFlags.gccafs		= rtsFalse;
+    RtsFlags.DebugFlags.gc		= rtsFalse;
+    RtsFlags.DebugFlags.block_alloc	= rtsFalse;
+    RtsFlags.DebugFlags.sanity		= rtsFalse;
+    RtsFlags.DebugFlags.stable		= rtsFalse;
+    RtsFlags.DebugFlags.stm             = rtsFalse;
+    RtsFlags.DebugFlags.prof		= rtsFalse;
+    RtsFlags.DebugFlags.gran		= rtsFalse;
+    RtsFlags.DebugFlags.par		= rtsFalse;
+    RtsFlags.DebugFlags.linker		= rtsFalse;
+    RtsFlags.DebugFlags.squeeze		= rtsFalse;
+#endif
+
+#if defined(PROFILING) || defined(PAR)
+    RtsFlags.CcFlags.doCostCentres	= 0;
+#endif /* PROFILING or PAR */
+
+#ifdef PROFILING
+    RtsFlags.ProfFlags.doHeapProfile      = rtsFalse;
+    RtsFlags.ProfFlags.profileInterval    = 100;
+    RtsFlags.ProfFlags.includeTSOs        = rtsFalse;
+    RtsFlags.ProfFlags.showCCSOnException = rtsFalse;
+    RtsFlags.ProfFlags.maxRetainerSetSize = 8;
+    RtsFlags.ProfFlags.modSelector        = NULL;
+    RtsFlags.ProfFlags.descrSelector      = NULL;
+    RtsFlags.ProfFlags.typeSelector       = NULL;
+    RtsFlags.ProfFlags.ccSelector         = NULL;
+    RtsFlags.ProfFlags.ccsSelector        = NULL;
+    RtsFlags.ProfFlags.retainerSelector   = NULL;
+    RtsFlags.ProfFlags.bioSelector        = NULL;
+
+#elif defined(DEBUG)
+    RtsFlags.ProfFlags.doHeapProfile      = rtsFalse;
+#endif
+
+    RtsFlags.ConcFlags.ctxtSwitchTime	= CS_MIN_MILLISECS;  /* In milliseconds */
+
+#ifdef THREADED_RTS
+    RtsFlags.ParFlags.nNodes	        = 1;
+    RtsFlags.ParFlags.migrate           = rtsTrue;
+    RtsFlags.ParFlags.wakeupMigrate     = rtsFalse;
+#endif
+
+#ifdef PAR
+    RtsFlags.ParFlags.ParStats.Full   	  = rtsFalse;
+    RtsFlags.ParFlags.ParStats.Suppressed = rtsFalse;
+    RtsFlags.ParFlags.ParStats.Binary 	  = rtsFalse;
+    RtsFlags.ParFlags.ParStats.Sparks 	  = rtsFalse;
+    RtsFlags.ParFlags.ParStats.Heap   	  = rtsFalse;
+    RtsFlags.ParFlags.ParStats.NewLogfile = rtsFalse;
+    RtsFlags.ParFlags.ParStats.Global     = rtsFalse;
+
+    RtsFlags.ParFlags.outputDisabled	= rtsFalse;
+#ifdef DIST
+    RtsFlags.ParFlags.doFairScheduling  = rtsTrue;  /* fair sched by def */
+#else
+    RtsFlags.ParFlags.doFairScheduling  = rtsFalse;  /* unfair sched by def */
+#endif
+    RtsFlags.ParFlags.packBufferSize	= 1024;
+    RtsFlags.ParFlags.thunksToPack      = 1; /* 0 ... infinity; */
+    RtsFlags.ParFlags.globalising       = 1; /* 0 ... everything */
+    RtsFlags.ParFlags.maxThreads        = 1024;
+    RtsFlags.ParFlags.maxFishes        = MAX_FISHES;
+    RtsFlags.ParFlags.fishDelay         = FISH_DELAY;
+#endif
+
+#if defined(PAR) || defined(THREADED_RTS)
+    RtsFlags.ParFlags.maxLocalSparks	= 4096;
+#endif /* PAR || THREADED_RTS */
+
+#if defined(GRAN)
+    /* ToDo: check defaults for GranSim and GUM */
+    RtsFlags.GcFlags.maxStkSize		= (8 * 1024 * 1024) / sizeof(W_);
+    RtsFlags.GcFlags.initialStkSize	= 1024 / sizeof(W_);
+
+    RtsFlags.GranFlags.maxThreads	= 65536; // refers to mandatory threads
+    RtsFlags.GranFlags.GranSimStats.Full	= rtsFalse;
+    RtsFlags.GranFlags.GranSimStats.Suppressed	= rtsFalse;
+    RtsFlags.GranFlags.GranSimStats.Binary      = rtsFalse;
+    RtsFlags.GranFlags.GranSimStats.Sparks      = rtsFalse;
+    RtsFlags.GranFlags.GranSimStats.Heap        = rtsFalse;
+    RtsFlags.GranFlags.GranSimStats.NewLogfile  = rtsFalse;
+    RtsFlags.GranFlags.GranSimStats.Global      = rtsFalse;
+
+    RtsFlags.GranFlags.packBufferSize	= 1024;
+    RtsFlags.GranFlags.packBufferSize_internal = GRANSIM_DEFAULT_PACK_BUFFER_SIZE;
+
+    RtsFlags.GranFlags.proc         = MAX_PROC;
+    RtsFlags.GranFlags.Fishing      = rtsFalse;
+    RtsFlags.GranFlags.maxFishes   = MAX_FISHES;
+    RtsFlags.GranFlags.time_slice   = GRAN_TIME_SLICE;
+    RtsFlags.GranFlags.Light        = rtsFalse;
+
+    RtsFlags.GranFlags.Costs.latency =             LATENCY;          
+    RtsFlags.GranFlags.Costs.additional_latency =  ADDITIONAL_LATENCY; 
+    RtsFlags.GranFlags.Costs.fetchtime =           FETCHTIME; 
+    RtsFlags.GranFlags.Costs.lunblocktime =        LOCALUNBLOCKTIME; 
+    RtsFlags.GranFlags.Costs.gunblocktime =        GLOBALUNBLOCKTIME;
+    RtsFlags.GranFlags.Costs.mpacktime =           MSGPACKTIME;      
+    RtsFlags.GranFlags.Costs.munpacktime =         MSGUNPACKTIME;
+    RtsFlags.GranFlags.Costs.mtidytime =           MSGTIDYTIME;
+
+    RtsFlags.GranFlags.Costs.threadcreatetime =         THREADCREATETIME;
+    RtsFlags.GranFlags.Costs.threadqueuetime =          THREADQUEUETIME;
+    RtsFlags.GranFlags.Costs.threaddescheduletime =     THREADDESCHEDULETIME;
+    RtsFlags.GranFlags.Costs.threadscheduletime =       THREADSCHEDULETIME;
+    RtsFlags.GranFlags.Costs.threadcontextswitchtime =  THREADCONTEXTSWITCHTIME;
+
+    RtsFlags.GranFlags.Costs.arith_cost =         ARITH_COST;       
+    RtsFlags.GranFlags.Costs.branch_cost =        BRANCH_COST; 
+    RtsFlags.GranFlags.Costs.load_cost =          LOAD_COST;        
+    RtsFlags.GranFlags.Costs.store_cost =         STORE_COST; 
+    RtsFlags.GranFlags.Costs.float_cost =         FLOAT_COST;       
+
+    RtsFlags.GranFlags.Costs.heapalloc_cost =     HEAPALLOC_COST;
+
+    RtsFlags.GranFlags.Costs.pri_spark_overhead = PRI_SPARK_OVERHEAD;        
+    RtsFlags.GranFlags.Costs.pri_sched_overhead = PRI_SCHED_OVERHEAD;        
+
+    RtsFlags.GranFlags.DoFairSchedule           = rtsFalse;             
+    RtsFlags.GranFlags.DoAsyncFetch             = rtsFalse;        
+    RtsFlags.GranFlags.DoStealThreadsFirst      = rtsFalse;        
+    RtsFlags.GranFlags.DoAlwaysCreateThreads    = rtsFalse;      
+    RtsFlags.GranFlags.DoBulkFetching           = rtsFalse;             
+    RtsFlags.GranFlags.DoThreadMigration        = rtsFalse;          
+    RtsFlags.GranFlags.FetchStrategy            = 2;                     
+    RtsFlags.GranFlags.PreferSparksOfLocalNodes = rtsFalse;   
+    RtsFlags.GranFlags.DoPrioritySparking       = rtsFalse;         
+    RtsFlags.GranFlags.DoPriorityScheduling     = rtsFalse;       
+    RtsFlags.GranFlags.SparkPriority            = 0;
+    RtsFlags.GranFlags.SparkPriority2           = 0; 
+    RtsFlags.GranFlags.RandomPriorities         = rtsFalse;           
+    RtsFlags.GranFlags.InversePriorities        = rtsFalse;          
+    RtsFlags.GranFlags.IgnorePriorities         = rtsFalse;           
+    RtsFlags.GranFlags.ThunksToPack             = 0;                      
+    RtsFlags.GranFlags.RandomSteal              = rtsTrue;
+#endif
+
+#ifdef TICKY_TICKY
+    RtsFlags.TickyFlags.showTickyStats	 = rtsFalse;
+    RtsFlags.TickyFlags.tickyFile	 = NULL;
+#endif
+}
+
+static const char *
+usage_text[] = {
+"",
+"Usage: <prog> <args> [+RTS <rtsopts> | -RTS <args>] ... --RTS <args>",
+"",
+"   +RTS    Indicates run time system options follow",
+"   -RTS    Indicates program arguments follow",
+"  --RTS    Indicates that ALL subsequent arguments will be given to the",
+"           program (including any of these RTS flags)",
+"",
+"The following run time system options are available:",
+"",
+"  -?       Prints this message and exits; the program is not executed",
+"",
+"  -K<size> Sets the maximum stack size (default 8M)  Egs: -K32k   -K512k",
+"  -k<size> Sets the initial thread stack size (default 1k)  Egs: -k4k   -k2m",
+"",
+"  -A<size> Sets the minimum allocation area size (default 256k) Egs: -A1m -A10k",
+"  -M<size> Sets the maximum heap size (default unlimited)  Egs: -M256k -M1G",
+"  -H<size> Sets the minimum heap size (default 0M)   Egs: -H24m  -H1G",
+"  -m<n>    Minimum % of heap which must be available (default 3%)",
+"  -G<n>    Number of generations (default: 2)",
+"  -T<n>    Number of steps in younger generations (default: 2)",
+"  -c<n>    Auto-enable compaction of the oldest generation when live data is",
+"           at least <n>% of the maximum heap size set with -M (default: 30%)",
+"  -c       Enable compaction for all major collections",
+#if defined(THREADED_RTS)
+"  -I<sec>  Perform full GC after <sec> idle time (default: 0.3, 0 == off)",
+#endif
+"",
+"  -t<file> One-line GC statistics  (default file: <program>.stat)",
+"  -s<file> Summary  GC statistics  (with -Sstderr going to stderr)",
+"  -S<file> Detailed GC statistics",
+#ifdef RTS_GTK_FRONTPANEL
+"  -f       Display front panel (requires X11 & GTK+)",
+#endif
+"",
+"",
+"  -Z       Don't squeeze out update frames on stack overflow",
+"  -B       Sound the bell at the start of each garbage collection",
+#if defined(PROFILING) || defined(PAR)
+"",
+"  -px      Time/allocation profile (XML)  (output file <program>.prof)",
+"  -p       Time/allocation profile        (output file <program>.prof)",
+"  -P       More detailed Time/Allocation profile",
+"  -Pa      Give information about *all* cost centres",
+
+# if defined(PROFILING)
+"",
+"  -hx            Heap residency profile (XML)   (output file <program>.prof)",
+"  -h<break-down> Heap residency profile (hp2ps) (output file <program>.hp)",
+"     break-down: c = cost centre stack (default)",
+"                 m = module",
+"                 d = closure description",
+"                 y = type description",
+"                 r = retainer",
+"                 b = biography (LAG,DRAG,VOID,USE)",
+"  A subset of closures may be selected thusly:",
+"    -hc<cc>,...  specific cost centre(s) (top of stack only)",
+"    -hC<cc>,...  specific cost centre(s) (anywhere in stack)",
+"    -hm<mod>...  all cost centres from the specified modules(s)",
+"    -hd<des>,... closures with specified closure descriptions",
+"    -hy<typ>...  closures with specified type descriptions",
+"    -hr<cc>...   closures with specified retainers",
+"    -hb<bio>...  closures with specified biographies (lag,drag,void,use)",
+"",
+"  -R<size>       Set the maximum retainer set size (default: 8)",
+"",
+"  -i<sec>        Time between heap samples (seconds, default: 0.1)",
+"",
+"  -xt            Include threads (TSOs) in a heap profile",
+"",
+"  -xc      Show current cost centre stack on raising an exception",
+# endif
+#endif /* PROFILING or PAR */
+#if !defined(PROFILING) && defined(DEBUG)
+"",
+"  -h<break-down> Debugging Heap residency profile",
+"                 (output file <program>.hp)",
+"     break-down: L = closure label (default)",
+"                 T = closure type (constructor, thunk etc.)",
+#endif
+"",
+#if defined(TICKY_TICKY)
+"  -r<file>  Produce reduction profiling statistics (with -rstderr for stderr)",
+"",
+#endif
+#if defined(PAR)
+"  -N<n>     Use <n> PVMish processors in parallel (default: 2)",
+/* NB: the -N<n> is implemented by the driver!! */
+#endif
+"  -C<secs>  Context-switch interval in seconds",
+"                (0 or no argument means switch as often as possible)",
+"                the default is .02 sec; resolution is .02 sec",
+"",
+#if defined(DEBUG)
+"  -Ds  DEBUG: scheduler",
+"  -Di  DEBUG: interpreter",
+"  -Dc  DEBUG: codegen",
+"  -Dw  DEBUG: weak",
+"  -DG  DEBUG: gccafs",
+"  -Dg  DEBUG: gc",
+"  -Db  DEBUG: block",
+"  -DS  DEBUG: sanity",
+"  -Dt  DEBUG: stable",
+"  -Dp  DEBUG: prof",
+"  -Dr  DEBUG: gran",
+"  -DP  DEBUG: par",
+"  -Dl  DEBUG: linker",
+"  -Dm  DEBUG: stm",
+"  -Dz  DEBUG: stack squezing",
+"",
+#endif /* DEBUG */
+#if defined(THREADED_RTS)
+"  -N<n>     Use <n> OS threads (default: 1)",
+"  -qm       Don't automatically migrate threads between CPUs",
+"  -qw       Migrate a thread to the current CPU when it is woken up",
+#endif
+#if defined(THREADED_RTS) || defined(PAR)
+"  -e<size>  Size of spark pools (default 100)",
+#endif
+#if defined(PAR)
+"  -t<num>   Set maximum number of advisory threads per PE (default 32)",
+"  -qP       Enable activity profile (output files in ~/<program>*.gr)",
+"  -qQ<size> Set pack-buffer size (default: 1024)",
+"  -qd       Turn on PVM-ish debugging",
+"  -qO       Disable output for performance measurement",
+#endif
+#if defined(THREADED_RTS) || defined(PAR)
+"  -e<n>     Maximum number of outstanding local sparks (default: 4096)",
+#endif
+#if defined(PAR)
+"  -d        Turn on PVM-ish debugging",
+"  -O        Disable output for performance measurement",
+#endif /* PAR */
+#if defined(GRAN)  /* ToDo: fill in decent Docu here */
+"  -b...     All GranSim options start with -b; see GranSim User's Guide for details",
+#endif
+"",
+"RTS options may also be specified using the GHCRTS environment variable.",
+"",
+"Other RTS options may be available for programs compiled a different way.",
+"The GHC User's Guide has full details.",
+"",
+0
+};
+
+STATIC_INLINE rtsBool
+strequal(const char *a, const char * b)
+{
+    return(strcmp(a, b) == 0);
+}
+
+static void
+splitRtsFlags(char *s, int *rts_argc, char *rts_argv[])
+{
+    char *c1, *c2;
+
+    c1 = s;
+    do {
+	while (isspace(*c1)) { c1++; };
+	c2 = c1;
+	while (!isspace(*c2) && *c2 != '\0') { c2++; };
+	
+	if (c1 == c2) { break; }
+	
+	if (*rts_argc < MAX_RTS_ARGS-1) {
+	    s = stgMallocBytes(c2-c1+1, "RtsFlags.c:splitRtsFlags()");
+	    strncpy(s, c1, c2-c1);
+	    s[c2-c1] = '\0';
+	    rts_argv[(*rts_argc)++] = s;
+	} else {
+	    barf("too many RTS arguments (max %d)", MAX_RTS_ARGS-1);
+	}
+	
+	c1 = c2;
+    } while (*c1 != '\0');
+}
+    
+void
+setupRtsFlags(int *argc, char *argv[], int *rts_argc, char *rts_argv[])
+{
+    rtsBool error = rtsFalse;
+    I_ mode;
+    I_ arg, total_arg;
+
+    setProgName (argv);
+    total_arg = *argc;
+    arg = 1;
+
+    *argc = 1;
+    *rts_argc = 0;
+
+    // process arguments from the ghc_rts_opts global variable first.
+    // (arguments from the GHCRTS environment variable and the command
+    // line override these).
+    {
+	if (ghc_rts_opts != NULL) {
+	    splitRtsFlags(ghc_rts_opts, rts_argc, rts_argv);
+	}
+    }
+
+    // process arguments from the GHCRTS environment variable next
+    // (arguments from the command line override these).
+    {
+	char *ghc_rts = getenv("GHCRTS");
+
+	if (ghc_rts != NULL) {
+	    splitRtsFlags(ghc_rts, rts_argc, rts_argv);
+	}
+    }
+
+    // Split arguments (argv) into PGM (argv) and RTS (rts_argv) parts
+    //   argv[0] must be PGM argument -- leave in argv
+
+    for (mode = PGM; arg < total_arg; arg++) {
+	// The '--RTS' argument disables all future +RTS ... -RTS processing.
+	if (strequal("--RTS", argv[arg])) {
+	    arg++;
+	    break;
+	}
+	// The '--' argument is passed through to the program, but
+	// disables all further +RTS ... -RTS processing.
+	else if (strequal("--", argv[arg])) {
+	    break;
+	}
+	else if (strequal("+RTS", argv[arg])) {
+	    mode = RTS;
+	}
+	else if (strequal("-RTS", argv[arg])) {
+	    mode = PGM;
+	}
+	else if (mode == RTS && *rts_argc < MAX_RTS_ARGS-1) {
+	    rts_argv[(*rts_argc)++] = argv[arg];
+	}
+	else if (mode == PGM) {
+	    argv[(*argc)++] = argv[arg];
+	}
+	else {
+	  barf("too many RTS arguments (max %d)", MAX_RTS_ARGS-1);
+	}
+    }
+    // process remaining program arguments
+    for (; arg < total_arg; arg++) {
+	argv[(*argc)++] = argv[arg];
+    }
+    argv[*argc] = (char *) 0;
+    rts_argv[*rts_argc] = (char *) 0;
+
+    // Process RTS (rts_argv) part: mainly to determine statsfile
+    for (arg = 0; arg < *rts_argc; arg++) {
+	if (rts_argv[arg][0] != '-') {
+	    fflush(stdout);
+	    errorBelch("unexpected RTS argument: %s", rts_argv[arg]);
+	    error = rtsTrue;
+
+        } else {
+	    switch(rts_argv[arg][1]) {
+
+	      /* process: general args, then PROFILING-only ones,
+		 then CONCURRENT-only, PARallel-only, GRAN-only,
+		 TICKY-only (same order as defined in RtsFlags.lh);
+		 within those groups, mostly in case-insensitive
+		 alphabetical order.
+                 Final group is x*, which allows for more options.
+	      */
+
+#ifdef TICKY_TICKY
+# define TICKY_BUILD_ONLY(x) x
+#else
+# define TICKY_BUILD_ONLY(x) \
+errorBelch("not built for: ticky-ticky stats"); \
+error = rtsTrue;
+#endif
+
+#if defined(PROFILING) 
+# define COST_CENTRE_USING_BUILD_ONLY(x) x
+#else
+# define COST_CENTRE_USING_BUILD_ONLY(x) \
+errorBelch("not built for: -prof or -parallel"); \
+error = rtsTrue;
+#endif
+
+#ifdef PROFILING
+# define PROFILING_BUILD_ONLY(x)   x
+#else
+# define PROFILING_BUILD_ONLY(x) \
+errorBelch("not built for: -prof"); \
+error = rtsTrue;
+#endif
+
+#ifdef PAR
+# define PAR_BUILD_ONLY(x)      x
+#else
+# define PAR_BUILD_ONLY(x) \
+errorBelch("not built for: -parallel"); \
+error = rtsTrue;
+#endif
+
+#ifdef THREADED_RTS
+# define THREADED_BUILD_ONLY(x)      x
+#else
+# define THREADED_BUILD_ONLY(x) \
+errorBelch("not built for: -smp"); \
+error = rtsTrue;
+#endif
+
+#if defined(THREADED_RTS) || defined(PAR)
+# define PAR_OR_THREADED_BUILD_ONLY(x)      x
+#else
+# define PAR_OR_THREADED_BUILD_ONLY(x) \
+errorBelch("not built for: -parallel or -smp"); \
+error = rtsTrue;
+#endif
+
+#ifdef GRAN
+# define GRAN_BUILD_ONLY(x)     x
+#else
+# define GRAN_BUILD_ONLY(x) \
+errorBelch("not built for: -gransim"); \
+error = rtsTrue;
+#endif
+
+	      /* =========== GENERAL ========================== */
+	      case '?':
+		error = rtsTrue;
+		break;
+
+	      case 'A':
+		RtsFlags.GcFlags.minAllocAreaSize
+		  = decode(rts_argv[arg]+2) / BLOCK_SIZE;
+		if (RtsFlags.GcFlags.minAllocAreaSize <= 0) {
+		  bad_option(rts_argv[arg]);
+		}
+		break;
+
+	      case 'B':
+		RtsFlags.GcFlags.ringBell = rtsTrue;
+		break;
+
+	      case 'c':
+		  if (rts_argv[arg][2] != '\0') {
+		      RtsFlags.GcFlags.compactThreshold =
+			  atof(rts_argv[arg]+2);
+		  } else {
+		      RtsFlags.GcFlags.compact = rtsTrue;
+		  }
+		  break;
+
+	      case 'F':
+	        RtsFlags.GcFlags.oldGenFactor = atof(rts_argv[arg]+2);
+	      
+		if (RtsFlags.GcFlags.oldGenFactor < 0)
+		  bad_option( rts_argv[arg] );
+		break;
+	      
+#ifdef DEBUG
+	      case 'D':
+	      { 
+		  char *c;
+
+		  for (c  = rts_argv[arg] + 2; *c != '\0'; c++) {
+		      switch (*c) {
+		      case 's':
+			  RtsFlags.DebugFlags.scheduler = rtsTrue;
+			  break;
+		      case 'i':
+			  RtsFlags.DebugFlags.interpreter = rtsTrue;
+			  break;
+		      case 'c':
+			  RtsFlags.DebugFlags.codegen = rtsTrue;
+			  break;
+		      case 'w':
+			  RtsFlags.DebugFlags.weak = rtsTrue;
+			  break;
+		      case 'G':
+			  RtsFlags.DebugFlags.gccafs = rtsTrue;
+			  break;
+		      case 'g':
+			  RtsFlags.DebugFlags.gc = rtsTrue;
+			  break;
+		      case 'b':
+			  RtsFlags.DebugFlags.block_alloc = rtsTrue;
+			  break;
+		      case 'S':
+			  RtsFlags.DebugFlags.sanity = rtsTrue;
+			  break;
+		      case 't':
+			  RtsFlags.DebugFlags.stable = rtsTrue;
+			  break;
+		      case 'p':
+			  RtsFlags.DebugFlags.prof = rtsTrue;
+			  break;
+		      case 'r':
+			  RtsFlags.DebugFlags.gran = rtsTrue;
+			  break;
+		      case 'P':
+			  RtsFlags.DebugFlags.par = rtsTrue;
+			  break;
+		      case 'l':
+			  RtsFlags.DebugFlags.linker = rtsTrue;
+			  break;
+		      case 'a':
+			  RtsFlags.DebugFlags.apply = rtsTrue;
+			  break;
+		      case 'm':
+			  RtsFlags.DebugFlags.stm = rtsTrue;
+			  break;
+		      case 'z':
+			  RtsFlags.DebugFlags.squeeze = rtsTrue;
+			  break;
+		      default:
+			  bad_option( rts_argv[arg] );
+		      }
+		  }
+		  break;
+	      }
+#endif
+
+	      case 'K':
+		RtsFlags.GcFlags.maxStkSize = 
+		  decode(rts_argv[arg]+2) / sizeof(W_);
+
+		if (RtsFlags.GcFlags.maxStkSize == 0) 
+		  bad_option( rts_argv[arg] );
+		break;
+
+	      case 'k':
+		RtsFlags.GcFlags.initialStkSize = 
+		  decode(rts_argv[arg]+2) / sizeof(W_);
+
+		if (RtsFlags.GcFlags.initialStkSize == 0) 
+		  bad_option( rts_argv[arg] );
+		break;
+
+	      case 'M':
+		RtsFlags.GcFlags.maxHeapSize = 
+		  decode(rts_argv[arg]+2) / BLOCK_SIZE;
+		/* user give size in *bytes* but "maxHeapSize" is in *blocks* */
+
+		if (RtsFlags.GcFlags.maxHeapSize <= 0) {
+		  bad_option(rts_argv[arg]);
+		}
+		break;
+
+	      case 'm':
+		RtsFlags.GcFlags.pcFreeHeap = atof(rts_argv[arg]+2);
+
+		if (RtsFlags.GcFlags.pcFreeHeap < 0 || 
+		    RtsFlags.GcFlags.pcFreeHeap > 100)
+		  bad_option( rts_argv[arg] );
+		break;
+
+	      case 'G':
+		RtsFlags.GcFlags.generations = decode(rts_argv[arg]+2);
+		if (RtsFlags.GcFlags.generations < 1) {
+		  bad_option(rts_argv[arg]);
+		}
+		break;
+
+	      case 'T':
+		RtsFlags.GcFlags.steps = decode(rts_argv[arg]+2);
+		if (RtsFlags.GcFlags.steps < 1) {
+		  bad_option(rts_argv[arg]);
+		}
+		break;
+
+	      case 'H':
+		RtsFlags.GcFlags.heapSizeSuggestion = 
+		  decode(rts_argv[arg]+2) / BLOCK_SIZE;
+
+		if (RtsFlags.GcFlags.heapSizeSuggestion <= 0) {
+		  bad_option(rts_argv[arg]);
+		}
+		break;
+
+#ifdef RTS_GTK_FRONTPANEL
+	      case 'f':
+		  RtsFlags.GcFlags.frontpanel = rtsTrue;
+		  break;
+#endif
+
+    	      case 'I':	/* idle GC delay */
+		if (rts_argv[arg][2] == '\0') {
+		  /* use default */
+		} else {
+		    I_ cst; /* tmp */
+
+		    /* Convert to ticks */
+		    cst = (I_) ((atof(rts_argv[arg]+2) * 1000));
+		    if (cst > 0 && cst < TICK_MILLISECS) {
+			cst = TICK_MILLISECS;
+		    } else {
+			cst = cst / TICK_MILLISECS;
+		    }
+		    RtsFlags.GcFlags.idleGCDelayTicks = cst;
+		}
+		break;
+
+	      case 'S':
+		  RtsFlags.GcFlags.giveStats = VERBOSE_GC_STATS;
+		  goto stats;
+
+	      case 's':
+		  RtsFlags.GcFlags.giveStats = SUMMARY_GC_STATS;
+		  goto stats;
+
+	      case 't':
+		  RtsFlags.GcFlags.giveStats = ONELINE_GC_STATS;
+		  goto stats;
+
+	    stats:
+#ifdef PAR
+		/* Opening all those files would almost certainly fail... */
+		// RtsFlags.ParFlags.ParStats.Full = rtsTrue;
+		RtsFlags.GcFlags.statsFile = NULL; /* temporary; ToDo: rm */
+#else
+		{ 
+		    int r;
+		    r = open_stats_file(arg, *argc, argv,
+					*rts_argc, rts_argv, STAT_FILENAME_FMT,
+					&RtsFlags.GcFlags.statsFile);
+		    if (r == -1) { error = rtsTrue; }
+		}
+#endif
+		  break;
+
+	      case 'Z':
+		RtsFlags.GcFlags.squeezeUpdFrames = rtsFalse;
+		break;
+
+	      /* =========== PROFILING ========================== */
+
+	      case 'P': /* detailed cost centre profiling (time/alloc) */
+	      case 'p': /* cost centre profiling (time/alloc) */
+		COST_CENTRE_USING_BUILD_ONLY(
+		switch (rts_argv[arg][2]) {
+		  case 'x':
+		    RtsFlags.CcFlags.doCostCentres = COST_CENTRES_XML;
+		    break;
+		  case 'a':
+		    RtsFlags.CcFlags.doCostCentres = COST_CENTRES_ALL;
+		    break;
+		  default:
+		      if (rts_argv[arg][1] == 'P') {
+			  RtsFlags.CcFlags.doCostCentres =
+			      COST_CENTRES_VERBOSE;
+		      } else {
+			  RtsFlags.CcFlags.doCostCentres =
+			      COST_CENTRES_SUMMARY;
+		      }
+		      break;
+		}
+		) break;
+
+	      case 'R':
+		  PROFILING_BUILD_ONLY(
+		      RtsFlags.ProfFlags.maxRetainerSetSize = atof(rts_argv[arg]+2);
+  	          ) break;
+
+	      case 'h': /* serial heap profile */
+#if !defined(PROFILING) && defined(DEBUG)
+		switch (rts_argv[arg][2]) {
+		  case '\0':
+		  case 'L':
+		    RtsFlags.ProfFlags.doHeapProfile = HEAP_BY_INFOPTR;
+		    break;
+		  case 'T':
+		    RtsFlags.ProfFlags.doHeapProfile = HEAP_BY_CLOSURE_TYPE;
+		    break;
+		  default:
+		    errorBelch("invalid heap profile option: %s",rts_argv[arg]);
+		    error = rtsTrue;
+		}
+#else
+		PROFILING_BUILD_ONLY(
+		switch (rts_argv[arg][2]) {
+		case '\0':
+		case 'C':
+		case 'c':
+		case 'M':
+		case 'm':
+		case 'D':
+		case 'd':
+		case 'Y':
+		case 'y':
+		case 'R':
+		case 'r':
+		case 'B':
+		case 'b':
+		    if (rts_argv[arg][2] != '\0' && rts_argv[arg][3] != '\0') {
+			{
+			    char *left  = strchr(rts_argv[arg], '{');
+			    char *right = strrchr(rts_argv[arg], '}');
+
+			    // curly braces are optional, for
+			    // backwards compat.
+			    if (left)
+				left = left+1;
+			    else
+				left = rts_argv[arg] + 3;
+
+			    if (!right)
+				right = rts_argv[arg] + strlen(rts_argv[arg]);
+
+			    *right = '\0';
+
+			    switch (rts_argv[arg][2]) {
+			    case 'c': // cost centre label select
+				RtsFlags.ProfFlags.ccSelector = left;
+				break;
+			    case 'C':
+				RtsFlags.ProfFlags.ccsSelector = left;
+				break;
+			    case 'M':
+			    case 'm': // cost centre module select
+				RtsFlags.ProfFlags.modSelector = left;
+				break;
+			    case 'D':
+			    case 'd': // closure descr select 
+				RtsFlags.ProfFlags.descrSelector = left;
+				break;
+			    case 'Y':
+			    case 'y': // closure type select
+				RtsFlags.ProfFlags.typeSelector = left;
+				break;
+			    case 'R':
+			    case 'r': // retainer select
+				RtsFlags.ProfFlags.retainerSelector = left;
+				break;
+			    case 'B':
+			    case 'b': // biography select
+				RtsFlags.ProfFlags.bioSelector = left;
+				break;
+			    }
+			}
+			break;
+		    }
+
+		    if (RtsFlags.ProfFlags.doHeapProfile != 0) {
+			errorBelch("multiple heap profile options");
+			error = rtsTrue;
+			break;
+		    }
+
+		    switch (rts_argv[arg][2]) {
+		    case '\0':
+		    case 'C':
+		    case 'c':
+			RtsFlags.ProfFlags.doHeapProfile = HEAP_BY_CCS;
+			break;
+		    case 'M':
+		    case 'm':
+			  RtsFlags.ProfFlags.doHeapProfile = HEAP_BY_MOD;
+			  break;
+		    case 'D':
+		    case 'd':
+			  RtsFlags.ProfFlags.doHeapProfile = HEAP_BY_DESCR;
+			  break;
+		    case 'Y':
+		    case 'y':
+			  RtsFlags.ProfFlags.doHeapProfile = HEAP_BY_TYPE;
+			  break;
+		    case 'R':
+		    case 'r':
+			  RtsFlags.ProfFlags.doHeapProfile = HEAP_BY_RETAINER;
+			  break;
+		    case 'B':
+		    case 'b':
+			  RtsFlags.ProfFlags.doHeapProfile = HEAP_BY_LDV;
+			  break;
+		    }
+		    break;
+		      
+		default:
+		    errorBelch("invalid heap profile option: %s",rts_argv[arg]);
+		    error = rtsTrue;
+		}
+		) 
+#endif /* PROFILING */
+    	    	break;
+
+#if defined(PROFILING) 
+    	      case 'i':	/* heap sample interval */
+		if (rts_argv[arg][2] == '\0') {
+		  /* use default */
+		} else {
+		    I_ cst; /* tmp */
+
+		    /* Convert to milliseconds */
+		    cst = (I_) ((atof(rts_argv[arg]+2) * 1000));
+		    cst = (cst / CS_MIN_MILLISECS) * CS_MIN_MILLISECS;
+		    if (cst != 0 && cst < CS_MIN_MILLISECS)
+			cst = CS_MIN_MILLISECS;
+
+		    RtsFlags.ProfFlags.profileInterval = cst;
+		}
+		break;
+#endif
+
+	      /* =========== CONCURRENT ========================= */
+    	      case 'C':	/* context switch interval */
+		if (rts_argv[arg][2] == '\0')
+    	    	    RtsFlags.ConcFlags.ctxtSwitchTime = 0;
+		else {
+		    I_ cst; /* tmp */
+
+		    /* Convert to milliseconds */
+		    cst = (I_) ((atof(rts_argv[arg]+2) * 1000));
+		    cst = (cst / CS_MIN_MILLISECS) * CS_MIN_MILLISECS;
+		    if (cst != 0 && cst < CS_MIN_MILLISECS)
+			cst = CS_MIN_MILLISECS;
+
+		    RtsFlags.ConcFlags.ctxtSwitchTime = cst;
+		}
+    	    	break;
+
+#ifdef THREADED_RTS
+	      case 'N':
+		THREADED_BUILD_ONLY(
+		if (rts_argv[arg][2] != '\0') {
+		    RtsFlags.ParFlags.nNodes
+		      = strtol(rts_argv[arg]+2, (char **) NULL, 10);
+		    if (RtsFlags.ParFlags.nNodes <= 0) {
+		      errorBelch("bad value for -N");
+		      error = rtsTrue;
+		    }
+		}
+		) break;
+
+	    case 'q':
+		    switch (rts_argv[arg][2]) {
+		    case '\0':
+			errorBelch("incomplete RTS option: %s",rts_argv[arg]);
+			error = rtsTrue;
+			break;
+		    case 'm':
+			RtsFlags.ParFlags.migrate = rtsFalse;
+			break;
+		    case 'w':
+			RtsFlags.ParFlags.wakeupMigrate = rtsTrue;
+			break;
+		    default:
+			errorBelch("unknown RTS option: %s",rts_argv[arg]);
+			error = rtsTrue;
+			break;
+		    }
+		    break;
+#endif
+	      /* =========== PARALLEL =========================== */
+	      case 'e':
+		PAR_OR_THREADED_BUILD_ONLY(
+		if (rts_argv[arg][2] != '\0') {
+		    RtsFlags.ParFlags.maxLocalSparks
+		      = strtol(rts_argv[arg]+2, (char **) NULL, 10);
+		    if (RtsFlags.ParFlags.maxLocalSparks <= 0) {
+		      errorBelch("bad value for -e");
+		      error = rtsTrue;
+		    }
+		}
+		) break;
+
+#ifdef PAR
+    	      case 'q':
+		PAR_BUILD_ONLY(
+		  process_par_option(arg, rts_argc, rts_argv, &error);
+		) break;
+#endif
+
+	      /* =========== GRAN =============================== */
+
+    	      case 'b':
+		GRAN_BUILD_ONLY(
+		  process_gran_option(arg, rts_argc, rts_argv, &error);
+		) break;
+
+	      /* =========== TICKY ============================== */
+
+	      case 'r': /* Basic profiling stats */
+		TICKY_BUILD_ONLY(
+
+		RtsFlags.TickyFlags.showTickyStats = rtsTrue;
+
+		{ 
+		    int r;
+		    r = open_stats_file(arg, *argc, argv,
+					*rts_argc, rts_argv, TICKY_FILENAME_FMT,
+					&RtsFlags.TickyFlags.tickyFile);
+		    if (r == -1) { error = rtsTrue; }
+		}
+	        ) break;
+
+	      /* =========== EXTENDED OPTIONS =================== */
+
+              case 'x': /* Extend the argument space */
+                switch(rts_argv[arg][2]) {
+                  case '\0':
+		    errorBelch("incomplete RTS option: %s",rts_argv[arg]);
+		    error = rtsTrue;
+		    break;
+
+                  case 'c': /* Debugging tool: show current cost centre on an exception */
+                    PROFILING_BUILD_ONLY(
+			RtsFlags.ProfFlags.showCCSOnException = rtsTrue;
+			);
+		    break;
+
+		case 't':  /* Include memory used by TSOs in a heap profile */
+		    PROFILING_BUILD_ONLY(
+			RtsFlags.ProfFlags.includeTSOs = rtsTrue;
+			);
+		    break;
+
+                  /* The option prefix '-xx' is reserved for future extension.  KSW 1999-11. */
+
+	          default:
+		    errorBelch("unknown RTS option: %s",rts_argv[arg]);
+		    error = rtsTrue;
+		    break;
+                }
+                break;  /* defensive programming */
+
+	      /* =========== OH DEAR ============================ */
+	      default:
+		errorBelch("unknown RTS option: %s",rts_argv[arg]);
+		error = rtsTrue;
+		break;
+	    }
+	}
+    }
+    if (error) {
+	const char **p;
+
+        fflush(stdout);
+	for (p = usage_text; *p; p++)
+	    errorBelch("%s", *p);
+	stg_exit(EXIT_FAILURE);
+    }
+}
+
+#if defined(GRAN)
+
+//@node GranSim specific options, Aux fcts, Command-line option parsing routines
+//@subsection GranSim specific options
+
+static void
+enable_GranSimLight(void) {
+
+    debugBelch("GrAnSim Light enabled (infinite number of processors;  0 communication costs)\n");
+    RtsFlags.GranFlags.Light=rtsTrue;
+    RtsFlags.GranFlags.Costs.latency = 
+	RtsFlags.GranFlags.Costs.fetchtime = 
+	RtsFlags.GranFlags.Costs.additional_latency =
+	RtsFlags.GranFlags.Costs.gunblocktime = 
+	RtsFlags.GranFlags.Costs.lunblocktime =
+	RtsFlags.GranFlags.Costs.threadcreatetime = 
+	RtsFlags.GranFlags.Costs.threadqueuetime =
+	RtsFlags.GranFlags.Costs.threadscheduletime = 
+	RtsFlags.GranFlags.Costs.threaddescheduletime =
+	RtsFlags.GranFlags.Costs.threadcontextswitchtime = 0;
+  
+    RtsFlags.GranFlags.Costs.mpacktime = 
+	RtsFlags.GranFlags.Costs.munpacktime = 0;
+
+    RtsFlags.GranFlags.DoFairSchedule = rtsTrue;
+    RtsFlags.GranFlags.DoAsyncFetch = rtsFalse;
+    RtsFlags.GranFlags.DoAlwaysCreateThreads = rtsTrue;
+    /* FetchStrategy is irrelevant in GrAnSim-Light */
+
+    /* GrAnSim Light often creates an abundance of parallel threads,
+       each with its own stack etc. Therefore, it's in general a good
+       idea to use small stack chunks (use the -o<size> option to 
+       increase it again). 
+    */
+    // RtsFlags.ConcFlags.stkChunkSize = 100;
+
+    RtsFlags.GranFlags.proc = 1; 
+}
+
+static void
+process_gran_option(int arg, int *rts_argc, char *rts_argv[], rtsBool *error)
+{
+    if (rts_argv[arg][1] != 'b') /* All GranSim options start with -b */
+      return;
+
+    /* or a ridiculously idealised simulator */
+    if(strcmp((rts_argv[arg]+2),"oring")==0) {
+      RtsFlags.GranFlags.Costs.latency = 
+	RtsFlags.GranFlags.Costs.fetchtime = 
+	RtsFlags.GranFlags.Costs.additional_latency =
+	RtsFlags.GranFlags.Costs.gunblocktime = 
+	RtsFlags.GranFlags.Costs.lunblocktime =
+	RtsFlags.GranFlags.Costs.threadcreatetime = 
+	RtsFlags.GranFlags.Costs.threadqueuetime =
+	RtsFlags.GranFlags.Costs.threadscheduletime = 
+	RtsFlags.GranFlags.Costs.threaddescheduletime =
+	RtsFlags.GranFlags.Costs.threadcontextswitchtime = 0;
+
+      RtsFlags.GranFlags.Costs.mpacktime = 
+	RtsFlags.GranFlags.Costs.munpacktime = 0;
+
+      RtsFlags.GranFlags.Costs.arith_cost = 
+	RtsFlags.GranFlags.Costs.float_cost = 
+	RtsFlags.GranFlags.Costs.load_cost =
+	RtsFlags.GranFlags.Costs.store_cost = 
+	RtsFlags.GranFlags.Costs.branch_cost = 0;
+
+      RtsFlags.GranFlags.Costs.heapalloc_cost = 1;
+
+      /* ++RtsFlags.GranFlags.DoFairSchedule; */
+      RtsFlags.GranFlags.DoStealThreadsFirst = rtsTrue;        /* -bZ */
+      RtsFlags.GranFlags.DoThreadMigration   = rtsTrue;        /* -bM */
+      RtsFlags.GranFlags.GranSimStats.Full   = rtsTrue;        /* -bP */
+      return;
+    }
+
+      /* or a somewhat idealised simulator */
+      if(strcmp((rts_argv[arg]+2),"onzo")==0) {
+	RtsFlags.GranFlags.Costs.latency = 
+	RtsFlags.GranFlags.Costs.fetchtime = 
+	RtsFlags.GranFlags.Costs.additional_latency =
+	RtsFlags.GranFlags.Costs.gunblocktime = 
+	RtsFlags.GranFlags.Costs.lunblocktime =
+	RtsFlags.GranFlags.Costs.threadcreatetime = 
+	RtsFlags.GranFlags.Costs.threadqueuetime =
+	RtsFlags.GranFlags.Costs.threadscheduletime = 
+	RtsFlags.GranFlags.Costs.threaddescheduletime =
+	RtsFlags.GranFlags.Costs.threadcontextswitchtime = 0;
+
+	RtsFlags.GranFlags.Costs.mpacktime = 
+	RtsFlags.GranFlags.Costs.munpacktime = 0;
+	
+	RtsFlags.GranFlags.Costs.heapalloc_cost = 1;
+
+	/* RtsFlags.GranFlags.DoFairSchedule  = rtsTrue; */       /* -b-R */
+	/* RtsFlags.GranFlags.DoStealThreadsFirst = rtsTrue; */   /* -b-T */
+	RtsFlags.GranFlags.DoAsyncFetch = rtsTrue;         /* -bZ */
+	RtsFlags.GranFlags.DoThreadMigration  = rtsTrue;          /* -bM */
+	RtsFlags.GranFlags.GranSimStats.Full  = rtsTrue;          /* -bP */
+#  if defined(GRAN_CHECK) && defined(GRAN)
+	RtsFlags.GranFlags.Debug.event_stats = rtsTrue; /* print event statistics   */
+#  endif
+	return;
+      }
+
+      /* Communication and task creation cost parameters */
+      switch(rts_argv[arg][2]) {
+        case '.':
+	  IgnoreYields = rtsTrue; // HWL HACK
+	  break;
+
+        case ':':
+	  enable_GranSimLight();       /* set flags for GrAnSim-Light mode */
+	  break;
+
+        case 'l':
+	  if (rts_argv[arg][3] != '\0')
+	    {
+	      RtsFlags.GranFlags.Costs.gunblocktime = 
+	      RtsFlags.GranFlags.Costs.latency = decode(rts_argv[arg]+3);
+	      RtsFlags.GranFlags.Costs.fetchtime = 2*RtsFlags.GranFlags.Costs.latency;
+	    }
+	  else
+	    RtsFlags.GranFlags.Costs.latency = LATENCY;
+	  break;
+
+        case 'a':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.additional_latency = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.additional_latency = ADDITIONAL_LATENCY;
+	  break;
+
+        case 'm':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.mpacktime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.mpacktime = MSGPACKTIME;
+	  break;
+
+        case 'x':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.mtidytime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.mtidytime = 0;
+	  break;
+
+        case 'r':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.munpacktime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.munpacktime = MSGUNPACKTIME;
+	  break;
+	  
+        case 'g':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.fetchtime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.fetchtime = FETCHTIME;
+	  break;
+	  
+        case 'n':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.gunblocktime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.gunblocktime = GLOBALUNBLOCKTIME;
+	  break;
+
+        case 'u':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.lunblocktime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.lunblocktime = LOCALUNBLOCKTIME;
+	  break;
+
+	/* Thread-related metrics */
+        case 't':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.threadcreatetime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.threadcreatetime = THREADCREATETIME;
+	  break;
+	  
+        case 'q':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.threadqueuetime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.threadqueuetime = THREADQUEUETIME;
+	  break;
+	  
+        case 'c':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.threadscheduletime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.threadscheduletime = THREADSCHEDULETIME;
+	  
+	  RtsFlags.GranFlags.Costs.threadcontextswitchtime = RtsFlags.GranFlags.Costs.threadscheduletime
+	    + RtsFlags.GranFlags.Costs.threaddescheduletime;
+	  break;
+
+        case 'd':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.threaddescheduletime = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.threaddescheduletime = THREADDESCHEDULETIME;
+	  
+	  RtsFlags.GranFlags.Costs.threadcontextswitchtime = RtsFlags.GranFlags.Costs.threadscheduletime
+	    + RtsFlags.GranFlags.Costs.threaddescheduletime;
+	  break;
+
+	/* Instruction Cost Metrics */
+        case 'A':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.arith_cost = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.arith_cost = ARITH_COST;
+	  break;
+
+        case 'F':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.float_cost = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.float_cost = FLOAT_COST;
+	  break;
+		      
+        case 'B':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.branch_cost = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.branch_cost = BRANCH_COST;
+	  break;
+
+        case 'L':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.load_cost = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.load_cost = LOAD_COST;
+	  break;
+	  
+        case 'S':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.store_cost = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.store_cost = STORE_COST;
+	  break;
+
+        case 'H':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.heapalloc_cost = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.heapalloc_cost = 0;
+	  break;
+
+        case 'y':
+	  RtsFlags.GranFlags.DoAsyncFetch = rtsTrue;
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.FetchStrategy = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.FetchStrategy = 2;
+	  if (RtsFlags.GranFlags.FetchStrategy == 0)
+	    RtsFlags.GranFlags.DoAsyncFetch = rtsFalse;
+	  break;
+	  
+        case 'K':   /* sort overhead (per elem in spark list) */
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.pri_spark_overhead = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.pri_spark_overhead = PRI_SPARK_OVERHEAD;
+	  debugBelch("Overhead for pri spark: %d (per elem).\n",
+		         RtsFlags.GranFlags.Costs.pri_spark_overhead);
+	  break;
+
+        case 'O':  /* sort overhead (per elem in spark list) */
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.Costs.pri_sched_overhead = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.Costs.pri_sched_overhead = PRI_SCHED_OVERHEAD;
+	  debugBelch("Overhead for pri sched: %d (per elem).\n",
+		       RtsFlags.GranFlags.Costs.pri_sched_overhead);
+	  break;
+
+        /* General Parameters */
+        case 'p':
+	  if (rts_argv[arg][3] != '\0')
+	    {
+	      RtsFlags.GranFlags.proc = decode(rts_argv[arg]+3);
+	      if (RtsFlags.GranFlags.proc==0) {
+		  enable_GranSimLight(); /* set flags for GrAnSim-Light mode */
+	      } else if (RtsFlags.GranFlags.proc > MAX_PROC || 
+			 RtsFlags.GranFlags.proc < 1)
+		{
+		  debugBelch("setupRtsFlags: no more than %u processors allowed\n",
+			  MAX_PROC);
+		  *error = rtsTrue;
+		}
+	    }
+	  else
+	    RtsFlags.GranFlags.proc = MAX_PROC;
+	  break;
+
+        case 'f':
+	  RtsFlags.GranFlags.Fishing = rtsTrue;
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.maxFishes = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.maxFishes = MAX_FISHES;
+	  break;
+	  
+        case 'w':
+	  if (rts_argv[arg][3] != '\0')
+	    RtsFlags.GranFlags.time_slice = decode(rts_argv[arg]+3);
+	  else
+	    RtsFlags.GranFlags.time_slice = GRAN_TIME_SLICE;
+	  break;
+	  
+        case 'C':
+	  RtsFlags.GranFlags.DoAlwaysCreateThreads=rtsTrue;
+	  RtsFlags.GranFlags.DoThreadMigration=rtsTrue;
+	  break;
+
+        case 'G':
+	  debugBelch("Bulk fetching enabled.\n");
+	  RtsFlags.GranFlags.DoBulkFetching=rtsTrue;
+	  break;
+	  
+        case 'M':
+	  debugBelch("Thread migration enabled.\n");
+	  RtsFlags.GranFlags.DoThreadMigration=rtsTrue;
+	  break;
+
+        case 'R':
+	  debugBelch("Fair Scheduling enabled.\n");
+	  RtsFlags.GranFlags.DoFairSchedule=rtsTrue;
+	  break;
+	  
+        case 'I':
+	  debugBelch("Priority Scheduling enabled.\n");
+	  RtsFlags.GranFlags.DoPriorityScheduling=rtsTrue;
+	  break;
+
+        case 'T':
+	  RtsFlags.GranFlags.DoStealThreadsFirst=rtsTrue;
+	  RtsFlags.GranFlags.DoThreadMigration=rtsTrue;
+	  break;
+	  
+        case 'Z':
+	  RtsFlags.GranFlags.DoAsyncFetch=rtsTrue;
+	  break;
+	  
+/*          case 'z': */
+/*  	  RtsFlags.GranFlags.SimplifiedFetch=rtsTrue; */
+/*  	  break; */
+	  
+        case 'N':
+	  RtsFlags.GranFlags.PreferSparksOfLocalNodes=rtsTrue;
+	  break;
+	  
+        case 'b':
+	  RtsFlags.GranFlags.GranSimStats.Binary=rtsTrue;
+	  break;
+	  
+        case 'P':
+	  /* format is -bP<c> where <c> is one char describing kind of profile */
+	  RtsFlags.GranFlags.GranSimStats.Full = rtsTrue;
+	  switch(rts_argv[arg][3]) {
+	  case '\0': break; // nothing special, just an ordinary profile
+	  case '0': RtsFlags.GranFlags.GranSimStats.Suppressed = rtsTrue;
+	    break;
+	  case 'b': RtsFlags.GranFlags.GranSimStats.Binary = rtsTrue;
+	    break;
+	  case 's': RtsFlags.GranFlags.GranSimStats.Sparks = rtsTrue;
+	    break;
+	  case 'h': RtsFlags.GranFlags.GranSimStats.Heap = rtsTrue;
+	    break;
+	  case 'n': RtsFlags.GranFlags.GranSimStats.NewLogfile = rtsTrue;
+	    break;
+	  case 'g': RtsFlags.GranFlags.GranSimStats.Global = rtsTrue;
+	    break;
+	  default: barf("Unknown option -bP%c", rts_argv[arg][3]);
+	  }
+	  break;
+
+        case 's':
+	  RtsFlags.GranFlags.GranSimStats.Sparks=rtsTrue;
+	  break;
+
+        case 'h':
+	  RtsFlags.GranFlags.GranSimStats.Heap=rtsTrue;
+	  break;
+
+        case 'Y':   /* syntax: -bY<n>[,<n>]  n ... pos int */ 
+	  if (rts_argv[arg][3] != '\0') {
+	    char *arg0, *tmp;
+	    
+	    arg0 = rts_argv[arg]+3;
+	    if ((tmp = strstr(arg0,","))==NULL) {
+	      RtsFlags.GranFlags.SparkPriority = decode(arg0);
+	      debugBelch("SparkPriority: %u.\n",RtsFlags.GranFlags.SparkPriority);
+	    } else {
+	      *(tmp++) = '\0'; 
+	      RtsFlags.GranFlags.SparkPriority = decode(arg0);
+	      RtsFlags.GranFlags.SparkPriority2 = decode(tmp);
+	      debugBelch("SparkPriority: %u.\n",
+		      RtsFlags.GranFlags.SparkPriority);
+	      debugBelch("SparkPriority2:%u.\n",
+		      RtsFlags.GranFlags.SparkPriority2);
+	      if (RtsFlags.GranFlags.SparkPriority2 < 
+		  RtsFlags.GranFlags.SparkPriority) {
+		debugBelch("WARNING: 2nd pri < main pri (%u<%u); 2nd pri has no effect\n",
+			RtsFlags.GranFlags.SparkPriority2,
+			RtsFlags.GranFlags.SparkPriority);
+	      }
+	    }
+	  } else {
+	    /* plain pri spark is now invoked with -bX  
+	       RtsFlags.GranFlags.DoPrioritySparking = 1;
+	       debugBelch("PrioritySparking.\n");
+	    */
+	  }
+	  break;
+
+        case 'Q':
+	  if (rts_argv[arg][3] != '\0') {
+	    RtsFlags.GranFlags.ThunksToPack = decode(rts_argv[arg]+3);
+	  } else {
+	    RtsFlags.GranFlags.ThunksToPack = 1;
+	  }
+	  debugBelch("Thunks To Pack in one packet: %u.\n",
+		  RtsFlags.GranFlags.ThunksToPack);
+	  break;
+		      
+        case 'e':
+	  RtsFlags.GranFlags.RandomSteal = rtsFalse;
+	  debugBelch("Deterministic mode (no random stealing)\n");
+		      break;
+
+	  /* The following class of options contains eXperimental */
+	  /* features in connection with exploiting granularity */
+	  /* information. I.e. if -bY is chosen these options */
+	  /* tell the RTS what to do with the supplied info --HWL */
+
+        case 'W':
+	  if (rts_argv[arg][3] != '\0') {
+	    RtsFlags.GranFlags.packBufferSize_internal = decode(rts_argv[arg]+3);
+	  } else {
+	    RtsFlags.GranFlags.packBufferSize_internal = GRANSIM_DEFAULT_PACK_BUFFER_SIZE;
+	  }
+	  debugBelch("Size of GranSim internal pack buffer: %u.\n",
+		  RtsFlags.GranFlags.packBufferSize_internal);
+	  break;
+  		      
+        case 'X':
+	  switch(rts_argv[arg][3]) {
+	    
+	    case '\0':
+	      RtsFlags.GranFlags.DoPrioritySparking = 1;
+	      debugBelch("Priority Sparking with Normal Priorities.\n");
+	      RtsFlags.GranFlags.InversePriorities = rtsFalse; 
+	      RtsFlags.GranFlags.RandomPriorities = rtsFalse;
+	      RtsFlags.GranFlags.IgnorePriorities = rtsFalse;
+	      break;
+			
+	    case 'I':
+	      RtsFlags.GranFlags.DoPrioritySparking = 1;
+	      debugBelch("Priority Sparking with Inverse Priorities.\n");
+	      RtsFlags.GranFlags.InversePriorities++; 
+	      break;
+	      
+	    case 'R': 
+	      RtsFlags.GranFlags.DoPrioritySparking = 1;
+	      debugBelch("Priority Sparking with Random Priorities.\n");
+	      RtsFlags.GranFlags.RandomPriorities++;
+	      break;
+	      
+	    case 'N':
+	      RtsFlags.GranFlags.DoPrioritySparking = 1;
+	      debugBelch("Priority Sparking with No Priorities.\n");
+	      RtsFlags.GranFlags.IgnorePriorities++;
+	      break;
+	      
+	    default:
+	      bad_option( rts_argv[arg] );
+	      break;
+	  }
+	  break;
+
+        case '-':
+	  switch(rts_argv[arg][3]) {
+	    
+	    case 'C':
+	      RtsFlags.GranFlags.DoAlwaysCreateThreads=rtsFalse;
+	      RtsFlags.GranFlags.DoThreadMigration=rtsFalse;
+	      break;
+
+	    case 'G':
+	      RtsFlags.GranFlags.DoBulkFetching=rtsFalse;
+	      break;
+	      
+	    case 'M':
+	      RtsFlags.GranFlags.DoThreadMigration=rtsFalse;
+	      break;
+
+	    case 'R':
+	      RtsFlags.GranFlags.DoFairSchedule=rtsFalse;
+	      break;
+
+	    case 'T':
+	      RtsFlags.GranFlags.DoStealThreadsFirst=rtsFalse;
+	      RtsFlags.GranFlags.DoThreadMigration=rtsFalse;
+	      break;
+
+	    case 'Z':
+	      RtsFlags.GranFlags.DoAsyncFetch=rtsFalse;
+	      break;
+	      
+	    case 'N':
+	      RtsFlags.GranFlags.PreferSparksOfLocalNodes=rtsFalse;
+			 break;
+			 
+	    case 'P':
+	      RtsFlags.GranFlags.GranSimStats.Suppressed=rtsTrue;
+	      break;
+
+	    case 's':
+	      RtsFlags.GranFlags.GranSimStats.Sparks=rtsFalse;
+	      break;
+	    
+	    case 'h':
+	      RtsFlags.GranFlags.GranSimStats.Heap=rtsFalse;
+	      break;
+	    
+	    case 'b':
+	      RtsFlags.GranFlags.GranSimStats.Binary=rtsFalse;
+	      break;
+			 
+	    case 'X':
+	      RtsFlags.GranFlags.DoPrioritySparking = rtsFalse;
+	      break;
+
+	    case 'Y':
+	      RtsFlags.GranFlags.DoPrioritySparking = rtsFalse;
+	      RtsFlags.GranFlags.SparkPriority = rtsFalse;
+	      break;
+
+	    case 'I':
+	      RtsFlags.GranFlags.DoPriorityScheduling = rtsFalse;
+	      break;
+
+	    case 'e':
+	      RtsFlags.GranFlags.RandomSteal = rtsFalse;
+	      break;
+
+	    default:
+	      bad_option( rts_argv[arg] );
+	      break;
+	  }
+	  break;
+
+#  if defined(GRAN_CHECK) && defined(GRAN)
+        case 'D':
+	  switch(rts_argv[arg][3]) {
+	    case 'Q':    /* Set pack buffer size (same as 'Q' in GUM) */
+	      if (rts_argv[arg][4] != '\0') {
+		RtsFlags.GranFlags.packBufferSize = decode(rts_argv[arg]+4);
+		debugBelch("Pack buffer size: %d\n",
+			RtsFlags.GranFlags.packBufferSize);
+	      } else {
+    	    	debugBelch("setupRtsFlags: missing size of PackBuffer (for -Q)\n");
+    	    	*error = rtsTrue;
+    	      }
+	      break;
+
+	  default:
+	      if (isdigit(rts_argv[arg][3])) {/* Set all debugging options in one */
+	    	/* hack warning: interpret the flags as a binary number */
+	    	nat n = decode(rts_argv[arg]+3);
+		set_GranSim_debug_options(n);
+	      } else {
+		nat i;
+		for (i=0; i<=MAX_GRAN_DEBUG_OPTION; i++) 
+		  if (rts_argv[arg][3] == gran_debug_opts_flags[i])
+		    break;
+		
+		if (i==MAX_GRAN_DEBUG_OPTION+1) {
+		  debugBelch("Valid GranSim debug options are:\n");
+		  help_GranSim_debug_options(MAX_GRAN_DEBUG_MASK);
+		  bad_option( rts_argv[arg] );
+		} else { // flag found; now set it
+		  set_GranSim_debug_options(GRAN_DEBUG_MASK(i));  // 2^i
+		}
+	      }
+	      break;
+	      
+#if 0
+	    case 'e':       /* event trace; also -bD1 */
+	      debugBelch("DEBUG: event_trace; printing event trace.\n");
+	      RtsFlags.GranFlags.Debug.event_trace = rtsTrue;
+	      /* RtsFlags.GranFlags.event_trace=rtsTrue; */
+	      break;
+	      
+	    case 'E':       /* event statistics; also -bD2 */
+	      debugBelch("DEBUG: event_stats; printing event statistics.\n");
+	      RtsFlags.GranFlags.Debug.event_stats = rtsTrue;
+	      /* RtsFlags.GranFlags.Debug |= 0x20; print event statistics   */
+	      break;
+	      
+	    case 'f':       /* thunkStealing; also -bD4 */
+	      debugBelch("DEBUG: thunkStealing; printing forwarding of FETCHNODES.\n");
+	      RtsFlags.GranFlags.Debug.thunkStealing = rtsTrue;
+	      /* RtsFlags.GranFlags.Debug |= 0x2;  print fwd messages */
+	      break;
+
+	    case 'z':       /* blockOnFetch; also -bD8 */
+	      debugBelch("DEBUG: blockOnFetch; check for blocked on fetch.\n");
+	      RtsFlags.GranFlags.Debug.blockOnFetch = rtsTrue;
+	      /* RtsFlags.GranFlags.Debug |= 0x4; debug non-reschedule-on-fetch */
+	      break;
+	      
+	    case 't':       /* blockOnFetch_sanity; also -bD16 */  
+	      debugBelch("DEBUG: blockOnFetch_sanity; check for TSO asleep on fetch.\n");
+	      RtsFlags.GranFlags.Debug.blockOnFetch_sanity = rtsTrue;
+	      /* RtsFlags.GranFlags.Debug |= 0x10; debug TSO asleep for fetch  */
+	      break;
+
+	    case 'S':       /* priSpark; also -bD32 */
+	      debugBelch("DEBUG: priSpark; priority sparking.\n");
+	      RtsFlags.GranFlags.Debug.priSpark = rtsTrue;
+	      break;
+
+	    case 's':       /* priSched; also -bD64 */
+	      debugBelch("DEBUG: priSched; priority scheduling.\n");
+	      RtsFlags.GranFlags.Debug.priSched = rtsTrue;
+	      break;
+
+	    case 'F':       /* findWork; also -bD128 */
+	      debugBelch("DEBUG: findWork; searching spark-pools (local & remote), thread queues for work.\n");
+	      RtsFlags.GranFlags.Debug.findWork = rtsTrue;
+	      break;
+	      
+	    case 'g':       /* globalBlock; also -bD256 */
+	      debugBelch("DEBUG: globalBlock; blocking on remote closures (FETCHMEs etc in GUM).\n");
+	      RtsFlags.GranFlags.Debug.globalBlock = rtsTrue;
+	      break;
+	      
+	    case 'G':       /* pack; also -bD512 */
+	      debugBelch("DEBUG: pack; routines for (un-)packing graph structures.\n");
+	      RtsFlags.GranFlags.Debug.pack = rtsTrue;
+	      break;
+	      
+	    case 'P':       /* packBuffer; also -bD1024 */
+	      debugBelch("DEBUG: packBuffer; routines handling pack buffer (GranSim internal!).\n");
+	      RtsFlags.GranFlags.Debug.packBuffer = rtsTrue;
+	      break;
+	      
+	    case 'o':       /* sortedQ; also -bD2048 */
+	      debugBelch("DEBUG: sortedQ; check whether spark/thread queues are sorted.\n");
+	      RtsFlags.GranFlags.Debug.sortedQ = rtsTrue;
+	      break;
+	      
+	    case 'r':       /* randomSteal; also -bD4096 */
+	      debugBelch("DEBUG: randomSteal; stealing sparks/threads from random PEs.\n");
+	      RtsFlags.GranFlags.Debug.randomSteal = rtsTrue;
+	      break;
+	      
+	    case 'q':       /* checkSparkQ; also -bD8192 */
+	      debugBelch("DEBUG: checkSparkQ; check consistency of the spark queues.\n");
+	      RtsFlags.GranFlags.Debug.checkSparkQ = rtsTrue;
+	      break;
+	      
+	    case ':':       /* checkLight; also -bD16384 */
+	      debugBelch("DEBUG: checkLight; check GranSim-Light setup.\n");
+	      RtsFlags.GranFlags.Debug.checkLight = rtsTrue;
+	      break;
+	      
+	    case 'b':       /* bq; also -bD32768 */
+	      debugBelch("DEBUG: bq; check blocking queues\n");
+	      RtsFlags.GranFlags.Debug.bq = rtsTrue;
+	      break;
+	      
+	    case 'd':       /* all options turned on */
+	      debugBelch("DEBUG: all options turned on.\n");
+	      set_GranSim_debug_options(MAX_GRAN_DEBUG_MASK);
+	      /* RtsFlags.GranFlags.Debug |= 0x40; */
+	      break;
+
+/*  	    case '\0': */
+/*  	      RtsFlags.GranFlags.Debug = 1; */
+/*  	      break; */
+#endif
+
+	  }
+	  break;
+#  endif  /* GRAN_CHECK */
+      default:
+	bad_option( rts_argv[arg] );
+	break;
+      }
+}
+
+/*
+  Interpret n as a binary number masking GranSim debug options and set the 
+  correxponding option. See gran_debug_opts_strs for explanations of the flags.
+*/
+static void
+set_GranSim_debug_options(nat n) {
+  nat i;
+
+  for (i=0; i<=MAX_GRAN_DEBUG_OPTION; i++) 
+    if ((n>>i)&1) {
+      errorBelch(gran_debug_opts_strs[i]);
+      switch (i) {
+        case 0: RtsFlags.GranFlags.Debug.event_trace   = rtsTrue;  break;
+        case 1: RtsFlags.GranFlags.Debug.event_stats   = rtsTrue;  break;
+        case 2: RtsFlags.GranFlags.Debug.bq            = rtsTrue;  break;
+        case 3: RtsFlags.GranFlags.Debug.pack          = rtsTrue;  break;
+        case 4: RtsFlags.GranFlags.Debug.checkSparkQ   = rtsTrue;  break;
+        case 5: RtsFlags.GranFlags.Debug.thunkStealing = rtsTrue;  break;
+        case 6: RtsFlags.GranFlags.Debug.randomSteal   = rtsTrue;  break;
+        case 7: RtsFlags.GranFlags.Debug.findWork      = rtsTrue;  break;
+        case 8: RtsFlags.GranFlags.Debug.unused        = rtsTrue;  break;
+        case 9: RtsFlags.GranFlags.Debug.pri           = rtsTrue;  break;
+        case 10: RtsFlags.GranFlags.Debug.checkLight   = rtsTrue;  break;
+        case 11: RtsFlags.GranFlags.Debug.sortedQ      = rtsTrue;  break;
+        case 12: RtsFlags.GranFlags.Debug.blockOnFetch = rtsTrue;  break;
+        case 13: RtsFlags.GranFlags.Debug.packBuffer   = rtsTrue;  break;
+        case 14: RtsFlags.GranFlags.Debug.blockOnFetch_sanity = rtsTrue;  break;
+        default: barf("set_GranSim_debug_options: only %d debug options expected");
+      } /* switch */
+    } /* if */
+}
+
+/*
+  Print one line explanation for each of the GranSim debug options specified
+  in the bitmask n.
+*/
+static void
+help_GranSim_debug_options(nat n) {
+  nat i;
+
+  for (i=0; i<=MAX_GRAN_DEBUG_OPTION; i++) 
+    if ((n>>i)&1) 
+      debugBelch(gran_debug_opts_strs[i]);
+}
+
+# elif defined(PAR)
+
+static void
+process_par_option(int arg, int *rts_argc, char *rts_argv[], rtsBool *error)
+{
+
+  if (rts_argv[arg][1] != 'q') { /* All GUM options start with -q */
+    errorBelch("Warning: GUM option does not start with -q: %s", rts_argv[arg]);
+    return;
+  }
+
+  /* Communication and task creation cost parameters */
+  switch(rts_argv[arg][2]) {
+  case 'e':  /* -qe<n>  ... allow <n> local sparks */
+    if (rts_argv[arg][3] != '\0') { /* otherwise, stick w/ the default */
+      RtsFlags.ParFlags.maxLocalSparks
+	= strtol(rts_argv[arg]+3, (char **) NULL, 10);
+      
+      if (RtsFlags.ParFlags.maxLocalSparks <= 0) {
+	errorBelch("setupRtsFlags: bad value for -e\n");
+	*error = rtsTrue;
+      }
+    }
+    IF_PAR_DEBUG(verbose,
+		 errorBelch("-qe<n>: max %d local sparks", 
+		       RtsFlags.ParFlags.maxLocalSparks));
+    break;
+  
+  case 't':
+    if (rts_argv[arg][3] != '\0') {
+      RtsFlags.ParFlags.maxThreads
+	= strtol(rts_argv[arg]+3, (char **) NULL, 10);
+    } else {
+      errorBelch("missing size for -qt\n");
+      *error = rtsTrue;
+    }
+    IF_PAR_DEBUG(verbose,
+		 errorBelch("-qt<n>: max %d threads", 
+		       RtsFlags.ParFlags.maxThreads));
+    break;
+
+  case 'f':
+    if (rts_argv[arg][3] != '\0')
+      RtsFlags.ParFlags.maxFishes = decode(rts_argv[arg]+3);
+    else
+      RtsFlags.ParFlags.maxFishes = MAX_FISHES;
+    break;
+    IF_PAR_DEBUG(verbose,
+		 errorBelch("-qf<n>: max %d fishes sent out at one time", 
+		       RtsFlags.ParFlags.maxFishes));
+    break;
+  
+  case 'F':
+    if (rts_argv[arg][3] != '\0') {
+      RtsFlags.ParFlags.fishDelay
+	= strtol(rts_argv[arg]+3, (char **) NULL, 10);
+    } else {
+      errorBelch("missing fish delay time for -qF\n");
+      *error = rtsTrue;
+    }
+    IF_PAR_DEBUG(verbose,
+		 errorBelch("-qF<n>: fish delay time %d us", 
+		       RtsFlags.ParFlags.fishDelay));
+    break;
+
+  case 'O':
+    RtsFlags.ParFlags.outputDisabled = rtsTrue;
+    IF_PAR_DEBUG(verbose,
+		 errorBelch("-qO: output disabled"));
+    break;
+  
+  case 'g': /* -qg<n> ... globalisation scheme */
+    if (rts_argv[arg][3] != '\0') {
+      RtsFlags.ParFlags.globalising = decode(rts_argv[arg]+3);
+    } else {
+      errorBelch("missing identifier for globalisation scheme (for -qg)\n");
+      *error = rtsTrue;
+    }
+    IF_PAR_DEBUG(verbose,
+		 debugBelch("-qg<n>: globalisation scheme set to  %d", 
+		       RtsFlags.ParFlags.globalising));
+    break;
+
+  case 'h': /* -qh<n> ... max number of thunks (except root) in packet */
+    if (rts_argv[arg][3] != '\0') {
+      RtsFlags.ParFlags.thunksToPack = decode(rts_argv[arg]+3);
+    } else {
+      errorBelch("missing number of thunks per packet (for -qh)\n");
+      *error = rtsTrue;
+    }
+    IF_PAR_DEBUG(verbose,
+		 debugBelch("-qh<n>: thunks per packet set to %d", 
+		       RtsFlags.ParFlags.thunksToPack));
+    break;
+
+  case 'P': /* -qP for writing a log file */
+    //RtsFlags.ParFlags.ParStats.Full = rtsFalse;
+    /* same encoding as in GranSim after -bP */	
+    switch(rts_argv[arg][3]) {
+    case '\0': RtsFlags.ParFlags.ParStats.Full = rtsTrue;
+      break; // nothing special, just an ordinary profile
+    case '0': RtsFlags.ParFlags.ParStats.Suppressed = rtsTrue;
+	RtsFlags.ParFlags.ParStats.Full = rtsFalse;
+      break;
+    case 'b': RtsFlags.ParFlags.ParStats.Binary = rtsTrue;
+      break;
+    case 's': RtsFlags.ParFlags.ParStats.Sparks = rtsTrue;
+      break;
+      //case 'h': RtsFlags.parFlags.ParStats.Heap = rtsTrue;
+      //  break;
+    case 'n': RtsFlags.ParFlags.ParStats.NewLogfile = rtsTrue;
+      break;
+    case 'g': 
+# if defined(PAR_TICKY)
+      RtsFlags.ParFlags.ParStats.Global = rtsTrue;
+# else 
+      errorBelch("-qPg is only possible for a PAR_TICKY RTS, which this is not");
+      stg_exit(EXIT_FAILURE);
+# endif
+      break;
+    default: barf("Unknown option -qP%c", rts_argv[arg][2]);
+    }
+    IF_PAR_DEBUG(verbose,
+		 debugBelch("(-qP) writing to log-file (RtsFlags.ParFlags.ParStats.Full=%s)",
+		       (RtsFlags.ParFlags.ParStats.Full ? "rtsTrue" : "rtsFalse")));
+    break;
+  
+  case 'Q': /* -qQ<n> ... set pack buffer size to <n> */
+    if (rts_argv[arg][3] != '\0') {
+      RtsFlags.ParFlags.packBufferSize = decode(rts_argv[arg]+3);
+    } else {
+      errorBelch("missing size of PackBuffer (for -qQ)\n");
+      *error = rtsTrue;
+    }
+    IF_PAR_DEBUG(verbose,
+		 debugBelch("-qQ<n>: pack buffer size set to %d", 
+		       RtsFlags.ParFlags.packBufferSize));
+    break;
+
+  case 'R':
+    RtsFlags.ParFlags.doFairScheduling = rtsTrue;
+    IF_PAR_DEBUG(verbose,
+		 debugBelch("-qR: fair-ish scheduling"));
+    break;
+  
+# if defined(DEBUG)  
+  case 'w':
+    if (rts_argv[arg][3] != '\0') {
+      RtsFlags.ParFlags.wait
+	= strtol(rts_argv[arg]+3, (char **) NULL, 10);
+    } else {
+      RtsFlags.ParFlags.wait = 1000;
+    }
+    IF_PAR_DEBUG(verbose,
+		 debugBelch("-qw<n>: length of wait loop after synchr before reduction: %d", 
+		       RtsFlags.ParFlags.wait));
+    break;
+
+  case 'D':  /* -qD ... all the debugging options */
+    if (isdigit(rts_argv[arg][3])) {/* Set all debugging options in one */
+      /* hack warning: interpret the flags as a binary number */
+      nat n = decode(rts_argv[arg]+3);
+      set_par_debug_options(n);
+    } else {
+      nat i;
+      for (i=0; i<=MAX_PAR_DEBUG_OPTION; i++) 
+	if (rts_argv[arg][3] == par_debug_opts_flags[i])
+	  break;
+	
+      if (i==MAX_PAR_DEBUG_OPTION+1) {
+	errorBelch("Valid GUM debug options are:\n");
+	help_par_debug_options(MAX_PAR_DEBUG_MASK);
+	bad_option( rts_argv[arg] );
+      } else { // flag found; now set it
+	set_par_debug_options(PAR_DEBUG_MASK(i));  // 2^i
+      }
+    }
+    break;
+# endif
+  default:
+    errorBelch("Unknown option -q%c (%d opts in total)", 
+	  rts_argv[arg][2], *rts_argc);
+    break;
+  } /* switch */
+}
+
+/*
+  Interpret n as a binary number masking Par debug options and set the 
+  correxponding option. See par_debug_opts_strs for explanations of the flags.
+*/
+static void
+set_par_debug_options(nat n) {
+  nat i;
+
+  for (i=0; i<=MAX_PAR_DEBUG_OPTION; i++) 
+    if ((n>>i)&1) {
+      debugBelch(par_debug_opts_strs[i]);
+      switch (i) {
+        case 0: RtsFlags.ParFlags.Debug.verbose       = rtsTrue;  break;
+        case 1: RtsFlags.ParFlags.Debug.bq            = rtsTrue;  break;
+        case 2: RtsFlags.ParFlags.Debug.schedule      = rtsTrue;  break;
+        case 3: RtsFlags.ParFlags.Debug.free          = rtsTrue;  break;
+        case 4: RtsFlags.ParFlags.Debug.resume        = rtsTrue;  break;
+        case 5: RtsFlags.ParFlags.Debug.weight        = rtsTrue;  break;
+        case 6: RtsFlags.ParFlags.Debug.fetch         = rtsTrue;  break;
+	  //case 7: RtsFlags.ParFlags.Debug.ack           = rtsTrue;  break;
+        case 7: RtsFlags.ParFlags.Debug.fish          = rtsTrue;  break;
+        case 8: RtsFlags.ParFlags.Debug.tables        = rtsTrue;  break;
+        case 9: RtsFlags.ParFlags.Debug.packet        = rtsTrue;  break;
+        case 10: RtsFlags.ParFlags.Debug.pack         = rtsTrue;  break;
+        case 11: RtsFlags.ParFlags.Debug.paranoia     = rtsTrue;  break;
+        default: barf("set_par_debug_options: only %d debug options expected",
+		      MAX_PAR_DEBUG_OPTION);
+      } /* switch */
+    } /* if */
+}
+
+/*
+  Print one line explanation for each of the GranSim debug options specified
+  in the bitmask n.
+*/
+static void
+help_par_debug_options(nat n) {
+  nat i;
+
+  for (i=0; i<=MAX_PAR_DEBUG_OPTION; i++) 
+    if ((n>>i)&1) 
+      debugBelch(par_debug_opts_strs[i]);
+}
+
+#endif /* PAR */
+
+//@node Aux fcts,  , GranSim specific options
+//@subsection Aux fcts
+
+static void
+stats_fprintf(FILE *f, char *s, ...)
+{
+    va_list ap;
+    va_start(ap,s);
+    if (f == NULL) {
+	vdebugBelch(s, ap);
+    } else {
+	vfprintf(f, s, ap);
+    }
+    va_end(ap);
+}
+
+static int		/* return -1 on error */
+open_stats_file (
+    I_ arg,
+    int argc, char *argv[],
+    int rts_argc, char *rts_argv[],
+    const char *FILENAME_FMT,
+    FILE **file_ret)
+{
+    FILE *f = NULL;
+
+    if (strequal(rts_argv[arg]+2, "stderr")) { /* use debugBelch */
+        f = NULL; /* NULL means use debugBelch */
+    } else {
+	if (rts_argv[arg][2] != '\0') {  /* stats file specified */
+	    f = fopen(rts_argv[arg]+2,"w");
+	} else {
+	    char stats_filename[STATS_FILENAME_MAXLEN]; /* default <program>.<ext> */
+	    sprintf(stats_filename, FILENAME_FMT, argv[0]);
+	    f = fopen(stats_filename,"w");
+	}
+	if (f == NULL) {
+	    errorBelch("Can't open stats file %s\n", rts_argv[arg]+2);
+	    return -1;
+	}
+    }
+    *file_ret = f;
+
+    {
+	/* Write argv and rtsv into start of stats file */
+	int count;
+	for(count = 0; count < argc; count++) {
+	    stats_fprintf(f, "%s ", argv[count]);
+	}
+	stats_fprintf(f, "+RTS ");
+	for(count = 0; count < rts_argc; count++)
+	    stats_fprintf(f, "%s ", rts_argv[count]);
+	stats_fprintf(f, "\n");
+    }
+    return 0;
+}
+
+
+
+static I_
+decode(const char *s)
+{
+    I_ c;
+    StgDouble m;
+
+    if (!*s)
+	return 0;
+
+    m = atof(s);
+    c = s[strlen(s)-1];
+
+    if (c == 'g' || c == 'G')
+	m *= 1000*1000*1000;	/* UNchecked! */
+    else if (c == 'm' || c == 'M')
+	m *= 1000*1000;			/* We do not use powers of 2 (1024) */
+    else if (c == 'k' || c == 'K')	/* to avoid possible bad effects on */
+	m *= 1000;			/* a direct-mapped cache.   	    */ 
+    else if (c == 'w' || c == 'W')
+	m *= sizeof(W_);
+
+    return (I_)m;
+}
+
+static void
+bad_option(const char *s)
+{
+  errorBelch("bad RTS option: %s", s);
+  stg_exit(EXIT_FAILURE);
+}
+
+/* -----------------------------------------------------------------------------
+   Getting/Setting the program's arguments.
+
+   These are used by System.Environment, and parts of the RTS.
+   -------------------------------------------------------------------------- */
+
+void
+setProgName(char *argv[])
+{
+    /* Remove directory from argv[0] -- default files in current directory */
+#if !defined(mingw32_HOST_OS)
+    char *last_slash;
+    if ( (last_slash = (char *) strrchr(argv[0], '/')) != NULL ) {
+	prog_name = last_slash+1;
+   } else {
+	prog_name = argv[0];
+   }
+#else
+    char* last_slash = argv[0] + (strlen(argv[0]) - 1);
+    while ( last_slash > argv[0] ) {
+	if ( *last_slash == '/' || *last_slash == '\\' ) {
+	    prog_name = last_slash+1;
+	    return;
+	}
+	last_slash--;
+    }
+    prog_name = argv[0];
+#endif
+}
+
+void
+getProgArgv(int *argc, char **argv[])
+{
+    if (argc) { *argc = prog_argc; }
+    if (argv) { *argv = prog_argv; }
+}
+
+void
+setProgArgv(int argc, char *argv[])
+{
+   /* Usually this is done by startupHaskell, so we don't need to call this. 
+      However, sometimes Hugs wants to change the arguments which Haskell
+      getArgs >>= ... will be fed.  So you can do that by calling here
+      _after_ calling startupHaskell.
+   */
+   prog_argc = argc;
+   prog_argv = argv;
+   setProgName(prog_argv);
+}
diff --git a/rts/RtsMessages.c b/rts/RtsMessages.c
new file mode 100644
index 0000000000..1242d886eb
--- /dev/null
+++ b/rts/RtsMessages.c
@@ -0,0 +1,201 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2004
+ *
+ * General utility functions used in the RTS.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+
+#include <stdio.h>
+
+#ifdef HAVE_WINDOWS_H
+#include <windows.h>
+#endif
+
+/* -----------------------------------------------------------------------------
+   General message generation functions
+
+   All messages should go through here.  We can't guarantee that
+   stdout/stderr will be available - e.g. in a Windows program there
+   is no console for generating messages, so they have to either go to
+   to the debug console, or pop up message boxes.
+   -------------------------------------------------------------------------- */
+
+// Default to the stdio implementation of these hooks.
+RtsMsgFunction *fatalInternalErrorFn = rtsFatalInternalErrorFn;
+RtsMsgFunction *debugMsgFn           = rtsDebugMsgFn;
+RtsMsgFunction *errorMsgFn           = rtsErrorMsgFn;
+
+void
+barf(char *s, ...)
+{
+  va_list ap;
+  va_start(ap,s);
+  (*fatalInternalErrorFn)(s,ap);
+  stg_exit(EXIT_INTERNAL_ERROR); // just in case fatalInternalErrorFn() returns
+  va_end(ap);
+}
+
+void
+vbarf(char *s, va_list ap)
+{
+  (*fatalInternalErrorFn)(s,ap);
+  stg_exit(EXIT_INTERNAL_ERROR); // just in case fatalInternalErrorFn() returns
+}
+
+void 
+_assertFail(char *filename, unsigned int linenum)
+{
+    barf("ASSERTION FAILED: file %s, line %u\n", filename, linenum);
+}
+
+void
+errorBelch(char *s, ...)
+{
+  va_list ap;
+  va_start(ap,s);
+  (*errorMsgFn)(s,ap);
+  va_end(ap);
+}
+
+void
+verrorBelch(char *s, va_list ap)
+{
+  (*errorMsgFn)(s,ap);
+}
+
+void
+debugBelch(char *s, ...)
+{
+  va_list ap;
+  va_start(ap,s);
+  (*debugMsgFn)(s,ap);
+  va_end(ap);
+}
+
+void
+vdebugBelch(char *s, va_list ap)
+{
+  (*debugMsgFn)(s,ap);
+}
+
+/* -----------------------------------------------------------------------------
+   stdio versions of the message functions
+   -------------------------------------------------------------------------- */
+
+#define BUFSIZE 512
+
+#if defined(cygwin32_TARGET_OS) || defined (mingw32_TARGET_OS)
+static int
+isGUIApp()
+{
+  PIMAGE_DOS_HEADER pDOSHeader;
+  PIMAGE_NT_HEADERS pPEHeader;
+
+  pDOSHeader = (PIMAGE_DOS_HEADER) GetModuleHandleA(NULL);
+  if (pDOSHeader->e_magic != IMAGE_DOS_SIGNATURE)
+    return 0;
+
+  pPEHeader = (PIMAGE_NT_HEADERS) ((char *)pDOSHeader + pDOSHeader->e_lfanew);
+  if (pPEHeader->Signature != IMAGE_NT_SIGNATURE)
+    return 0;
+
+  return (pPEHeader->OptionalHeader.Subsystem == IMAGE_SUBSYSTEM_WINDOWS_GUI);
+}
+#endif
+
+#define xstr(s) str(s)
+#define str(s) #s
+
+void
+rtsFatalInternalErrorFn(char *s, va_list ap)
+{
+#if defined(cygwin32_TARGET_OS) || defined (mingw32_TARGET_OS)
+  if (isGUIApp())
+  {
+     char title[BUFSIZE], message[BUFSIZE];
+
+     snprintf(title,   BUFSIZE, "%s: internal error", prog_name);
+     vsnprintf(message, BUFSIZE, s, ap);
+
+     MessageBox(NULL /* hWnd */,
+	        message,
+	        title,
+	        MB_OK | MB_ICONERROR | MB_TASKMODAL
+	       );
+  }
+  else
+#endif
+  {
+     /* don't fflush(stdout); WORKAROUND bug in Linux glibc */
+     if (prog_argv != NULL && prog_name != NULL) {
+       fprintf(stderr, "%s: internal error: ", prog_name);
+     } else {
+       fprintf(stderr, "internal error: ");
+     }
+     vfprintf(stderr, s, ap);
+     fprintf(stderr, "\n");
+     fprintf(stderr, "    (GHC version %s for %s)\n", ProjectVersion, xstr(HostPlatform_TYPE));
+     fprintf(stderr, "    Please report this as a GHC bug:  http://www.haskell.org/ghc/reportabug\n");
+     fflush(stderr);
+  }
+
+  abort();
+  // stg_exit(EXIT_INTERNAL_ERROR);
+}
+
+void
+rtsErrorMsgFn(char *s, va_list ap)
+{
+#if defined(cygwin32_TARGET_OS) || defined (mingw32_TARGET_OS)
+  if (isGUIApp())
+  {
+     char buf[BUFSIZE];
+     int r;
+
+	 r = vsnprintf(buf, BUFSIZE, s, ap);
+	 if (r > 0 && r < BUFSIZE) {
+		MessageBox(NULL /* hWnd */,
+              buf,
+              prog_name,
+              MB_OK | MB_ICONERROR | MB_TASKMODAL
+              );
+     }
+  }
+  else
+#endif
+  {
+     /* don't fflush(stdout); WORKAROUND bug in Linux glibc */
+     if (prog_argv != NULL && prog_name != NULL) {
+       fprintf(stderr, "%s: ", prog_name);
+     }
+     vfprintf(stderr, s, ap);
+     fprintf(stderr, "\n");
+  }
+}
+
+void
+rtsDebugMsgFn(char *s, va_list ap)
+{
+#if defined(cygwin32_TARGET_OS) || defined (mingw32_TARGET_OS)
+  if (isGUIApp())
+  {
+     char buf[BUFSIZE];
+	 int r;
+
+	 r = vsnprintf(buf, BUFSIZE, s, ap);
+	 if (r > 0 && r < BUFSIZE) {
+       OutputDebugString(buf);
+     }
+  }
+  else
+#endif
+  {
+     /* don't fflush(stdout); WORKAROUND bug in Linux glibc */
+     vfprintf(stderr, s, ap);
+     fflush(stderr);
+  }
+}
diff --git a/rts/RtsSignals.h b/rts/RtsSignals.h
new file mode 100644
index 0000000000..eafeeaaf55
--- /dev/null
+++ b/rts/RtsSignals.h
@@ -0,0 +1,78 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Signal processing / handling.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef RTS_SIGNALS_H
+#define RTS_SIGNALS_H
+
+#if !defined(PAR) && !defined(mingw32_HOST_OS)
+
+#include "posix/Signals.h"
+
+#elif defined(mingw32_HOST_OS)
+
+#include "win32/ConsoleHandler.h"
+
+#else /* PAR */
+
+#define signals_pending() (rtsFalse)
+
+#endif /* PAR */
+
+
+#if RTS_USER_SIGNALS
+
+/*
+ * Function: initUserSignals()
+ *
+ * Initialize the console handling substrate.
+ */
+extern void initUserSignals(void);
+
+/*
+ * Function: initDefaultHandlers()
+ *
+ * Install any default signal/console handlers. Currently we install a
+ * Ctrl+C handler that shuts down the RTS in an orderly manner.
+ */
+extern void initDefaultHandlers(void);
+
+/*
+ * Function: blockUserSignals()
+ *
+ * Temporarily block the delivery of further console events. Needed to
+ * avoid race conditions when GCing the queue of outstanding handlers or
+ * when emptying the queue by running the handlers.
+ * 
+ */
+extern void blockUserSignals(void);
+
+/*
+ * Function: unblockUserSignals()
+ *
+ * The inverse of blockUserSignals(); re-enable the deliver of console events.
+ */
+extern void unblockUserSignals(void);
+
+/*
+ * Function: awaitUserSignals()
+ *
+ * Wait for the next console event. Currently a NOP (returns immediately.)
+ */
+extern void awaitUserSignals(void);
+
+/*
+ * Function: markSignalHandlers()
+ *
+ * Evacuate the handler queue. _Assumes_ that console event delivery
+ * has already been blocked.
+ */
+extern void markSignalHandlers (evac_fn evac);
+
+#endif /* RTS_USER_SIGNALS */
+
+#endif /* RTS_SIGNALS_H */
diff --git a/rts/RtsStartup.c b/rts/RtsStartup.c
new file mode 100644
index 0000000000..147de7b857
--- /dev/null
+++ b/rts/RtsStartup.c
@@ -0,0 +1,457 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2002
+ *
+ * Main function for a standalone Haskell program.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsAPI.h"
+#include "RtsUtils.h"
+#include "RtsFlags.h"  
+#include "OSThreads.h"
+#include "Storage.h"    /* initStorage, exitStorage */
+#include "Schedule.h"   /* initScheduler */
+#include "Stats.h"      /* initStats */
+#include "STM.h"        /* initSTM */
+#include "Signals.h"
+#include "RtsSignals.h"
+#include "Timer.h"      /* startTimer, stopTimer */
+#include "Weak.h"
+#include "Ticky.h"
+#include "StgRun.h"
+#include "Prelude.h"		/* fixupRTStoPreludeRefs */
+#include "HsFFI.h"
+#include "Linker.h"
+#include "ThreadLabels.h"
+#include "BlockAlloc.h"
+
+#if defined(RTS_GTK_FRONTPANEL)
+#include "FrontPanel.h"
+#endif
+
+#if defined(PROFILING) || defined(DEBUG)
+# include "Profiling.h"
+# include "ProfHeap.h"
+# include "RetainerProfile.h"
+#endif
+
+#if defined(GRAN)
+# include "GranSimRts.h"
+#endif
+
+#if defined(GRAN) || defined(PAR)
+# include "ParallelRts.h"
+#endif
+
+#if defined(PAR)
+# include "Parallel.h"
+# include "LLC.h"
+#endif
+
+#if defined(mingw32_HOST_OS)
+#include "win32/AsyncIO.h"
+#endif
+
+#include <stdlib.h>
+
+#ifdef HAVE_TERMIOS_H
+#include <termios.h>
+#endif
+#ifdef HAVE_SIGNAL_H
+#include <signal.h>
+#endif
+
+// Count of how many outstanding hs_init()s there have been.
+static int hs_init_count = 0;
+
+// Here we save the terminal settings on the standard file
+// descriptors, if we need to change them (eg. to support NoBuffering
+// input).
+static void *saved_termios[3] = {NULL,NULL,NULL};
+
+void*
+__hscore_get_saved_termios(int fd)
+{
+  return (0 <= fd && fd < (int)(sizeof(saved_termios) / sizeof(*saved_termios))) ?
+    saved_termios[fd] : NULL;
+}
+
+void
+__hscore_set_saved_termios(int fd, void* ts)
+{
+  if (0 <= fd && fd < (int)(sizeof(saved_termios) / sizeof(*saved_termios))) {
+    saved_termios[fd] = ts;
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   Initialise floating point unit on x86 (currently disabled. why?)
+   (see comment in ghc/compiler/nativeGen/MachInstrs.lhs).
+   -------------------------------------------------------------------------- */
+
+#define X86_INIT_FPU 0
+
+#if X86_INIT_FPU
+static void
+x86_init_fpu ( void )
+{
+  __volatile unsigned short int fpu_cw;
+
+  // Grab the control word
+  __asm __volatile ("fnstcw %0" : "=m" (fpu_cw));
+
+#if 0
+  printf("fpu_cw: %x\n", fpu_cw);
+#endif
+
+  // Set bits 8-9 to 10 (64-bit precision).
+  fpu_cw = (fpu_cw & 0xfcff) | 0x0200;
+
+  // Store the new control word back
+  __asm __volatile ("fldcw %0" : : "m" (fpu_cw));
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+   Starting up the RTS
+   -------------------------------------------------------------------------- */
+
+void
+hs_init(int *argc, char **argv[])
+{
+    hs_init_count++;
+    if (hs_init_count > 1) {
+	// second and subsequent inits are ignored
+	return;
+    }
+
+    /* The very first thing we do is grab the start time...just in case we're
+     * collecting timing statistics.
+     */
+    stat_startInit();
+
+#ifdef PAR
+    /*
+     * The parallel system needs to be initialised and synchronised before
+     * the program is run.  
+     */ 
+    startupParallelSystem(argv);
+     
+    if (*argv[0] == '-') { /* Strip off mainPE flag argument */
+      argv++; 
+      argc--;			
+    }
+
+    argv[1] = argv[0];   /* ignore the nPEs argument */
+    argv++; argc--;
+#endif
+
+    /* Set the RTS flags to default values. */
+    initRtsFlagsDefaults();
+
+    /* Call the user hook to reset defaults, if present */
+    defaultsHook();
+
+    /* Parse the flags, separating the RTS flags from the programs args */
+    if (argc != NULL && argv != NULL) {
+	setupRtsFlags(argc, *argv, &rts_argc, rts_argv);
+	setProgArgv(*argc,*argv);
+    }
+
+#if defined(PAR)
+    /* NB: this really must be done after processing the RTS flags */
+    IF_PAR_DEBUG(verbose,
+                 debugBelch("==== Synchronising system (%d PEs)\n", nPEs));
+    synchroniseSystem();             // calls initParallelSystem etc
+#endif	/* PAR */
+
+    /* Perform initialisation of adjustor thunk layer. */
+    initAdjustor();
+
+    /* initialise scheduler data structures (needs to be done before
+     * initStorage()).
+     */
+    initScheduler();
+
+#if defined(GRAN)
+    /* And start GranSim profiling if required: */
+    if (RtsFlags.GranFlags.GranSimStats.Full)
+      init_gr_simulation(rts_argc, rts_argv, prog_argc, prog_argv);
+#elif defined(PAR)
+    /* And start GUM profiling if required: */
+    if (RtsFlags.ParFlags.ParStats.Full)
+      init_gr_simulation(rts_argc, rts_argv, prog_argc, prog_argv);
+#endif	/* PAR || GRAN */
+
+    /* initialize the storage manager */
+    initStorage();
+
+    /* initialise the stable pointer table */
+    initStablePtrTable();
+
+#if defined(DEBUG)
+    /* initialise thread label table (tso->char*) */
+    initThreadLabelTable();
+#endif
+
+#if defined(PROFILING) || defined(DEBUG)
+    initProfiling1();
+#endif
+
+    /* start the virtual timer 'subsystem'. */
+    startTimer(TICK_MILLISECS);
+
+    /* Initialise the stats department */
+    initStats();
+
+#if defined(RTS_USER_SIGNALS)
+    /* Initialise the user signal handler set */
+    initUserSignals();
+    /* Set up handler to run on SIGINT, etc. */
+    initDefaultHandlers();
+#endif
+ 
+#if defined(mingw32_HOST_OS)
+    startupAsyncIO();
+#endif
+
+#ifdef RTS_GTK_FRONTPANEL
+    if (RtsFlags.GcFlags.frontpanel) {
+	initFrontPanel();
+    }
+#endif
+
+#if X86_INIT_FPU
+    x86_init_fpu();
+#endif
+
+    /* Record initialization times */
+    stat_endInit();
+}
+
+// Compatibility interface
+void
+startupHaskell(int argc, char *argv[], void (*init_root)(void))
+{
+    hs_init(&argc, &argv);
+    if(init_root)
+        hs_add_root(init_root);
+}
+
+
+/* -----------------------------------------------------------------------------
+   Per-module initialisation
+
+   This process traverses all the compiled modules in the program
+   starting with "Main", and performing per-module initialisation for
+   each one.
+
+   So far, two things happen at initialisation time:
+
+      - we register stable names for each foreign-exported function
+        in that module.  This prevents foreign-exported entities, and
+	things they depend on, from being garbage collected.
+
+      - we supply a unique integer to each statically declared cost
+        centre and cost centre stack in the program.
+
+   The code generator inserts a small function "__stginit_<module>" in each
+   module and calls the registration functions in each of the modules it
+   imports.
+
+   The init* functions are compiled in the same way as STG code,
+   i.e. without normal C call/return conventions.  Hence we must use
+   StgRun to call this stuff.
+   -------------------------------------------------------------------------- */
+
+/* The init functions use an explicit stack... 
+ */
+#define INIT_STACK_BLOCKS  4
+static F_ *init_stack = NULL;
+
+void
+hs_add_root(void (*init_root)(void))
+{
+    bdescr *bd;
+    nat init_sp;
+    Capability *cap = &MainCapability;
+
+    if (hs_init_count <= 0) {
+	barf("hs_add_root() must be called after hs_init()");
+    }
+
+    /* The initialisation stack grows downward, with sp pointing 
+       to the last occupied word */
+    init_sp = INIT_STACK_BLOCKS*BLOCK_SIZE_W;
+    bd = allocGroup_lock(INIT_STACK_BLOCKS);
+    init_stack = (F_ *)bd->start;
+    init_stack[--init_sp] = (F_)stg_init_finish;
+    if (init_root != NULL) {
+	init_stack[--init_sp] = (F_)init_root;
+    }
+    
+    cap->r.rSp = (P_)(init_stack + init_sp);
+    StgRun((StgFunPtr)stg_init, &cap->r);
+
+    freeGroup_lock(bd);
+
+#if defined(PROFILING) || defined(DEBUG)
+    // This must be done after module initialisation.
+    // ToDo: make this work in the presence of multiple hs_add_root()s.
+    initProfiling2();
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+   Shutting down the RTS
+   -------------------------------------------------------------------------- */
+
+void
+hs_exit(void)
+{
+    if (hs_init_count <= 0) {
+	errorBelch("warning: too many hs_exit()s");
+	return;
+    }
+    hs_init_count--;
+    if (hs_init_count > 0) {
+	// ignore until it's the last one
+	return;
+    }
+
+    /* start timing the shutdown */
+    stat_startExit();
+    
+    /* stop all running tasks */
+    exitScheduler();
+    
+#if defined(GRAN)
+    /* end_gr_simulation prints global stats if requested -- HWL */
+    if (!RtsFlags.GranFlags.GranSimStats.Suppressed)
+	end_gr_simulation();
+#endif
+    
+    /* stop the ticker */
+    stopTimer();
+    
+    /* reset the standard file descriptors to blocking mode */
+    resetNonBlockingFd(0);
+    resetNonBlockingFd(1);
+    resetNonBlockingFd(2);
+
+#if HAVE_TERMIOS_H
+    // Reset the terminal settings on the standard file descriptors,
+    // if we changed them.  See System.Posix.Internals.tcSetAttr for
+    // more details, including the reason we termporarily disable
+    // SIGTTOU here.
+    { 
+	int fd;
+	sigset_t sigset, old_sigset;
+	sigemptyset(&sigset);
+	sigaddset(&sigset, SIGTTOU);
+	sigprocmask(SIG_BLOCK, &sigset, &old_sigset);
+	for (fd = 0; fd <= 2; fd++) {
+	    struct termios* ts = (struct termios*)__hscore_get_saved_termios(fd);
+	    if (ts != NULL) {
+		tcsetattr(fd,TCSANOW,ts);
+	    }
+	}
+	sigprocmask(SIG_SETMASK, &old_sigset, NULL);
+    }
+#endif
+
+#if defined(PAR)
+    /* controlled exit; good thread! */
+    shutdownParallelSystem(0);
+    
+    /* global statistics in parallel system */
+    PAR_TICKY_PAR_END();
+#endif
+
+    /* stop timing the shutdown, we're about to print stats */
+    stat_endExit();
+    
+    // clean up things from the storage manager's point of view.
+    // also outputs the stats (+RTS -s) info.
+    exitStorage();
+    
+#ifdef RTS_GTK_FRONTPANEL
+    if (RtsFlags.GcFlags.frontpanel) {
+	stopFrontPanel();
+    }
+#endif
+
+#if defined(PROFILING) 
+    reportCCSProfiling();
+#endif
+
+#if defined(PROFILING) || defined(DEBUG)
+    endProfiling();
+#endif
+
+#ifdef PROFILING
+    // Originally, this was in report_ccs_profiling().  Now, retainer
+    // profiling might tack some extra stuff on to the end of this file
+    // during endProfiling().
+    fclose(prof_file);
+#endif
+
+#if defined(TICKY_TICKY)
+    if (RtsFlags.TickyFlags.showTickyStats) PrintTickyInfo();
+#endif
+
+#if defined(mingw32_HOST_OS)
+    shutdownAsyncIO();
+#endif
+
+    // Finally, free all our storage.
+    freeStorage();
+}
+
+// Compatibility interfaces
+void
+shutdownHaskell(void)
+{
+    hs_exit();
+}
+
+void
+shutdownHaskellAndExit(int n)
+{
+    if (hs_init_count == 1) {
+	OnExitHook();
+	hs_exit();
+#if defined(PAR)
+	/* really exit (stg_exit() would call shutdownParallelSystem() again) */
+	exit(n);
+#else
+	stg_exit(n);
+#endif
+    }
+}
+
+/* 
+ * called from STG-land to exit the program
+ */
+
+#ifdef PAR
+static int exit_started=rtsFalse;
+#endif
+
+void  
+stg_exit(int n)
+{ 
+#ifdef PAR
+  /* HACK: avoid a loop when exiting due to a stupid error */
+  if (exit_started) 
+    return;
+  exit_started=rtsTrue;
+
+  IF_PAR_DEBUG(verbose, debugBelch("==-- stg_exit %d on [%x]...", n, mytid));
+  shutdownParallelSystem(n);
+#endif
+  exit(n);
+}
diff --git a/rts/RtsUtils.c b/rts/RtsUtils.c
new file mode 100644
index 0000000000..3e7e225dda
--- /dev/null
+++ b/rts/RtsUtils.c
@@ -0,0 +1,367 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2004
+ *
+ * General utility functions used in the RTS.
+ *
+ * ---------------------------------------------------------------------------*/
+
+/* gettimeofday isn't POSIX */
+/* #include "PosixSource.h" */
+
+#include "Rts.h"
+#include "RtsAPI.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "Ticky.h"
+
+#ifdef HAVE_TIME_H
+#include <time.h>
+#endif
+
+#ifdef HAVE_FCNTL_H
+#include <fcntl.h>
+#endif
+
+#ifdef HAVE_GETTIMEOFDAY
+#include <sys/time.h>
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <stdio.h>
+
+#ifdef HAVE_SIGNAL_H
+#include <signal.h>
+#endif
+
+#if defined(THREADED_RTS) && defined(openbsd_HOST_OS) && defined(HAVE_PTHREAD_H)
+#include <pthread.h>
+#endif
+
+#if defined(openbsd_HOST_OS) || defined(linux_HOST_OS) || defined(darwin_HOST_OS)
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+
+/* no C99 header stdint.h on OpenBSD? */
+#if defined(openbsd_HOST_OS)
+typedef unsigned long my_uintptr_t;
+#else
+#include <stdint.h>
+typedef uintptr_t my_uintptr_t;
+#endif
+#endif
+
+#if defined(_WIN32)
+#include <windows.h>
+#endif
+
+/* -----------------------------------------------------------------------------
+   Result-checking malloc wrappers.
+   -------------------------------------------------------------------------- */
+
+void *
+stgMallocBytes (int n, char *msg)
+{
+    char *space;
+
+    if ((space = (char *) malloc((size_t) n)) == NULL) {
+      /* don't fflush(stdout); WORKAROUND bug in Linux glibc */
+      MallocFailHook((W_) n, msg); /*msg*/
+      stg_exit(EXIT_INTERNAL_ERROR);
+    }
+    return space;
+}
+
+void *
+stgReallocBytes (void *p, int n, char *msg)
+{
+    char *space;
+
+    if ((space = (char *) realloc(p, (size_t) n)) == NULL) {
+      /* don't fflush(stdout); WORKAROUND bug in Linux glibc */
+      MallocFailHook((W_) n, msg); /*msg*/
+      stg_exit(EXIT_INTERNAL_ERROR);
+    }
+    return space;
+}
+
+void *
+stgCallocBytes (int n, int m, char *msg)
+{
+    char *space;
+
+    if ((space = (char *) calloc((size_t) n, (size_t) m)) == NULL) {
+      /* don't fflush(stdout); WORKAROUND bug in Linux glibc */
+      MallocFailHook((W_) n*m, msg); /*msg*/
+      stg_exit(EXIT_INTERNAL_ERROR);
+    }
+    return space;
+}
+
+/* To simplify changing the underlying allocator used
+ * by stgMallocBytes(), provide stgFree() as well.
+ */
+void
+stgFree(void* p)
+{
+  free(p);
+}
+
+/* -----------------------------------------------------------------------------
+   Stack overflow
+   
+   Not sure if this belongs here.
+   -------------------------------------------------------------------------- */
+
+void
+stackOverflow(void)
+{
+  StackOverflowHook(RtsFlags.GcFlags.maxStkSize * sizeof(W_));
+
+#if defined(TICKY_TICKY)
+  if (RtsFlags.TickyFlags.showTickyStats) PrintTickyInfo();
+#endif
+}
+
+void
+heapOverflow(void)
+{
+  /* don't fflush(stdout); WORKAROUND bug in Linux glibc */
+  OutOfHeapHook(0/*unknown request size*/, 
+		RtsFlags.GcFlags.maxHeapSize * BLOCK_SIZE);
+  
+#if defined(TICKY_TICKY)
+  if (RtsFlags.TickyFlags.showTickyStats) PrintTickyInfo();
+#endif
+
+  stg_exit(EXIT_HEAPOVERFLOW);
+}
+
+/* -----------------------------------------------------------------------------
+   Out-of-line strlen.
+
+   Used in addr2Integer because the C compiler on x86 chokes on
+   strlen, trying to inline it with not enough registers available.
+   -------------------------------------------------------------------------- */
+
+nat stg_strlen(char *s)
+{
+   char *p = s;
+
+   while (*p) p++;
+   return p-s;
+}
+
+
+/* -----------------------------------------------------------------------------
+   genSym stuff, used by GHC itself for its splitting unique supply.
+
+   ToDo: put this somewhere sensible.
+   -------------------------------------------------------------------------  */
+
+static I_ __GenSymCounter = 0;
+
+I_
+genSymZh(void)
+{
+    return(__GenSymCounter++);
+}
+I_
+resetGenSymZh(void) /* it's your funeral */
+{
+    __GenSymCounter=0;
+    return(__GenSymCounter);
+}
+
+/* -----------------------------------------------------------------------------
+   Get the current time as a string.  Used in profiling reports.
+   -------------------------------------------------------------------------- */
+
+#if defined(PROFILING) || defined(DEBUG) || defined(PAR) || defined(GRAN)
+char *
+time_str(void)
+{
+    static time_t now = 0;
+    static char nowstr[26];
+
+    if (now == 0) {
+	time(&now);
+#if HAVE_CTIME_R
+	ctime_r(&now, nowstr);
+#else
+	strcpy(nowstr, ctime(&now));
+#endif
+	memmove(nowstr+16,nowstr+19,7);
+	nowstr[21] = '\0';  // removes the \n
+    }
+    return nowstr;
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+ * Reset a file handle to blocking mode.  We do this for the standard
+ * file descriptors before exiting, because the shell doesn't always
+ * clean up for us.
+ * -------------------------------------------------------------------------- */
+
+#if !defined(mingw32_HOST_OS)
+void
+resetNonBlockingFd(int fd)
+{
+  long fd_flags;
+
+  /* clear the non-blocking flag on this file descriptor */
+  fd_flags = fcntl(fd, F_GETFL);
+  if (fd_flags & O_NONBLOCK) {
+    fcntl(fd, F_SETFL, fd_flags & ~O_NONBLOCK);
+  }
+}
+
+void
+setNonBlockingFd(int fd)
+{
+  long fd_flags;
+
+  /* clear the non-blocking flag on this file descriptor */
+  fd_flags = fcntl(fd, F_GETFL);
+  if (!(fd_flags & O_NONBLOCK)) {
+    fcntl(fd, F_SETFL, fd_flags | O_NONBLOCK);
+  }
+}
+#else
+/* Stub defns -- async / non-blocking IO is not done 
+ * via O_NONBLOCK and select() under Win32. 
+ */
+void resetNonBlockingFd(int fd STG_UNUSED) {}
+void setNonBlockingFd(int fd STG_UNUSED) {}
+#endif
+
+#ifdef PAR
+static ullong startTime = 0;
+
+/* used in a parallel setup */
+ullong
+msTime(void)
+{
+# if defined(HAVE_GETCLOCK) && !defined(alpha_HOST_ARCH) && !defined(hppa1_1_HOST_ARCH)
+    struct timespec tv;
+
+    if (getclock(TIMEOFDAY, &tv) != 0) {
+	fflush(stdout);
+	fprintf(stderr, "Clock failed\n");
+	stg_exit(EXIT_FAILURE);
+    }
+    return tv.tv_sec * LL(1000) + tv.tv_nsec / LL(1000000) - startTime;
+# elif HAVE_GETTIMEOFDAY && !defined(alpha_HOST_ARCH)
+    struct timeval tv;
+ 
+    if (gettimeofday(&tv, NULL) != 0) {
+	fflush(stdout);
+	fprintf(stderr, "Clock failed\n");
+	stg_exit(EXIT_FAILURE);
+    }
+    return tv.tv_sec * LL(1000) + tv.tv_usec / LL(1000) - startTime;
+# else
+    time_t t;
+    if ((t = time(NULL)) == (time_t) -1) {
+	fflush(stdout);
+	fprintf(stderr, "Clock failed\n");
+	stg_exit(EXIT_FAILURE);
+    }
+    return t * LL(1000) - startTime;
+# endif
+}
+#endif /* PAR */
+
+/* -----------------------------------------------------------------------------
+   Print large numbers, with punctuation.
+   -------------------------------------------------------------------------- */
+
+char *
+ullong_format_string(ullong x, char *s, rtsBool with_commas)
+{
+    if (x < (ullong)1000) 
+	sprintf(s, "%lu", (lnat)x);
+    else if (x < (ullong)1000000)
+	sprintf(s, (with_commas) ? "%lu,%3.3lu" : "%lu%3.3lu",
+		(lnat)((x)/(ullong)1000),
+		(lnat)((x)%(ullong)1000));
+    else if (x < (ullong)1000000000)
+	sprintf(s, (with_commas) ? "%lu,%3.3lu,%3.3lu" :  "%lu%3.3lu%3.3lu",
+		(lnat)((x)/(ullong)1000000),
+		(lnat)((x)/(ullong)1000%(ullong)1000),
+		(lnat)((x)%(ullong)1000));
+    else
+	sprintf(s, (with_commas) ? "%lu,%3.3lu,%3.3lu,%3.3lu" : "%lu%3.3lu%3.3lu%3.3lu",
+		(lnat)((x)/(ullong)1000000000),
+		(lnat)((x)/(ullong)1000000%(ullong)1000),
+		(lnat)((x)/(ullong)1000%(ullong)1000), 
+		(lnat)((x)%(ullong)1000));
+    return s;
+}
+
+
+// Can be used as a breakpoint to set on every heap check failure.
+#ifdef DEBUG
+void
+heapCheckFail( void )
+{
+}
+#endif
+
+/* 
+ * It seems that pthreads and signals interact oddly in OpenBSD & FreeBSD
+ * pthreads (and possibly others). When linking with -lpthreads, we
+ * have to use pthread_kill to send blockable signals. So use that
+ * when we have a threaded rts. So System.Posix.Signals will call
+ * genericRaise(), rather than raise(3).
+ */
+int genericRaise(int sig) {
+#if defined(THREADED_RTS) && (defined(openbsd_HOST_OS) || defined(freebsd_HOST_OS))
+        return pthread_kill(pthread_self(), sig);
+#else
+        return raise(sig);
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+   Allocating executable memory
+   -------------------------------------------------------------------------- */
+
+/* Heavily arch-specific, I'm afraid.. */
+
+/*
+ * Allocate len bytes which are readable, writable, and executable.
+ *
+ * ToDo: If this turns out to be a performance bottleneck, one could
+ * e.g. cache the last VirtualProtect/mprotect-ed region and do
+ * nothing in case of a cache hit.
+ */
+void*
+stgMallocBytesRWX(int len)
+{
+  void *addr = stgMallocBytes(len, "mallocBytesRWX");
+#if defined(i386_HOST_ARCH) && defined(_WIN32)
+  /* This could be necessary for processors which distinguish between READ and
+     EXECUTE memory accesses, e.g. Itaniums. */
+  DWORD dwOldProtect = 0;
+  if (VirtualProtect (addr, len, PAGE_EXECUTE_READWRITE, &dwOldProtect) == 0) {
+    barf("mallocBytesRWX: failed to protect 0x%p; error=%lu; old protection: %lu\n",
+         addr, (unsigned long)GetLastError(), (unsigned long)dwOldProtect);
+  }
+#elif defined(openbsd_HOST_OS) || defined(linux_HOST_OS) || defined(darwin_HOST_OS)
+  /* malloced memory isn't executable by default on OpenBSD */
+  my_uintptr_t pageSize         = sysconf(_SC_PAGESIZE);
+  my_uintptr_t mask             = ~(pageSize - 1);
+  my_uintptr_t startOfFirstPage = ((my_uintptr_t)addr          ) & mask;
+  my_uintptr_t startOfLastPage  = ((my_uintptr_t)addr + len - 1) & mask;
+  my_uintptr_t size             = startOfLastPage - startOfFirstPage + pageSize;
+  if (mprotect((void*)startOfFirstPage, (size_t)size, PROT_EXEC | PROT_READ | PROT_WRITE) != 0) {
+    barf("mallocBytesRWX: failed to protect 0x%p\n", addr);
+  }
+#endif
+  return addr;
+}
diff --git a/rts/RtsUtils.h b/rts/RtsUtils.h
new file mode 100644
index 0000000000..96a5f0d82f
--- /dev/null
+++ b/rts/RtsUtils.h
@@ -0,0 +1,54 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * General utility functions used in the RTS.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef RTSUTILS_H
+#define RTSUTILS_H
+
+/* -----------------------------------------------------------------------------
+ * (Checked) dynamic allocation
+ * -------------------------------------------------------------------------- */
+
+extern void *stgMallocBytes(int n, char *msg)
+    GNUC3_ATTRIBUTE(__malloc__);
+
+extern void* stgMallocBytesRWX(int len)
+    GNUC3_ATTRIBUTE(__malloc__);
+
+extern void *stgReallocBytes(void *p, int n, char *msg);
+
+extern void *stgCallocBytes(int n, int m, char *msg)
+     GNUC3_ATTRIBUTE(__malloc__);
+
+extern void stgFree(void* p);
+
+/* -----------------------------------------------------------------------------
+ * Misc other utilities
+ * -------------------------------------------------------------------------- */
+
+extern void heapOverflow(void);
+
+extern void setNonBlockingFd(int fd);
+extern void resetNonBlockingFd(int fd);
+
+extern nat stg_strlen(char *str);
+
+extern char *time_str(void);
+extern char *ullong_format_string(ullong, char *, rtsBool);
+
+#ifdef PAR
+extern ullong msTime(void);
+#endif
+
+#ifdef DEBUG
+extern void heapCheckFail( void );
+#endif
+
+extern void* __hscore_get_saved_termios(int fd);
+extern void __hscore_set_saved_termios(int fd, void* ts);
+
+#endif /* RTSUTILS_H */
diff --git a/rts/STM.c b/rts/STM.c
new file mode 100644
index 0000000000..d3283a92f0
--- /dev/null
+++ b/rts/STM.c
@@ -0,0 +1,1261 @@
+/* -----------------------------------------------------------------------------
+ * (c) The GHC Team 1998-2005
+ * 
+ * STM implementation.
+ *
+ * Overview
+ * --------
+ *
+ * See the PPoPP 2005 paper "Composable memory transactions".  In summary, 
+ * each transcation has a TRec (transaction record) holding entries for each of the
+ * TVars (transactional variables) that it has accessed.  Each entry records
+ * (a) the TVar, (b) the expected value seen in the TVar, (c) the new value that
+ * the transaction wants to write to the TVar, (d) during commit, the identity of
+ * the TRec that wrote the expected value.  
+ *
+ * Separate TRecs are used for each level in a nest of transactions.  This allows
+ * a nested transaction to be aborted without condemning its enclosing transactions.
+ * This is needed in the implementation of catchRetry.  Note that the "expected value"
+ * in a nested transaction's TRec is the value expected to be *held in memory* if
+ * the transaction commits -- not the "new value" stored in one of the enclosing
+ * transactions.  This means that validation can be done without searching through
+ * a nest of TRecs.
+ *
+ * Concurrency control
+ * -------------------
+ *
+ * Three different concurrency control schemes can be built according to the settings
+ * in STM.h:
+ * 
+ * STM_UNIPROC assumes that the caller serialises invocations on the STM interface.
+ * In the Haskell RTS this means it is suitable only for non-THREADED_RTS builds.
+ *
+ * STM_CG_LOCK uses coarse-grained locking -- a single 'stm lock' is acquired during
+ * an invocation on the STM interface.  Note that this does not mean that 
+ * transactions are simply serialized -- the lock is only held *within* the 
+ * implementation of stmCommitTransaction, stmWait etc.
+ *
+ * STM_FG_LOCKS uses fine-grained locking -- locking is done on a per-TVar basis
+ * and, when committing a transaction, no locks are acquired for TVars that have
+ * been read but not updated.
+ *
+ * Concurrency control is implemented in the functions:
+ *
+ *    lock_stm
+ *    unlock_stm
+ *    lock_tvar / cond_lock_tvar
+ *    unlock_tvar
+ *
+ * The choice between STM_UNIPROC / STM_CG_LOCK / STM_FG_LOCKS affects the 
+ * implementation of these functions.  
+ *
+ * lock_stm & unlock_stm are straightforward : they acquire a simple spin-lock
+ * using STM_CG_LOCK, and otherwise they are no-ops.
+ *
+ * lock_tvar / cond_lock_tvar and unlock_tvar are more complex because they 
+ * have other effects (present in STM_UNIPROC and STM_CG_LOCK builds) as well
+ * as the actual business of maniupultaing a lock (present only in STM_FG_LOCKS
+ * builds).  This is because locking a TVar is implemented by writing the lock
+ * holder's TRec into the TVar's current_value field:
+ *
+ *   lock_tvar - lock a specified TVar (STM_FG_LOCKS only), returning the value 
+ *               it contained.
+ *
+ *   cond_lock_tvar - lock a specified TVar (STM_FG_LOCKS only) if it 
+ *               contains a specified value.  Return TRUE if this succeeds,
+ *               FALSE otherwise.
+ *
+ *   unlock_tvar - release the lock on a specified TVar (STM_FG_LOCKS only),
+ *               storing a specified value in place of the lock entry.
+ *
+ * Using these operations, the typcial pattern of a commit/validate/wait operation
+ * is to (a) lock the STM, (b) lock all the TVars being updated, (c) check that 
+ * the TVars that were only read from still contain their expected values, 
+ * (d) release the locks on the TVars, writing updates to them in the case of a 
+ * commit, (e) unlock the STM.
+ *
+ * Queues of waiting threads hang off the first_wait_queue_entry field of each
+ * TVar.  This may only be manipulated when holding that TVar's lock.  In
+ * particular, when a thread is putting itself to sleep, it mustn't release
+ * the TVar's lock until it has added itself to the wait queue and marked its
+ * TSO as BlockedOnSTM -- this makes sure that other threads will know to wake it.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "Schedule.h"
+#include "SMP.h"
+#include "STM.h"
+#include "Storage.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#define TRUE 1
+#define FALSE 0
+
+// ACQ_ASSERT is used for assertions which are only required for
+// THREADED_RTS builds with fine-grained locking.
+
+#if defined(STM_FG_LOCKS)
+#define ACQ_ASSERT(_X) ASSERT(_X)
+#define NACQ_ASSERT(_X) /*Nothing*/
+#else
+#define ACQ_ASSERT(_X) /*Nothing*/
+#define NACQ_ASSERT(_X) ASSERT(_X)
+#endif
+
+/*......................................................................*/
+
+// If SHAKE is defined then validation will sometime spuriously fail.  They helps test
+// unusualy code paths if genuine contention is rare
+
+#if defined(DEBUG)
+#define SHAKE
+#if defined(THREADED_RTS)
+#define TRACE(_x...) IF_DEBUG(stm, debugBelch("STM  (task %p): ", (void *)(unsigned long)(unsigned int)osThreadId()); debugBelch ( _x ))
+#else
+#define TRACE(_x...) IF_DEBUG(stm, debugBelch ( _x ))
+#endif
+#else
+#define TRACE(_x...) /*Nothing*/
+#endif
+
+#ifdef SHAKE
+static const int do_shake = TRUE;
+#else
+static const int do_shake = FALSE;
+#endif
+static int shake_ctr = 0;
+static int shake_lim = 1;
+
+static int shake(void) {
+  if (do_shake) {
+    if (((shake_ctr++) % shake_lim) == 0) {
+      shake_ctr = 1;
+      shake_lim ++;
+      return TRUE;
+    } 
+    return FALSE;
+  } else {
+    return FALSE;
+  }
+}
+
+/*......................................................................*/
+
+// Helper macros for iterating over entries within a transaction
+// record
+
+#define FOR_EACH_ENTRY(_t,_x,CODE) do {                                         \
+  StgTRecHeader *__t = (_t);                                                    \
+  StgTRecChunk *__c = __t -> current_chunk;                                     \
+  StgWord __limit = __c -> next_entry_idx;                                      \
+  TRACE("%p : FOR_EACH_ENTRY, current_chunk=%p limit=%ld\n", __t, __c, __limit); \
+  while (__c != END_STM_CHUNK_LIST) {                                           \
+    StgWord __i;                                                                \
+    for (__i = 0; __i < __limit; __i ++) {                                      \
+      TRecEntry *_x = &(__c -> entries[__i]);                                   \
+      do { CODE } while (0);                                                    \
+    }                                                                           \
+    __c = __c -> prev_chunk;                                                    \
+    __limit = TREC_CHUNK_NUM_ENTRIES;                                           \
+  }                                                                             \
+ exit_for_each:                                                                 \
+  if (FALSE) goto exit_for_each;                                                \
+} while (0)
+
+#define BREAK_FOR_EACH goto exit_for_each
+     
+/*......................................................................*/
+
+// if REUSE_MEMORY is defined then attempt to re-use descriptors, log chunks,
+// and wait queue entries without GC
+
+#define REUSE_MEMORY
+
+/*......................................................................*/
+
+#define IF_STM_UNIPROC(__X)  do { } while (0)
+#define IF_STM_CG_LOCK(__X)  do { } while (0)
+#define IF_STM_FG_LOCKS(__X) do { } while (0)
+
+#if defined(STM_UNIPROC)
+#undef IF_STM_UNIPROC
+#define IF_STM_UNIPROC(__X)  do { __X } while (0)
+static const StgBool use_read_phase = FALSE;
+
+static void lock_stm(StgTRecHeader *trec STG_UNUSED) {
+  TRACE("%p : lock_stm()\n", trec);
+}
+
+static void unlock_stm(StgTRecHeader *trec STG_UNUSED) {
+  TRACE("%p : unlock_stm()\n", trec);
+}
+
+static StgClosure *lock_tvar(StgTRecHeader *trec STG_UNUSED, 
+                             StgTVar *s STG_UNUSED) {
+  StgClosure *result;
+  TRACE("%p : lock_tvar(%p)\n", trec, s);
+  result = s -> current_value;
+  return result;
+}
+
+static void unlock_tvar(StgTRecHeader *trec STG_UNUSED,
+                        StgTVar *s STG_UNUSED,
+                        StgClosure *c,
+                        StgBool force_update) {
+  TRACE("%p : unlock_tvar(%p)\n", trec, s);
+  if (force_update) {
+    s -> current_value = c;
+  }
+}
+
+static StgBool cond_lock_tvar(StgTRecHeader *trec STG_UNUSED, 
+                              StgTVar *s STG_UNUSED,
+                              StgClosure *expected) {
+  StgClosure *result;
+  TRACE("%p : cond_lock_tvar(%p, %p)\n", trec, s, expected);
+  result = s -> current_value;
+  TRACE("%p : %s\n", trec, (result == expected) ? "success" : "failure");
+  return (result == expected);
+}
+#endif
+
+#if defined(STM_CG_LOCK) /*........................................*/
+
+#undef IF_STM_CG_LOCK
+#define IF_STM_CG_LOCK(__X)  do { __X } while (0)
+static const StgBool use_read_phase = FALSE;
+static volatile StgTRecHeader *smp_locked = NULL;
+
+static void lock_stm(StgTRecHeader *trec) {
+  while (cas(&smp_locked, NULL, trec) != NULL) { }
+  TRACE("%p : lock_stm()\n", trec);
+}
+
+static void unlock_stm(StgTRecHeader *trec STG_UNUSED) {
+  TRACE("%p : unlock_stm()\n", trec);
+  ASSERT (smp_locked == trec);
+  smp_locked = 0;
+}
+
+static StgClosure *lock_tvar(StgTRecHeader *trec STG_UNUSED, 
+                             StgTVar *s STG_UNUSED) {
+  StgClosure *result;
+  TRACE("%p : lock_tvar(%p)\n", trec, s);
+  ASSERT (smp_locked == trec);
+  result = s -> current_value;
+  return result;
+}
+
+static void *unlock_tvar(StgTRecHeader *trec STG_UNUSED,
+                         StgTVar *s STG_UNUSED,
+                         StgClosure *c,
+                         StgBool force_update) {
+  TRACE("%p : unlock_tvar(%p, %p)\n", trec, s, c);
+  ASSERT (smp_locked == trec);
+  if (force_update) {
+    s -> current_value = c;
+  }
+}
+
+static StgBool cond_lock_tvar(StgTRecHeader *trec STG_UNUSED, 
+                               StgTVar *s STG_UNUSED,
+                               StgClosure *expected) {
+  StgClosure *result;
+  TRACE("%p : cond_lock_tvar(%p, %p)\n", trec, s, expected);
+  ASSERT (smp_locked == trec);
+  result = s -> current_value;
+  TRACE("%p : %d\n", result ? "success" : "failure");
+  return (result == expected);
+}
+#endif
+
+#if defined(STM_FG_LOCKS) /*...................................*/
+
+#undef IF_STM_FG_LOCKS
+#define IF_STM_FG_LOCKS(__X) do { __X } while (0)
+static const StgBool use_read_phase = TRUE;
+
+static void lock_stm(StgTRecHeader *trec STG_UNUSED) {
+  TRACE("%p : lock_stm()\n", trec);
+}
+
+static void unlock_stm(StgTRecHeader *trec STG_UNUSED) {
+  TRACE("%p : unlock_stm()\n", trec);
+}
+
+static StgClosure *lock_tvar(StgTRecHeader *trec, 
+                             StgTVar *s STG_UNUSED) {
+  StgClosure *result;
+  TRACE("%p : lock_tvar(%p)\n", trec, s);
+  do {
+    do {
+      result = s -> current_value;
+    } while (GET_INFO(result) == &stg_TREC_HEADER_info);
+  } while (cas(&(s -> current_value), result, trec) != result);
+  return result;
+}
+
+static void unlock_tvar(StgTRecHeader *trec STG_UNUSED,
+                        StgTVar *s,
+                        StgClosure *c,
+                        StgBool force_update STG_UNUSED) {
+  TRACE("%p : unlock_tvar(%p, %p)\n", trec, s, c);
+  ASSERT(s -> current_value == trec);
+  s -> current_value = c;
+}
+
+static StgBool cond_lock_tvar(StgTRecHeader *trec, 
+                              StgTVar *s,
+                              StgClosure *expected) {
+  StgClosure *result;
+  TRACE("%p : cond_lock_tvar(%p, %p)\n", trec, s, expected);
+  result = cas(&(s -> current_value), expected, trec);
+  TRACE("%p : %s\n", trec, result ? "success" : "failure");
+  return (result == expected);
+}
+#endif
+
+/*......................................................................*/
+
+// Helper functions for thread blocking and unblocking
+
+static void park_tso(StgTSO *tso) {
+  ASSERT(tso -> why_blocked == NotBlocked);
+  tso -> why_blocked = BlockedOnSTM;
+  tso -> block_info.closure = (StgClosure *) END_TSO_QUEUE;
+  TRACE("park_tso on tso=%p\n", tso);
+}
+
+static void unpark_tso(Capability *cap, StgTSO *tso) {
+  // We will continue unparking threads while they remain on one of the wait
+  // queues: it's up to the thread itself to remove it from the wait queues
+  // if it decides to do so when it is scheduled.
+  if (tso -> why_blocked == BlockedOnSTM) {
+    TRACE("unpark_tso on tso=%p\n", tso);
+    unblockOne(cap,tso);
+  } else {
+    TRACE("spurious unpark_tso on tso=%p\n", tso);
+  }
+}
+
+static void unpark_waiters_on(Capability *cap, StgTVar *s) {
+  StgTVarWaitQueue *q;
+  TRACE("unpark_waiters_on tvar=%p\n", s);
+  for (q = s -> first_wait_queue_entry; 
+       q != END_STM_WAIT_QUEUE; 
+       q = q -> next_queue_entry) {
+    unpark_tso(cap, q -> waiting_tso);
+  }
+}
+
+/*......................................................................*/
+
+// Helper functions for downstream allocation and initialization
+
+static StgTVarWaitQueue *new_stg_tvar_wait_queue(Capability *cap,
+                                                 StgTSO *waiting_tso) {
+  StgTVarWaitQueue *result;
+  result = (StgTVarWaitQueue *)allocateLocal(cap, sizeofW(StgTVarWaitQueue));
+  SET_HDR (result, &stg_TVAR_WAIT_QUEUE_info, CCS_SYSTEM);
+  result -> waiting_tso = waiting_tso;
+  return result;
+}
+
+static StgTRecChunk *new_stg_trec_chunk(Capability *cap) {
+  StgTRecChunk *result;
+  result = (StgTRecChunk *)allocateLocal(cap, sizeofW(StgTRecChunk));
+  SET_HDR (result, &stg_TREC_CHUNK_info, CCS_SYSTEM);
+  result -> prev_chunk = END_STM_CHUNK_LIST;
+  result -> next_entry_idx = 0;
+  return result;
+}
+
+static StgTRecHeader *new_stg_trec_header(Capability *cap,
+                                          StgTRecHeader *enclosing_trec) {
+  StgTRecHeader *result;
+  result = (StgTRecHeader *) allocateLocal(cap, sizeofW(StgTRecHeader));
+  SET_HDR (result, &stg_TREC_HEADER_info, CCS_SYSTEM);
+
+  result -> enclosing_trec = enclosing_trec;
+  result -> current_chunk = new_stg_trec_chunk(cap);
+
+  if (enclosing_trec == NO_TREC) {
+    result -> state = TREC_ACTIVE;
+  } else {
+    ASSERT(enclosing_trec -> state == TREC_ACTIVE ||
+           enclosing_trec -> state == TREC_CONDEMNED);
+    result -> state = enclosing_trec -> state;
+  }
+
+  return result;  
+}
+
+/*......................................................................*/
+
+// Allocation / deallocation functions that retain per-capability lists
+// of closures that can be re-used
+
+static StgTVarWaitQueue *alloc_stg_tvar_wait_queue(Capability *cap,
+                                                   StgTSO *waiting_tso) {
+  StgTVarWaitQueue *result = NULL;
+  if (cap -> free_tvar_wait_queues == END_STM_WAIT_QUEUE) {
+    result = new_stg_tvar_wait_queue(cap, waiting_tso);
+  } else {
+    result = cap -> free_tvar_wait_queues;
+    result -> waiting_tso = waiting_tso;
+    cap -> free_tvar_wait_queues = result -> next_queue_entry;
+  }
+  return result;
+}
+
+static void free_stg_tvar_wait_queue(Capability *cap,
+                                     StgTVarWaitQueue *wq) {
+#if defined(REUSE_MEMORY)
+  wq -> next_queue_entry = cap -> free_tvar_wait_queues;
+  cap -> free_tvar_wait_queues = wq;
+#endif
+}
+
+static StgTRecChunk *alloc_stg_trec_chunk(Capability *cap) {
+  StgTRecChunk *result = NULL;
+  if (cap -> free_trec_chunks == END_STM_CHUNK_LIST) {
+    result = new_stg_trec_chunk(cap);
+  } else {
+    result = cap -> free_trec_chunks;
+    cap -> free_trec_chunks = result -> prev_chunk;
+    result -> prev_chunk = END_STM_CHUNK_LIST;
+    result -> next_entry_idx = 0;
+  }
+  return result;
+}
+
+static void free_stg_trec_chunk(Capability *cap, 
+                                StgTRecChunk *c) {
+#if defined(REUSE_MEMORY)
+  c -> prev_chunk = cap -> free_trec_chunks;
+  cap -> free_trec_chunks = c;
+#endif
+}
+
+static StgTRecHeader *alloc_stg_trec_header(Capability *cap,
+                                            StgTRecHeader *enclosing_trec) {
+  StgTRecHeader *result = NULL;
+  if (cap -> free_trec_headers == NO_TREC) {
+    result = new_stg_trec_header(cap, enclosing_trec);
+  } else {
+    result = cap -> free_trec_headers;
+    cap -> free_trec_headers = result -> enclosing_trec;
+    result -> enclosing_trec = enclosing_trec;
+    result -> current_chunk -> next_entry_idx = 0;
+    if (enclosing_trec == NO_TREC) {
+      result -> state = TREC_ACTIVE;
+    } else {
+      ASSERT(enclosing_trec -> state == TREC_ACTIVE ||
+             enclosing_trec -> state == TREC_CONDEMNED);
+      result -> state = enclosing_trec -> state;
+    }
+  }
+  return result;
+}
+
+static void free_stg_trec_header(Capability *cap,
+                                 StgTRecHeader *trec) {
+#if defined(REUSE_MEMORY)
+  StgTRecChunk *chunk = trec -> current_chunk -> prev_chunk;
+  while (chunk != END_STM_CHUNK_LIST) {
+    StgTRecChunk *prev_chunk = chunk -> prev_chunk;
+    free_stg_trec_chunk(cap, chunk);
+    chunk = prev_chunk;
+  } 
+  trec -> current_chunk -> prev_chunk = END_STM_CHUNK_LIST;
+  trec -> enclosing_trec = cap -> free_trec_headers;
+  cap -> free_trec_headers = trec;
+#endif
+}
+
+/*......................................................................*/
+
+// Helper functions for managing waiting lists
+
+static void build_wait_queue_entries_for_trec(Capability *cap,
+                                      StgTSO *tso, 
+                                      StgTRecHeader *trec) {
+  ASSERT(trec != NO_TREC);
+  ASSERT(trec -> enclosing_trec == NO_TREC);
+  ASSERT(trec -> state == TREC_ACTIVE);
+
+  TRACE("%p : build_wait_queue_entries_for_trec()\n", trec);
+
+  FOR_EACH_ENTRY(trec, e, {
+    StgTVar *s;
+    StgTVarWaitQueue *q;
+    StgTVarWaitQueue *fq;
+    s = e -> tvar;
+    TRACE("%p : adding tso=%p to wait queue for tvar=%p\n", trec, tso, s);
+    ACQ_ASSERT(s -> current_value == trec);
+    NACQ_ASSERT(s -> current_value == e -> expected_value);
+    fq = s -> first_wait_queue_entry;
+    q = alloc_stg_tvar_wait_queue(cap, tso);
+    q -> next_queue_entry = fq;
+    q -> prev_queue_entry = END_STM_WAIT_QUEUE;
+    if (fq != END_STM_WAIT_QUEUE) {
+      fq -> prev_queue_entry = q;
+    }
+    s -> first_wait_queue_entry = q;
+    e -> new_value = (StgClosure *) q;
+  });
+}
+
+static void remove_wait_queue_entries_for_trec(Capability *cap,
+                                               StgTRecHeader *trec) {
+  ASSERT(trec != NO_TREC);
+  ASSERT(trec -> enclosing_trec == NO_TREC);
+  ASSERT(trec -> state == TREC_WAITING ||
+         trec -> state == TREC_CONDEMNED);
+
+  TRACE("%p : remove_wait_queue_entries_for_trec()\n", trec);
+
+  FOR_EACH_ENTRY(trec, e, {
+    StgTVar *s;
+    StgTVarWaitQueue *pq;
+    StgTVarWaitQueue *nq;
+    StgTVarWaitQueue *q;
+    s = e -> tvar;
+    StgClosure *saw = lock_tvar(trec, s);
+    q = (StgTVarWaitQueue *) (e -> new_value);
+    TRACE("%p : removing tso=%p from wait queue for tvar=%p\n", trec, q -> waiting_tso, s);
+    ACQ_ASSERT(s -> current_value == trec);
+    nq = q -> next_queue_entry;
+    pq = q -> prev_queue_entry;
+    if (nq != END_STM_WAIT_QUEUE) {
+      nq -> prev_queue_entry = pq;
+    }
+    if (pq != END_STM_WAIT_QUEUE) {
+      pq -> next_queue_entry = nq;
+    } else {
+      ASSERT (s -> first_wait_queue_entry == q);
+      s -> first_wait_queue_entry = nq;
+    }
+    free_stg_tvar_wait_queue(cap, q);
+    unlock_tvar(trec, s, saw, FALSE);
+  });
+}
+ 
+/*......................................................................*/
+ 
+static TRecEntry *get_new_entry(Capability *cap,
+                                StgTRecHeader *t) {
+  TRecEntry *result;
+  StgTRecChunk *c;
+  int i;
+
+  c = t -> current_chunk;
+  i = c -> next_entry_idx;
+  ASSERT(c != END_STM_CHUNK_LIST);
+
+  if (i < TREC_CHUNK_NUM_ENTRIES) {
+    // Continue to use current chunk
+    result = &(c -> entries[i]);
+    c -> next_entry_idx ++;
+  } else {
+    // Current chunk is full: allocate a fresh one
+    StgTRecChunk *nc;
+    nc = alloc_stg_trec_chunk(cap);
+    nc -> prev_chunk = c;
+    nc -> next_entry_idx = 1;
+    t -> current_chunk = nc;
+    result = &(nc -> entries[0]);
+  }
+
+  return result;
+}
+
+/*......................................................................*/
+
+static void merge_update_into(Capability *cap,
+                              StgTRecHeader *t,
+                              StgTVar *tvar,
+                              StgClosure *expected_value,
+                              StgClosure *new_value) {
+  int found;
+  
+  // Look for an entry in this trec
+  found = FALSE;
+  FOR_EACH_ENTRY(t, e, {
+    StgTVar *s;
+    s = e -> tvar;
+    if (s == tvar) {
+      found = TRUE;
+      if (e -> expected_value != expected_value) {
+        // Must abort if the two entries start from different values
+        TRACE("%p : entries inconsistent at %p (%p vs %p)\n", 
+              t, tvar, e -> expected_value, expected_value);
+        t -> state = TREC_CONDEMNED;
+      } 
+      e -> new_value = new_value;
+      BREAK_FOR_EACH;
+    }
+  });
+
+  if (!found) {
+    // No entry so far in this trec
+    TRecEntry *ne;
+    ne = get_new_entry(cap, t);
+    ne -> tvar = tvar;
+    ne -> expected_value = expected_value;
+    ne -> new_value = new_value;
+  }
+}
+
+/*......................................................................*/
+
+static StgBool entry_is_update(TRecEntry *e) {
+  StgBool result;
+  result = (e -> expected_value != e -> new_value);
+  return result;
+} 
+
+#if defined(STM_FG_LOCKS)
+static StgBool entry_is_read_only(TRecEntry *e) {
+  StgBool result;
+  result = (e -> expected_value == e -> new_value);
+  return result;
+} 
+
+static StgBool tvar_is_locked(StgTVar *s, StgTRecHeader *h) {
+  StgClosure *c;
+  StgBool result;
+  c = s -> current_value;
+  result = (c == (StgClosure *) h);
+  return result;  
+}
+#endif
+
+// revert_ownership : release a lock on a TVar, storing back
+// the value that it held when the lock was acquired.  "revert_all"
+// is set in stmWait and stmReWait when we acquired locks on all of 
+// the TVars involved.  "revert_all" is not set in commit operations
+// where we don't lock TVars that have been read from but not updated.
+
+static void revert_ownership(StgTRecHeader *trec STG_UNUSED,
+                             StgBool revert_all STG_UNUSED) {
+#if defined(STM_FG_LOCKS) 
+  FOR_EACH_ENTRY(trec, e, {
+    if (revert_all || entry_is_update(e)) {
+      StgTVar *s;
+      s = e -> tvar;
+      if (tvar_is_locked(s, trec)) {
+        unlock_tvar(trec, s, e -> expected_value, TRUE);
+      }
+    }
+  });
+#endif
+}
+
+/*......................................................................*/
+
+// validate_and_acquire_ownership : this performs the twin functions
+// of checking that the TVars referred to by entries in trec hold the
+// expected values and:
+// 
+//   - locking the TVar (on updated TVars during commit, or all TVars
+//     during wait)
+//
+//   - recording the identity of the TRec who wrote the value seen in the
+//     TVar (on non-updated TVars during commit).  These values are 
+//     stashed in the TRec entries and are then checked in check_read_only
+//     to ensure that an atomic snapshot of all of these locations has been
+//     seen.
+
+static StgBool validate_and_acquire_ownership (StgTRecHeader *trec, 
+                                               int acquire_all,
+                                               int retain_ownership) {
+  StgBool result;
+
+  if (shake()) {
+    TRACE("%p : shake, pretending trec is invalid when it may not be\n", trec);
+    return FALSE;
+  }
+
+  ASSERT ((trec -> state == TREC_ACTIVE) || 
+	  (trec -> state == TREC_WAITING) ||
+	  (trec -> state == TREC_CONDEMNED));
+  result = !((trec -> state) == TREC_CONDEMNED);
+  if (result) {
+    FOR_EACH_ENTRY(trec, e, {
+      StgTVar *s;
+      s = e -> tvar;
+      if (acquire_all || entry_is_update(e)) {
+        TRACE("%p : trying to acquire %p\n", trec, s);
+        if (!cond_lock_tvar(trec, s, e -> expected_value)) {
+          TRACE("%p : failed to acquire %p\n", trec, s);
+          result = FALSE;
+          BREAK_FOR_EACH;
+        }
+      } else {
+        ASSERT(use_read_phase);
+        IF_STM_FG_LOCKS({
+          TRACE("%p : will need to check %p\n", trec, s);
+          if (s -> current_value != e -> expected_value) {
+            TRACE("%p : doesn't match\n", trec);
+            result = FALSE;
+            BREAK_FOR_EACH;
+          }
+          e -> num_updates = s -> num_updates;
+          if (s -> current_value != e -> expected_value) {
+            TRACE("%p : doesn't match (race)\n", trec);
+            result = FALSE;
+            BREAK_FOR_EACH;
+          } else {
+            TRACE("%p : need to check version %d\n", trec, e -> num_updates);
+          }
+        });
+      }
+    });
+  }
+
+  if ((!result) || (!retain_ownership)) {
+    revert_ownership(trec, acquire_all);
+  }
+  
+  return result;
+}
+
+// check_read_only : check that we've seen an atomic snapshot of the
+// non-updated TVars accessed by a trec.  This checks that the last TRec to
+// commit an update to the TVar is unchanged since the value was stashed in
+// validate_and_acquire_ownership.  If no udpate is seen to any TVar than
+// all of them contained their expected values at the start of the call to
+// check_read_only.
+//
+// The paper "Concurrent programming without locks" (under submission), or
+// Keir Fraser's PhD dissertation "Practical lock-free programming" discuss
+// this kind of algorithm.
+
+static StgBool check_read_only(StgTRecHeader *trec STG_UNUSED) {
+  StgBool result = TRUE;
+
+  ASSERT (use_read_phase);
+  IF_STM_FG_LOCKS({
+    FOR_EACH_ENTRY(trec, e, {
+      StgTVar *s;
+      s = e -> tvar;
+      if (entry_is_read_only(e)) {
+        TRACE("%p : check_read_only for TVar %p, saw %d\n", trec, s, e -> num_updates);
+        if (s -> num_updates != e -> num_updates) {
+          // ||s -> current_value != e -> expected_value) {
+          TRACE("%p : mismatch\n", trec);
+          result = FALSE;
+          BREAK_FOR_EACH;
+        }
+      }
+    });
+  });
+
+  return result;
+}
+
+
+/************************************************************************/
+
+void stmPreGCHook() {
+  nat i;
+
+  lock_stm(NO_TREC);
+  TRACE("stmPreGCHook\n");
+  for (i = 0; i < n_capabilities; i ++) {
+    Capability *cap = &capabilities[i];
+    cap -> free_tvar_wait_queues = END_STM_WAIT_QUEUE;
+    cap -> free_trec_chunks = END_STM_CHUNK_LIST;
+    cap -> free_trec_headers = NO_TREC;
+  }
+  unlock_stm(NO_TREC);
+}
+
+/************************************************************************/
+
+// check_read_only relies on version numbers held in TVars' "num_updates" 
+// fields not wrapping around while a transaction is committed.  The version
+// number is incremented each time an update is committed to the TVar
+// This is unlikely to wrap around when 32-bit integers are used for the counts, 
+// but to ensure correctness we maintain a shared count on the maximum
+// number of commit operations that may occur and check that this has 
+// not increased by more than 2^32 during a commit.
+
+#define TOKEN_BATCH_SIZE 1024
+
+static volatile StgInt64 max_commits = 0;
+
+static volatile StgBool token_locked = FALSE;
+
+#if defined(THREADED_RTS)
+static void getTokenBatch(Capability *cap) {
+  while (cas(&token_locked, FALSE, TRUE) == TRUE) { /* nothing */ }
+  max_commits += TOKEN_BATCH_SIZE;
+  cap -> transaction_tokens = TOKEN_BATCH_SIZE;
+  token_locked = FALSE;
+}
+
+static void getToken(Capability *cap) {
+  if (cap -> transaction_tokens == 0) {
+    getTokenBatch(cap);
+  }
+  cap -> transaction_tokens --;
+}
+#else
+static void getToken(Capability *cap STG_UNUSED) {
+  // Nothing
+}
+#endif
+
+/*......................................................................*/
+
+StgTRecHeader *stmStartTransaction(Capability *cap,
+                                   StgTRecHeader *outer) {
+  StgTRecHeader *t;
+  TRACE("%p : stmStartTransaction with %d tokens\n", 
+        outer, 
+        cap -> transaction_tokens);
+
+  getToken(cap);
+
+  t = alloc_stg_trec_header(cap, outer);
+  TRACE("%p : stmStartTransaction()=%p\n", outer, t);
+  return t;
+}
+
+/*......................................................................*/
+
+void stmAbortTransaction(Capability *cap,
+                         StgTRecHeader *trec) {
+  TRACE("%p : stmAbortTransaction\n", trec);
+  ASSERT (trec != NO_TREC);
+  ASSERT ((trec -> state == TREC_ACTIVE) || 
+          (trec -> state == TREC_WAITING) ||
+          (trec -> state == TREC_CONDEMNED));
+
+  lock_stm(trec);
+  if (trec -> state == TREC_WAITING) {
+    ASSERT (trec -> enclosing_trec == NO_TREC);
+    TRACE("%p : stmAbortTransaction aborting waiting transaction\n", trec);
+    remove_wait_queue_entries_for_trec(cap, trec);
+  } 
+  trec -> state = TREC_ABORTED;
+  unlock_stm(trec);
+
+  free_stg_trec_header(cap, trec);
+
+  TRACE("%p : stmAbortTransaction done\n", trec);
+}
+
+/*......................................................................*/
+
+void stmCondemnTransaction(Capability *cap,
+                           StgTRecHeader *trec) {
+  TRACE("%p : stmCondemnTransaction\n", trec);
+  ASSERT (trec != NO_TREC);
+  ASSERT ((trec -> state == TREC_ACTIVE) || 
+          (trec -> state == TREC_WAITING) ||
+          (trec -> state == TREC_CONDEMNED));
+
+  lock_stm(trec);
+  if (trec -> state == TREC_WAITING) {
+    ASSERT (trec -> enclosing_trec == NO_TREC);
+    TRACE("%p : stmCondemnTransaction condemning waiting transaction\n", trec);
+    remove_wait_queue_entries_for_trec(cap, trec);
+  } 
+  trec -> state = TREC_CONDEMNED;
+  unlock_stm(trec);
+
+  TRACE("%p : stmCondemnTransaction done\n", trec);
+}
+
+/*......................................................................*/
+
+StgTRecHeader *stmGetEnclosingTRec(StgTRecHeader *trec) {
+  StgTRecHeader *outer;
+  TRACE("%p : stmGetEnclosingTRec\n", trec);
+  outer = trec -> enclosing_trec;
+  TRACE("%p : stmGetEnclosingTRec()=%p\n", trec, outer);
+  return outer;
+}
+
+/*......................................................................*/
+
+StgBool stmValidateNestOfTransactions(StgTRecHeader *trec) {
+  StgTRecHeader *t;
+  StgBool result;
+
+  TRACE("%p : stmValidateNestOfTransactions\n", trec);
+  ASSERT(trec != NO_TREC);
+  ASSERT((trec -> state == TREC_ACTIVE) || 
+         (trec -> state == TREC_WAITING) ||
+         (trec -> state == TREC_CONDEMNED));
+
+  lock_stm(trec);
+
+  t = trec;
+  result = TRUE;
+  while (t != NO_TREC) {
+    result &= validate_and_acquire_ownership(t, TRUE, FALSE);
+    t = t -> enclosing_trec;
+  }
+
+  if (!result && trec -> state != TREC_WAITING) {
+    trec -> state = TREC_CONDEMNED; 
+  }
+
+  unlock_stm(trec);
+
+  TRACE("%p : stmValidateNestOfTransactions()=%d\n", trec, result);
+  return result;
+}
+
+/*......................................................................*/
+
+StgBool stmCommitTransaction(Capability *cap, StgTRecHeader *trec) {
+  int result;
+  StgInt64 max_commits_at_start = max_commits;
+
+  TRACE("%p : stmCommitTransaction()\n", trec);
+  ASSERT (trec != NO_TREC);
+
+  lock_stm(trec);
+
+  ASSERT (trec -> enclosing_trec == NO_TREC);
+  ASSERT ((trec -> state == TREC_ACTIVE) || 
+          (trec -> state == TREC_CONDEMNED));
+
+  result = validate_and_acquire_ownership(trec, (!use_read_phase), TRUE);
+  if (result) {
+    // We now know that all the updated locations hold their expected values.
+    ASSERT (trec -> state == TREC_ACTIVE);
+
+    if (use_read_phase) {
+      TRACE("%p : doing read check\n", trec);
+      result = check_read_only(trec);
+      TRACE("%p : read-check %s\n", trec, result ? "succeeded" : "failed");
+
+      StgInt64 max_commits_at_end = max_commits;
+      StgInt64 max_concurrent_commits;
+      max_concurrent_commits = ((max_commits_at_end - max_commits_at_start) +
+                                (n_capabilities * TOKEN_BATCH_SIZE));
+      if (((max_concurrent_commits >> 32) > 0) || shake()) {
+        result = FALSE;
+      }
+    }
+    
+    if (result) {
+      // We now know that all of the read-only locations held their exepcted values
+      // at the end of the call to validate_and_acquire_ownership.  This forms the
+      // linearization point of the commit.
+      
+      FOR_EACH_ENTRY(trec, e, {
+        StgTVar *s;
+        s = e -> tvar;
+        if (e -> new_value != e -> expected_value) {
+          // Entry is an update: write the value back to the TVar, unlocking it if
+          // necessary.
+
+          ACQ_ASSERT(tvar_is_locked(s, trec));
+          TRACE("%p : writing %p to %p, waking waiters\n", trec, e -> new_value, s);
+          unpark_waiters_on(cap,s);
+          IF_STM_FG_LOCKS({
+            s -> num_updates ++;
+          });
+          unlock_tvar(trec, s, e -> new_value, TRUE);
+        } 
+        ACQ_ASSERT(!tvar_is_locked(s, trec));
+      });
+    } else {
+      revert_ownership(trec, FALSE);
+    }
+  } 
+
+  unlock_stm(trec);
+
+  free_stg_trec_header(cap, trec);
+
+  TRACE("%p : stmCommitTransaction()=%d\n", trec, result);
+
+  return result;
+}
+
+/*......................................................................*/
+
+StgBool stmCommitNestedTransaction(Capability *cap, StgTRecHeader *trec) {
+  StgTRecHeader *et;
+  int result;
+  ASSERT (trec != NO_TREC && trec -> enclosing_trec != NO_TREC);
+  TRACE("%p : stmCommitNestedTransaction() into %p\n", trec, trec -> enclosing_trec);
+  ASSERT ((trec -> state == TREC_ACTIVE) || (trec -> state == TREC_CONDEMNED));
+
+  lock_stm(trec);
+
+  et = trec -> enclosing_trec;
+  result = validate_and_acquire_ownership(trec, (!use_read_phase), TRUE);
+  if (result) {
+    // We now know that all the updated locations hold their expected values.
+
+    if (use_read_phase) {
+      TRACE("%p : doing read check\n", trec);
+      result = check_read_only(trec);
+    }
+    if (result) {
+      // We now know that all of the read-only locations held their exepcted values
+      // at the end of the call to validate_and_acquire_ownership.  This forms the
+      // linearization point of the commit.
+
+      if (result) {
+        TRACE("%p : read-check succeeded\n", trec);
+        FOR_EACH_ENTRY(trec, e, {
+          // Merge each entry into the enclosing transaction record, release all
+          // locks.
+
+          StgTVar *s;
+          s = e -> tvar;
+          if (entry_is_update(e)) {
+            unlock_tvar(trec, s, e -> expected_value, FALSE);
+          }
+          merge_update_into(cap, et, s, e -> expected_value, e -> new_value);
+          ACQ_ASSERT(s -> current_value != trec);
+        });
+      } else {
+        revert_ownership(trec, FALSE);
+      }
+    }
+  } 
+
+  unlock_stm(trec);
+
+  free_stg_trec_header(cap, trec);
+
+  TRACE("%p : stmCommitNestedTransaction()=%d\n", trec, result);
+
+  return result;
+}
+
+/*......................................................................*/
+
+StgBool stmWait(Capability *cap, StgTSO *tso, StgTRecHeader *trec) {
+  int result;
+  TRACE("%p : stmWait(%p)\n", trec, tso);
+  ASSERT (trec != NO_TREC);
+  ASSERT (trec -> enclosing_trec == NO_TREC);
+  ASSERT ((trec -> state == TREC_ACTIVE) || 
+          (trec -> state == TREC_CONDEMNED));
+
+  lock_stm(trec);
+  result = validate_and_acquire_ownership(trec, TRUE, TRUE);
+  if (result) {
+    // The transaction is valid so far so we can actually start waiting.
+    // (Otherwise the transaction was not valid and the thread will have to
+    // retry it).
+
+    // Put ourselves to sleep.  We retain locks on all the TVars involved
+    // until we are sound asleep : (a) on the wait queues, (b) BlockedOnSTM
+    // in the TSO, (c) TREC_WAITING in the Trec.  
+    build_wait_queue_entries_for_trec(cap, tso, trec);
+    park_tso(tso);
+    trec -> state = TREC_WAITING;
+
+    // We haven't released ownership of the transaction yet.  The TSO
+    // has been put on the wait queue for the TVars it is waiting for,
+    // but we haven't yet tidied up the TSO's stack and made it safe
+    // to wake up the TSO.  Therefore, we must wait until the TSO is
+    // safe to wake up before we release ownership - when all is well,
+    // the runtime will call stmWaitUnlock() below, with the same
+    // TRec.
+
+  } else {
+    unlock_stm(trec);
+    free_stg_trec_header(cap, trec);
+  }
+
+  TRACE("%p : stmWait(%p)=%d\n", trec, tso, result);
+  return result;
+}
+
+
+void
+stmWaitUnlock(Capability *cap STG_UNUSED, StgTRecHeader *trec) {
+    revert_ownership(trec, TRUE);
+    unlock_stm(trec);
+}
+
+/*......................................................................*/
+
+StgBool stmReWait(Capability *cap, StgTSO *tso) {
+  int result;
+  StgTRecHeader *trec = tso->trec;
+
+  TRACE("%p : stmReWait\n", trec);
+  ASSERT (trec != NO_TREC);
+  ASSERT (trec -> enclosing_trec == NO_TREC);
+  ASSERT ((trec -> state == TREC_WAITING) || 
+          (trec -> state == TREC_CONDEMNED));
+
+  lock_stm(trec);
+  result = validate_and_acquire_ownership(trec, TRUE, TRUE);
+  TRACE("%p : validation %s\n", trec, result ? "succeeded" : "failed");
+  if (result) {
+    // The transaction remains valid -- do nothing because it is already on
+    // the wait queues
+    ASSERT (trec -> state == TREC_WAITING);
+    park_tso(tso);
+    revert_ownership(trec, TRUE);
+  } else {
+    // The transcation has become invalid.  We can now remove it from the wait
+    // queues.
+    if (trec -> state != TREC_CONDEMNED) {
+      remove_wait_queue_entries_for_trec (cap, trec);
+    }
+    free_stg_trec_header(cap, trec);
+  }
+  unlock_stm(trec);
+
+  TRACE("%p : stmReWait()=%d\n", trec, result);
+  return result;
+}
+
+/*......................................................................*/
+
+static TRecEntry *get_entry_for(StgTRecHeader *trec, StgTVar *tvar, StgTRecHeader **in) {
+  TRecEntry *result = NULL;
+
+  TRACE("%p : get_entry_for TVar %p\n", trec, tvar);
+  ASSERT(trec != NO_TREC);
+
+  do {
+    FOR_EACH_ENTRY(trec, e, {
+      if (e -> tvar == tvar) {
+        result = e;
+        if (in != NULL) {
+          *in = trec;
+        }
+        BREAK_FOR_EACH;
+      }
+    });
+    trec = trec -> enclosing_trec;
+  } while (result == NULL && trec != NO_TREC);
+
+  return result;    
+}
+
+static StgClosure *read_current_value(StgTRecHeader *trec STG_UNUSED, StgTVar *tvar) {
+  StgClosure *result;
+  result = tvar -> current_value;
+
+#if defined(STM_FG_LOCKS)
+  while (GET_INFO(result) == &stg_TREC_HEADER_info) {
+    TRACE("%p : read_current_value(%p) saw %p\n", trec, tvar, result);
+    result = tvar -> current_value;
+  }
+#endif
+
+  TRACE("%p : read_current_value(%p)=%p\n", trec, tvar, result);
+  return result;
+}
+
+/*......................................................................*/
+
+StgClosure *stmReadTVar(Capability *cap,
+                        StgTRecHeader *trec, 
+			StgTVar *tvar) {
+  StgTRecHeader *entry_in;
+  StgClosure *result = NULL;
+  TRecEntry *entry = NULL;
+  TRACE("%p : stmReadTVar(%p)\n", trec, tvar);
+  ASSERT (trec != NO_TREC);
+  ASSERT (trec -> state == TREC_ACTIVE || 
+          trec -> state == TREC_CONDEMNED);
+
+  entry = get_entry_for(trec, tvar, &entry_in);
+
+  if (entry != NULL) {
+    if (entry_in == trec) {
+      // Entry found in our trec
+      result = entry -> new_value;
+    } else {
+      // Entry found in another trec
+      TRecEntry *new_entry = get_new_entry(cap, trec);
+      new_entry -> tvar = tvar;
+      new_entry -> expected_value = entry -> expected_value;
+      new_entry -> new_value = entry -> new_value;
+      result = new_entry -> new_value;
+    } 
+  } else {
+    // No entry found
+    StgClosure *current_value = read_current_value(trec, tvar);
+    TRecEntry *new_entry = get_new_entry(cap, trec);
+    new_entry -> tvar = tvar;
+    new_entry -> expected_value = current_value;
+    new_entry -> new_value = current_value;
+    result = current_value;
+  }
+
+  TRACE("%p : stmReadTVar(%p)=%p\n", trec, tvar, result);
+  return result;
+}
+
+/*......................................................................*/
+
+void stmWriteTVar(Capability *cap,
+                  StgTRecHeader *trec,
+		  StgTVar *tvar, 
+		  StgClosure *new_value) {
+
+  StgTRecHeader *entry_in;
+  TRecEntry *entry = NULL;
+  TRACE("%p : stmWriteTVar(%p, %p)\n", trec, tvar, new_value);
+  ASSERT (trec != NO_TREC);
+  ASSERT (trec -> state == TREC_ACTIVE || 
+          trec -> state == TREC_CONDEMNED);
+
+  entry = get_entry_for(trec, tvar, &entry_in);
+
+  if (entry != NULL) {
+    if (entry_in == trec) {
+      // Entry found in our trec
+      entry -> new_value = new_value;
+    } else {
+      // Entry found in another trec
+      TRecEntry *new_entry = get_new_entry(cap, trec);
+      new_entry -> tvar = tvar;
+      new_entry -> expected_value = entry -> expected_value;
+      new_entry -> new_value = new_value;
+    } 
+  } else {
+    // No entry found
+    StgClosure *current_value = read_current_value(trec, tvar);
+    TRecEntry *new_entry = get_new_entry(cap, trec);
+    new_entry -> tvar = tvar;
+    new_entry -> expected_value = current_value;
+    new_entry -> new_value = new_value;
+  }
+
+  TRACE("%p : stmWriteTVar done\n", trec);
+}
+
+/*......................................................................*/
+
+StgTVar *stmNewTVar(Capability *cap,
+                    StgClosure *new_value) {
+  StgTVar *result;
+  result = (StgTVar *)allocateLocal(cap, sizeofW(StgTVar));
+  SET_HDR (result, &stg_TVAR_info, CCS_SYSTEM);
+  result -> current_value = new_value;
+  result -> first_wait_queue_entry = END_STM_WAIT_QUEUE;
+#if defined(THREADED_RTS)
+  result -> num_updates = 0;
+#endif
+  return result;
+}
+
+/*......................................................................*/
diff --git a/rts/Sanity.c b/rts/Sanity.c
new file mode 100644
index 0000000000..0e68a86ba7
--- /dev/null
+++ b/rts/Sanity.c
@@ -0,0 +1,948 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2006
+ *
+ * Sanity checking code for the heap and stack.
+ *
+ * Used when debugging: check that everything reasonable.
+ *
+ *    - All things that are supposed to be pointers look like pointers.
+ *
+ *    - Objects in text space are marked as static closures, those
+ *	in the heap are dynamic.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+
+#ifdef DEBUG                                                   /* whole file */
+
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "BlockAlloc.h"
+#include "Sanity.h"
+#include "MBlock.h"
+#include "Storage.h"
+#include "Schedule.h"
+#include "Apply.h"
+
+/* -----------------------------------------------------------------------------
+   Forward decls.
+   -------------------------------------------------------------------------- */
+
+static void      checkSmallBitmap    ( StgPtr payload, StgWord bitmap, nat );
+static void      checkLargeBitmap    ( StgPtr payload, StgLargeBitmap*, nat );
+static void      checkClosureShallow ( StgClosure * );
+
+/* -----------------------------------------------------------------------------
+   Check stack sanity
+   -------------------------------------------------------------------------- */
+
+static void
+checkSmallBitmap( StgPtr payload, StgWord bitmap, nat size )
+{
+    StgPtr p;
+    nat i;
+
+    p = payload;
+    for(i = 0; i < size; i++, bitmap >>= 1 ) {
+	if ((bitmap & 1) == 0) {
+	    checkClosureShallow((StgClosure *)payload[i]);
+	}
+    }
+}
+
+static void
+checkLargeBitmap( StgPtr payload, StgLargeBitmap* large_bitmap, nat size )
+{
+    StgWord bmp;
+    nat i, j;
+
+    i = 0;
+    for (bmp=0; i < size; bmp++) {
+	StgWord bitmap = large_bitmap->bitmap[bmp];
+	j = 0;
+	for(; i < size && j < BITS_IN(W_); j++, i++, bitmap >>= 1 ) {
+	    if ((bitmap & 1) == 0) {
+		checkClosureShallow((StgClosure *)payload[i]);
+	    }
+	}
+    }
+}
+
+/*
+ * check that it looks like a valid closure - without checking its payload
+ * used to avoid recursion between checking PAPs and checking stack
+ * chunks.
+ */
+ 
+static void 
+checkClosureShallow( StgClosure* p )
+{
+    ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
+
+    /* Is it a static closure? */
+    if (!HEAP_ALLOCED(p)) {
+	ASSERT(closure_STATIC(p));
+    } else {
+	ASSERT(!closure_STATIC(p));
+    }
+}
+
+// check an individual stack object
+StgOffset 
+checkStackFrame( StgPtr c )
+{
+    nat size;
+    const StgRetInfoTable* info;
+
+    info = get_ret_itbl((StgClosure *)c);
+
+    /* All activation records have 'bitmap' style layout info. */
+    switch (info->i.type) {
+    case RET_DYN: /* Dynamic bitmap: the mask is stored on the stack */
+    {
+	StgWord dyn;
+	StgPtr p;
+	StgRetDyn* r;
+	
+	r = (StgRetDyn *)c;
+	dyn = r->liveness;
+	
+	p = (P_)(r->payload);
+	checkSmallBitmap(p,RET_DYN_LIVENESS(r->liveness),RET_DYN_BITMAP_SIZE);
+	p += RET_DYN_BITMAP_SIZE + RET_DYN_NONPTR_REGS_SIZE;
+
+	// skip over the non-pointers
+	p += RET_DYN_NONPTRS(dyn);
+	
+	// follow the ptr words
+	for (size = RET_DYN_PTRS(dyn); size > 0; size--) {
+	    checkClosureShallow((StgClosure *)*p);
+	    p++;
+	}
+	
+	return sizeofW(StgRetDyn) + RET_DYN_BITMAP_SIZE +
+	    RET_DYN_NONPTR_REGS_SIZE +
+	    RET_DYN_NONPTRS(dyn) + RET_DYN_PTRS(dyn);
+    }
+
+    case UPDATE_FRAME:
+      ASSERT(LOOKS_LIKE_CLOSURE_PTR(((StgUpdateFrame*)c)->updatee));
+    case ATOMICALLY_FRAME:
+    case CATCH_RETRY_FRAME:
+    case CATCH_STM_FRAME:
+    case CATCH_FRAME:
+      // small bitmap cases (<= 32 entries)
+    case STOP_FRAME:
+    case RET_SMALL:
+    case RET_VEC_SMALL:
+	size = BITMAP_SIZE(info->i.layout.bitmap);
+	checkSmallBitmap((StgPtr)c + 1, 
+			 BITMAP_BITS(info->i.layout.bitmap), size);
+	return 1 + size;
+
+    case RET_BCO: {
+	StgBCO *bco;
+	nat size;
+	bco = (StgBCO *)*(c+1);
+	size = BCO_BITMAP_SIZE(bco);
+	checkLargeBitmap((StgPtr)c + 2, BCO_BITMAP(bco), size);
+	return 2 + size;
+    }
+
+    case RET_BIG: // large bitmap (> 32 entries)
+    case RET_VEC_BIG:
+	size = GET_LARGE_BITMAP(&info->i)->size;
+	checkLargeBitmap((StgPtr)c + 1, GET_LARGE_BITMAP(&info->i), size);
+	return 1 + size;
+
+    case RET_FUN:
+    {
+	StgFunInfoTable *fun_info;
+	StgRetFun *ret_fun;
+
+	ret_fun = (StgRetFun *)c;
+	fun_info = get_fun_itbl(ret_fun->fun);
+	size = ret_fun->size;
+	switch (fun_info->f.fun_type) {
+	case ARG_GEN:
+	    checkSmallBitmap((StgPtr)ret_fun->payload, 
+			     BITMAP_BITS(fun_info->f.b.bitmap), size);
+	    break;
+	case ARG_GEN_BIG:
+	    checkLargeBitmap((StgPtr)ret_fun->payload,
+			     GET_FUN_LARGE_BITMAP(fun_info), size);
+	    break;
+	default:
+	    checkSmallBitmap((StgPtr)ret_fun->payload,
+			     BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]),
+			     size);
+	    break;
+	}
+	return sizeofW(StgRetFun) + size;
+    }
+
+    default:
+	barf("checkStackFrame: weird activation record found on stack (%p %d).",c,info->i.type);
+    }
+}
+
+// check sections of stack between update frames
+void 
+checkStackChunk( StgPtr sp, StgPtr stack_end )
+{
+    StgPtr p;
+
+    p = sp;
+    while (p < stack_end) {
+	p += checkStackFrame( p );
+    }
+    // ASSERT( p == stack_end ); -- HWL
+}
+
+static void
+checkPAP (StgClosure *fun, StgClosure** payload, StgWord n_args)
+{ 
+    StgClosure *p;
+    StgFunInfoTable *fun_info;
+    
+    ASSERT(LOOKS_LIKE_CLOSURE_PTR(fun));
+    fun_info = get_fun_itbl(fun);
+    
+    p = (StgClosure *)payload;
+    switch (fun_info->f.fun_type) {
+    case ARG_GEN:
+	checkSmallBitmap( (StgPtr)payload, 
+			  BITMAP_BITS(fun_info->f.b.bitmap), n_args );
+	break;
+    case ARG_GEN_BIG:
+	checkLargeBitmap( (StgPtr)payload, 
+			  GET_FUN_LARGE_BITMAP(fun_info), 
+			  n_args );
+	break;
+    case ARG_BCO:
+	checkLargeBitmap( (StgPtr)payload, 
+			  BCO_BITMAP(fun), 
+			  n_args );
+	break;
+    default:
+	checkSmallBitmap( (StgPtr)payload, 
+			  BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]),
+			  n_args );
+	break;
+    }
+}
+
+
+StgOffset 
+checkClosure( StgClosure* p )
+{
+    const StgInfoTable *info;
+
+    ASSERT(LOOKS_LIKE_INFO_PTR(p->header.info));
+
+    /* Is it a static closure (i.e. in the data segment)? */
+    if (!HEAP_ALLOCED(p)) {
+	ASSERT(closure_STATIC(p));
+    } else {
+	ASSERT(!closure_STATIC(p));
+    }
+
+    info = get_itbl(p);
+    switch (info->type) {
+
+    case MVAR:
+      { 
+	StgMVar *mvar = (StgMVar *)p;
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(mvar->head));
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(mvar->tail));
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(mvar->value));
+#if 0
+#if defined(PAR)
+	checkBQ((StgBlockingQueueElement *)mvar->head, p);
+#else
+	checkBQ(mvar->head, p);
+#endif
+#endif
+	return sizeofW(StgMVar);
+      }
+
+    case THUNK:
+    case THUNK_1_0:
+    case THUNK_0_1:
+    case THUNK_1_1:
+    case THUNK_0_2:
+    case THUNK_2_0:
+      {
+	nat i;
+	for (i = 0; i < info->layout.payload.ptrs; i++) {
+	  ASSERT(LOOKS_LIKE_CLOSURE_PTR(((StgThunk *)p)->payload[i]));
+	}
+	return thunk_sizeW_fromITBL(info);
+      }
+
+    case FUN:
+    case FUN_1_0:
+    case FUN_0_1:
+    case FUN_1_1:
+    case FUN_0_2:
+    case FUN_2_0:
+    case CONSTR:
+    case CONSTR_1_0:
+    case CONSTR_0_1:
+    case CONSTR_1_1:
+    case CONSTR_0_2:
+    case CONSTR_2_0:
+    case IND_PERM:
+    case IND_OLDGEN:
+    case IND_OLDGEN_PERM:
+#ifdef TICKY_TICKY
+    case SE_BLACKHOLE:
+    case SE_CAF_BLACKHOLE:
+#endif
+    case BLACKHOLE:
+    case CAF_BLACKHOLE:
+    case STABLE_NAME:
+    case MUT_VAR_CLEAN:
+    case MUT_VAR_DIRTY:
+    case CONSTR_INTLIKE:
+    case CONSTR_CHARLIKE:
+    case CONSTR_STATIC:
+    case CONSTR_NOCAF_STATIC:
+    case THUNK_STATIC:
+    case FUN_STATIC:
+	{
+	    nat i;
+	    for (i = 0; i < info->layout.payload.ptrs; i++) {
+		ASSERT(LOOKS_LIKE_CLOSURE_PTR(p->payload[i]));
+	    }
+	    return sizeW_fromITBL(info);
+	}
+
+    case BCO: {
+	StgBCO *bco = (StgBCO *)p;
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(bco->instrs));
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(bco->literals));
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(bco->ptrs));
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(bco->itbls));
+	return bco_sizeW(bco);
+    }
+
+    case IND_STATIC: /* (1, 0) closure */
+      ASSERT(LOOKS_LIKE_CLOSURE_PTR(((StgIndStatic*)p)->indirectee));
+      return sizeW_fromITBL(info);
+
+    case WEAK:
+      /* deal with these specially - the info table isn't
+       * representative of the actual layout.
+       */
+      { StgWeak *w = (StgWeak *)p;
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(w->key));
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(w->value));
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(w->finalizer));
+	if (w->link) {
+	  ASSERT(LOOKS_LIKE_CLOSURE_PTR(w->link));
+	}
+	return sizeW_fromITBL(info);
+      }
+
+    case THUNK_SELECTOR:
+	    ASSERT(LOOKS_LIKE_CLOSURE_PTR(((StgSelector *)p)->selectee));
+	    return THUNK_SELECTOR_sizeW();
+
+    case IND:
+	{ 
+  	    /* we don't expect to see any of these after GC
+	     * but they might appear during execution
+	     */
+	    StgInd *ind = (StgInd *)p;
+	    ASSERT(LOOKS_LIKE_CLOSURE_PTR(ind->indirectee));
+	    return sizeofW(StgInd);
+	}
+
+    case RET_BCO:
+    case RET_SMALL:
+    case RET_VEC_SMALL:
+    case RET_BIG:
+    case RET_VEC_BIG:
+    case RET_DYN:
+    case UPDATE_FRAME:
+    case STOP_FRAME:
+    case CATCH_FRAME:
+    case ATOMICALLY_FRAME:
+    case CATCH_RETRY_FRAME:
+    case CATCH_STM_FRAME:
+	    barf("checkClosure: stack frame");
+
+    case AP:
+    {
+	StgAP* ap = (StgAP *)p;
+	checkPAP (ap->fun, ap->payload, ap->n_args);
+	return ap_sizeW(ap);
+    }
+
+    case PAP:
+    {
+	StgPAP* pap = (StgPAP *)p;
+	checkPAP (pap->fun, pap->payload, pap->n_args);
+	return pap_sizeW(pap);
+    }
+
+    case AP_STACK:
+    { 
+	StgAP_STACK *ap = (StgAP_STACK *)p;
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(ap->fun));
+	checkStackChunk((StgPtr)ap->payload, (StgPtr)ap->payload + ap->size);
+	return ap_stack_sizeW(ap);
+    }
+
+    case ARR_WORDS:
+	    return arr_words_sizeW((StgArrWords *)p);
+
+    case MUT_ARR_PTRS_CLEAN:
+    case MUT_ARR_PTRS_DIRTY:
+    case MUT_ARR_PTRS_FROZEN:
+    case MUT_ARR_PTRS_FROZEN0:
+	{
+	    StgMutArrPtrs* a = (StgMutArrPtrs *)p;
+	    nat i;
+	    for (i = 0; i < a->ptrs; i++) {
+		ASSERT(LOOKS_LIKE_CLOSURE_PTR(a->payload[i]));
+	    }
+	    return mut_arr_ptrs_sizeW(a);
+	}
+
+    case TSO:
+        checkTSO((StgTSO *)p);
+        return tso_sizeW((StgTSO *)p);
+
+#if defined(PAR)
+
+    case BLOCKED_FETCH:
+      ASSERT(LOOKS_LIKE_GA(&(((StgBlockedFetch *)p)->ga)));
+      ASSERT(LOOKS_LIKE_CLOSURE_PTR((((StgBlockedFetch *)p)->node)));
+      return sizeofW(StgBlockedFetch);  // see size used in evacuate()
+
+#ifdef DIST
+    case REMOTE_REF:
+      return sizeofW(StgFetchMe); 
+#endif /*DIST */
+      
+    case FETCH_ME:
+      ASSERT(LOOKS_LIKE_GA(((StgFetchMe *)p)->ga));
+      return sizeofW(StgFetchMe);  // see size used in evacuate()
+
+    case FETCH_ME_BQ:
+      checkBQ(((StgFetchMeBlockingQueue *)p)->blocking_queue, (StgClosure *)p);
+      return sizeofW(StgFetchMeBlockingQueue); // see size used in evacuate()
+
+    case RBH:
+      /* In an RBH the BQ may be empty (ie END_BQ_QUEUE) but not NULL */
+      ASSERT(((StgRBH *)p)->blocking_queue!=NULL);
+      if (((StgRBH *)p)->blocking_queue!=END_BQ_QUEUE)
+	checkBQ(((StgRBH *)p)->blocking_queue, p);
+      ASSERT(LOOKS_LIKE_INFO_PTR(REVERT_INFOPTR(get_itbl((StgClosure *)p))));
+      return BLACKHOLE_sizeW();   // see size used in evacuate()
+      // sizeW_fromITBL(REVERT_INFOPTR(get_itbl((StgClosure *)p)));
+
+#endif
+
+    case TVAR_WAIT_QUEUE:
+      {
+        StgTVarWaitQueue *wq = (StgTVarWaitQueue *)p;
+        ASSERT(LOOKS_LIKE_CLOSURE_PTR(wq->next_queue_entry));
+        ASSERT(LOOKS_LIKE_CLOSURE_PTR(wq->prev_queue_entry));
+        return sizeofW(StgTVarWaitQueue);
+      }
+
+    case TVAR:
+      {
+        StgTVar *tv = (StgTVar *)p;
+        ASSERT(LOOKS_LIKE_CLOSURE_PTR(tv->current_value));
+        ASSERT(LOOKS_LIKE_CLOSURE_PTR(tv->first_wait_queue_entry));
+        return sizeofW(StgTVar);
+      }
+
+    case TREC_CHUNK:
+      {
+        nat i;
+        StgTRecChunk *tc = (StgTRecChunk *)p;
+        ASSERT(LOOKS_LIKE_CLOSURE_PTR(tc->prev_chunk));
+        for (i = 0; i < tc -> next_entry_idx; i ++) {
+          ASSERT(LOOKS_LIKE_CLOSURE_PTR(tc->entries[i].tvar));
+          ASSERT(LOOKS_LIKE_CLOSURE_PTR(tc->entries[i].expected_value));
+          ASSERT(LOOKS_LIKE_CLOSURE_PTR(tc->entries[i].new_value));
+        }
+        return sizeofW(StgTRecChunk);
+      }
+
+    case TREC_HEADER:
+      {
+        StgTRecHeader *trec = (StgTRecHeader *)p;
+        ASSERT(LOOKS_LIKE_CLOSURE_PTR(trec -> enclosing_trec));
+        ASSERT(LOOKS_LIKE_CLOSURE_PTR(trec -> current_chunk));
+        return sizeofW(StgTRecHeader);
+      }
+      
+      
+    case EVACUATED:
+	    barf("checkClosure: found EVACUATED closure %d",
+		 info->type);
+    default:
+	    barf("checkClosure (closure type %d)", info->type);
+    }
+}
+
+#if defined(PAR)
+
+#define PVM_PE_MASK    0xfffc0000
+#define MAX_PVM_PES    MAX_PES
+#define MAX_PVM_TIDS   MAX_PES
+#define MAX_SLOTS      100000
+
+rtsBool
+looks_like_tid(StgInt tid)
+{
+  StgInt hi = (tid & PVM_PE_MASK) >> 18;
+  StgInt lo = (tid & ~PVM_PE_MASK);
+  rtsBool ok = (hi != 0) && (lo < MAX_PVM_TIDS) && (hi < MAX_PVM_TIDS);
+  return ok;
+}
+
+rtsBool
+looks_like_slot(StgInt slot)
+{
+  /* if tid is known better use looks_like_ga!! */
+  rtsBool ok = slot<MAX_SLOTS;
+  // This refers only to the no. of slots on the current PE
+  // rtsBool ok = slot<=highest_slot();
+  return ok; 
+}
+
+rtsBool
+looks_like_ga(globalAddr *ga)
+{
+  rtsBool is_tid = looks_like_tid((ga)->payload.gc.gtid);
+  rtsBool is_slot = ((ga)->payload.gc.gtid==mytid) ? 
+                     (ga)->payload.gc.slot<=highest_slot() : 
+                     (ga)->payload.gc.slot<MAX_SLOTS;
+  rtsBool ok = is_tid && is_slot;
+  return ok;
+}
+
+#endif
+
+
+/* -----------------------------------------------------------------------------
+   Check Heap Sanity
+
+   After garbage collection, the live heap is in a state where we can
+   run through and check that all the pointers point to the right
+   place.  This function starts at a given position and sanity-checks
+   all the objects in the remainder of the chain.
+   -------------------------------------------------------------------------- */
+
+void 
+checkHeap(bdescr *bd)
+{
+    StgPtr p;
+
+#if defined(THREADED_RTS)
+    // heap sanity checking doesn't work with SMP, because we can't
+    // zero the slop (see Updates.h).
+    return;
+#endif
+
+    for (; bd != NULL; bd = bd->link) {
+	p = bd->start;
+	while (p < bd->free) {
+	    nat size = checkClosure((StgClosure *)p);
+	    /* This is the smallest size of closure that can live in the heap */
+	    ASSERT( size >= MIN_PAYLOAD_SIZE + sizeofW(StgHeader) );
+	    p += size;
+	    
+	    /* skip over slop */
+	    while (p < bd->free &&
+		   (*p < 0x1000 || !LOOKS_LIKE_INFO_PTR((void*)*p))) { p++; } 
+	}
+    }
+}
+
+#if defined(PAR)
+/* 
+   Check heap between start and end. Used after unpacking graphs.
+*/
+void 
+checkHeapChunk(StgPtr start, StgPtr end)
+{
+  extern globalAddr *LAGAlookup(StgClosure *addr);
+  StgPtr p;
+  nat size;
+
+  for (p=start; p<end; p+=size) {
+    ASSERT(LOOKS_LIKE_INFO_PTR((void*)*p));
+    if (get_itbl((StgClosure*)p)->type == FETCH_ME &&
+	*(p+1) == 0x0000eeee /* ie. unpack garbage (see SetGAandCommonUp) */) {
+      /* if it's a FM created during unpack and commoned up, it's not global */
+      ASSERT(LAGAlookup((StgClosure*)p)==NULL);
+      size = sizeofW(StgFetchMe);
+    } else if (get_itbl((StgClosure*)p)->type == IND) {
+      *(p+2) = 0x0000ee11; /* mark slop in IND as garbage */
+      size = sizeofW(StgInd);
+    } else {
+      size = checkClosure((StgClosure *)p);
+      /* This is the smallest size of closure that can live in the heap. */
+      ASSERT( size >= MIN_PAYLOAD_SIZE + sizeofW(StgHeader) );
+    }
+  }
+}
+#else /* !PAR */
+void 
+checkHeapChunk(StgPtr start, StgPtr end)
+{
+  StgPtr p;
+  nat size;
+
+  for (p=start; p<end; p+=size) {
+    ASSERT(LOOKS_LIKE_INFO_PTR((void*)*p));
+    size = checkClosure((StgClosure *)p);
+    /* This is the smallest size of closure that can live in the heap. */
+    ASSERT( size >= MIN_PAYLOAD_SIZE + sizeofW(StgHeader) );
+  }
+}
+#endif
+
+void
+checkChain(bdescr *bd)
+{
+  while (bd != NULL) {
+    checkClosure((StgClosure *)bd->start);
+    bd = bd->link;
+  }
+}
+
+void
+checkTSO(StgTSO *tso)
+{
+    StgPtr sp = tso->sp;
+    StgPtr stack = tso->stack;
+    StgOffset stack_size = tso->stack_size;
+    StgPtr stack_end = stack + stack_size;
+
+    if (tso->what_next == ThreadRelocated) {
+      checkTSO(tso->link);
+      return;
+    }
+
+    if (tso->what_next == ThreadKilled) {
+      /* The garbage collector doesn't bother following any pointers
+       * from dead threads, so don't check sanity here.  
+       */
+      return;
+    }
+
+    ASSERT(stack <= sp && sp < stack_end);
+
+#if defined(PAR)
+    ASSERT(tso->par.magic==TSO_MAGIC);
+
+    switch (tso->why_blocked) {
+    case BlockedOnGA: 
+      checkClosureShallow(tso->block_info.closure);
+      ASSERT(/* Can't be a FETCH_ME because *this* closure is on its BQ */
+	     get_itbl(tso->block_info.closure)->type==FETCH_ME_BQ);
+      break;
+    case BlockedOnGA_NoSend: 
+      checkClosureShallow(tso->block_info.closure);
+      ASSERT(get_itbl(tso->block_info.closure)->type==FETCH_ME_BQ);
+      break;
+    case BlockedOnBlackHole: 
+      checkClosureShallow(tso->block_info.closure);
+      ASSERT(get_itbl(tso->block_info.closure)->type==BLACKHOLE ||
+             get_itbl(tso->block_info.closure)->type==RBH);
+      break;
+    case BlockedOnRead:
+    case BlockedOnWrite:
+    case BlockedOnDelay:
+#if defined(mingw32_HOST_OS)
+    case BlockedOnDoProc:
+#endif
+      /* isOnBQ(blocked_queue) */
+      break;
+    case BlockedOnException:
+      /* isOnSomeBQ(tso) */
+      ASSERT(get_itbl(tso->block_info.tso)->type==TSO);
+      break;
+    case BlockedOnMVar:
+      ASSERT(get_itbl(tso->block_info.closure)->type==MVAR);
+      break;
+    case BlockedOnSTM:
+      ASSERT(tso->block_info.closure == END_TSO_QUEUE);
+      break;
+    default:
+      /* 
+	 Could check other values of why_blocked but I am more 
+	 lazy than paranoid (bad combination) -- HWL 
+      */
+    }
+
+    /* if the link field is non-nil it most point to one of these
+       three closure types */
+    ASSERT(tso->link == END_TSO_QUEUE ||
+	   get_itbl(tso->link)->type == TSO ||
+	   get_itbl(tso->link)->type == BLOCKED_FETCH ||
+	   get_itbl(tso->link)->type == CONSTR);
+#endif
+
+    checkStackChunk(sp, stack_end);
+}
+
+#if defined(GRAN)
+void  
+checkTSOsSanity(void) {
+  nat i, tsos;
+  StgTSO *tso;
+  
+  debugBelch("Checking sanity of all runnable TSOs:");
+  
+  for (i=0, tsos=0; i<RtsFlags.GranFlags.proc; i++) {
+    for (tso=run_queue_hds[i]; tso!=END_TSO_QUEUE; tso=tso->link) {
+      debugBelch("TSO %p on PE %d ...", tso, i);
+      checkTSO(tso); 
+      debugBelch("OK, ");
+      tsos++;
+    }
+  }
+  
+  debugBelch(" checked %d TSOs on %d PEs; ok\n", tsos, RtsFlags.GranFlags.proc);
+}
+
+
+// still GRAN only
+
+rtsBool
+checkThreadQSanity (PEs proc, rtsBool check_TSO_too) 
+{
+  StgTSO *tso, *prev;
+
+  /* the NIL value for TSOs is END_TSO_QUEUE; thus, finding NULL is an error */
+  ASSERT(run_queue_hds[proc]!=NULL);
+  ASSERT(run_queue_tls[proc]!=NULL);
+  /* if either head or tail is NIL then the other one must be NIL, too */
+  ASSERT(run_queue_hds[proc]!=END_TSO_QUEUE || run_queue_tls[proc]==END_TSO_QUEUE);
+  ASSERT(run_queue_tls[proc]!=END_TSO_QUEUE || run_queue_hds[proc]==END_TSO_QUEUE);
+  for (tso=run_queue_hds[proc], prev=END_TSO_QUEUE; 
+       tso!=END_TSO_QUEUE;
+       prev=tso, tso=tso->link) {
+    ASSERT((prev!=END_TSO_QUEUE || tso==run_queue_hds[proc]) &&
+	   (prev==END_TSO_QUEUE || prev->link==tso));
+    if (check_TSO_too)
+      checkTSO(tso);
+  }
+  ASSERT(prev==run_queue_tls[proc]);
+}
+
+rtsBool
+checkThreadQsSanity (rtsBool check_TSO_too)
+{
+  PEs p;
+  
+  for (p=0; p<RtsFlags.GranFlags.proc; p++)
+    checkThreadQSanity(p, check_TSO_too);
+}
+#endif /* GRAN */
+
+/* 
+   Check that all TSOs have been evacuated.
+   Optionally also check the sanity of the TSOs.
+*/
+void
+checkGlobalTSOList (rtsBool checkTSOs)
+{
+  extern  StgTSO *all_threads;
+  StgTSO *tso;
+  for (tso=all_threads; tso != END_TSO_QUEUE; tso = tso->global_link) {
+      ASSERT(LOOKS_LIKE_CLOSURE_PTR(tso));
+      ASSERT(get_itbl(tso)->type == TSO);
+      if (checkTSOs)
+	  checkTSO(tso);
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   Check mutable list sanity.
+   -------------------------------------------------------------------------- */
+
+void
+checkMutableList( bdescr *mut_bd, nat gen )
+{
+    bdescr *bd;
+    StgPtr q;
+    StgClosure *p;
+
+    for (bd = mut_bd; bd != NULL; bd = bd->link) {
+	for (q = bd->start; q < bd->free; q++) {
+	    p = (StgClosure *)*q;
+	    ASSERT(!HEAP_ALLOCED(p) || Bdescr((P_)p)->gen_no == gen);
+	}
+    }
+}
+
+/*
+  Check the static objects list.
+*/
+void
+checkStaticObjects ( StgClosure* static_objects )
+{
+  StgClosure *p = static_objects;
+  StgInfoTable *info;
+
+  while (p != END_OF_STATIC_LIST) {
+    checkClosure(p);
+    info = get_itbl(p);
+    switch (info->type) {
+    case IND_STATIC:
+      { 
+	StgClosure *indirectee = ((StgIndStatic *)p)->indirectee;
+
+	ASSERT(LOOKS_LIKE_CLOSURE_PTR(indirectee));
+	ASSERT(LOOKS_LIKE_INFO_PTR(indirectee->header.info));
+	p = *IND_STATIC_LINK((StgClosure *)p);
+	break;
+      }
+
+    case THUNK_STATIC:
+      p = *THUNK_STATIC_LINK((StgClosure *)p);
+      break;
+
+    case FUN_STATIC:
+      p = *FUN_STATIC_LINK((StgClosure *)p);
+      break;
+
+    case CONSTR_STATIC:
+      p = *STATIC_LINK(info,(StgClosure *)p);
+      break;
+
+    default:
+      barf("checkStaticObjetcs: strange closure %p (%s)", 
+	   p, info_type(p));
+    }
+  }
+}
+
+/* 
+   Check the sanity of a blocking queue starting at bqe with closure being
+   the closure holding the blocking queue.
+   Note that in GUM we can have several different closure types in a 
+   blocking queue 
+*/
+#if defined(PAR)
+void
+checkBQ (StgBlockingQueueElement *bqe, StgClosure *closure) 
+{
+  rtsBool end = rtsFalse;
+  StgInfoTable *info = get_itbl(closure);
+
+  ASSERT(info->type == MVAR || info->type == FETCH_ME_BQ || info->type == RBH);
+
+  do {
+    switch (get_itbl(bqe)->type) {
+    case BLOCKED_FETCH:
+    case TSO:
+      checkClosure((StgClosure *)bqe);
+      bqe = bqe->link;
+      end = (bqe==END_BQ_QUEUE);
+      break;
+    
+    case CONSTR:
+      checkClosure((StgClosure *)bqe);
+      end = rtsTrue;
+      break;
+
+    default:
+      barf("checkBQ: strange closure %d in blocking queue for closure %p (%s)\n", 
+	   get_itbl(bqe)->type, closure, info_type(closure));
+    }
+  } while (!end);
+}
+#elif defined(GRAN)
+void
+checkBQ (StgTSO *bqe, StgClosure *closure) 
+{  
+  rtsBool end = rtsFalse;
+  StgInfoTable *info = get_itbl(closure);
+
+  ASSERT(info->type == MVAR);
+
+  do {
+    switch (get_itbl(bqe)->type) {
+    case BLOCKED_FETCH:
+    case TSO:
+      checkClosure((StgClosure *)bqe);
+      bqe = bqe->link;
+      end = (bqe==END_BQ_QUEUE);
+      break;
+    
+    default:
+      barf("checkBQ: strange closure %d in blocking queue for closure %p (%s)\n", 
+	   get_itbl(bqe)->type, closure, info_type(closure));
+    }
+  } while (!end);
+}
+#endif
+    
+
+
+/*
+  This routine checks the sanity of the LAGA and GALA tables. They are 
+  implemented as lists through one hash table, LAtoGALAtable, because entries 
+  in both tables have the same structure:
+   - the LAGA table maps local addresses to global addresses; it starts
+     with liveIndirections
+   - the GALA table maps global addresses to local addresses; it starts 
+     with liveRemoteGAs
+*/
+
+#if defined(PAR)
+#include "Hash.h"
+
+/* hidden in parallel/Global.c; only accessed for testing here */
+extern GALA *liveIndirections;
+extern GALA *liveRemoteGAs;
+extern HashTable *LAtoGALAtable;
+
+void
+checkLAGAtable(rtsBool check_closures)
+{
+  GALA *gala, *gala0;
+  nat n=0, m=0; // debugging
+
+  for (gala = liveIndirections; gala != NULL; gala = gala->next) {
+    n++;
+    gala0 = lookupHashTable(LAtoGALAtable, (StgWord) gala->la);
+    ASSERT(!gala->preferred || gala == gala0);
+    ASSERT(LOOKS_LIKE_INFO_PTR(((StgClosure *)gala->la)->header.info));
+    ASSERT(gala->next!=gala); // detect direct loops
+    if ( check_closures ) {
+      checkClosure((StgClosure *)gala->la);
+    }
+  }
+
+  for (gala = liveRemoteGAs; gala != NULL; gala = gala->next) {
+    m++;
+    gala0 = lookupHashTable(LAtoGALAtable, (StgWord) gala->la);
+    ASSERT(!gala->preferred || gala == gala0);
+    ASSERT(LOOKS_LIKE_INFO_PTR(((StgClosure *)gala->la)->header.info));
+    ASSERT(gala->next!=gala); // detect direct loops
+    /*
+    if ( check_closures ) {
+      checkClosure((StgClosure *)gala->la);
+    }
+    */
+  }
+}
+#endif
+
+#endif /* DEBUG */
diff --git a/rts/Sanity.h b/rts/Sanity.h
new file mode 100644
index 0000000000..8cf3f9e52e
--- /dev/null
+++ b/rts/Sanity.h
@@ -0,0 +1,56 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-1999
+ *
+ * Prototypes for functions in Sanity.c
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef SANITY_H
+
+#ifdef DEBUG
+
+# if defined(PAR)
+# define PVM_PE_MASK    0xfffc0000
+# define MAX_PVM_PES    MAX_PES
+# define MAX_PVM_TIDS   MAX_PES
+# define MAX_SLOTS      100000
+# endif
+
+/* debugging routines */
+extern void checkHeap      ( bdescr *bd );
+extern void checkHeapChunk ( StgPtr start, StgPtr end );
+extern void checkChain     ( bdescr *bd );
+extern void checkTSO       ( StgTSO* tso );
+extern void checkGlobalTSOList ( rtsBool checkTSOs );
+extern void checkStaticObjects ( StgClosure* static_objects );
+extern void checkStackChunk    ( StgPtr sp, StgPtr stack_end );
+extern StgOffset checkStackFrame ( StgPtr sp );
+extern StgOffset checkClosure  ( StgClosure* p );
+
+extern void checkMutableList   ( bdescr *bd, nat gen );
+
+#if defined(GRAN)
+extern void checkTSOsSanity(void);
+extern rtsBool checkThreadQSanity (PEs proc, rtsBool check_TSO_too);
+extern rtsBool checkThreadQsSanity (rtsBool check_TSO_too);
+#endif
+
+#if defined(PAR)
+extern void checkBQ (StgBlockingQueueElement *bqe, StgClosure *closure);
+#else
+extern void checkBQ (StgTSO *bqe, StgClosure *closure);
+#endif
+
+#if defined(PAR)
+extern void checkLAGAtable(rtsBool check_closures);
+extern void checkHeapChunk(StgPtr start, StgPtr end);
+#endif
+
+/* test whether an object is already on update list */
+extern rtsBool isBlackhole( StgTSO* tso, StgClosure* p );
+
+#endif /* DEBUG */
+ 
+#endif /* SANITY_H */
+
diff --git a/rts/Schedule.c b/rts/Schedule.c
new file mode 100644
index 0000000000..52fd4d5df6
--- /dev/null
+++ b/rts/Schedule.c
@@ -0,0 +1,4589 @@
+/* ---------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * The scheduler and thread-related functionality
+ *
+ * --------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "SchedAPI.h"
+#include "RtsUtils.h"
+#include "RtsFlags.h"
+#include "BlockAlloc.h"
+#include "OSThreads.h"
+#include "Storage.h"
+#include "StgRun.h"
+#include "Hooks.h"
+#include "Schedule.h"
+#include "StgMiscClosures.h"
+#include "Interpreter.h"
+#include "Exception.h"
+#include "Printer.h"
+#include "RtsSignals.h"
+#include "Sanity.h"
+#include "Stats.h"
+#include "STM.h"
+#include "Timer.h"
+#include "Prelude.h"
+#include "ThreadLabels.h"
+#include "LdvProfile.h"
+#include "Updates.h"
+#ifdef PROFILING
+#include "Proftimer.h"
+#include "ProfHeap.h"
+#endif
+#if defined(GRAN) || defined(PARALLEL_HASKELL)
+# include "GranSimRts.h"
+# include "GranSim.h"
+# include "ParallelRts.h"
+# include "Parallel.h"
+# include "ParallelDebug.h"
+# include "FetchMe.h"
+# include "HLC.h"
+#endif
+#include "Sparks.h"
+#include "Capability.h"
+#include "Task.h"
+#include "AwaitEvent.h"
+#if defined(mingw32_HOST_OS)
+#include "win32/IOManager.h"
+#endif
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+
+#ifdef HAVE_ERRNO_H
+#include <errno.h>
+#endif
+
+// Turn off inlining when debugging - it obfuscates things
+#ifdef DEBUG
+# undef  STATIC_INLINE
+# define STATIC_INLINE static
+#endif
+
+/* -----------------------------------------------------------------------------
+ * Global variables
+ * -------------------------------------------------------------------------- */
+
+#if defined(GRAN)
+
+StgTSO* ActiveTSO = NULL; /* for assigning system costs; GranSim-Light only */
+/* rtsTime TimeOfNextEvent, EndOfTimeSlice;            now in GranSim.c */
+
+/* 
+   In GranSim we have a runnable and a blocked queue for each processor.
+   In order to minimise code changes new arrays run_queue_hds/tls
+   are created. run_queue_hd is then a short cut (macro) for
+   run_queue_hds[CurrentProc] (see GranSim.h).
+   -- HWL
+*/
+StgTSO *run_queue_hds[MAX_PROC], *run_queue_tls[MAX_PROC];
+StgTSO *blocked_queue_hds[MAX_PROC], *blocked_queue_tls[MAX_PROC];
+StgTSO *ccalling_threadss[MAX_PROC];
+/* We use the same global list of threads (all_threads) in GranSim as in
+   the std RTS (i.e. we are cheating). However, we don't use this list in
+   the GranSim specific code at the moment (so we are only potentially
+   cheating).  */
+
+#else /* !GRAN */
+
+#if !defined(THREADED_RTS)
+// Blocked/sleeping thrads
+StgTSO *blocked_queue_hd = NULL;
+StgTSO *blocked_queue_tl = NULL;
+StgTSO *sleeping_queue = NULL;    // perhaps replace with a hash table?
+#endif
+
+/* Threads blocked on blackholes.
+ * LOCK: sched_mutex+capability, or all capabilities
+ */
+StgTSO *blackhole_queue = NULL;
+#endif
+
+/* The blackhole_queue should be checked for threads to wake up.  See
+ * Schedule.h for more thorough comment.
+ * LOCK: none (doesn't matter if we miss an update)
+ */
+rtsBool blackholes_need_checking = rtsFalse;
+
+/* Linked list of all threads.
+ * Used for detecting garbage collected threads.
+ * LOCK: sched_mutex+capability, or all capabilities
+ */
+StgTSO *all_threads = NULL;
+
+/* flag set by signal handler to precipitate a context switch
+ * LOCK: none (just an advisory flag)
+ */
+int context_switch = 0;
+
+/* flag that tracks whether we have done any execution in this time slice.
+ * LOCK: currently none, perhaps we should lock (but needs to be
+ * updated in the fast path of the scheduler).
+ */
+nat recent_activity = ACTIVITY_YES;
+
+/* if this flag is set as well, give up execution
+ * LOCK: none (changes once, from false->true)
+ */
+rtsBool sched_state = SCHED_RUNNING;
+
+/* Next thread ID to allocate.
+ * LOCK: sched_mutex
+ */
+static StgThreadID next_thread_id = 1;
+
+/* The smallest stack size that makes any sense is:
+ *    RESERVED_STACK_WORDS    (so we can get back from the stack overflow)
+ *  + sizeofW(StgStopFrame)   (the stg_stop_thread_info frame)
+ *  + 1                       (the closure to enter)
+ *  + 1			      (stg_ap_v_ret)
+ *  + 1			      (spare slot req'd by stg_ap_v_ret)
+ *
+ * A thread with this stack will bomb immediately with a stack
+ * overflow, which will increase its stack size.  
+ */
+#define MIN_STACK_WORDS (RESERVED_STACK_WORDS + sizeofW(StgStopFrame) + 3)
+
+#if defined(GRAN)
+StgTSO *CurrentTSO;
+#endif
+
+/*  This is used in `TSO.h' and gcc 2.96 insists that this variable actually 
+ *  exists - earlier gccs apparently didn't.
+ *  -= chak
+ */
+StgTSO dummy_tso;
+
+/*
+ * Set to TRUE when entering a shutdown state (via shutdownHaskellAndExit()) --
+ * in an MT setting, needed to signal that a worker thread shouldn't hang around
+ * in the scheduler when it is out of work.
+ */
+rtsBool shutting_down_scheduler = rtsFalse;
+
+/*
+ * This mutex protects most of the global scheduler data in
+ * the THREADED_RTS runtime.
+ */
+#if defined(THREADED_RTS)
+Mutex sched_mutex;
+#endif
+
+#if defined(PARALLEL_HASKELL)
+StgTSO *LastTSO;
+rtsTime TimeOfLastYield;
+rtsBool emitSchedule = rtsTrue;
+#endif
+
+/* -----------------------------------------------------------------------------
+ * static function prototypes
+ * -------------------------------------------------------------------------- */
+
+static Capability *schedule (Capability *initialCapability, Task *task);
+
+//
+// These function all encapsulate parts of the scheduler loop, and are
+// abstracted only to make the structure and control flow of the
+// scheduler clearer.
+//
+static void schedulePreLoop (void);
+#if defined(THREADED_RTS)
+static void schedulePushWork(Capability *cap, Task *task);
+#endif
+static void scheduleStartSignalHandlers (Capability *cap);
+static void scheduleCheckBlockedThreads (Capability *cap);
+static void scheduleCheckWakeupThreads(Capability *cap USED_IF_NOT_THREADS);
+static void scheduleCheckBlackHoles (Capability *cap);
+static void scheduleDetectDeadlock (Capability *cap, Task *task);
+#if defined(GRAN)
+static StgTSO *scheduleProcessEvent(rtsEvent *event);
+#endif
+#if defined(PARALLEL_HASKELL)
+static StgTSO *scheduleSendPendingMessages(void);
+static void scheduleActivateSpark(void);
+static rtsBool scheduleGetRemoteWork(rtsBool *receivedFinish);
+#endif
+#if defined(PAR) || defined(GRAN)
+static void scheduleGranParReport(void);
+#endif
+static void schedulePostRunThread(void);
+static rtsBool scheduleHandleHeapOverflow( Capability *cap, StgTSO *t );
+static void scheduleHandleStackOverflow( Capability *cap, Task *task, 
+					 StgTSO *t);
+static rtsBool scheduleHandleYield( Capability *cap, StgTSO *t, 
+				    nat prev_what_next );
+static void scheduleHandleThreadBlocked( StgTSO *t );
+static rtsBool scheduleHandleThreadFinished( Capability *cap, Task *task,
+					     StgTSO *t );
+static rtsBool scheduleDoHeapProfile(rtsBool ready_to_gc);
+static Capability *scheduleDoGC(Capability *cap, Task *task,
+				rtsBool force_major, 
+				void (*get_roots)(evac_fn));
+
+static void unblockThread(Capability *cap, StgTSO *tso);
+static rtsBool checkBlackHoles(Capability *cap);
+static void AllRoots(evac_fn evac);
+
+static StgTSO *threadStackOverflow(Capability *cap, StgTSO *tso);
+
+static void raiseAsync_(Capability *cap, StgTSO *tso, StgClosure *exception, 
+			rtsBool stop_at_atomically, StgPtr stop_here);
+
+static void deleteThread (Capability *cap, StgTSO *tso);
+static void deleteAllThreads (Capability *cap);
+
+#ifdef DEBUG
+static void printThreadBlockage(StgTSO *tso);
+static void printThreadStatus(StgTSO *tso);
+void printThreadQueue(StgTSO *tso);
+#endif
+
+#if defined(PARALLEL_HASKELL)
+StgTSO * createSparkThread(rtsSpark spark);
+StgTSO * activateSpark (rtsSpark spark);  
+#endif
+
+#ifdef DEBUG
+static char *whatNext_strs[] = {
+  "(unknown)",
+  "ThreadRunGHC",
+  "ThreadInterpret",
+  "ThreadKilled",
+  "ThreadRelocated",
+  "ThreadComplete"
+};
+#endif
+
+/* -----------------------------------------------------------------------------
+ * Putting a thread on the run queue: different scheduling policies
+ * -------------------------------------------------------------------------- */
+
+STATIC_INLINE void
+addToRunQueue( Capability *cap, StgTSO *t )
+{
+#if defined(PARALLEL_HASKELL)
+    if (RtsFlags.ParFlags.doFairScheduling) { 
+	// this does round-robin scheduling; good for concurrency
+	appendToRunQueue(cap,t);
+    } else {
+	// this does unfair scheduling; good for parallelism
+	pushOnRunQueue(cap,t);
+    }
+#else
+    // this does round-robin scheduling; good for concurrency
+    appendToRunQueue(cap,t);
+#endif
+}
+
+/* ---------------------------------------------------------------------------
+   Main scheduling loop.
+
+   We use round-robin scheduling, each thread returning to the
+   scheduler loop when one of these conditions is detected:
+
+      * out of heap space
+      * timer expires (thread yields)
+      * thread blocks
+      * thread ends
+      * stack overflow
+
+   GRAN version:
+     In a GranSim setup this loop iterates over the global event queue.
+     This revolves around the global event queue, which determines what 
+     to do next. Therefore, it's more complicated than either the 
+     concurrent or the parallel (GUM) setup.
+
+   GUM version:
+     GUM iterates over incoming messages.
+     It starts with nothing to do (thus CurrentTSO == END_TSO_QUEUE),
+     and sends out a fish whenever it has nothing to do; in-between
+     doing the actual reductions (shared code below) it processes the
+     incoming messages and deals with delayed operations 
+     (see PendingFetches).
+     This is not the ugliest code you could imagine, but it's bloody close.
+
+   ------------------------------------------------------------------------ */
+
+static Capability *
+schedule (Capability *initialCapability, Task *task)
+{
+  StgTSO *t;
+  Capability *cap;
+  StgThreadReturnCode ret;
+#if defined(GRAN)
+  rtsEvent *event;
+#elif defined(PARALLEL_HASKELL)
+  StgTSO *tso;
+  GlobalTaskId pe;
+  rtsBool receivedFinish = rtsFalse;
+# if defined(DEBUG)
+  nat tp_size, sp_size; // stats only
+# endif
+#endif
+  nat prev_what_next;
+  rtsBool ready_to_gc;
+#if defined(THREADED_RTS)
+  rtsBool first = rtsTrue;
+#endif
+  
+  cap = initialCapability;
+
+  // Pre-condition: this task owns initialCapability.
+  // The sched_mutex is *NOT* held
+  // NB. on return, we still hold a capability.
+
+  IF_DEBUG(scheduler,
+	   sched_belch("### NEW SCHEDULER LOOP (task: %p, cap: %p)",
+		       task, initialCapability);
+      );
+
+  schedulePreLoop();
+
+  // -----------------------------------------------------------
+  // Scheduler loop starts here:
+
+#if defined(PARALLEL_HASKELL)
+#define TERMINATION_CONDITION        (!receivedFinish)
+#elif defined(GRAN)
+#define TERMINATION_CONDITION        ((event = get_next_event()) != (rtsEvent*)NULL) 
+#else
+#define TERMINATION_CONDITION        rtsTrue
+#endif
+
+  while (TERMINATION_CONDITION) {
+
+#if defined(GRAN)
+      /* Choose the processor with the next event */
+      CurrentProc = event->proc;
+      CurrentTSO = event->tso;
+#endif
+
+#if defined(THREADED_RTS)
+      if (first) {
+	  // don't yield the first time, we want a chance to run this
+	  // thread for a bit, even if there are others banging at the
+	  // door.
+	  first = rtsFalse;
+	  ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
+      } else {
+	  // Yield the capability to higher-priority tasks if necessary.
+	  yieldCapability(&cap, task);
+      }
+#endif
+      
+#if defined(THREADED_RTS)
+      schedulePushWork(cap,task);
+#endif
+
+    // Check whether we have re-entered the RTS from Haskell without
+    // going via suspendThread()/resumeThread (i.e. a 'safe' foreign
+    // call).
+    if (cap->in_haskell) {
+    	  errorBelch("schedule: re-entered unsafely.\n"
+    		     "   Perhaps a 'foreign import unsafe' should be 'safe'?");
+    	  stg_exit(EXIT_FAILURE);
+    }
+
+    // The interruption / shutdown sequence.
+    // 
+    // In order to cleanly shut down the runtime, we want to:
+    //   * make sure that all main threads return to their callers
+    //     with the state 'Interrupted'.
+    //   * clean up all OS threads assocated with the runtime
+    //   * free all memory etc.
+    //
+    // So the sequence for ^C goes like this:
+    //
+    //   * ^C handler sets sched_state := SCHED_INTERRUPTING and
+    //     arranges for some Capability to wake up
+    //
+    //   * all threads in the system are halted, and the zombies are
+    //     placed on the run queue for cleaning up.  We acquire all
+    //     the capabilities in order to delete the threads, this is
+    //     done by scheduleDoGC() for convenience (because GC already
+    //     needs to acquire all the capabilities).  We can't kill
+    //     threads involved in foreign calls.
+    // 
+    //   * sched_state := SCHED_INTERRUPTED
+    //
+    //   * somebody calls shutdownHaskell(), which calls exitScheduler()
+    //
+    //   * sched_state := SCHED_SHUTTING_DOWN
+    //
+    //   * all workers exit when the run queue on their capability
+    //     drains.  All main threads will also exit when their TSO
+    //     reaches the head of the run queue and they can return.
+    //
+    //   * eventually all Capabilities will shut down, and the RTS can
+    //     exit.
+    //
+    //   * We might be left with threads blocked in foreign calls, 
+    //     we should really attempt to kill these somehow (TODO);
+    
+    switch (sched_state) {
+    case SCHED_RUNNING:
+	break;
+    case SCHED_INTERRUPTING:
+	IF_DEBUG(scheduler, sched_belch("SCHED_INTERRUPTING"));
+#if defined(THREADED_RTS)
+	discardSparksCap(cap);
+#endif
+	/* scheduleDoGC() deletes all the threads */
+	cap = scheduleDoGC(cap,task,rtsFalse,GetRoots);
+	break;
+    case SCHED_INTERRUPTED:
+	IF_DEBUG(scheduler, sched_belch("SCHED_INTERRUPTED"));
+	break;
+    case SCHED_SHUTTING_DOWN:
+	IF_DEBUG(scheduler, sched_belch("SCHED_SHUTTING_DOWN"));
+	// If we are a worker, just exit.  If we're a bound thread
+	// then we will exit below when we've removed our TSO from
+	// the run queue.
+	if (task->tso == NULL && emptyRunQueue(cap)) {
+	    return cap;
+	}
+	break;
+    default:
+	barf("sched_state: %d", sched_state);
+    }
+
+#if defined(THREADED_RTS)
+    // If the run queue is empty, take a spark and turn it into a thread.
+    {
+	if (emptyRunQueue(cap)) {
+	    StgClosure *spark;
+	    spark = findSpark(cap);
+	    if (spark != NULL) {
+		IF_DEBUG(scheduler,
+			 sched_belch("turning spark of closure %p into a thread",
+				     (StgClosure *)spark));
+		createSparkThread(cap,spark);	  
+	    }
+	}
+    }
+#endif // THREADED_RTS
+
+    scheduleStartSignalHandlers(cap);
+
+    // Only check the black holes here if we've nothing else to do.
+    // During normal execution, the black hole list only gets checked
+    // at GC time, to avoid repeatedly traversing this possibly long
+    // list each time around the scheduler.
+    if (emptyRunQueue(cap)) { scheduleCheckBlackHoles(cap); }
+
+    scheduleCheckWakeupThreads(cap);
+
+    scheduleCheckBlockedThreads(cap);
+
+    scheduleDetectDeadlock(cap,task);
+#if defined(THREADED_RTS)
+    cap = task->cap;    // reload cap, it might have changed
+#endif
+
+    // Normally, the only way we can get here with no threads to
+    // run is if a keyboard interrupt received during 
+    // scheduleCheckBlockedThreads() or scheduleDetectDeadlock().
+    // Additionally, it is not fatal for the
+    // threaded RTS to reach here with no threads to run.
+    //
+    // win32: might be here due to awaitEvent() being abandoned
+    // as a result of a console event having been delivered.
+    if ( emptyRunQueue(cap) ) {
+#if !defined(THREADED_RTS) && !defined(mingw32_HOST_OS)
+	ASSERT(sched_state >= SCHED_INTERRUPTING);
+#endif
+	continue; // nothing to do
+    }
+
+#if defined(PARALLEL_HASKELL)
+    scheduleSendPendingMessages();
+    if (emptyRunQueue(cap) && scheduleActivateSpark()) 
+	continue;
+
+#if defined(SPARKS)
+    ASSERT(next_fish_to_send_at==0);  // i.e. no delayed fishes left!
+#endif
+
+    /* If we still have no work we need to send a FISH to get a spark
+       from another PE */
+    if (emptyRunQueue(cap)) {
+	if (!scheduleGetRemoteWork(&receivedFinish)) continue;
+	ASSERT(rtsFalse); // should not happen at the moment
+    }
+    // from here: non-empty run queue.
+    //  TODO: merge above case with this, only one call processMessages() !
+    if (PacketsWaiting()) {  /* process incoming messages, if
+				any pending...  only in else
+				because getRemoteWork waits for
+				messages as well */
+	receivedFinish = processMessages();
+    }
+#endif
+
+#if defined(GRAN)
+    scheduleProcessEvent(event);
+#endif
+
+    // 
+    // Get a thread to run
+    //
+    t = popRunQueue(cap);
+
+#if defined(GRAN) || defined(PAR)
+    scheduleGranParReport(); // some kind of debuging output
+#else
+    // Sanity check the thread we're about to run.  This can be
+    // expensive if there is lots of thread switching going on...
+    IF_DEBUG(sanity,checkTSO(t));
+#endif
+
+#if defined(THREADED_RTS)
+    // Check whether we can run this thread in the current task.
+    // If not, we have to pass our capability to the right task.
+    {
+	Task *bound = t->bound;
+      
+	if (bound) {
+	    if (bound == task) {
+		IF_DEBUG(scheduler,
+			 sched_belch("### Running thread %d in bound thread",
+				     t->id));
+		// yes, the Haskell thread is bound to the current native thread
+	    } else {
+		IF_DEBUG(scheduler,
+			 sched_belch("### thread %d bound to another OS thread",
+				     t->id));
+		// no, bound to a different Haskell thread: pass to that thread
+		pushOnRunQueue(cap,t);
+		continue;
+	    }
+	} else {
+	    // The thread we want to run is unbound.
+	    if (task->tso) { 
+		IF_DEBUG(scheduler,
+			 sched_belch("### this OS thread cannot run thread %d", t->id));
+		// no, the current native thread is bound to a different
+		// Haskell thread, so pass it to any worker thread
+		pushOnRunQueue(cap,t);
+		continue; 
+	    }
+	}
+    }
+#endif
+
+    cap->r.rCurrentTSO = t;
+    
+    /* context switches are initiated by the timer signal, unless
+     * the user specified "context switch as often as possible", with
+     * +RTS -C0
+     */
+    if (RtsFlags.ConcFlags.ctxtSwitchTicks == 0
+	&& !emptyThreadQueues(cap)) {
+	context_switch = 1;
+    }
+	 
+run_thread:
+
+    IF_DEBUG(scheduler, sched_belch("-->> running thread %ld %s ...", 
+			      (long)t->id, whatNext_strs[t->what_next]));
+
+#if defined(PROFILING)
+    startHeapProfTimer();
+#endif
+
+    // ----------------------------------------------------------------------
+    // Run the current thread 
+
+    ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
+    ASSERT(t->cap == cap);
+
+    prev_what_next = t->what_next;
+
+    errno = t->saved_errno;
+    cap->in_haskell = rtsTrue;
+
+    dirtyTSO(t);
+
+    recent_activity = ACTIVITY_YES;
+
+    switch (prev_what_next) {
+	
+    case ThreadKilled:
+    case ThreadComplete:
+	/* Thread already finished, return to scheduler. */
+	ret = ThreadFinished;
+	break;
+	
+    case ThreadRunGHC:
+    {
+	StgRegTable *r;
+	r = StgRun((StgFunPtr) stg_returnToStackTop, &cap->r);
+	cap = regTableToCapability(r);
+	ret = r->rRet;
+	break;
+    }
+    
+    case ThreadInterpret:
+	cap = interpretBCO(cap);
+	ret = cap->r.rRet;
+	break;
+	
+    default:
+	barf("schedule: invalid what_next field");
+    }
+
+    cap->in_haskell = rtsFalse;
+
+    // The TSO might have moved, eg. if it re-entered the RTS and a GC
+    // happened.  So find the new location:
+    t = cap->r.rCurrentTSO;
+
+    // We have run some Haskell code: there might be blackhole-blocked
+    // threads to wake up now.
+    // Lock-free test here should be ok, we're just setting a flag.
+    if ( blackhole_queue != END_TSO_QUEUE ) {
+	blackholes_need_checking = rtsTrue;
+    }
+    
+    // And save the current errno in this thread.
+    // XXX: possibly bogus for SMP because this thread might already
+    // be running again, see code below.
+    t->saved_errno = errno;
+
+#if defined(THREADED_RTS)
+    // If ret is ThreadBlocked, and this Task is bound to the TSO that
+    // blocked, we are in limbo - the TSO is now owned by whatever it
+    // is blocked on, and may in fact already have been woken up,
+    // perhaps even on a different Capability.  It may be the case
+    // that task->cap != cap.  We better yield this Capability
+    // immediately and return to normaility.
+    if (ret == ThreadBlocked) {
+	IF_DEBUG(scheduler,
+		 sched_belch("--<< thread %d (%s) stopped: blocked\n",
+			     t->id, whatNext_strs[t->what_next]));
+	continue;
+    }
+#endif
+
+    ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
+    ASSERT(t->cap == cap);
+
+    // ----------------------------------------------------------------------
+    
+    // Costs for the scheduler are assigned to CCS_SYSTEM
+#if defined(PROFILING)
+    stopHeapProfTimer();
+    CCCS = CCS_SYSTEM;
+#endif
+    
+#if defined(THREADED_RTS)
+    IF_DEBUG(scheduler,debugBelch("sched (task %p): ", (void *)(unsigned long)(unsigned int)osThreadId()););
+#elif !defined(GRAN) && !defined(PARALLEL_HASKELL)
+    IF_DEBUG(scheduler,debugBelch("sched: "););
+#endif
+    
+    schedulePostRunThread();
+
+    ready_to_gc = rtsFalse;
+
+    switch (ret) {
+    case HeapOverflow:
+	ready_to_gc = scheduleHandleHeapOverflow(cap,t);
+	break;
+
+    case StackOverflow:
+	scheduleHandleStackOverflow(cap,task,t);
+	break;
+
+    case ThreadYielding:
+	if (scheduleHandleYield(cap, t, prev_what_next)) {
+            // shortcut for switching between compiler/interpreter:
+	    goto run_thread; 
+	}
+	break;
+
+    case ThreadBlocked:
+	scheduleHandleThreadBlocked(t);
+	break;
+
+    case ThreadFinished:
+	if (scheduleHandleThreadFinished(cap, task, t)) return cap;
+	ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
+	break;
+
+    default:
+      barf("schedule: invalid thread return code %d", (int)ret);
+    }
+
+    if (scheduleDoHeapProfile(ready_to_gc)) { ready_to_gc = rtsFalse; }
+    if (ready_to_gc) {
+      cap = scheduleDoGC(cap,task,rtsFalse,GetRoots);
+    }
+  } /* end of while() */
+
+  IF_PAR_DEBUG(verbose,
+	       debugBelch("== Leaving schedule() after having received Finish\n"));
+}
+
+/* ----------------------------------------------------------------------------
+ * Setting up the scheduler loop
+ * ------------------------------------------------------------------------- */
+
+static void
+schedulePreLoop(void)
+{
+#if defined(GRAN) 
+    /* set up first event to get things going */
+    /* ToDo: assign costs for system setup and init MainTSO ! */
+    new_event(CurrentProc, CurrentProc, CurrentTime[CurrentProc],
+	      ContinueThread, 
+	      CurrentTSO, (StgClosure*)NULL, (rtsSpark*)NULL);
+    
+    IF_DEBUG(gran,
+	     debugBelch("GRAN: Init CurrentTSO (in schedule) = %p\n", 
+			CurrentTSO);
+	     G_TSO(CurrentTSO, 5));
+    
+    if (RtsFlags.GranFlags.Light) {
+	/* Save current time; GranSim Light only */
+	CurrentTSO->gran.clock = CurrentTime[CurrentProc];
+    }      
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+ * schedulePushWork()
+ *
+ * Push work to other Capabilities if we have some.
+ * -------------------------------------------------------------------------- */
+
+#if defined(THREADED_RTS)
+static void
+schedulePushWork(Capability *cap USED_IF_THREADS, 
+		 Task *task      USED_IF_THREADS)
+{
+    Capability *free_caps[n_capabilities], *cap0;
+    nat i, n_free_caps;
+
+    // migration can be turned off with +RTS -qg
+    if (!RtsFlags.ParFlags.migrate) return;
+
+    // Check whether we have more threads on our run queue, or sparks
+    // in our pool, that we could hand to another Capability.
+    if ((emptyRunQueue(cap) || cap->run_queue_hd->link == END_TSO_QUEUE)
+	&& sparkPoolSizeCap(cap) < 2) {
+	return;
+    }
+
+    // First grab as many free Capabilities as we can.
+    for (i=0, n_free_caps=0; i < n_capabilities; i++) {
+	cap0 = &capabilities[i];
+	if (cap != cap0 && tryGrabCapability(cap0,task)) {
+	    if (!emptyRunQueue(cap0) || cap->returning_tasks_hd != NULL) {
+		// it already has some work, we just grabbed it at 
+		// the wrong moment.  Or maybe it's deadlocked!
+		releaseCapability(cap0);
+	    } else {
+		free_caps[n_free_caps++] = cap0;
+	    }
+	}
+    }
+
+    // we now have n_free_caps free capabilities stashed in
+    // free_caps[].  Share our run queue equally with them.  This is
+    // probably the simplest thing we could do; improvements we might
+    // want to do include:
+    //
+    //   - giving high priority to moving relatively new threads, on 
+    //     the gournds that they haven't had time to build up a
+    //     working set in the cache on this CPU/Capability.
+    //
+    //   - giving low priority to moving long-lived threads
+
+    if (n_free_caps > 0) {
+	StgTSO *prev, *t, *next;
+	rtsBool pushed_to_all;
+
+	IF_DEBUG(scheduler, sched_belch("excess threads on run queue and %d free capabilities, sharing...", n_free_caps));
+
+	i = 0;
+	pushed_to_all = rtsFalse;
+
+	if (cap->run_queue_hd != END_TSO_QUEUE) {
+	    prev = cap->run_queue_hd;
+	    t = prev->link;
+	    prev->link = END_TSO_QUEUE;
+	    for (; t != END_TSO_QUEUE; t = next) {
+		next = t->link;
+		t->link = END_TSO_QUEUE;
+		if (t->what_next == ThreadRelocated
+		    || t->bound == task // don't move my bound thread
+		    || tsoLocked(t)) {  // don't move a locked thread
+		    prev->link = t;
+		    prev = t;
+		} else if (i == n_free_caps) {
+		    pushed_to_all = rtsTrue;
+		    i = 0;
+		    // keep one for us
+		    prev->link = t;
+		    prev = t;
+		} else {
+		    IF_DEBUG(scheduler, sched_belch("pushing thread %d to capability %d", t->id, free_caps[i]->no));
+		    appendToRunQueue(free_caps[i],t);
+		    if (t->bound) { t->bound->cap = free_caps[i]; }
+		    t->cap = free_caps[i];
+		    i++;
+		}
+	    }
+	    cap->run_queue_tl = prev;
+	}
+
+	// If there are some free capabilities that we didn't push any
+	// threads to, then try to push a spark to each one.
+	if (!pushed_to_all) {
+	    StgClosure *spark;
+	    // i is the next free capability to push to
+	    for (; i < n_free_caps; i++) {
+		if (emptySparkPoolCap(free_caps[i])) {
+		    spark = findSpark(cap);
+		    if (spark != NULL) {
+			IF_DEBUG(scheduler, sched_belch("pushing spark %p to capability %d", spark, free_caps[i]->no));
+			newSpark(&(free_caps[i]->r), spark);
+		    }
+		}
+	    }
+	}
+
+	// release the capabilities
+	for (i = 0; i < n_free_caps; i++) {
+	    task->cap = free_caps[i];
+	    releaseCapability(free_caps[i]);
+	}
+    }
+    task->cap = cap; // reset to point to our Capability.
+}
+#endif
+
+/* ----------------------------------------------------------------------------
+ * Start any pending signal handlers
+ * ------------------------------------------------------------------------- */
+
+#if defined(RTS_USER_SIGNALS) && (!defined(THREADED_RTS) || defined(mingw32_HOST_OS))
+static void
+scheduleStartSignalHandlers(Capability *cap)
+{
+    if (signals_pending()) { // safe outside the lock
+	startSignalHandlers(cap);
+    }
+}
+#else
+static void
+scheduleStartSignalHandlers(Capability *cap STG_UNUSED)
+{
+}
+#endif
+
+/* ----------------------------------------------------------------------------
+ * Check for blocked threads that can be woken up.
+ * ------------------------------------------------------------------------- */
+
+static void
+scheduleCheckBlockedThreads(Capability *cap USED_IF_NOT_THREADS)
+{
+#if !defined(THREADED_RTS)
+    //
+    // Check whether any waiting threads need to be woken up.  If the
+    // run queue is empty, and there are no other tasks running, we
+    // can wait indefinitely for something to happen.
+    //
+    if ( !emptyQueue(blocked_queue_hd) || !emptyQueue(sleeping_queue) )
+    {
+	awaitEvent( emptyRunQueue(cap) && !blackholes_need_checking );
+    }
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------
+ * Check for threads woken up by other Capabilities
+ * ------------------------------------------------------------------------- */
+
+static void
+scheduleCheckWakeupThreads(Capability *cap USED_IF_THREADS)
+{
+#if defined(THREADED_RTS)
+    // Any threads that were woken up by other Capabilities get
+    // appended to our run queue.
+    if (!emptyWakeupQueue(cap)) {
+	ACQUIRE_LOCK(&cap->lock);
+	if (emptyRunQueue(cap)) {
+	    cap->run_queue_hd = cap->wakeup_queue_hd;
+	    cap->run_queue_tl = cap->wakeup_queue_tl;
+	} else {
+	    cap->run_queue_tl->link = cap->wakeup_queue_hd;
+	    cap->run_queue_tl = cap->wakeup_queue_tl;
+	}
+	cap->wakeup_queue_hd = cap->wakeup_queue_tl = END_TSO_QUEUE;
+	RELEASE_LOCK(&cap->lock);
+    }
+#endif
+}
+
+/* ----------------------------------------------------------------------------
+ * Check for threads blocked on BLACKHOLEs that can be woken up
+ * ------------------------------------------------------------------------- */
+static void
+scheduleCheckBlackHoles (Capability *cap)
+{
+    if ( blackholes_need_checking ) // check without the lock first
+    {
+	ACQUIRE_LOCK(&sched_mutex);
+	if ( blackholes_need_checking ) {
+	    checkBlackHoles(cap);
+	    blackholes_need_checking = rtsFalse;
+	}
+	RELEASE_LOCK(&sched_mutex);
+    }
+}
+
+/* ----------------------------------------------------------------------------
+ * Detect deadlock conditions and attempt to resolve them.
+ * ------------------------------------------------------------------------- */
+
+static void
+scheduleDetectDeadlock (Capability *cap, Task *task)
+{
+
+#if defined(PARALLEL_HASKELL)
+    // ToDo: add deadlock detection in GUM (similar to THREADED_RTS) -- HWL
+    return;
+#endif
+
+    /* 
+     * Detect deadlock: when we have no threads to run, there are no
+     * threads blocked, waiting for I/O, or sleeping, and all the
+     * other tasks are waiting for work, we must have a deadlock of
+     * some description.
+     */
+    if ( emptyThreadQueues(cap) )
+    {
+#if defined(THREADED_RTS)
+	/* 
+	 * In the threaded RTS, we only check for deadlock if there
+	 * has been no activity in a complete timeslice.  This means
+	 * we won't eagerly start a full GC just because we don't have
+	 * any threads to run currently.
+	 */
+	if (recent_activity != ACTIVITY_INACTIVE) return;
+#endif
+
+	IF_DEBUG(scheduler, sched_belch("deadlocked, forcing major GC..."));
+
+	// Garbage collection can release some new threads due to
+	// either (a) finalizers or (b) threads resurrected because
+	// they are unreachable and will therefore be sent an
+	// exception.  Any threads thus released will be immediately
+	// runnable.
+	cap = scheduleDoGC (cap, task, rtsTrue/*force  major GC*/, GetRoots);
+
+	recent_activity = ACTIVITY_DONE_GC;
+	
+	if ( !emptyRunQueue(cap) ) return;
+
+#if defined(RTS_USER_SIGNALS) && (!defined(THREADED_RTS) || defined(mingw32_HOST_OS))
+	/* If we have user-installed signal handlers, then wait
+	 * for signals to arrive rather then bombing out with a
+	 * deadlock.
+	 */
+	if ( anyUserHandlers() ) {
+	    IF_DEBUG(scheduler, 
+		     sched_belch("still deadlocked, waiting for signals..."));
+
+	    awaitUserSignals();
+
+	    if (signals_pending()) {
+		startSignalHandlers(cap);
+	    }
+
+	    // either we have threads to run, or we were interrupted:
+	    ASSERT(!emptyRunQueue(cap) || sched_state >= SCHED_INTERRUPTING);
+	}
+#endif
+
+#if !defined(THREADED_RTS)
+	/* Probably a real deadlock.  Send the current main thread the
+	 * Deadlock exception.
+	 */
+	if (task->tso) {
+	    switch (task->tso->why_blocked) {
+	    case BlockedOnSTM:
+	    case BlockedOnBlackHole:
+	    case BlockedOnException:
+	    case BlockedOnMVar:
+		raiseAsync(cap, task->tso, (StgClosure *)NonTermination_closure);
+		return;
+	    default:
+		barf("deadlock: main thread blocked in a strange way");
+	    }
+	}
+	return;
+#endif
+    }
+}
+
+/* ----------------------------------------------------------------------------
+ * Process an event (GRAN only)
+ * ------------------------------------------------------------------------- */
+
+#if defined(GRAN)
+static StgTSO *
+scheduleProcessEvent(rtsEvent *event)
+{
+    StgTSO *t;
+
+    if (RtsFlags.GranFlags.Light)
+      GranSimLight_enter_system(event, &ActiveTSO); // adjust ActiveTSO etc
+
+    /* adjust time based on time-stamp */
+    if (event->time > CurrentTime[CurrentProc] &&
+        event->evttype != ContinueThread)
+      CurrentTime[CurrentProc] = event->time;
+    
+    /* Deal with the idle PEs (may issue FindWork or MoveSpark events) */
+    if (!RtsFlags.GranFlags.Light)
+      handleIdlePEs();
+
+    IF_DEBUG(gran, debugBelch("GRAN: switch by event-type\n"));
+
+    /* main event dispatcher in GranSim */
+    switch (event->evttype) {
+      /* Should just be continuing execution */
+    case ContinueThread:
+      IF_DEBUG(gran, debugBelch("GRAN: doing ContinueThread\n"));
+      /* ToDo: check assertion
+      ASSERT(run_queue_hd != (StgTSO*)NULL &&
+	     run_queue_hd != END_TSO_QUEUE);
+      */
+      /* Ignore ContinueThreads for fetching threads (if synchr comm) */
+      if (!RtsFlags.GranFlags.DoAsyncFetch &&
+	  procStatus[CurrentProc]==Fetching) {
+	debugBelch("ghuH: Spurious ContinueThread while Fetching ignored; TSO %d (%p) [PE %d]\n",
+	      CurrentTSO->id, CurrentTSO, CurrentProc);
+	goto next_thread;
+      }	
+      /* Ignore ContinueThreads for completed threads */
+      if (CurrentTSO->what_next == ThreadComplete) {
+	debugBelch("ghuH: found a ContinueThread event for completed thread %d (%p) [PE %d] (ignoring ContinueThread)\n", 
+	      CurrentTSO->id, CurrentTSO, CurrentProc);
+	goto next_thread;
+      }	
+      /* Ignore ContinueThreads for threads that are being migrated */
+      if (PROCS(CurrentTSO)==Nowhere) { 
+	debugBelch("ghuH: trying to run the migrating TSO %d (%p) [PE %d] (ignoring ContinueThread)\n",
+	      CurrentTSO->id, CurrentTSO, CurrentProc);
+	goto next_thread;
+      }
+      /* The thread should be at the beginning of the run queue */
+      if (CurrentTSO!=run_queue_hds[CurrentProc]) { 
+	debugBelch("ghuH: TSO %d (%p) [PE %d] is not at the start of the run_queue when doing a ContinueThread\n",
+	      CurrentTSO->id, CurrentTSO, CurrentProc);
+	break; // run the thread anyway
+      }
+      /*
+      new_event(proc, proc, CurrentTime[proc],
+		FindWork,
+		(StgTSO*)NULL, (StgClosure*)NULL, (rtsSpark*)NULL);
+      goto next_thread; 
+      */ /* Catches superfluous CONTINUEs -- should be unnecessary */
+      break; // now actually run the thread; DaH Qu'vam yImuHbej 
+
+    case FetchNode:
+      do_the_fetchnode(event);
+      goto next_thread;             /* handle next event in event queue  */
+      
+    case GlobalBlock:
+      do_the_globalblock(event);
+      goto next_thread;             /* handle next event in event queue  */
+      
+    case FetchReply:
+      do_the_fetchreply(event);
+      goto next_thread;             /* handle next event in event queue  */
+      
+    case UnblockThread:   /* Move from the blocked queue to the tail of */
+      do_the_unblock(event);
+      goto next_thread;             /* handle next event in event queue  */
+      
+    case ResumeThread:  /* Move from the blocked queue to the tail of */
+      /* the runnable queue ( i.e. Qu' SImqa'lu') */ 
+      event->tso->gran.blocktime += 
+	CurrentTime[CurrentProc] - event->tso->gran.blockedat;
+      do_the_startthread(event);
+      goto next_thread;             /* handle next event in event queue  */
+      
+    case StartThread:
+      do_the_startthread(event);
+      goto next_thread;             /* handle next event in event queue  */
+      
+    case MoveThread:
+      do_the_movethread(event);
+      goto next_thread;             /* handle next event in event queue  */
+      
+    case MoveSpark:
+      do_the_movespark(event);
+      goto next_thread;             /* handle next event in event queue  */
+      
+    case FindWork:
+      do_the_findwork(event);
+      goto next_thread;             /* handle next event in event queue  */
+      
+    default:
+      barf("Illegal event type %u\n", event->evttype);
+    }  /* switch */
+    
+    /* This point was scheduler_loop in the old RTS */
+
+    IF_DEBUG(gran, debugBelch("GRAN: after main switch\n"));
+
+    TimeOfLastEvent = CurrentTime[CurrentProc];
+    TimeOfNextEvent = get_time_of_next_event();
+    IgnoreEvents=(TimeOfNextEvent==0); // HWL HACK
+    // CurrentTSO = ThreadQueueHd;
+
+    IF_DEBUG(gran, debugBelch("GRAN: time of next event is: %ld\n", 
+			 TimeOfNextEvent));
+
+    if (RtsFlags.GranFlags.Light) 
+      GranSimLight_leave_system(event, &ActiveTSO); 
+
+    EndOfTimeSlice = CurrentTime[CurrentProc]+RtsFlags.GranFlags.time_slice;
+
+    IF_DEBUG(gran, 
+	     debugBelch("GRAN: end of time-slice is %#lx\n", EndOfTimeSlice));
+
+    /* in a GranSim setup the TSO stays on the run queue */
+    t = CurrentTSO;
+    /* Take a thread from the run queue. */
+    POP_RUN_QUEUE(t); // take_off_run_queue(t);
+
+    IF_DEBUG(gran, 
+	     debugBelch("GRAN: About to run current thread, which is\n");
+	     G_TSO(t,5));
+
+    context_switch = 0; // turned on via GranYield, checking events and time slice
+
+    IF_DEBUG(gran, 
+	     DumpGranEvent(GR_SCHEDULE, t));
+
+    procStatus[CurrentProc] = Busy;
+}
+#endif // GRAN
+
+/* ----------------------------------------------------------------------------
+ * Send pending messages (PARALLEL_HASKELL only)
+ * ------------------------------------------------------------------------- */
+
+#if defined(PARALLEL_HASKELL)
+static StgTSO *
+scheduleSendPendingMessages(void)
+{
+    StgSparkPool *pool;
+    rtsSpark spark;
+    StgTSO *t;
+
+# if defined(PAR) // global Mem.Mgmt., omit for now
+    if (PendingFetches != END_BF_QUEUE) {
+        processFetches();
+    }
+# endif
+    
+    if (RtsFlags.ParFlags.BufferTime) {
+	// if we use message buffering, we must send away all message
+	// packets which have become too old...
+	sendOldBuffers(); 
+    }
+}
+#endif
+
+/* ----------------------------------------------------------------------------
+ * Activate spark threads (PARALLEL_HASKELL only)
+ * ------------------------------------------------------------------------- */
+
+#if defined(PARALLEL_HASKELL)
+static void
+scheduleActivateSpark(void)
+{
+#if defined(SPARKS)
+  ASSERT(emptyRunQueue());
+/* We get here if the run queue is empty and want some work.
+   We try to turn a spark into a thread, and add it to the run queue,
+   from where it will be picked up in the next iteration of the scheduler
+   loop.
+*/
+
+      /* :-[  no local threads => look out for local sparks */
+      /* the spark pool for the current PE */
+      pool = &(cap.r.rSparks); // JB: cap = (old) MainCap
+      if (advisory_thread_count < RtsFlags.ParFlags.maxThreads &&
+	  pool->hd < pool->tl) {
+	/* 
+	 * ToDo: add GC code check that we really have enough heap afterwards!!
+	 * Old comment:
+	 * If we're here (no runnable threads) and we have pending
+	 * sparks, we must have a space problem.  Get enough space
+	 * to turn one of those pending sparks into a
+	 * thread... 
+	 */
+
+  	spark = findSpark(rtsFalse);            /* get a spark */
+  	if (spark != (rtsSpark) NULL) {
+  	  tso = createThreadFromSpark(spark);       /* turn the spark into a thread */
+  	  IF_PAR_DEBUG(fish, // schedule,
+  		       debugBelch("==== schedule: Created TSO %d (%p); %d threads active\n",
+  			     tso->id, tso, advisory_thread_count));
+
+  	  if (tso==END_TSO_QUEUE) { /* failed to activate spark->back to loop */
+  	    IF_PAR_DEBUG(fish, // schedule,
+  			 debugBelch("==^^ failed to create thread from spark @ %lx\n",
+                            spark));
+  	    return rtsFalse; /* failed to generate a thread */
+  	  }                  /* otherwise fall through & pick-up new tso */
+  	} else {
+  	  IF_PAR_DEBUG(fish, // schedule,
+  		       debugBelch("==^^ no local sparks (spark pool contains only NFs: %d)\n", 
+  			     spark_queue_len(pool)));
+  	  return rtsFalse;  /* failed to generate a thread */
+  	}
+  	return rtsTrue;  /* success in generating a thread */
+  } else { /* no more threads permitted or pool empty */
+    return rtsFalse;  /* failed to generateThread */
+  }
+#else
+  tso = NULL; // avoid compiler warning only
+  return rtsFalse;  /* dummy in non-PAR setup */
+#endif // SPARKS
+}
+#endif // PARALLEL_HASKELL
+
+/* ----------------------------------------------------------------------------
+ * Get work from a remote node (PARALLEL_HASKELL only)
+ * ------------------------------------------------------------------------- */
+    
+#if defined(PARALLEL_HASKELL)
+static rtsBool
+scheduleGetRemoteWork(rtsBool *receivedFinish)
+{
+  ASSERT(emptyRunQueue());
+
+  if (RtsFlags.ParFlags.BufferTime) {
+	IF_PAR_DEBUG(verbose, 
+	        debugBelch("...send all pending data,"));
+        {
+	  nat i;
+	  for (i=1; i<=nPEs; i++)
+	    sendImmediately(i); // send all messages away immediately
+	}
+  }
+# ifndef SPARKS
+	//++EDEN++ idle() , i.e. send all buffers, wait for work
+	// suppress fishing in EDEN... just look for incoming messages
+	// (blocking receive)
+  IF_PAR_DEBUG(verbose, 
+	       debugBelch("...wait for incoming messages...\n"));
+  *receivedFinish = processMessages(); // blocking receive...
+
+	// and reenter scheduling loop after having received something
+	// (return rtsFalse below)
+
+# else /* activate SPARKS machinery */
+/* We get here, if we have no work, tried to activate a local spark, but still
+   have no work. We try to get a remote spark, by sending a FISH message.
+   Thread migration should be added here, and triggered when a sequence of 
+   fishes returns without work. */
+	delay = (RtsFlags.ParFlags.fishDelay!=0ll ? RtsFlags.ParFlags.fishDelay : 0ll);
+
+      /* =8-[  no local sparks => look for work on other PEs */
+	/*
+	 * We really have absolutely no work.  Send out a fish
+	 * (there may be some out there already), and wait for
+	 * something to arrive.  We clearly can't run any threads
+	 * until a SCHEDULE or RESUME arrives, and so that's what
+	 * we're hoping to see.  (Of course, we still have to
+	 * respond to other types of messages.)
+	 */
+	rtsTime now = msTime() /*CURRENT_TIME*/;
+	IF_PAR_DEBUG(verbose, 
+		     debugBelch("--  now=%ld\n", now));
+	IF_PAR_DEBUG(fish, // verbose,
+  	     if (outstandingFishes < RtsFlags.ParFlags.maxFishes &&
+  		 (last_fish_arrived_at!=0 &&
+  		  last_fish_arrived_at+delay > now)) {
+  	       debugBelch("--$$ <%llu> delaying FISH until %llu (last fish %llu, delay %llu)\n",
+  		     now, last_fish_arrived_at+delay, 
+		     last_fish_arrived_at,
+  		     delay);
+  	     });
+  
+	if (outstandingFishes < RtsFlags.ParFlags.maxFishes &&
+	    advisory_thread_count < RtsFlags.ParFlags.maxThreads) { // send a FISH, but when?
+	  if (last_fish_arrived_at==0 ||
+	      (last_fish_arrived_at+delay <= now)) {           // send FISH now!
+	    /* outstandingFishes is set in sendFish, processFish;
+	       avoid flooding system with fishes via delay */
+    next_fish_to_send_at = 0;  
+  } else {
+    /* ToDo: this should be done in the main scheduling loop to avoid the
+             busy wait here; not so bad if fish delay is very small  */
+    int iq = 0; // DEBUGGING -- HWL
+    next_fish_to_send_at = last_fish_arrived_at+delay; // remember when to send  
+    /* send a fish when ready, but process messages that arrive in the meantime */
+    do {
+      if (PacketsWaiting()) {
+        iq++; // DEBUGGING
+        *receivedFinish = processMessages();
+      }
+      now = msTime();
+    } while (!*receivedFinish || now<next_fish_to_send_at);
+    // JB: This means the fish could become obsolete, if we receive
+    // work. Better check for work again? 
+    // last line: while (!receivedFinish || !haveWork || now<...)
+    // next line: if (receivedFinish || haveWork )
+
+    if (*receivedFinish) // no need to send a FISH if we are finishing anyway
+      return rtsFalse;  // NB: this will leave scheduler loop
+			// immediately after return!
+  			  
+    IF_PAR_DEBUG(fish, // verbose,
+  	       debugBelch("--$$ <%llu> sent delayed fish (%d processMessages); active/total threads=%d/%d\n",now,iq,run_queue_len(),advisory_thread_count));
+
+  }
+
+    // JB: IMHO, this should all be hidden inside sendFish(...)
+    /* pe = choosePE(); 
+       sendFish(pe, thisPE, NEW_FISH_AGE, NEW_FISH_HISTORY, 
+                NEW_FISH_HUNGER);
+
+    // Global statistics: count no. of fishes
+    if (RtsFlags.ParFlags.ParStats.Global &&
+         RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	   globalParStats.tot_fish_mess++;
+	   }
+    */ 
+
+  /* delayed fishes must have been sent by now! */
+  next_fish_to_send_at = 0;  
+  }
+      
+  *receivedFinish = processMessages();
+# endif /* SPARKS */
+
+ return rtsFalse;
+ /* NB: this function always returns rtsFalse, meaning the scheduler
+    loop continues with the next iteration; 
+    rationale: 
+      return code means success in finding work; we enter this function
+      if there is no local work, thus have to send a fish which takes
+      time until it arrives with work; in the meantime we should process
+      messages in the main loop;
+ */
+}
+#endif // PARALLEL_HASKELL
+
+/* ----------------------------------------------------------------------------
+ * PAR/GRAN: Report stats & debugging info(?)
+ * ------------------------------------------------------------------------- */
+
+#if defined(PAR) || defined(GRAN)
+static void
+scheduleGranParReport(void)
+{
+  ASSERT(run_queue_hd != END_TSO_QUEUE);
+
+  /* Take a thread from the run queue, if we have work */
+  POP_RUN_QUEUE(t);  // take_off_run_queue(END_TSO_QUEUE);
+
+    /* If this TSO has got its outport closed in the meantime, 
+     *   it mustn't be run. Instead, we have to clean it up as if it was finished.
+     * It has to be marked as TH_DEAD for this purpose.
+     * If it is TH_TERM instead, it is supposed to have finished in the normal way.
+
+JB: TODO: investigate wether state change field could be nuked
+     entirely and replaced by the normal tso state (whatnext
+     field). All we want to do is to kill tsos from outside.
+     */
+
+    /* ToDo: write something to the log-file
+    if (RTSflags.ParFlags.granSimStats && !sameThread)
+        DumpGranEvent(GR_SCHEDULE, RunnableThreadsHd);
+
+    CurrentTSO = t;
+    */
+    /* the spark pool for the current PE */
+    pool = &(cap.r.rSparks); //  cap = (old) MainCap
+
+    IF_DEBUG(scheduler, 
+	     debugBelch("--=^ %d threads, %d sparks on [%#x]\n", 
+		   run_queue_len(), spark_queue_len(pool), CURRENT_PROC));
+
+    IF_PAR_DEBUG(fish,
+	     debugBelch("--=^ %d threads, %d sparks on [%#x]\n", 
+		   run_queue_len(), spark_queue_len(pool), CURRENT_PROC));
+
+    if (RtsFlags.ParFlags.ParStats.Full && 
+	(t->par.sparkname != (StgInt)0) && // only log spark generated threads
+	(emitSchedule || // forced emit
+         (t && LastTSO && t->id != LastTSO->id))) {
+      /* 
+	 we are running a different TSO, so write a schedule event to log file
+	 NB: If we use fair scheduling we also have to write  a deschedule 
+	     event for LastTSO; with unfair scheduling we know that the
+	     previous tso has blocked whenever we switch to another tso, so
+	     we don't need it in GUM for now
+      */
+      IF_PAR_DEBUG(fish, // schedule,
+		   debugBelch("____ scheduling spark generated thread %d (%lx) (%lx) via a forced emit\n",t->id,t,t->par.sparkname));
+
+      DumpRawGranEvent(CURRENT_PROC, CURRENT_PROC,
+		       GR_SCHEDULE, t, (StgClosure *)NULL, 0, 0);
+      emitSchedule = rtsFalse;
+    }
+}     
+#endif
+
+/* ----------------------------------------------------------------------------
+ * After running a thread...
+ * ------------------------------------------------------------------------- */
+
+static void
+schedulePostRunThread(void)
+{
+#if defined(PAR)
+    /* HACK 675: if the last thread didn't yield, make sure to print a 
+       SCHEDULE event to the log file when StgRunning the next thread, even
+       if it is the same one as before */
+    LastTSO = t; 
+    TimeOfLastYield = CURRENT_TIME;
+#endif
+
+  /* some statistics gathering in the parallel case */
+
+#if defined(GRAN) || defined(PAR) || defined(EDEN)
+  switch (ret) {
+    case HeapOverflow:
+# if defined(GRAN)
+      IF_DEBUG(gran, DumpGranEvent(GR_DESCHEDULE, t));
+      globalGranStats.tot_heapover++;
+# elif defined(PAR)
+      globalParStats.tot_heapover++;
+# endif
+      break;
+
+     case StackOverflow:
+# if defined(GRAN)
+      IF_DEBUG(gran, 
+	       DumpGranEvent(GR_DESCHEDULE, t));
+      globalGranStats.tot_stackover++;
+# elif defined(PAR)
+      // IF_DEBUG(par, 
+      // DumpGranEvent(GR_DESCHEDULE, t);
+      globalParStats.tot_stackover++;
+# endif
+      break;
+
+    case ThreadYielding:
+# if defined(GRAN)
+      IF_DEBUG(gran, 
+	       DumpGranEvent(GR_DESCHEDULE, t));
+      globalGranStats.tot_yields++;
+# elif defined(PAR)
+      // IF_DEBUG(par, 
+      // DumpGranEvent(GR_DESCHEDULE, t);
+      globalParStats.tot_yields++;
+# endif
+      break; 
+
+    case ThreadBlocked:
+# if defined(GRAN)
+      IF_DEBUG(scheduler,
+	       debugBelch("--<< thread %ld (%p; %s) stopped, blocking on node %p [PE %d] with BQ: ", 
+			  t->id, t, whatNext_strs[t->what_next], t->block_info.closure, 
+			  (t->block_info.closure==(StgClosure*)NULL ? 99 : where_is(t->block_info.closure)));
+	       if (t->block_info.closure!=(StgClosure*)NULL)
+	         print_bq(t->block_info.closure);
+	       debugBelch("\n"));
+
+      // ??? needed; should emit block before
+      IF_DEBUG(gran, 
+	       DumpGranEvent(GR_DESCHEDULE, t)); 
+      prune_eventq(t, (StgClosure *)NULL); // prune ContinueThreads for t
+      /*
+	ngoq Dogh!
+      ASSERT(procStatus[CurrentProc]==Busy || 
+	      ((procStatus[CurrentProc]==Fetching) && 
+	      (t->block_info.closure!=(StgClosure*)NULL)));
+      if (run_queue_hds[CurrentProc] == END_TSO_QUEUE &&
+	  !(!RtsFlags.GranFlags.DoAsyncFetch &&
+	    procStatus[CurrentProc]==Fetching)) 
+	procStatus[CurrentProc] = Idle;
+      */
+# elif defined(PAR)
+//++PAR++  blockThread() writes the event (change?)
+# endif
+    break;
+
+  case ThreadFinished:
+    break;
+
+  default:
+    barf("parGlobalStats: unknown return code");
+    break;
+    }
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+ * Handle a thread that returned to the scheduler with ThreadHeepOverflow
+ * -------------------------------------------------------------------------- */
+
+static rtsBool
+scheduleHandleHeapOverflow( Capability *cap, StgTSO *t )
+{
+    // did the task ask for a large block?
+    if (cap->r.rHpAlloc > BLOCK_SIZE) {
+	// if so, get one and push it on the front of the nursery.
+	bdescr *bd;
+	lnat blocks;
+	
+	blocks = (lnat)BLOCK_ROUND_UP(cap->r.rHpAlloc) / BLOCK_SIZE;
+	
+	IF_DEBUG(scheduler,
+		 debugBelch("--<< thread %ld (%s) stopped: requesting a large block (size %ld)\n", 
+			    (long)t->id, whatNext_strs[t->what_next], blocks));
+	
+	// don't do this if the nursery is (nearly) full, we'll GC first.
+	if (cap->r.rCurrentNursery->link != NULL ||
+	    cap->r.rNursery->n_blocks == 1) {  // paranoia to prevent infinite loop
+	                                       // if the nursery has only one block.
+	    
+	    ACQUIRE_SM_LOCK
+	    bd = allocGroup( blocks );
+	    RELEASE_SM_LOCK
+	    cap->r.rNursery->n_blocks += blocks;
+	    
+	    // link the new group into the list
+	    bd->link = cap->r.rCurrentNursery;
+	    bd->u.back = cap->r.rCurrentNursery->u.back;
+	    if (cap->r.rCurrentNursery->u.back != NULL) {
+		cap->r.rCurrentNursery->u.back->link = bd;
+	    } else {
+#if !defined(THREADED_RTS)
+		ASSERT(g0s0->blocks == cap->r.rCurrentNursery &&
+		       g0s0 == cap->r.rNursery);
+#endif
+		cap->r.rNursery->blocks = bd;
+	    }		  
+	    cap->r.rCurrentNursery->u.back = bd;
+	    
+	    // initialise it as a nursery block.  We initialise the
+	    // step, gen_no, and flags field of *every* sub-block in
+	    // this large block, because this is easier than making
+	    // sure that we always find the block head of a large
+	    // block whenever we call Bdescr() (eg. evacuate() and
+	    // isAlive() in the GC would both have to do this, at
+	    // least).
+	    { 
+		bdescr *x;
+		for (x = bd; x < bd + blocks; x++) {
+		    x->step = cap->r.rNursery;
+		    x->gen_no = 0;
+		    x->flags = 0;
+		}
+	    }
+	    
+	    // This assert can be a killer if the app is doing lots
+	    // of large block allocations.
+	    IF_DEBUG(sanity, checkNurserySanity(cap->r.rNursery));
+	    
+	    // now update the nursery to point to the new block
+	    cap->r.rCurrentNursery = bd;
+	    
+	    // we might be unlucky and have another thread get on the
+	    // run queue before us and steal the large block, but in that
+	    // case the thread will just end up requesting another large
+	    // block.
+	    pushOnRunQueue(cap,t);
+	    return rtsFalse;  /* not actually GC'ing */
+	}
+    }
+    
+    IF_DEBUG(scheduler,
+	     debugBelch("--<< thread %ld (%s) stopped: HeapOverflow\n", 
+			(long)t->id, whatNext_strs[t->what_next]));
+#if defined(GRAN)
+    ASSERT(!is_on_queue(t,CurrentProc));
+#elif defined(PARALLEL_HASKELL)
+    /* Currently we emit a DESCHEDULE event before GC in GUM.
+       ToDo: either add separate event to distinguish SYSTEM time from rest
+       or just nuke this DESCHEDULE (and the following SCHEDULE) */
+    if (0 && RtsFlags.ParFlags.ParStats.Full) {
+	DumpRawGranEvent(CURRENT_PROC, CURRENT_PROC,
+			 GR_DESCHEDULE, t, (StgClosure *)NULL, 0, 0);
+	emitSchedule = rtsTrue;
+    }
+#endif
+      
+    pushOnRunQueue(cap,t);
+    return rtsTrue;
+    /* actual GC is done at the end of the while loop in schedule() */
+}
+
+/* -----------------------------------------------------------------------------
+ * Handle a thread that returned to the scheduler with ThreadStackOverflow
+ * -------------------------------------------------------------------------- */
+
+static void
+scheduleHandleStackOverflow (Capability *cap, Task *task, StgTSO *t)
+{
+    IF_DEBUG(scheduler,debugBelch("--<< thread %ld (%s) stopped, StackOverflow\n", 
+				  (long)t->id, whatNext_strs[t->what_next]));
+    /* just adjust the stack for this thread, then pop it back
+     * on the run queue.
+     */
+    { 
+	/* enlarge the stack */
+	StgTSO *new_t = threadStackOverflow(cap, t);
+	
+	/* The TSO attached to this Task may have moved, so update the
+	 * pointer to it.
+	 */
+	if (task->tso == t) {
+	    task->tso = new_t;
+	}
+	pushOnRunQueue(cap,new_t);
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ * Handle a thread that returned to the scheduler with ThreadYielding
+ * -------------------------------------------------------------------------- */
+
+static rtsBool
+scheduleHandleYield( Capability *cap, StgTSO *t, nat prev_what_next )
+{
+    // Reset the context switch flag.  We don't do this just before
+    // running the thread, because that would mean we would lose ticks
+    // during GC, which can lead to unfair scheduling (a thread hogs
+    // the CPU because the tick always arrives during GC).  This way
+    // penalises threads that do a lot of allocation, but that seems
+    // better than the alternative.
+    context_switch = 0;
+    
+    /* put the thread back on the run queue.  Then, if we're ready to
+     * GC, check whether this is the last task to stop.  If so, wake
+     * up the GC thread.  getThread will block during a GC until the
+     * GC is finished.
+     */
+    IF_DEBUG(scheduler,
+	     if (t->what_next != prev_what_next) {
+		 debugBelch("--<< thread %ld (%s) stopped to switch evaluators\n", 
+			    (long)t->id, whatNext_strs[t->what_next]);
+	     } else {
+		 debugBelch("--<< thread %ld (%s) stopped, yielding\n",
+			    (long)t->id, whatNext_strs[t->what_next]);
+	     }
+	);
+    
+    IF_DEBUG(sanity,
+	     //debugBelch("&& Doing sanity check on yielding TSO %ld.", t->id);
+	     checkTSO(t));
+    ASSERT(t->link == END_TSO_QUEUE);
+    
+    // Shortcut if we're just switching evaluators: don't bother
+    // doing stack squeezing (which can be expensive), just run the
+    // thread.
+    if (t->what_next != prev_what_next) {
+	return rtsTrue;
+    }
+    
+#if defined(GRAN)
+    ASSERT(!is_on_queue(t,CurrentProc));
+      
+    IF_DEBUG(sanity,
+	     //debugBelch("&& Doing sanity check on all ThreadQueues (and their TSOs).");
+	     checkThreadQsSanity(rtsTrue));
+
+#endif
+
+    addToRunQueue(cap,t);
+
+#if defined(GRAN)
+    /* add a ContinueThread event to actually process the thread */
+    new_event(CurrentProc, CurrentProc, CurrentTime[CurrentProc],
+	      ContinueThread,
+	      t, (StgClosure*)NULL, (rtsSpark*)NULL);
+    IF_GRAN_DEBUG(bq, 
+		  debugBelch("GRAN: eventq and runnableq after adding yielded thread to queue again:\n");
+		  G_EVENTQ(0);
+		  G_CURR_THREADQ(0));
+#endif
+    return rtsFalse;
+}
+
+/* -----------------------------------------------------------------------------
+ * Handle a thread that returned to the scheduler with ThreadBlocked
+ * -------------------------------------------------------------------------- */
+
+static void
+scheduleHandleThreadBlocked( StgTSO *t
+#if !defined(GRAN) && !defined(DEBUG)
+    STG_UNUSED
+#endif
+    )
+{
+#if defined(GRAN)
+    IF_DEBUG(scheduler,
+	     debugBelch("--<< thread %ld (%p; %s) stopped, blocking on node %p [PE %d] with BQ: \n", 
+			t->id, t, whatNext_strs[t->what_next], t->block_info.closure, (t->block_info.closure==(StgClosure*)NULL ? 99 : where_is(t->block_info.closure)));
+	     if (t->block_info.closure!=(StgClosure*)NULL) print_bq(t->block_info.closure));
+    
+    // ??? needed; should emit block before
+    IF_DEBUG(gran, 
+	     DumpGranEvent(GR_DESCHEDULE, t)); 
+    prune_eventq(t, (StgClosure *)NULL); // prune ContinueThreads for t
+    /*
+      ngoq Dogh!
+      ASSERT(procStatus[CurrentProc]==Busy || 
+      ((procStatus[CurrentProc]==Fetching) && 
+      (t->block_info.closure!=(StgClosure*)NULL)));
+      if (run_queue_hds[CurrentProc] == END_TSO_QUEUE &&
+      !(!RtsFlags.GranFlags.DoAsyncFetch &&
+      procStatus[CurrentProc]==Fetching)) 
+      procStatus[CurrentProc] = Idle;
+    */
+#elif defined(PAR)
+    IF_DEBUG(scheduler,
+	     debugBelch("--<< thread %ld (%p; %s) stopped, blocking on node %p with BQ: \n", 
+			t->id, t, whatNext_strs[t->what_next], t->block_info.closure));
+    IF_PAR_DEBUG(bq,
+		 
+		 if (t->block_info.closure!=(StgClosure*)NULL) 
+		 print_bq(t->block_info.closure));
+    
+    /* Send a fetch (if BlockedOnGA) and dump event to log file */
+    blockThread(t);
+    
+    /* whatever we schedule next, we must log that schedule */
+    emitSchedule = rtsTrue;
+    
+#else /* !GRAN */
+
+      // We don't need to do anything.  The thread is blocked, and it
+      // has tidied up its stack and placed itself on whatever queue
+      // it needs to be on.
+
+#if !defined(THREADED_RTS)
+    ASSERT(t->why_blocked != NotBlocked);
+	     // This might not be true under THREADED_RTS: we don't have
+	     // exclusive access to this TSO, so someone might have
+	     // woken it up by now.  This actually happens: try
+	     // conc023 +RTS -N2.
+#endif
+
+    IF_DEBUG(scheduler,
+	     debugBelch("--<< thread %d (%s) stopped: ", 
+			t->id, whatNext_strs[t->what_next]);
+	     printThreadBlockage(t);
+	     debugBelch("\n"));
+    
+    /* Only for dumping event to log file 
+       ToDo: do I need this in GranSim, too?
+       blockThread(t);
+    */
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+ * Handle a thread that returned to the scheduler with ThreadFinished
+ * -------------------------------------------------------------------------- */
+
+static rtsBool
+scheduleHandleThreadFinished (Capability *cap STG_UNUSED, Task *task, StgTSO *t)
+{
+    /* Need to check whether this was a main thread, and if so,
+     * return with the return value.
+     *
+     * We also end up here if the thread kills itself with an
+     * uncaught exception, see Exception.cmm.
+     */
+    IF_DEBUG(scheduler,debugBelch("--++ thread %d (%s) finished\n", 
+				  t->id, whatNext_strs[t->what_next]));
+
+#if defined(GRAN)
+      endThread(t, CurrentProc); // clean-up the thread
+#elif defined(PARALLEL_HASKELL)
+      /* For now all are advisory -- HWL */
+      //if(t->priority==AdvisoryPriority) ??
+      advisory_thread_count--; // JB: Caution with this counter, buggy!
+      
+# if defined(DIST)
+      if(t->dist.priority==RevalPriority)
+	FinishReval(t);
+# endif
+    
+# if defined(EDENOLD)
+      // the thread could still have an outport... (BUG)
+      if (t->eden.outport != -1) {
+      // delete the outport for the tso which has finished...
+	IF_PAR_DEBUG(eden_ports,
+		   debugBelch("WARNING: Scheduler removes outport %d for TSO %d.\n",
+			      t->eden.outport, t->id));
+	deleteOPT(t);
+      }
+      // thread still in the process (HEAVY BUG! since outport has just been closed...)
+      if (t->eden.epid != -1) {
+	IF_PAR_DEBUG(eden_ports,
+		   debugBelch("WARNING: Scheduler removes TSO %d from process %d .\n",
+			   t->id, t->eden.epid));
+	removeTSOfromProcess(t);
+      }
+# endif 
+
+# if defined(PAR)
+      if (RtsFlags.ParFlags.ParStats.Full &&
+	  !RtsFlags.ParFlags.ParStats.Suppressed) 
+	DumpEndEvent(CURRENT_PROC, t, rtsFalse /* not mandatory */);
+
+      //  t->par only contains statistics: left out for now...
+      IF_PAR_DEBUG(fish,
+		   debugBelch("**** end thread: ended sparked thread %d (%lx); sparkname: %lx\n",
+			      t->id,t,t->par.sparkname));
+# endif
+#endif // PARALLEL_HASKELL
+
+      //
+      // Check whether the thread that just completed was a bound
+      // thread, and if so return with the result.  
+      //
+      // There is an assumption here that all thread completion goes
+      // through this point; we need to make sure that if a thread
+      // ends up in the ThreadKilled state, that it stays on the run
+      // queue so it can be dealt with here.
+      //
+
+      if (t->bound) {
+
+	  if (t->bound != task) {
+#if !defined(THREADED_RTS)
+	      // Must be a bound thread that is not the topmost one.  Leave
+	      // it on the run queue until the stack has unwound to the
+	      // point where we can deal with this.  Leaving it on the run
+	      // queue also ensures that the garbage collector knows about
+	      // this thread and its return value (it gets dropped from the
+	      // all_threads list so there's no other way to find it).
+	      appendToRunQueue(cap,t);
+	      return rtsFalse;
+#else
+	      // this cannot happen in the threaded RTS, because a
+	      // bound thread can only be run by the appropriate Task.
+	      barf("finished bound thread that isn't mine");
+#endif
+	  }
+
+	  ASSERT(task->tso == t);
+
+	  if (t->what_next == ThreadComplete) {
+	      if (task->ret) {
+		  // NOTE: return val is tso->sp[1] (see StgStartup.hc)
+		  *(task->ret) = (StgClosure *)task->tso->sp[1]; 
+	      }
+	      task->stat = Success;
+	  } else {
+	      if (task->ret) {
+		  *(task->ret) = NULL;
+	      }
+	      if (sched_state >= SCHED_INTERRUPTING) {
+		  task->stat = Interrupted;
+	      } else {
+		  task->stat = Killed;
+	      }
+	  }
+#ifdef DEBUG
+	  removeThreadLabel((StgWord)task->tso->id);
+#endif
+	  return rtsTrue; // tells schedule() to return
+      }
+
+      return rtsFalse;
+}
+
+/* -----------------------------------------------------------------------------
+ * Perform a heap census, if PROFILING
+ * -------------------------------------------------------------------------- */
+
+static rtsBool
+scheduleDoHeapProfile( rtsBool ready_to_gc STG_UNUSED )
+{
+#if defined(PROFILING)
+    // When we have +RTS -i0 and we're heap profiling, do a census at
+    // every GC.  This lets us get repeatable runs for debugging.
+    if (performHeapProfile ||
+	(RtsFlags.ProfFlags.profileInterval==0 &&
+	 RtsFlags.ProfFlags.doHeapProfile && ready_to_gc)) {
+
+	// checking black holes is necessary before GC, otherwise
+	// there may be threads that are unreachable except by the
+	// blackhole queue, which the GC will consider to be
+	// deadlocked.
+	scheduleCheckBlackHoles(&MainCapability);
+
+	IF_DEBUG(scheduler, sched_belch("garbage collecting before heap census"));
+	GarbageCollect(GetRoots, rtsTrue);
+
+	IF_DEBUG(scheduler, sched_belch("performing heap census"));
+	heapCensus();
+
+	performHeapProfile = rtsFalse;
+	return rtsTrue;  // true <=> we already GC'd
+    }
+#endif
+    return rtsFalse;
+}
+
+/* -----------------------------------------------------------------------------
+ * Perform a garbage collection if necessary
+ * -------------------------------------------------------------------------- */
+
+static Capability *
+scheduleDoGC (Capability *cap, Task *task USED_IF_THREADS,
+	      rtsBool force_major, void (*get_roots)(evac_fn))
+{
+    StgTSO *t;
+#ifdef THREADED_RTS
+    static volatile StgWord waiting_for_gc;
+    rtsBool was_waiting;
+    nat i;
+#endif
+
+#ifdef THREADED_RTS
+    // In order to GC, there must be no threads running Haskell code.
+    // Therefore, the GC thread needs to hold *all* the capabilities,
+    // and release them after the GC has completed.  
+    //
+    // This seems to be the simplest way: previous attempts involved
+    // making all the threads with capabilities give up their
+    // capabilities and sleep except for the *last* one, which
+    // actually did the GC.  But it's quite hard to arrange for all
+    // the other tasks to sleep and stay asleep.
+    //
+	
+    was_waiting = cas(&waiting_for_gc, 0, 1);
+    if (was_waiting) {
+	do {
+	    IF_DEBUG(scheduler, sched_belch("someone else is trying to GC..."));
+	    if (cap) yieldCapability(&cap,task);
+	} while (waiting_for_gc);
+	return cap;  // NOTE: task->cap might have changed here
+    }
+
+    for (i=0; i < n_capabilities; i++) {
+	IF_DEBUG(scheduler, sched_belch("ready_to_gc, grabbing all the capabilies (%d/%d)", i, n_capabilities));
+	if (cap != &capabilities[i]) {
+	    Capability *pcap = &capabilities[i];
+	    // we better hope this task doesn't get migrated to
+	    // another Capability while we're waiting for this one.
+	    // It won't, because load balancing happens while we have
+	    // all the Capabilities, but even so it's a slightly
+	    // unsavoury invariant.
+	    task->cap = pcap;
+	    context_switch = 1;
+	    waitForReturnCapability(&pcap, task);
+	    if (pcap != &capabilities[i]) {
+		barf("scheduleDoGC: got the wrong capability");
+	    }
+	}
+    }
+
+    waiting_for_gc = rtsFalse;
+#endif
+
+    /* Kick any transactions which are invalid back to their
+     * atomically frames.  When next scheduled they will try to
+     * commit, this commit will fail and they will retry.
+     */
+    { 
+	StgTSO *next;
+
+	for (t = all_threads; t != END_TSO_QUEUE; t = next) {
+	    if (t->what_next == ThreadRelocated) {
+		next = t->link;
+	    } else {
+		next = t->global_link;
+		if (t -> trec != NO_TREC && t -> why_blocked == NotBlocked) {
+		    if (!stmValidateNestOfTransactions (t -> trec)) {
+			IF_DEBUG(stm, sched_belch("trec %p found wasting its time", t));
+			
+			// strip the stack back to the
+			// ATOMICALLY_FRAME, aborting the (nested)
+			// transaction, and saving the stack of any
+			// partially-evaluated thunks on the heap.
+			raiseAsync_(&capabilities[0], t, NULL, rtsTrue, NULL);
+			
+#ifdef REG_R1
+			ASSERT(get_itbl((StgClosure *)t->sp)->type == ATOMICALLY_FRAME);
+#endif
+		    }
+		}
+	    }
+	}
+    }
+    
+    // so this happens periodically:
+    if (cap) scheduleCheckBlackHoles(cap);
+    
+    IF_DEBUG(scheduler, printAllThreads());
+
+    /*
+     * We now have all the capabilities; if we're in an interrupting
+     * state, then we should take the opportunity to delete all the
+     * threads in the system.
+     */
+    if (sched_state >= SCHED_INTERRUPTING) {
+	deleteAllThreads(&capabilities[0]);
+	sched_state = SCHED_INTERRUPTED;
+    }
+
+    /* everybody back, start the GC.
+     * Could do it in this thread, or signal a condition var
+     * to do it in another thread.  Either way, we need to
+     * broadcast on gc_pending_cond afterward.
+     */
+#if defined(THREADED_RTS)
+    IF_DEBUG(scheduler,sched_belch("doing GC"));
+#endif
+    GarbageCollect(get_roots, force_major);
+    
+#if defined(THREADED_RTS)
+    // release our stash of capabilities.
+    for (i = 0; i < n_capabilities; i++) {
+	if (cap != &capabilities[i]) {
+	    task->cap = &capabilities[i];
+	    releaseCapability(&capabilities[i]);
+	}
+    }
+    if (cap) {
+	task->cap = cap;
+    } else {
+	task->cap = NULL;
+    }
+#endif
+
+#if defined(GRAN)
+    /* add a ContinueThread event to continue execution of current thread */
+    new_event(CurrentProc, CurrentProc, CurrentTime[CurrentProc],
+	      ContinueThread,
+	      t, (StgClosure*)NULL, (rtsSpark*)NULL);
+    IF_GRAN_DEBUG(bq, 
+		  debugBelch("GRAN: eventq and runnableq after Garbage collection:\n\n");
+		  G_EVENTQ(0);
+		  G_CURR_THREADQ(0));
+#endif /* GRAN */
+
+    return cap;
+}
+
+/* ---------------------------------------------------------------------------
+ * rtsSupportsBoundThreads(): is the RTS built to support bound threads?
+ * used by Control.Concurrent for error checking.
+ * ------------------------------------------------------------------------- */
+ 
+StgBool
+rtsSupportsBoundThreads(void)
+{
+#if defined(THREADED_RTS)
+  return rtsTrue;
+#else
+  return rtsFalse;
+#endif
+}
+
+/* ---------------------------------------------------------------------------
+ * isThreadBound(tso): check whether tso is bound to an OS thread.
+ * ------------------------------------------------------------------------- */
+ 
+StgBool
+isThreadBound(StgTSO* tso USED_IF_THREADS)
+{
+#if defined(THREADED_RTS)
+  return (tso->bound != NULL);
+#endif
+  return rtsFalse;
+}
+
+/* ---------------------------------------------------------------------------
+ * Singleton fork(). Do not copy any running threads.
+ * ------------------------------------------------------------------------- */
+
+#if !defined(mingw32_HOST_OS)
+#define FORKPROCESS_PRIMOP_SUPPORTED
+#endif
+
+#ifdef FORKPROCESS_PRIMOP_SUPPORTED
+static void 
+deleteThread_(Capability *cap, StgTSO *tso);
+#endif
+StgInt
+forkProcess(HsStablePtr *entry
+#ifndef FORKPROCESS_PRIMOP_SUPPORTED
+	    STG_UNUSED
+#endif
+           )
+{
+#ifdef FORKPROCESS_PRIMOP_SUPPORTED
+    Task *task;
+    pid_t pid;
+    StgTSO* t,*next;
+    Capability *cap;
+    
+#if defined(THREADED_RTS)
+    if (RtsFlags.ParFlags.nNodes > 1) {
+	errorBelch("forking not supported with +RTS -N<n> greater than 1");
+	stg_exit(EXIT_FAILURE);
+    }
+#endif
+
+    IF_DEBUG(scheduler,sched_belch("forking!"));
+    
+    // ToDo: for SMP, we should probably acquire *all* the capabilities
+    cap = rts_lock();
+    
+    pid = fork();
+    
+    if (pid) { // parent
+	
+	// just return the pid
+	rts_unlock(cap);
+	return pid;
+	
+    } else { // child
+	
+	// Now, all OS threads except the thread that forked are
+	// stopped.  We need to stop all Haskell threads, including
+	// those involved in foreign calls.  Also we need to delete
+	// all Tasks, because they correspond to OS threads that are
+	// now gone.
+
+	for (t = all_threads; t != END_TSO_QUEUE; t = next) {
+	    if (t->what_next == ThreadRelocated) {
+		next = t->link;
+	    } else {
+		next = t->global_link;
+		// don't allow threads to catch the ThreadKilled
+		// exception, but we do want to raiseAsync() because these
+		// threads may be evaluating thunks that we need later.
+		deleteThread_(cap,t);
+	    }
+	}
+	
+	// Empty the run queue.  It seems tempting to let all the
+	// killed threads stay on the run queue as zombies to be
+	// cleaned up later, but some of them correspond to bound
+	// threads for which the corresponding Task does not exist.
+	cap->run_queue_hd = END_TSO_QUEUE;
+	cap->run_queue_tl = END_TSO_QUEUE;
+
+	// Any suspended C-calling Tasks are no more, their OS threads
+	// don't exist now:
+	cap->suspended_ccalling_tasks = NULL;
+
+	// Empty the all_threads list.  Otherwise, the garbage
+	// collector may attempt to resurrect some of these threads.
+	all_threads = END_TSO_QUEUE;
+
+	// Wipe the task list, except the current Task.
+	ACQUIRE_LOCK(&sched_mutex);
+	for (task = all_tasks; task != NULL; task=task->all_link) {
+	    if (task != cap->running_task) {
+		discardTask(task);
+	    }
+	}
+	RELEASE_LOCK(&sched_mutex);
+
+#if defined(THREADED_RTS)
+	// Wipe our spare workers list, they no longer exist.  New
+	// workers will be created if necessary.
+	cap->spare_workers = NULL;
+	cap->returning_tasks_hd = NULL;
+	cap->returning_tasks_tl = NULL;
+#endif
+
+	cap = rts_evalStableIO(cap, entry, NULL);  // run the action
+	rts_checkSchedStatus("forkProcess",cap);
+	
+	rts_unlock(cap);
+	hs_exit();                      // clean up and exit
+	stg_exit(EXIT_SUCCESS);
+    }
+#else /* !FORKPROCESS_PRIMOP_SUPPORTED */
+    barf("forkProcess#: primop not supported on this platform, sorry!\n");
+    return -1;
+#endif
+}
+
+/* ---------------------------------------------------------------------------
+ * Delete all the threads in the system
+ * ------------------------------------------------------------------------- */
+   
+static void
+deleteAllThreads ( Capability *cap )
+{
+  StgTSO* t, *next;
+  IF_DEBUG(scheduler,sched_belch("deleting all threads"));
+  for (t = all_threads; t != END_TSO_QUEUE; t = next) {
+      if (t->what_next == ThreadRelocated) {
+	  next = t->link;
+      } else {
+	  next = t->global_link;
+	  deleteThread(cap,t);
+      }
+  }      
+
+  // The run queue now contains a bunch of ThreadKilled threads.  We
+  // must not throw these away: the main thread(s) will be in there
+  // somewhere, and the main scheduler loop has to deal with it.
+  // Also, the run queue is the only thing keeping these threads from
+  // being GC'd, and we don't want the "main thread has been GC'd" panic.
+
+#if !defined(THREADED_RTS)
+  ASSERT(blocked_queue_hd == END_TSO_QUEUE);
+  ASSERT(sleeping_queue == END_TSO_QUEUE);
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+   Managing the suspended_ccalling_tasks list.
+   Locks required: sched_mutex
+   -------------------------------------------------------------------------- */
+
+STATIC_INLINE void
+suspendTask (Capability *cap, Task *task)
+{
+    ASSERT(task->next == NULL && task->prev == NULL);
+    task->next = cap->suspended_ccalling_tasks;
+    task->prev = NULL;
+    if (cap->suspended_ccalling_tasks) {
+	cap->suspended_ccalling_tasks->prev = task;
+    }
+    cap->suspended_ccalling_tasks = task;
+}
+
+STATIC_INLINE void
+recoverSuspendedTask (Capability *cap, Task *task)
+{
+    if (task->prev) {
+	task->prev->next = task->next;
+    } else {
+	ASSERT(cap->suspended_ccalling_tasks == task);
+	cap->suspended_ccalling_tasks = task->next;
+    }
+    if (task->next) {
+	task->next->prev = task->prev;
+    }
+    task->next = task->prev = NULL;
+}
+
+/* ---------------------------------------------------------------------------
+ * Suspending & resuming Haskell threads.
+ * 
+ * When making a "safe" call to C (aka _ccall_GC), the task gives back
+ * its capability before calling the C function.  This allows another
+ * task to pick up the capability and carry on running Haskell
+ * threads.  It also means that if the C call blocks, it won't lock
+ * the whole system.
+ *
+ * The Haskell thread making the C call is put to sleep for the
+ * duration of the call, on the susepended_ccalling_threads queue.  We
+ * give out a token to the task, which it can use to resume the thread
+ * on return from the C function.
+ * ------------------------------------------------------------------------- */
+   
+void *
+suspendThread (StgRegTable *reg)
+{
+  Capability *cap;
+  int saved_errno = errno;
+  StgTSO *tso;
+  Task *task;
+
+  /* assume that *reg is a pointer to the StgRegTable part of a Capability.
+   */
+  cap = regTableToCapability(reg);
+
+  task = cap->running_task;
+  tso = cap->r.rCurrentTSO;
+
+  IF_DEBUG(scheduler,
+	   sched_belch("thread %d did a safe foreign call", cap->r.rCurrentTSO->id));
+
+  // XXX this might not be necessary --SDM
+  tso->what_next = ThreadRunGHC;
+
+  threadPaused(cap,tso);
+
+  if(tso->blocked_exceptions == NULL)  {
+      tso->why_blocked = BlockedOnCCall;
+      tso->blocked_exceptions = END_TSO_QUEUE;
+  } else {
+      tso->why_blocked = BlockedOnCCall_NoUnblockExc;
+  }
+
+  // Hand back capability
+  task->suspended_tso = tso;
+
+  ACQUIRE_LOCK(&cap->lock);
+
+  suspendTask(cap,task);
+  cap->in_haskell = rtsFalse;
+  releaseCapability_(cap);
+  
+  RELEASE_LOCK(&cap->lock);
+
+#if defined(THREADED_RTS)
+  /* Preparing to leave the RTS, so ensure there's a native thread/task
+     waiting to take over.
+  */
+  IF_DEBUG(scheduler, sched_belch("thread %d: leaving RTS", tso->id));
+#endif
+
+  errno = saved_errno;
+  return task;
+}
+
+StgRegTable *
+resumeThread (void *task_)
+{
+    StgTSO *tso;
+    Capability *cap;
+    int saved_errno = errno;
+    Task *task = task_;
+
+    cap = task->cap;
+    // Wait for permission to re-enter the RTS with the result.
+    waitForReturnCapability(&cap,task);
+    // we might be on a different capability now... but if so, our
+    // entry on the suspended_ccalling_tasks list will also have been
+    // migrated.
+
+    // Remove the thread from the suspended list
+    recoverSuspendedTask(cap,task);
+
+    tso = task->suspended_tso;
+    task->suspended_tso = NULL;
+    tso->link = END_TSO_QUEUE;
+    IF_DEBUG(scheduler, sched_belch("thread %d: re-entering RTS", tso->id));
+    
+    if (tso->why_blocked == BlockedOnCCall) {
+	awakenBlockedQueue(cap,tso->blocked_exceptions);
+	tso->blocked_exceptions = NULL;
+    }
+    
+    /* Reset blocking status */
+    tso->why_blocked  = NotBlocked;
+    
+    cap->r.rCurrentTSO = tso;
+    cap->in_haskell = rtsTrue;
+    errno = saved_errno;
+
+    /* We might have GC'd, mark the TSO dirty again */
+    dirtyTSO(tso);
+
+    IF_DEBUG(sanity, checkTSO(tso));
+
+    return &cap->r;
+}
+
+/* ---------------------------------------------------------------------------
+ * Comparing Thread ids.
+ *
+ * This is used from STG land in the implementation of the
+ * instances of Eq/Ord for ThreadIds.
+ * ------------------------------------------------------------------------ */
+
+int
+cmp_thread(StgPtr tso1, StgPtr tso2) 
+{ 
+  StgThreadID id1 = ((StgTSO *)tso1)->id; 
+  StgThreadID id2 = ((StgTSO *)tso2)->id;
+ 
+  if (id1 < id2) return (-1);
+  if (id1 > id2) return 1;
+  return 0;
+}
+
+/* ---------------------------------------------------------------------------
+ * Fetching the ThreadID from an StgTSO.
+ *
+ * This is used in the implementation of Show for ThreadIds.
+ * ------------------------------------------------------------------------ */
+int
+rts_getThreadId(StgPtr tso) 
+{
+  return ((StgTSO *)tso)->id;
+}
+
+#ifdef DEBUG
+void
+labelThread(StgPtr tso, char *label)
+{
+  int len;
+  void *buf;
+
+  /* Caveat: Once set, you can only set the thread name to "" */
+  len = strlen(label)+1;
+  buf = stgMallocBytes(len * sizeof(char), "Schedule.c:labelThread()");
+  strncpy(buf,label,len);
+  /* Update will free the old memory for us */
+  updateThreadLabel(((StgTSO *)tso)->id,buf);
+}
+#endif /* DEBUG */
+
+/* ---------------------------------------------------------------------------
+   Create a new thread.
+
+   The new thread starts with the given stack size.  Before the
+   scheduler can run, however, this thread needs to have a closure
+   (and possibly some arguments) pushed on its stack.  See
+   pushClosure() in Schedule.h.
+
+   createGenThread() and createIOThread() (in SchedAPI.h) are
+   convenient packaged versions of this function.
+
+   currently pri (priority) is only used in a GRAN setup -- HWL
+   ------------------------------------------------------------------------ */
+#if defined(GRAN)
+/*   currently pri (priority) is only used in a GRAN setup -- HWL */
+StgTSO *
+createThread(nat size, StgInt pri)
+#else
+StgTSO *
+createThread(Capability *cap, nat size)
+#endif
+{
+    StgTSO *tso;
+    nat stack_size;
+
+    /* sched_mutex is *not* required */
+
+    /* First check whether we should create a thread at all */
+#if defined(PARALLEL_HASKELL)
+    /* check that no more than RtsFlags.ParFlags.maxThreads threads are created */
+    if (advisory_thread_count >= RtsFlags.ParFlags.maxThreads) {
+	threadsIgnored++;
+	debugBelch("{createThread}Daq ghuH: refusing to create another thread; no more than %d threads allowed (currently %d)\n",
+		   RtsFlags.ParFlags.maxThreads, advisory_thread_count);
+	return END_TSO_QUEUE;
+    }
+    threadsCreated++;
+#endif
+
+#if defined(GRAN)
+    ASSERT(!RtsFlags.GranFlags.Light || CurrentProc==0);
+#endif
+
+    // ToDo: check whether size = stack_size - TSO_STRUCT_SIZEW
+
+    /* catch ridiculously small stack sizes */
+    if (size < MIN_STACK_WORDS + TSO_STRUCT_SIZEW) {
+	size = MIN_STACK_WORDS + TSO_STRUCT_SIZEW;
+    }
+
+    stack_size = size - TSO_STRUCT_SIZEW;
+    
+    tso = (StgTSO *)allocateLocal(cap, size);
+    TICK_ALLOC_TSO(stack_size, 0);
+
+    SET_HDR(tso, &stg_TSO_info, CCS_SYSTEM);
+#if defined(GRAN)
+    SET_GRAN_HDR(tso, ThisPE);
+#endif
+
+    // Always start with the compiled code evaluator
+    tso->what_next = ThreadRunGHC;
+
+    tso->why_blocked  = NotBlocked;
+    tso->blocked_exceptions = NULL;
+    tso->flags = TSO_DIRTY;
+    
+    tso->saved_errno = 0;
+    tso->bound = NULL;
+    tso->cap = cap;
+    
+    tso->stack_size     = stack_size;
+    tso->max_stack_size = round_to_mblocks(RtsFlags.GcFlags.maxStkSize) 
+	                  - TSO_STRUCT_SIZEW;
+    tso->sp             = (P_)&(tso->stack) + stack_size;
+
+    tso->trec = NO_TREC;
+    
+#ifdef PROFILING
+    tso->prof.CCCS = CCS_MAIN;
+#endif
+    
+  /* put a stop frame on the stack */
+    tso->sp -= sizeofW(StgStopFrame);
+    SET_HDR((StgClosure*)tso->sp,(StgInfoTable *)&stg_stop_thread_info,CCS_SYSTEM);
+    tso->link = END_TSO_QUEUE;
+    
+  // ToDo: check this
+#if defined(GRAN)
+    /* uses more flexible routine in GranSim */
+    insertThread(tso, CurrentProc);
+#else
+    /* In a non-GranSim setup the pushing of a TSO onto the runq is separated
+     * from its creation
+     */
+#endif
+    
+#if defined(GRAN) 
+    if (RtsFlags.GranFlags.GranSimStats.Full) 
+	DumpGranEvent(GR_START,tso);
+#elif defined(PARALLEL_HASKELL)
+    if (RtsFlags.ParFlags.ParStats.Full) 
+	DumpGranEvent(GR_STARTQ,tso);
+    /* HACk to avoid SCHEDULE 
+       LastTSO = tso; */
+#endif
+    
+    /* Link the new thread on the global thread list.
+     */
+    ACQUIRE_LOCK(&sched_mutex);
+    tso->id = next_thread_id++;  // while we have the mutex
+    tso->global_link = all_threads;
+    all_threads = tso;
+    RELEASE_LOCK(&sched_mutex);
+    
+#if defined(DIST)
+    tso->dist.priority = MandatoryPriority; //by default that is...
+#endif
+    
+#if defined(GRAN)
+    tso->gran.pri = pri;
+# if defined(DEBUG)
+    tso->gran.magic = TSO_MAGIC; // debugging only
+# endif
+    tso->gran.sparkname   = 0;
+    tso->gran.startedat   = CURRENT_TIME; 
+    tso->gran.exported    = 0;
+    tso->gran.basicblocks = 0;
+    tso->gran.allocs      = 0;
+    tso->gran.exectime    = 0;
+    tso->gran.fetchtime   = 0;
+    tso->gran.fetchcount  = 0;
+    tso->gran.blocktime   = 0;
+    tso->gran.blockcount  = 0;
+    tso->gran.blockedat   = 0;
+    tso->gran.globalsparks = 0;
+    tso->gran.localsparks  = 0;
+    if (RtsFlags.GranFlags.Light)
+	tso->gran.clock  = Now; /* local clock */
+    else
+	tso->gran.clock  = 0;
+    
+    IF_DEBUG(gran,printTSO(tso));
+#elif defined(PARALLEL_HASKELL)
+# if defined(DEBUG)
+    tso->par.magic = TSO_MAGIC; // debugging only
+# endif
+    tso->par.sparkname   = 0;
+    tso->par.startedat   = CURRENT_TIME; 
+    tso->par.exported    = 0;
+    tso->par.basicblocks = 0;
+    tso->par.allocs      = 0;
+    tso->par.exectime    = 0;
+    tso->par.fetchtime   = 0;
+    tso->par.fetchcount  = 0;
+    tso->par.blocktime   = 0;
+    tso->par.blockcount  = 0;
+    tso->par.blockedat   = 0;
+    tso->par.globalsparks = 0;
+    tso->par.localsparks  = 0;
+#endif
+    
+#if defined(GRAN)
+    globalGranStats.tot_threads_created++;
+    globalGranStats.threads_created_on_PE[CurrentProc]++;
+    globalGranStats.tot_sq_len += spark_queue_len(CurrentProc);
+    globalGranStats.tot_sq_probes++;
+#elif defined(PARALLEL_HASKELL)
+    // collect parallel global statistics (currently done together with GC stats)
+    if (RtsFlags.ParFlags.ParStats.Global &&
+	RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	//debugBelch("Creating thread %d @ %11.2f\n", tso->id, usertime()); 
+	globalParStats.tot_threads_created++;
+    }
+#endif 
+    
+#if defined(GRAN)
+    IF_GRAN_DEBUG(pri,
+		  sched_belch("==__ schedule: Created TSO %d (%p);",
+			      CurrentProc, tso, tso->id));
+#elif defined(PARALLEL_HASKELL)
+    IF_PAR_DEBUG(verbose,
+		 sched_belch("==__ schedule: Created TSO %d (%p); %d threads active",
+			     (long)tso->id, tso, advisory_thread_count));
+#else
+    IF_DEBUG(scheduler,sched_belch("created thread %ld, stack size = %lx words", 
+				   (long)tso->id, (long)tso->stack_size));
+#endif    
+    return tso;
+}
+
+#if defined(PAR)
+/* RFP:
+   all parallel thread creation calls should fall through the following routine.
+*/
+StgTSO *
+createThreadFromSpark(rtsSpark spark) 
+{ StgTSO *tso;
+  ASSERT(spark != (rtsSpark)NULL);
+// JB: TAKE CARE OF THIS COUNTER! BUGGY
+  if (advisory_thread_count >= RtsFlags.ParFlags.maxThreads) 
+  { threadsIgnored++;
+    barf("{createSparkThread}Daq ghuH: refusing to create another thread; no more than %d threads allowed (currently %d)",
+	  RtsFlags.ParFlags.maxThreads, advisory_thread_count);    
+    return END_TSO_QUEUE;
+  }
+  else
+  { threadsCreated++;
+    tso = createThread(RtsFlags.GcFlags.initialStkSize);
+    if (tso==END_TSO_QUEUE)	
+      barf("createSparkThread: Cannot create TSO");
+#if defined(DIST)
+    tso->priority = AdvisoryPriority;
+#endif
+    pushClosure(tso,spark);
+    addToRunQueue(tso);
+    advisory_thread_count++;  // JB: TAKE CARE OF THIS COUNTER! BUGGY
+  }
+  return tso;
+}
+#endif
+
+/*
+  Turn a spark into a thread.
+  ToDo: fix for SMP (needs to acquire SCHED_MUTEX!)
+*/
+#if 0
+StgTSO *
+activateSpark (rtsSpark spark) 
+{
+  StgTSO *tso;
+
+  tso = createSparkThread(spark);
+  if (RtsFlags.ParFlags.ParStats.Full) {   
+    //ASSERT(run_queue_hd == END_TSO_QUEUE); // I think ...
+      IF_PAR_DEBUG(verbose,
+		   debugBelch("==^^ activateSpark: turning spark of closure %p (%s) into a thread\n",
+			      (StgClosure *)spark, info_type((StgClosure *)spark)));
+  }
+  // ToDo: fwd info on local/global spark to thread -- HWL
+  // tso->gran.exported =  spark->exported;
+  // tso->gran.locked =   !spark->global;
+  // tso->gran.sparkname = spark->name;
+
+  return tso;
+}
+#endif
+
+/* ---------------------------------------------------------------------------
+ * scheduleThread()
+ *
+ * scheduleThread puts a thread on the end  of the runnable queue.
+ * This will usually be done immediately after a thread is created.
+ * The caller of scheduleThread must create the thread using e.g.
+ * createThread and push an appropriate closure
+ * on this thread's stack before the scheduler is invoked.
+ * ------------------------------------------------------------------------ */
+
+void
+scheduleThread(Capability *cap, StgTSO *tso)
+{
+    // The thread goes at the *end* of the run-queue, to avoid possible
+    // starvation of any threads already on the queue.
+    appendToRunQueue(cap,tso);
+}
+
+void
+scheduleThreadOn(Capability *cap, StgWord cpu USED_IF_THREADS, StgTSO *tso)
+{
+#if defined(THREADED_RTS)
+    tso->flags |= TSO_LOCKED; // we requested explicit affinity; don't
+			      // move this thread from now on.
+    cpu %= RtsFlags.ParFlags.nNodes;
+    if (cpu == cap->no) {
+	appendToRunQueue(cap,tso);
+    } else {
+	Capability *target_cap = &capabilities[cpu];
+	if (tso->bound) {
+	    tso->bound->cap = target_cap;
+	}
+	tso->cap = target_cap;
+	wakeupThreadOnCapability(target_cap,tso);
+    }
+#else
+    appendToRunQueue(cap,tso);
+#endif
+}
+
+Capability *
+scheduleWaitThread (StgTSO* tso, /*[out]*/HaskellObj* ret, Capability *cap)
+{
+    Task *task;
+
+    // We already created/initialised the Task
+    task = cap->running_task;
+
+    // This TSO is now a bound thread; make the Task and TSO
+    // point to each other.
+    tso->bound = task;
+    tso->cap = cap;
+
+    task->tso = tso;
+    task->ret = ret;
+    task->stat = NoStatus;
+
+    appendToRunQueue(cap,tso);
+
+    IF_DEBUG(scheduler, sched_belch("new bound thread (%d)", tso->id));
+
+#if defined(GRAN)
+    /* GranSim specific init */
+    CurrentTSO = m->tso;                // the TSO to run
+    procStatus[MainProc] = Busy;        // status of main PE
+    CurrentProc = MainProc;             // PE to run it on
+#endif
+
+    cap = schedule(cap,task);
+
+    ASSERT(task->stat != NoStatus);
+    ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
+
+    IF_DEBUG(scheduler, sched_belch("bound thread (%d) finished", task->tso->id));
+    return cap;
+}
+
+/* ----------------------------------------------------------------------------
+ * Starting Tasks
+ * ------------------------------------------------------------------------- */
+
+#if defined(THREADED_RTS)
+void
+workerStart(Task *task)
+{
+    Capability *cap;
+
+    // See startWorkerTask().
+    ACQUIRE_LOCK(&task->lock);
+    cap = task->cap;
+    RELEASE_LOCK(&task->lock);
+
+    // set the thread-local pointer to the Task:
+    taskEnter(task);
+
+    // schedule() runs without a lock.
+    cap = schedule(cap,task);
+
+    // On exit from schedule(), we have a Capability.
+    releaseCapability(cap);
+    taskStop(task);
+}
+#endif
+
+/* ---------------------------------------------------------------------------
+ * initScheduler()
+ *
+ * Initialise the scheduler.  This resets all the queues - if the
+ * queues contained any threads, they'll be garbage collected at the
+ * next pass.
+ *
+ * ------------------------------------------------------------------------ */
+
+void 
+initScheduler(void)
+{
+#if defined(GRAN)
+  nat i;
+  for (i=0; i<=MAX_PROC; i++) {
+    run_queue_hds[i]      = END_TSO_QUEUE;
+    run_queue_tls[i]      = END_TSO_QUEUE;
+    blocked_queue_hds[i]  = END_TSO_QUEUE;
+    blocked_queue_tls[i]  = END_TSO_QUEUE;
+    ccalling_threadss[i]  = END_TSO_QUEUE;
+    blackhole_queue[i]    = END_TSO_QUEUE;
+    sleeping_queue        = END_TSO_QUEUE;
+  }
+#elif !defined(THREADED_RTS)
+  blocked_queue_hd  = END_TSO_QUEUE;
+  blocked_queue_tl  = END_TSO_QUEUE;
+  sleeping_queue    = END_TSO_QUEUE;
+#endif
+
+  blackhole_queue   = END_TSO_QUEUE;
+  all_threads       = END_TSO_QUEUE;
+
+  context_switch = 0;
+  sched_state    = SCHED_RUNNING;
+
+  RtsFlags.ConcFlags.ctxtSwitchTicks =
+      RtsFlags.ConcFlags.ctxtSwitchTime / TICK_MILLISECS;
+      
+#if defined(THREADED_RTS)
+  /* Initialise the mutex and condition variables used by
+   * the scheduler. */
+  initMutex(&sched_mutex);
+#endif
+  
+  ACQUIRE_LOCK(&sched_mutex);
+
+  /* A capability holds the state a native thread needs in
+   * order to execute STG code. At least one capability is
+   * floating around (only THREADED_RTS builds have more than one).
+   */
+  initCapabilities();
+
+  initTaskManager();
+
+#if defined(THREADED_RTS) || defined(PARALLEL_HASKELL)
+  initSparkPools();
+#endif
+
+#if defined(THREADED_RTS)
+  /*
+   * Eagerly start one worker to run each Capability, except for
+   * Capability 0.  The idea is that we're probably going to start a
+   * bound thread on Capability 0 pretty soon, so we don't want a
+   * worker task hogging it.
+   */
+  { 
+      nat i;
+      Capability *cap;
+      for (i = 1; i < n_capabilities; i++) {
+	  cap = &capabilities[i];
+	  ACQUIRE_LOCK(&cap->lock);
+	  startWorkerTask(cap, workerStart);
+	  RELEASE_LOCK(&cap->lock);
+      }
+  }
+#endif
+
+  RELEASE_LOCK(&sched_mutex);
+}
+
+void
+exitScheduler( void )
+{
+    Task *task = NULL;
+
+#if defined(THREADED_RTS)
+    ACQUIRE_LOCK(&sched_mutex);
+    task = newBoundTask();
+    RELEASE_LOCK(&sched_mutex);
+#endif
+
+    // If we haven't killed all the threads yet, do it now.
+    if (sched_state < SCHED_INTERRUPTED) {
+	sched_state = SCHED_INTERRUPTING;
+	scheduleDoGC(NULL,task,rtsFalse,GetRoots);    
+    }
+    sched_state = SCHED_SHUTTING_DOWN;
+
+#if defined(THREADED_RTS)
+    { 
+	nat i;
+	
+	for (i = 0; i < n_capabilities; i++) {
+	    shutdownCapability(&capabilities[i], task);
+	}
+	boundTaskExiting(task);
+	stopTaskManager();
+    }
+#endif
+}
+
+/* ---------------------------------------------------------------------------
+   Where are the roots that we know about?
+
+        - all the threads on the runnable queue
+        - all the threads on the blocked queue
+        - all the threads on the sleeping queue
+	- all the thread currently executing a _ccall_GC
+        - all the "main threads"
+     
+   ------------------------------------------------------------------------ */
+
+/* This has to be protected either by the scheduler monitor, or by the
+	garbage collection monitor (probably the latter).
+	KH @ 25/10/99
+*/
+
+void
+GetRoots( evac_fn evac )
+{
+    nat i;
+    Capability *cap;
+    Task *task;
+
+#if defined(GRAN)
+    for (i=0; i<=RtsFlags.GranFlags.proc; i++) {
+	if ((run_queue_hds[i] != END_TSO_QUEUE) && ((run_queue_hds[i] != NULL)))
+	    evac((StgClosure **)&run_queue_hds[i]);
+	if ((run_queue_tls[i] != END_TSO_QUEUE) && ((run_queue_tls[i] != NULL)))
+	    evac((StgClosure **)&run_queue_tls[i]);
+	
+	if ((blocked_queue_hds[i] != END_TSO_QUEUE) && ((blocked_queue_hds[i] != NULL)))
+	    evac((StgClosure **)&blocked_queue_hds[i]);
+	if ((blocked_queue_tls[i] != END_TSO_QUEUE) && ((blocked_queue_tls[i] != NULL)))
+	    evac((StgClosure **)&blocked_queue_tls[i]);
+	if ((ccalling_threadss[i] != END_TSO_QUEUE) && ((ccalling_threadss[i] != NULL)))
+	    evac((StgClosure **)&ccalling_threads[i]);
+    }
+
+    markEventQueue();
+
+#else /* !GRAN */
+
+    for (i = 0; i < n_capabilities; i++) {
+	cap = &capabilities[i];
+	evac((StgClosure **)(void *)&cap->run_queue_hd);
+	evac((StgClosure **)(void *)&cap->run_queue_tl);
+#if defined(THREADED_RTS)
+	evac((StgClosure **)(void *)&cap->wakeup_queue_hd);
+	evac((StgClosure **)(void *)&cap->wakeup_queue_tl);
+#endif
+	for (task = cap->suspended_ccalling_tasks; task != NULL; 
+	     task=task->next) {
+	    IF_DEBUG(scheduler,sched_belch("evac'ing suspended TSO %d", task->suspended_tso->id));
+	    evac((StgClosure **)(void *)&task->suspended_tso);
+	}
+
+    }
+    
+
+#if !defined(THREADED_RTS)
+    evac((StgClosure **)(void *)&blocked_queue_hd);
+    evac((StgClosure **)(void *)&blocked_queue_tl);
+    evac((StgClosure **)(void *)&sleeping_queue);
+#endif 
+#endif
+
+    // evac((StgClosure **)&blackhole_queue);
+
+#if defined(THREADED_RTS) || defined(PARALLEL_HASKELL) || defined(GRAN)
+    markSparkQueue(evac);
+#endif
+    
+#if defined(RTS_USER_SIGNALS)
+    // mark the signal handlers (signals should be already blocked)
+    markSignalHandlers(evac);
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+   performGC
+
+   This is the interface to the garbage collector from Haskell land.
+   We provide this so that external C code can allocate and garbage
+   collect when called from Haskell via _ccall_GC.
+
+   It might be useful to provide an interface whereby the programmer
+   can specify more roots (ToDo).
+   
+   This needs to be protected by the GC condition variable above.  KH.
+   -------------------------------------------------------------------------- */
+
+static void (*extra_roots)(evac_fn);
+
+static void
+performGC_(rtsBool force_major, void (*get_roots)(evac_fn))
+{
+    Task *task = myTask();
+
+    if (task == NULL) {
+	ACQUIRE_LOCK(&sched_mutex);
+	task = newBoundTask();
+	RELEASE_LOCK(&sched_mutex);
+	scheduleDoGC(NULL,task,force_major, get_roots);
+	boundTaskExiting(task);
+    } else {
+	scheduleDoGC(NULL,task,force_major, get_roots);
+    }
+}
+
+void
+performGC(void)
+{
+    performGC_(rtsFalse, GetRoots);
+}
+
+void
+performMajorGC(void)
+{
+    performGC_(rtsTrue, GetRoots);
+}
+
+static void
+AllRoots(evac_fn evac)
+{
+    GetRoots(evac);		// the scheduler's roots
+    extra_roots(evac);		// the user's roots
+}
+
+void
+performGCWithRoots(void (*get_roots)(evac_fn))
+{
+    extra_roots = get_roots;
+    performGC_(rtsFalse, AllRoots);
+}
+
+/* -----------------------------------------------------------------------------
+   Stack overflow
+
+   If the thread has reached its maximum stack size, then raise the
+   StackOverflow exception in the offending thread.  Otherwise
+   relocate the TSO into a larger chunk of memory and adjust its stack
+   size appropriately.
+   -------------------------------------------------------------------------- */
+
+static StgTSO *
+threadStackOverflow(Capability *cap, StgTSO *tso)
+{
+  nat new_stack_size, stack_words;
+  lnat new_tso_size;
+  StgPtr new_sp;
+  StgTSO *dest;
+
+  IF_DEBUG(sanity,checkTSO(tso));
+  if (tso->stack_size >= tso->max_stack_size) {
+
+    IF_DEBUG(gc,
+	     debugBelch("@@ threadStackOverflow of TSO %ld (%p): stack too large (now %ld; max is %ld)\n",
+		   (long)tso->id, tso, (long)tso->stack_size, (long)tso->max_stack_size);
+	     /* If we're debugging, just print out the top of the stack */
+	     printStackChunk(tso->sp, stg_min(tso->stack+tso->stack_size, 
+					      tso->sp+64)));
+
+    /* Send this thread the StackOverflow exception */
+    raiseAsync(cap, tso, (StgClosure *)stackOverflow_closure);
+    return tso;
+  }
+
+  /* Try to double the current stack size.  If that takes us over the
+   * maximum stack size for this thread, then use the maximum instead.
+   * Finally round up so the TSO ends up as a whole number of blocks.
+   */
+  new_stack_size = stg_min(tso->stack_size * 2, tso->max_stack_size);
+  new_tso_size   = (lnat)BLOCK_ROUND_UP(new_stack_size * sizeof(W_) + 
+				       TSO_STRUCT_SIZE)/sizeof(W_);
+  new_tso_size = round_to_mblocks(new_tso_size);  /* Be MBLOCK-friendly */
+  new_stack_size = new_tso_size - TSO_STRUCT_SIZEW;
+
+  IF_DEBUG(scheduler, sched_belch("increasing stack size from %ld words to %d.\n", (long)tso->stack_size, new_stack_size));
+
+  dest = (StgTSO *)allocate(new_tso_size);
+  TICK_ALLOC_TSO(new_stack_size,0);
+
+  /* copy the TSO block and the old stack into the new area */
+  memcpy(dest,tso,TSO_STRUCT_SIZE);
+  stack_words = tso->stack + tso->stack_size - tso->sp;
+  new_sp = (P_)dest + new_tso_size - stack_words;
+  memcpy(new_sp, tso->sp, stack_words * sizeof(W_));
+
+  /* relocate the stack pointers... */
+  dest->sp         = new_sp;
+  dest->stack_size = new_stack_size;
+	
+  /* Mark the old TSO as relocated.  We have to check for relocated
+   * TSOs in the garbage collector and any primops that deal with TSOs.
+   *
+   * It's important to set the sp value to just beyond the end
+   * of the stack, so we don't attempt to scavenge any part of the
+   * dead TSO's stack.
+   */
+  tso->what_next = ThreadRelocated;
+  tso->link = dest;
+  tso->sp = (P_)&(tso->stack[tso->stack_size]);
+  tso->why_blocked = NotBlocked;
+
+  IF_PAR_DEBUG(verbose,
+	       debugBelch("@@ threadStackOverflow of TSO %d (now at %p): stack size increased to %ld\n",
+		     tso->id, tso, tso->stack_size);
+	       /* If we're debugging, just print out the top of the stack */
+	       printStackChunk(tso->sp, stg_min(tso->stack+tso->stack_size, 
+						tso->sp+64)));
+  
+  IF_DEBUG(sanity,checkTSO(tso));
+#if 0
+  IF_DEBUG(scheduler,printTSO(dest));
+#endif
+
+  return dest;
+}
+
+/* ---------------------------------------------------------------------------
+   Wake up a queue that was blocked on some resource.
+   ------------------------------------------------------------------------ */
+
+#if defined(GRAN)
+STATIC_INLINE void
+unblockCount ( StgBlockingQueueElement *bqe, StgClosure *node )
+{
+}
+#elif defined(PARALLEL_HASKELL)
+STATIC_INLINE void
+unblockCount ( StgBlockingQueueElement *bqe, StgClosure *node )
+{
+  /* write RESUME events to log file and
+     update blocked and fetch time (depending on type of the orig closure) */
+  if (RtsFlags.ParFlags.ParStats.Full) {
+    DumpRawGranEvent(CURRENT_PROC, CURRENT_PROC, 
+		     GR_RESUMEQ, ((StgTSO *)bqe), ((StgTSO *)bqe)->block_info.closure,
+		     0, 0 /* spark_queue_len(ADVISORY_POOL) */);
+    if (emptyRunQueue())
+      emitSchedule = rtsTrue;
+
+    switch (get_itbl(node)->type) {
+	case FETCH_ME_BQ:
+	  ((StgTSO *)bqe)->par.fetchtime += CURRENT_TIME-((StgTSO *)bqe)->par.blockedat;
+	  break;
+	case RBH:
+	case FETCH_ME:
+	case BLACKHOLE_BQ:
+	  ((StgTSO *)bqe)->par.blocktime += CURRENT_TIME-((StgTSO *)bqe)->par.blockedat;
+	  break;
+#ifdef DIST
+        case MVAR:
+          break;
+#endif	  
+	default:
+	  barf("{unblockOne}Daq Qagh: unexpected closure in blocking queue");
+	}
+      }
+}
+#endif
+
+#if defined(GRAN)
+StgBlockingQueueElement *
+unblockOne(StgBlockingQueueElement *bqe, StgClosure *node)
+{
+    StgTSO *tso;
+    PEs node_loc, tso_loc;
+
+    node_loc = where_is(node); // should be lifted out of loop
+    tso = (StgTSO *)bqe;  // wastes an assignment to get the type right
+    tso_loc = where_is((StgClosure *)tso);
+    if (IS_LOCAL_TO(PROCS(node),tso_loc)) { // TSO is local
+      /* !fake_fetch => TSO is on CurrentProc is same as IS_LOCAL_TO */
+      ASSERT(CurrentProc!=node_loc || tso_loc==CurrentProc);
+      CurrentTime[CurrentProc] += RtsFlags.GranFlags.Costs.lunblocktime;
+      // insertThread(tso, node_loc);
+      new_event(tso_loc, tso_loc, CurrentTime[CurrentProc],
+		ResumeThread,
+		tso, node, (rtsSpark*)NULL);
+      tso->link = END_TSO_QUEUE; // overwrite link just to be sure 
+      // len_local++;
+      // len++;
+    } else { // TSO is remote (actually should be FMBQ)
+      CurrentTime[CurrentProc] += RtsFlags.GranFlags.Costs.mpacktime +
+                                  RtsFlags.GranFlags.Costs.gunblocktime +
+	                          RtsFlags.GranFlags.Costs.latency;
+      new_event(tso_loc, CurrentProc, CurrentTime[CurrentProc],
+		UnblockThread,
+		tso, node, (rtsSpark*)NULL);
+      tso->link = END_TSO_QUEUE; // overwrite link just to be sure 
+      // len++;
+    }
+    /* the thread-queue-overhead is accounted for in either Resume or UnblockThread */
+    IF_GRAN_DEBUG(bq,
+		  debugBelch(" %s TSO %d (%p) [PE %d] (block_info.closure=%p) (next=%p) ,",
+			  (node_loc==tso_loc ? "Local" : "Global"), 
+			  tso->id, tso, CurrentProc, tso->block_info.closure, tso->link));
+    tso->block_info.closure = NULL;
+    IF_DEBUG(scheduler,debugBelch("-- Waking up thread %ld (%p)\n", 
+			     tso->id, tso));
+}
+#elif defined(PARALLEL_HASKELL)
+StgBlockingQueueElement *
+unblockOne(StgBlockingQueueElement *bqe, StgClosure *node)
+{
+    StgBlockingQueueElement *next;
+
+    switch (get_itbl(bqe)->type) {
+    case TSO:
+      ASSERT(((StgTSO *)bqe)->why_blocked != NotBlocked);
+      /* if it's a TSO just push it onto the run_queue */
+      next = bqe->link;
+      ((StgTSO *)bqe)->link = END_TSO_QUEUE; // debugging?
+      APPEND_TO_RUN_QUEUE((StgTSO *)bqe); 
+      threadRunnable();
+      unblockCount(bqe, node);
+      /* reset blocking status after dumping event */
+      ((StgTSO *)bqe)->why_blocked = NotBlocked;
+      break;
+
+    case BLOCKED_FETCH:
+      /* if it's a BLOCKED_FETCH put it on the PendingFetches list */
+      next = bqe->link;
+      bqe->link = (StgBlockingQueueElement *)PendingFetches;
+      PendingFetches = (StgBlockedFetch *)bqe;
+      break;
+
+# if defined(DEBUG)
+      /* can ignore this case in a non-debugging setup; 
+	 see comments on RBHSave closures above */
+    case CONSTR:
+      /* check that the closure is an RBHSave closure */
+      ASSERT(get_itbl((StgClosure *)bqe) == &stg_RBH_Save_0_info ||
+	     get_itbl((StgClosure *)bqe) == &stg_RBH_Save_1_info ||
+	     get_itbl((StgClosure *)bqe) == &stg_RBH_Save_2_info);
+      break;
+
+    default:
+      barf("{unblockOne}Daq Qagh: Unexpected IP (%#lx; %s) in blocking queue at %#lx\n",
+	   get_itbl((StgClosure *)bqe), info_type((StgClosure *)bqe), 
+	   (StgClosure *)bqe);
+# endif
+    }
+  IF_PAR_DEBUG(bq, debugBelch(", %p (%s)\n", bqe, info_type((StgClosure*)bqe)));
+  return next;
+}
+#endif
+
+StgTSO *
+unblockOne(Capability *cap, StgTSO *tso)
+{
+  StgTSO *next;
+
+  ASSERT(get_itbl(tso)->type == TSO);
+  ASSERT(tso->why_blocked != NotBlocked);
+
+  tso->why_blocked = NotBlocked;
+  next = tso->link;
+  tso->link = END_TSO_QUEUE;
+
+#if defined(THREADED_RTS)
+  if (tso->cap == cap || (!tsoLocked(tso) && RtsFlags.ParFlags.wakeupMigrate)) {
+      // We are waking up this thread on the current Capability, which
+      // might involve migrating it from the Capability it was last on.
+      if (tso->bound) {
+	  ASSERT(tso->bound->cap == tso->cap);
+	  tso->bound->cap = cap;
+      }
+      tso->cap = cap;
+      appendToRunQueue(cap,tso);
+      // we're holding a newly woken thread, make sure we context switch
+      // quickly so we can migrate it if necessary.
+      context_switch = 1;
+  } else {
+      // we'll try to wake it up on the Capability it was last on.
+      wakeupThreadOnCapability(tso->cap, tso);
+  }
+#else
+  appendToRunQueue(cap,tso);
+  context_switch = 1;
+#endif
+
+  IF_DEBUG(scheduler,sched_belch("waking up thread %ld on cap %d", (long)tso->id, tso->cap->no));
+  return next;
+}
+
+
+#if defined(GRAN)
+void 
+awakenBlockedQueue(StgBlockingQueueElement *q, StgClosure *node)
+{
+  StgBlockingQueueElement *bqe;
+  PEs node_loc;
+  nat len = 0; 
+
+  IF_GRAN_DEBUG(bq, 
+		debugBelch("##-_ AwBQ for node %p on PE %d @ %ld by TSO %d (%p): \n", \
+		      node, CurrentProc, CurrentTime[CurrentProc], 
+		      CurrentTSO->id, CurrentTSO));
+
+  node_loc = where_is(node);
+
+  ASSERT(q == END_BQ_QUEUE ||
+	 get_itbl(q)->type == TSO ||   // q is either a TSO or an RBHSave
+	 get_itbl(q)->type == CONSTR); // closure (type constructor)
+  ASSERT(is_unique(node));
+
+  /* FAKE FETCH: magically copy the node to the tso's proc;
+     no Fetch necessary because in reality the node should not have been 
+     moved to the other PE in the first place
+  */
+  if (CurrentProc!=node_loc) {
+    IF_GRAN_DEBUG(bq, 
+		  debugBelch("## node %p is on PE %d but CurrentProc is %d (TSO %d); assuming fake fetch and adjusting bitmask (old: %#x)\n",
+			node, node_loc, CurrentProc, CurrentTSO->id, 
+			// CurrentTSO, where_is(CurrentTSO),
+			node->header.gran.procs));
+    node->header.gran.procs = (node->header.gran.procs) | PE_NUMBER(CurrentProc);
+    IF_GRAN_DEBUG(bq, 
+		  debugBelch("## new bitmask of node %p is %#x\n",
+			node, node->header.gran.procs));
+    if (RtsFlags.GranFlags.GranSimStats.Global) {
+      globalGranStats.tot_fake_fetches++;
+    }
+  }
+
+  bqe = q;
+  // ToDo: check: ASSERT(CurrentProc==node_loc);
+  while (get_itbl(bqe)->type==TSO) { // q != END_TSO_QUEUE) {
+    //next = bqe->link;
+    /* 
+       bqe points to the current element in the queue
+       next points to the next element in the queue
+    */
+    //tso = (StgTSO *)bqe;  // wastes an assignment to get the type right
+    //tso_loc = where_is(tso);
+    len++;
+    bqe = unblockOne(bqe, node);
+  }
+
+  /* if this is the BQ of an RBH, we have to put back the info ripped out of
+     the closure to make room for the anchor of the BQ */
+  if (bqe!=END_BQ_QUEUE) {
+    ASSERT(get_itbl(node)->type == RBH && get_itbl(bqe)->type == CONSTR);
+    /*
+    ASSERT((info_ptr==&RBH_Save_0_info) ||
+	   (info_ptr==&RBH_Save_1_info) ||
+	   (info_ptr==&RBH_Save_2_info));
+    */
+    /* cf. convertToRBH in RBH.c for writing the RBHSave closure */
+    ((StgRBH *)node)->blocking_queue = (StgBlockingQueueElement *)((StgRBHSave *)bqe)->payload[0];
+    ((StgRBH *)node)->mut_link       = (StgMutClosure *)((StgRBHSave *)bqe)->payload[1];
+
+    IF_GRAN_DEBUG(bq,
+		  debugBelch("## Filled in RBH_Save for %p (%s) at end of AwBQ\n",
+			node, info_type(node)));
+  }
+
+  /* statistics gathering */
+  if (RtsFlags.GranFlags.GranSimStats.Global) {
+    // globalGranStats.tot_bq_processing_time += bq_processing_time;
+    globalGranStats.tot_bq_len += len;      // total length of all bqs awakened
+    // globalGranStats.tot_bq_len_local += len_local;  // same for local TSOs only
+    globalGranStats.tot_awbq++;             // total no. of bqs awakened
+  }
+  IF_GRAN_DEBUG(bq,
+		debugBelch("## BQ Stats of %p: [%d entries] %s\n",
+			node, len, (bqe!=END_BQ_QUEUE) ? "RBH" : ""));
+}
+#elif defined(PARALLEL_HASKELL)
+void 
+awakenBlockedQueue(StgBlockingQueueElement *q, StgClosure *node)
+{
+  StgBlockingQueueElement *bqe;
+
+  IF_PAR_DEBUG(verbose, 
+	       debugBelch("##-_ AwBQ for node %p on [%x]: \n",
+		     node, mytid));
+#ifdef DIST  
+  //RFP
+  if(get_itbl(q)->type == CONSTR || q==END_BQ_QUEUE) {
+    IF_PAR_DEBUG(verbose, debugBelch("## ... nothing to unblock so lets just return. RFP (BUG?)\n"));
+    return;
+  }
+#endif
+  
+  ASSERT(q == END_BQ_QUEUE ||
+	 get_itbl(q)->type == TSO ||           
+  	 get_itbl(q)->type == BLOCKED_FETCH || 
+  	 get_itbl(q)->type == CONSTR); 
+
+  bqe = q;
+  while (get_itbl(bqe)->type==TSO || 
+	 get_itbl(bqe)->type==BLOCKED_FETCH) {
+    bqe = unblockOne(bqe, node);
+  }
+}
+
+#else   /* !GRAN && !PARALLEL_HASKELL */
+
+void
+awakenBlockedQueue(Capability *cap, StgTSO *tso)
+{
+    if (tso == NULL) return; // hack; see bug #1235728, and comments in
+	 		     // Exception.cmm
+    while (tso != END_TSO_QUEUE) {
+	tso = unblockOne(cap,tso);
+    }
+}
+#endif
+
+/* ---------------------------------------------------------------------------
+   Interrupt execution
+   - usually called inside a signal handler so it mustn't do anything fancy.   
+   ------------------------------------------------------------------------ */
+
+void
+interruptStgRts(void)
+{
+    sched_state = SCHED_INTERRUPTING;
+    context_switch = 1;
+#if defined(THREADED_RTS)
+    prodAllCapabilities();
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+   Unblock a thread
+
+   This is for use when we raise an exception in another thread, which
+   may be blocked.
+   This has nothing to do with the UnblockThread event in GranSim. -- HWL
+   -------------------------------------------------------------------------- */
+
+#if defined(GRAN) || defined(PARALLEL_HASKELL)
+/*
+  NB: only the type of the blocking queue is different in GranSim and GUM
+      the operations on the queue-elements are the same
+      long live polymorphism!
+
+  Locks: sched_mutex is held upon entry and exit.
+
+*/
+static void
+unblockThread(Capability *cap, StgTSO *tso)
+{
+  StgBlockingQueueElement *t, **last;
+
+  switch (tso->why_blocked) {
+
+  case NotBlocked:
+    return;  /* not blocked */
+
+  case BlockedOnSTM:
+    // Be careful: nothing to do here!  We tell the scheduler that the thread
+    // is runnable and we leave it to the stack-walking code to abort the 
+    // transaction while unwinding the stack.  We should perhaps have a debugging
+    // test to make sure that this really happens and that the 'zombie' transaction
+    // does not get committed.
+    goto done;
+
+  case BlockedOnMVar:
+    ASSERT(get_itbl(tso->block_info.closure)->type == MVAR);
+    {
+      StgBlockingQueueElement *last_tso = END_BQ_QUEUE;
+      StgMVar *mvar = (StgMVar *)(tso->block_info.closure);
+
+      last = (StgBlockingQueueElement **)&mvar->head;
+      for (t = (StgBlockingQueueElement *)mvar->head; 
+	   t != END_BQ_QUEUE; 
+	   last = &t->link, last_tso = t, t = t->link) {
+	if (t == (StgBlockingQueueElement *)tso) {
+	  *last = (StgBlockingQueueElement *)tso->link;
+	  if (mvar->tail == tso) {
+	    mvar->tail = (StgTSO *)last_tso;
+	  }
+	  goto done;
+	}
+      }
+      barf("unblockThread (MVAR): TSO not found");
+    }
+
+  case BlockedOnBlackHole:
+    ASSERT(get_itbl(tso->block_info.closure)->type == BLACKHOLE_BQ);
+    {
+      StgBlockingQueue *bq = (StgBlockingQueue *)(tso->block_info.closure);
+
+      last = &bq->blocking_queue;
+      for (t = bq->blocking_queue; 
+	   t != END_BQ_QUEUE; 
+	   last = &t->link, t = t->link) {
+	if (t == (StgBlockingQueueElement *)tso) {
+	  *last = (StgBlockingQueueElement *)tso->link;
+	  goto done;
+	}
+      }
+      barf("unblockThread (BLACKHOLE): TSO not found");
+    }
+
+  case BlockedOnException:
+    {
+      StgTSO *target  = tso->block_info.tso;
+
+      ASSERT(get_itbl(target)->type == TSO);
+
+      if (target->what_next == ThreadRelocated) {
+	  target = target->link;
+	  ASSERT(get_itbl(target)->type == TSO);
+      }
+
+      ASSERT(target->blocked_exceptions != NULL);
+
+      last = (StgBlockingQueueElement **)&target->blocked_exceptions;
+      for (t = (StgBlockingQueueElement *)target->blocked_exceptions; 
+	   t != END_BQ_QUEUE; 
+	   last = &t->link, t = t->link) {
+	ASSERT(get_itbl(t)->type == TSO);
+	if (t == (StgBlockingQueueElement *)tso) {
+	  *last = (StgBlockingQueueElement *)tso->link;
+	  goto done;
+	}
+      }
+      barf("unblockThread (Exception): TSO not found");
+    }
+
+  case BlockedOnRead:
+  case BlockedOnWrite:
+#if defined(mingw32_HOST_OS)
+  case BlockedOnDoProc:
+#endif
+    {
+      /* take TSO off blocked_queue */
+      StgBlockingQueueElement *prev = NULL;
+      for (t = (StgBlockingQueueElement *)blocked_queue_hd; t != END_BQ_QUEUE; 
+	   prev = t, t = t->link) {
+	if (t == (StgBlockingQueueElement *)tso) {
+	  if (prev == NULL) {
+	    blocked_queue_hd = (StgTSO *)t->link;
+	    if ((StgBlockingQueueElement *)blocked_queue_tl == t) {
+	      blocked_queue_tl = END_TSO_QUEUE;
+	    }
+	  } else {
+	    prev->link = t->link;
+	    if ((StgBlockingQueueElement *)blocked_queue_tl == t) {
+	      blocked_queue_tl = (StgTSO *)prev;
+	    }
+	  }
+#if defined(mingw32_HOST_OS)
+	  /* (Cooperatively) signal that the worker thread should abort
+	   * the request.
+	   */
+	  abandonWorkRequest(tso->block_info.async_result->reqID);
+#endif
+	  goto done;
+	}
+      }
+      barf("unblockThread (I/O): TSO not found");
+    }
+
+  case BlockedOnDelay:
+    {
+      /* take TSO off sleeping_queue */
+      StgBlockingQueueElement *prev = NULL;
+      for (t = (StgBlockingQueueElement *)sleeping_queue; t != END_BQ_QUEUE; 
+	   prev = t, t = t->link) {
+	if (t == (StgBlockingQueueElement *)tso) {
+	  if (prev == NULL) {
+	    sleeping_queue = (StgTSO *)t->link;
+	  } else {
+	    prev->link = t->link;
+	  }
+	  goto done;
+	}
+      }
+      barf("unblockThread (delay): TSO not found");
+    }
+
+  default:
+    barf("unblockThread");
+  }
+
+ done:
+  tso->link = END_TSO_QUEUE;
+  tso->why_blocked = NotBlocked;
+  tso->block_info.closure = NULL;
+  pushOnRunQueue(cap,tso);
+}
+#else
+static void
+unblockThread(Capability *cap, StgTSO *tso)
+{
+  StgTSO *t, **last;
+  
+  /* To avoid locking unnecessarily. */
+  if (tso->why_blocked == NotBlocked) {
+    return;
+  }
+
+  switch (tso->why_blocked) {
+
+  case BlockedOnSTM:
+    // Be careful: nothing to do here!  We tell the scheduler that the thread
+    // is runnable and we leave it to the stack-walking code to abort the 
+    // transaction while unwinding the stack.  We should perhaps have a debugging
+    // test to make sure that this really happens and that the 'zombie' transaction
+    // does not get committed.
+    goto done;
+
+  case BlockedOnMVar:
+    ASSERT(get_itbl(tso->block_info.closure)->type == MVAR);
+    {
+      StgTSO *last_tso = END_TSO_QUEUE;
+      StgMVar *mvar = (StgMVar *)(tso->block_info.closure);
+
+      last = &mvar->head;
+      for (t = mvar->head; t != END_TSO_QUEUE; 
+	   last = &t->link, last_tso = t, t = t->link) {
+	if (t == tso) {
+	  *last = tso->link;
+	  if (mvar->tail == tso) {
+	    mvar->tail = last_tso;
+	  }
+	  goto done;
+	}
+      }
+      barf("unblockThread (MVAR): TSO not found");
+    }
+
+  case BlockedOnBlackHole:
+    {
+      last = &blackhole_queue;
+      for (t = blackhole_queue; t != END_TSO_QUEUE; 
+	   last = &t->link, t = t->link) {
+	if (t == tso) {
+	  *last = tso->link;
+	  goto done;
+	}
+      }
+      barf("unblockThread (BLACKHOLE): TSO not found");
+    }
+
+  case BlockedOnException:
+    {
+      StgTSO *target  = tso->block_info.tso;
+
+      ASSERT(get_itbl(target)->type == TSO);
+
+      while (target->what_next == ThreadRelocated) {
+	  target = target->link;
+	  ASSERT(get_itbl(target)->type == TSO);
+      }
+      
+      ASSERT(target->blocked_exceptions != NULL);
+
+      last = &target->blocked_exceptions;
+      for (t = target->blocked_exceptions; t != END_TSO_QUEUE; 
+	   last = &t->link, t = t->link) {
+	ASSERT(get_itbl(t)->type == TSO);
+	if (t == tso) {
+	  *last = tso->link;
+	  goto done;
+	}
+      }
+      barf("unblockThread (Exception): TSO not found");
+    }
+
+#if !defined(THREADED_RTS)
+  case BlockedOnRead:
+  case BlockedOnWrite:
+#if defined(mingw32_HOST_OS)
+  case BlockedOnDoProc:
+#endif
+    {
+      StgTSO *prev = NULL;
+      for (t = blocked_queue_hd; t != END_TSO_QUEUE; 
+	   prev = t, t = t->link) {
+	if (t == tso) {
+	  if (prev == NULL) {
+	    blocked_queue_hd = t->link;
+	    if (blocked_queue_tl == t) {
+	      blocked_queue_tl = END_TSO_QUEUE;
+	    }
+	  } else {
+	    prev->link = t->link;
+	    if (blocked_queue_tl == t) {
+	      blocked_queue_tl = prev;
+	    }
+	  }
+#if defined(mingw32_HOST_OS)
+	  /* (Cooperatively) signal that the worker thread should abort
+	   * the request.
+	   */
+	  abandonWorkRequest(tso->block_info.async_result->reqID);
+#endif
+	  goto done;
+	}
+      }
+      barf("unblockThread (I/O): TSO not found");
+    }
+
+  case BlockedOnDelay:
+    {
+      StgTSO *prev = NULL;
+      for (t = sleeping_queue; t != END_TSO_QUEUE; 
+	   prev = t, t = t->link) {
+	if (t == tso) {
+	  if (prev == NULL) {
+	    sleeping_queue = t->link;
+	  } else {
+	    prev->link = t->link;
+	  }
+	  goto done;
+	}
+      }
+      barf("unblockThread (delay): TSO not found");
+    }
+#endif
+
+  default:
+    barf("unblockThread");
+  }
+
+ done:
+  tso->link = END_TSO_QUEUE;
+  tso->why_blocked = NotBlocked;
+  tso->block_info.closure = NULL;
+  appendToRunQueue(cap,tso);
+
+  // We might have just migrated this TSO to our Capability:
+  if (tso->bound) {
+      tso->bound->cap = cap;
+  }
+  tso->cap = cap;
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+ * checkBlackHoles()
+ *
+ * Check the blackhole_queue for threads that can be woken up.  We do
+ * this periodically: before every GC, and whenever the run queue is
+ * empty.
+ *
+ * An elegant solution might be to just wake up all the blocked
+ * threads with awakenBlockedQueue occasionally: they'll go back to
+ * sleep again if the object is still a BLACKHOLE.  Unfortunately this
+ * doesn't give us a way to tell whether we've actually managed to
+ * wake up any threads, so we would be busy-waiting.
+ *
+ * -------------------------------------------------------------------------- */
+
+static rtsBool
+checkBlackHoles (Capability *cap)
+{
+    StgTSO **prev, *t;
+    rtsBool any_woke_up = rtsFalse;
+    StgHalfWord type;
+
+    // blackhole_queue is global:
+    ASSERT_LOCK_HELD(&sched_mutex);
+
+    IF_DEBUG(scheduler, sched_belch("checking threads blocked on black holes"));
+
+    // ASSUMES: sched_mutex
+    prev = &blackhole_queue;
+    t = blackhole_queue;
+    while (t != END_TSO_QUEUE) {
+	ASSERT(t->why_blocked == BlockedOnBlackHole);
+	type = get_itbl(t->block_info.closure)->type;
+	if (type != BLACKHOLE && type != CAF_BLACKHOLE) {
+	    IF_DEBUG(sanity,checkTSO(t));
+	    t = unblockOne(cap, t);
+	    // urk, the threads migrate to the current capability
+	    // here, but we'd like to keep them on the original one.
+	    *prev = t;
+	    any_woke_up = rtsTrue;
+	} else {
+	    prev = &t->link;
+	    t = t->link;
+	}
+    }
+
+    return any_woke_up;
+}
+
+/* -----------------------------------------------------------------------------
+ * raiseAsync()
+ *
+ * The following function implements the magic for raising an
+ * asynchronous exception in an existing thread.
+ *
+ * We first remove the thread from any queue on which it might be
+ * blocked.  The possible blockages are MVARs and BLACKHOLE_BQs.
+ *
+ * We strip the stack down to the innermost CATCH_FRAME, building
+ * thunks in the heap for all the active computations, so they can 
+ * be restarted if necessary.  When we reach a CATCH_FRAME, we build
+ * an application of the handler to the exception, and push it on
+ * the top of the stack.
+ * 
+ * How exactly do we save all the active computations?  We create an
+ * AP_STACK for every UpdateFrame on the stack.  Entering one of these
+ * AP_STACKs pushes everything from the corresponding update frame
+ * upwards onto the stack.  (Actually, it pushes everything up to the
+ * next update frame plus a pointer to the next AP_STACK object.
+ * Entering the next AP_STACK object pushes more onto the stack until we
+ * reach the last AP_STACK object - at which point the stack should look
+ * exactly as it did when we killed the TSO and we can continue
+ * execution by entering the closure on top of the stack.
+ *
+ * We can also kill a thread entirely - this happens if either (a) the 
+ * exception passed to raiseAsync is NULL, or (b) there's no
+ * CATCH_FRAME on the stack.  In either case, we strip the entire
+ * stack and replace the thread with a zombie.
+ *
+ * ToDo: in THREADED_RTS mode, this function is only safe if either
+ * (a) we hold all the Capabilities (eg. in GC, or if there is only
+ * one Capability), or (b) we own the Capability that the TSO is
+ * currently blocked on or on the run queue of.
+ *
+ * -------------------------------------------------------------------------- */
+ 
+void
+raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception)
+{
+    raiseAsync_(cap, tso, exception, rtsFalse, NULL);
+}
+
+void
+suspendComputation(Capability *cap, StgTSO *tso, StgPtr stop_here)
+{
+    raiseAsync_(cap, tso, NULL, rtsFalse, stop_here);
+}
+
+static void
+raiseAsync_(Capability *cap, StgTSO *tso, StgClosure *exception, 
+	    rtsBool stop_at_atomically, StgPtr stop_here)
+{
+    StgRetInfoTable *info;
+    StgPtr sp, frame;
+    nat i;
+  
+    // Thread already dead?
+    if (tso->what_next == ThreadComplete || tso->what_next == ThreadKilled) {
+	return;
+    }
+
+    IF_DEBUG(scheduler, 
+	     sched_belch("raising exception in thread %ld.", (long)tso->id));
+    
+    // Remove it from any blocking queues
+    unblockThread(cap,tso);
+
+    // mark it dirty; we're about to change its stack.
+    dirtyTSO(tso);
+
+    sp = tso->sp;
+    
+    // The stack freezing code assumes there's a closure pointer on
+    // the top of the stack, so we have to arrange that this is the case...
+    //
+    if (sp[0] == (W_)&stg_enter_info) {
+	sp++;
+    } else {
+	sp--;
+	sp[0] = (W_)&stg_dummy_ret_closure;
+    }
+
+    frame = sp + 1;
+    while (stop_here == NULL || frame < stop_here) {
+
+	// 1. Let the top of the stack be the "current closure"
+	//
+	// 2. Walk up the stack until we find either an UPDATE_FRAME or a
+	// CATCH_FRAME.
+	//
+	// 3. If it's an UPDATE_FRAME, then make an AP_STACK containing the
+	// current closure applied to the chunk of stack up to (but not
+	// including) the update frame.  This closure becomes the "current
+	// closure".  Go back to step 2.
+	//
+	// 4. If it's a CATCH_FRAME, then leave the exception handler on
+	// top of the stack applied to the exception.
+	// 
+	// 5. If it's a STOP_FRAME, then kill the thread.
+        // 
+        // NB: if we pass an ATOMICALLY_FRAME then abort the associated 
+        // transaction
+       
+	info = get_ret_itbl((StgClosure *)frame);
+
+	switch (info->i.type) {
+
+	case UPDATE_FRAME:
+	{
+	    StgAP_STACK * ap;
+	    nat words;
+	    
+	    // First build an AP_STACK consisting of the stack chunk above the
+	    // current update frame, with the top word on the stack as the
+	    // fun field.
+	    //
+	    words = frame - sp - 1;
+	    ap = (StgAP_STACK *)allocateLocal(cap,AP_STACK_sizeW(words));
+	    
+	    ap->size = words;
+	    ap->fun  = (StgClosure *)sp[0];
+	    sp++;
+	    for(i=0; i < (nat)words; ++i) {
+		ap->payload[i] = (StgClosure *)*sp++;
+	    }
+	    
+	    SET_HDR(ap,&stg_AP_STACK_info,
+		    ((StgClosure *)frame)->header.prof.ccs /* ToDo */); 
+	    TICK_ALLOC_UP_THK(words+1,0);
+	    
+	    IF_DEBUG(scheduler,
+		     debugBelch("sched: Updating ");
+		     printPtr((P_)((StgUpdateFrame *)frame)->updatee); 
+		     debugBelch(" with ");
+		     printObj((StgClosure *)ap);
+		);
+
+	    // Replace the updatee with an indirection
+	    //
+	    // Warning: if we're in a loop, more than one update frame on
+	    // the stack may point to the same object.  Be careful not to
+	    // overwrite an IND_OLDGEN in this case, because we'll screw
+	    // up the mutable lists.  To be on the safe side, don't
+	    // overwrite any kind of indirection at all.  See also
+	    // threadSqueezeStack in GC.c, where we have to make a similar
+	    // check.
+	    //
+	    if (!closure_IND(((StgUpdateFrame *)frame)->updatee)) {
+		// revert the black hole
+		UPD_IND_NOLOCK(((StgUpdateFrame *)frame)->updatee,
+			       (StgClosure *)ap);
+	    }
+	    sp += sizeofW(StgUpdateFrame) - 1;
+	    sp[0] = (W_)ap; // push onto stack
+	    frame = sp + 1;
+	    continue; //no need to bump frame
+	}
+
+	case STOP_FRAME:
+	    // We've stripped the entire stack, the thread is now dead.
+	    tso->what_next = ThreadKilled;
+	    tso->sp = frame + sizeofW(StgStopFrame);
+	    return;
+
+	case CATCH_FRAME:
+	    // If we find a CATCH_FRAME, and we've got an exception to raise,
+	    // then build the THUNK raise(exception), and leave it on
+	    // top of the CATCH_FRAME ready to enter.
+	    //
+	{
+#ifdef PROFILING
+	    StgCatchFrame *cf = (StgCatchFrame *)frame;
+#endif
+	    StgThunk *raise;
+	    
+	    if (exception == NULL) break;
+
+	    // we've got an exception to raise, so let's pass it to the
+	    // handler in this frame.
+	    //
+	    raise = (StgThunk *)allocateLocal(cap,sizeofW(StgThunk)+1);
+	    TICK_ALLOC_SE_THK(1,0);
+	    SET_HDR(raise,&stg_raise_info,cf->header.prof.ccs);
+	    raise->payload[0] = exception;
+	    
+	    // throw away the stack from Sp up to the CATCH_FRAME.
+	    //
+	    sp = frame - 1;
+	    
+	    /* Ensure that async excpetions are blocked now, so we don't get
+	     * a surprise exception before we get around to executing the
+	     * handler.
+	     */
+	    if (tso->blocked_exceptions == NULL) {
+		tso->blocked_exceptions = END_TSO_QUEUE;
+	    }
+
+	    /* Put the newly-built THUNK on top of the stack, ready to execute
+	     * when the thread restarts.
+	     */
+	    sp[0] = (W_)raise;
+	    sp[-1] = (W_)&stg_enter_info;
+	    tso->sp = sp-1;
+	    tso->what_next = ThreadRunGHC;
+	    IF_DEBUG(sanity, checkTSO(tso));
+	    return;
+	}
+	    
+	case ATOMICALLY_FRAME:
+	    if (stop_at_atomically) {
+		ASSERT(stmGetEnclosingTRec(tso->trec) == NO_TREC);
+		stmCondemnTransaction(cap, tso -> trec);
+#ifdef REG_R1
+		tso->sp = frame;
+#else
+		// R1 is not a register: the return convention for IO in
+		// this case puts the return value on the stack, so we
+		// need to set up the stack to return to the atomically
+		// frame properly...
+		tso->sp = frame - 2;
+		tso->sp[1] = (StgWord) &stg_NO_FINALIZER_closure; // why not?
+		tso->sp[0] = (StgWord) &stg_ut_1_0_unreg_info;
+#endif
+		tso->what_next = ThreadRunGHC;
+		return;
+	    }
+	    // Not stop_at_atomically... fall through and abort the
+	    // transaction.
+	    
+	case CATCH_RETRY_FRAME:
+	    // IF we find an ATOMICALLY_FRAME then we abort the
+	    // current transaction and propagate the exception.  In
+	    // this case (unlike ordinary exceptions) we do not care
+	    // whether the transaction is valid or not because its
+	    // possible validity cannot have caused the exception
+	    // and will not be visible after the abort.
+	    IF_DEBUG(stm,
+		     debugBelch("Found atomically block delivering async exception\n"));
+            StgTRecHeader *trec = tso -> trec;
+            StgTRecHeader *outer = stmGetEnclosingTRec(trec);
+            stmAbortTransaction(cap, trec);
+            tso -> trec = outer;
+	    break;
+	    
+	default:
+	    break;
+	}
+
+	// move on to the next stack frame
+	frame += stack_frame_sizeW((StgClosure *)frame);
+    }
+
+    // if we got here, then we stopped at stop_here
+    ASSERT(stop_here != NULL);
+}
+
+/* -----------------------------------------------------------------------------
+   Deleting threads
+
+   This is used for interruption (^C) and forking, and corresponds to
+   raising an exception but without letting the thread catch the
+   exception.
+   -------------------------------------------------------------------------- */
+
+static void 
+deleteThread (Capability *cap, StgTSO *tso)
+{
+  if (tso->why_blocked != BlockedOnCCall &&
+      tso->why_blocked != BlockedOnCCall_NoUnblockExc) {
+      raiseAsync(cap,tso,NULL);
+  }
+}
+
+#ifdef FORKPROCESS_PRIMOP_SUPPORTED
+static void 
+deleteThread_(Capability *cap, StgTSO *tso)
+{ // for forkProcess only:
+  // like deleteThread(), but we delete threads in foreign calls, too.
+
+    if (tso->why_blocked == BlockedOnCCall ||
+	tso->why_blocked == BlockedOnCCall_NoUnblockExc) {
+	unblockOne(cap,tso);
+	tso->what_next = ThreadKilled;
+    } else {
+	deleteThread(cap,tso);
+    }
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+   raiseExceptionHelper
+   
+   This function is called by the raise# primitve, just so that we can
+   move some of the tricky bits of raising an exception from C-- into
+   C.  Who knows, it might be a useful re-useable thing here too.
+   -------------------------------------------------------------------------- */
+
+StgWord
+raiseExceptionHelper (StgRegTable *reg, StgTSO *tso, StgClosure *exception)
+{
+    Capability *cap = regTableToCapability(reg);
+    StgThunk *raise_closure = NULL;
+    StgPtr p, next;
+    StgRetInfoTable *info;
+    //
+    // This closure represents the expression 'raise# E' where E
+    // is the exception raise.  It is used to overwrite all the
+    // thunks which are currently under evaluataion.
+    //
+
+    // OLD COMMENT (we don't have MIN_UPD_SIZE now):
+    // LDV profiling: stg_raise_info has THUNK as its closure
+    // type. Since a THUNK takes at least MIN_UPD_SIZE words in its
+    // payload, MIN_UPD_SIZE is more approprate than 1.  It seems that
+    // 1 does not cause any problem unless profiling is performed.
+    // However, when LDV profiling goes on, we need to linearly scan
+    // small object pool, where raise_closure is stored, so we should
+    // use MIN_UPD_SIZE.
+    //
+    // raise_closure = (StgClosure *)RET_STGCALL1(P_,allocate,
+    // 				       sizeofW(StgClosure)+1);
+    //
+
+    //
+    // Walk up the stack, looking for the catch frame.  On the way,
+    // we update any closures pointed to from update frames with the
+    // raise closure that we just built.
+    //
+    p = tso->sp;
+    while(1) {
+	info = get_ret_itbl((StgClosure *)p);
+	next = p + stack_frame_sizeW((StgClosure *)p);
+	switch (info->i.type) {
+	    
+	case UPDATE_FRAME:
+	    // Only create raise_closure if we need to.
+	    if (raise_closure == NULL) {
+		raise_closure = 
+		    (StgThunk *)allocateLocal(cap,sizeofW(StgThunk)+1);
+		SET_HDR(raise_closure, &stg_raise_info, CCCS);
+		raise_closure->payload[0] = exception;
+	    }
+	    UPD_IND(((StgUpdateFrame *)p)->updatee,(StgClosure *)raise_closure);
+	    p = next;
+	    continue;
+
+        case ATOMICALLY_FRAME:
+            IF_DEBUG(stm, debugBelch("Found ATOMICALLY_FRAME at %p\n", p));
+            tso->sp = p;
+            return ATOMICALLY_FRAME;
+	    
+	case CATCH_FRAME:
+	    tso->sp = p;
+	    return CATCH_FRAME;
+
+        case CATCH_STM_FRAME:
+            IF_DEBUG(stm, debugBelch("Found CATCH_STM_FRAME at %p\n", p));
+            tso->sp = p;
+            return CATCH_STM_FRAME;
+	    
+	case STOP_FRAME:
+	    tso->sp = p;
+	    return STOP_FRAME;
+
+        case CATCH_RETRY_FRAME:
+	default:
+	    p = next; 
+	    continue;
+	}
+    }
+}
+
+
+/* -----------------------------------------------------------------------------
+   findRetryFrameHelper
+
+   This function is called by the retry# primitive.  It traverses the stack
+   leaving tso->sp referring to the frame which should handle the retry.  
+
+   This should either be a CATCH_RETRY_FRAME (if the retry# is within an orElse#) 
+   or should be a ATOMICALLY_FRAME (if the retry# reaches the top level).  
+
+   We skip CATCH_STM_FRAMEs because retries are not considered to be exceptions,
+   despite the similar implementation.
+
+   We should not expect to see CATCH_FRAME or STOP_FRAME because those should
+   not be created within memory transactions.
+   -------------------------------------------------------------------------- */
+
+StgWord
+findRetryFrameHelper (StgTSO *tso)
+{
+  StgPtr           p, next;
+  StgRetInfoTable *info;
+
+  p = tso -> sp;
+  while (1) {
+    info = get_ret_itbl((StgClosure *)p);
+    next = p + stack_frame_sizeW((StgClosure *)p);
+    switch (info->i.type) {
+      
+    case ATOMICALLY_FRAME:
+      IF_DEBUG(stm, debugBelch("Found ATOMICALLY_FRAME at %p during retrry\n", p));
+      tso->sp = p;
+      return ATOMICALLY_FRAME;
+      
+    case CATCH_RETRY_FRAME:
+      IF_DEBUG(stm, debugBelch("Found CATCH_RETRY_FRAME at %p during retrry\n", p));
+      tso->sp = p;
+      return CATCH_RETRY_FRAME;
+      
+    case CATCH_STM_FRAME:
+    default:
+      ASSERT(info->i.type != CATCH_FRAME);
+      ASSERT(info->i.type != STOP_FRAME);
+      p = next; 
+      continue;
+    }
+  }
+}
+
+/* -----------------------------------------------------------------------------
+   resurrectThreads is called after garbage collection on the list of
+   threads found to be garbage.  Each of these threads will be woken
+   up and sent a signal: BlockedOnDeadMVar if the thread was blocked
+   on an MVar, or NonTermination if the thread was blocked on a Black
+   Hole.
+
+   Locks: assumes we hold *all* the capabilities.
+   -------------------------------------------------------------------------- */
+
+void
+resurrectThreads (StgTSO *threads)
+{
+    StgTSO *tso, *next;
+    Capability *cap;
+
+    for (tso = threads; tso != END_TSO_QUEUE; tso = next) {
+	next = tso->global_link;
+	tso->global_link = all_threads;
+	all_threads = tso;
+	IF_DEBUG(scheduler, sched_belch("resurrecting thread %d", tso->id));
+	
+	// Wake up the thread on the Capability it was last on
+	cap = tso->cap;
+	
+	switch (tso->why_blocked) {
+	case BlockedOnMVar:
+	case BlockedOnException:
+	    /* Called by GC - sched_mutex lock is currently held. */
+	    raiseAsync(cap, tso,(StgClosure *)BlockedOnDeadMVar_closure);
+	    break;
+	case BlockedOnBlackHole:
+	    raiseAsync(cap, tso,(StgClosure *)NonTermination_closure);
+	    break;
+	case BlockedOnSTM:
+	    raiseAsync(cap, tso,(StgClosure *)BlockedIndefinitely_closure);
+	    break;
+	case NotBlocked:
+	    /* This might happen if the thread was blocked on a black hole
+	     * belonging to a thread that we've just woken up (raiseAsync
+	     * can wake up threads, remember...).
+	     */
+	    continue;
+	default:
+	    barf("resurrectThreads: thread blocked in a strange way");
+	}
+    }
+}
+
+/* ----------------------------------------------------------------------------
+ * Debugging: why is a thread blocked
+ * [Also provides useful information when debugging threaded programs
+ *  at the Haskell source code level, so enable outside of DEBUG. --sof 7/02]
+   ------------------------------------------------------------------------- */
+
+#if DEBUG
+static void
+printThreadBlockage(StgTSO *tso)
+{
+  switch (tso->why_blocked) {
+  case BlockedOnRead:
+    debugBelch("is blocked on read from fd %d", (int)(tso->block_info.fd));
+    break;
+  case BlockedOnWrite:
+    debugBelch("is blocked on write to fd %d", (int)(tso->block_info.fd));
+    break;
+#if defined(mingw32_HOST_OS)
+    case BlockedOnDoProc:
+    debugBelch("is blocked on proc (request: %ld)", tso->block_info.async_result->reqID);
+    break;
+#endif
+  case BlockedOnDelay:
+    debugBelch("is blocked until %ld", (long)(tso->block_info.target));
+    break;
+  case BlockedOnMVar:
+    debugBelch("is blocked on an MVar @ %p", tso->block_info.closure);
+    break;
+  case BlockedOnException:
+    debugBelch("is blocked on delivering an exception to thread %d",
+	    tso->block_info.tso->id);
+    break;
+  case BlockedOnBlackHole:
+    debugBelch("is blocked on a black hole");
+    break;
+  case NotBlocked:
+    debugBelch("is not blocked");
+    break;
+#if defined(PARALLEL_HASKELL)
+  case BlockedOnGA:
+    debugBelch("is blocked on global address; local FM_BQ is %p (%s)",
+	    tso->block_info.closure, info_type(tso->block_info.closure));
+    break;
+  case BlockedOnGA_NoSend:
+    debugBelch("is blocked on global address (no send); local FM_BQ is %p (%s)",
+	    tso->block_info.closure, info_type(tso->block_info.closure));
+    break;
+#endif
+  case BlockedOnCCall:
+    debugBelch("is blocked on an external call");
+    break;
+  case BlockedOnCCall_NoUnblockExc:
+    debugBelch("is blocked on an external call (exceptions were already blocked)");
+    break;
+  case BlockedOnSTM:
+    debugBelch("is blocked on an STM operation");
+    break;
+  default:
+    barf("printThreadBlockage: strange tso->why_blocked: %d for TSO %d (%d)",
+	 tso->why_blocked, tso->id, tso);
+  }
+}
+
+void
+printThreadStatus(StgTSO *t)
+{
+    debugBelch("\tthread %4d @ %p ", t->id, (void *)t);
+    {
+      void *label = lookupThreadLabel(t->id);
+      if (label) debugBelch("[\"%s\"] ",(char *)label);
+    }
+    if (t->what_next == ThreadRelocated) {
+	debugBelch("has been relocated...\n");
+    } else {
+	switch (t->what_next) {
+	case ThreadKilled:
+	    debugBelch("has been killed");
+	    break;
+	case ThreadComplete:
+	    debugBelch("has completed");
+	    break;
+	default:
+	    printThreadBlockage(t);
+	}
+	debugBelch("\n");
+    }
+}
+
+void
+printAllThreads(void)
+{
+  StgTSO *t, *next;
+  nat i;
+  Capability *cap;
+
+# if defined(GRAN)
+  char time_string[TIME_STR_LEN], node_str[NODE_STR_LEN];
+  ullong_format_string(TIME_ON_PROC(CurrentProc), 
+		       time_string, rtsFalse/*no commas!*/);
+
+  debugBelch("all threads at [%s]:\n", time_string);
+# elif defined(PARALLEL_HASKELL)
+  char time_string[TIME_STR_LEN], node_str[NODE_STR_LEN];
+  ullong_format_string(CURRENT_TIME,
+		       time_string, rtsFalse/*no commas!*/);
+
+  debugBelch("all threads at [%s]:\n", time_string);
+# else
+  debugBelch("all threads:\n");
+# endif
+
+  for (i = 0; i < n_capabilities; i++) {
+      cap = &capabilities[i];
+      debugBelch("threads on capability %d:\n", cap->no);
+      for (t = cap->run_queue_hd; t != END_TSO_QUEUE; t = t->link) {
+	  printThreadStatus(t);
+      }
+  }
+
+  debugBelch("other threads:\n");
+  for (t = all_threads; t != END_TSO_QUEUE; t = next) {
+      if (t->why_blocked != NotBlocked) {
+	  printThreadStatus(t);
+      }
+      if (t->what_next == ThreadRelocated) {
+	  next = t->link;
+      } else {
+	  next = t->global_link;
+      }
+  }
+}
+
+// useful from gdb
+void 
+printThreadQueue(StgTSO *t)
+{
+    nat i = 0;
+    for (; t != END_TSO_QUEUE; t = t->link) {
+	printThreadStatus(t);
+	i++;
+    }
+    debugBelch("%d threads on queue\n", i);
+}
+
+/* 
+   Print a whole blocking queue attached to node (debugging only).
+*/
+# if defined(PARALLEL_HASKELL)
+void 
+print_bq (StgClosure *node)
+{
+  StgBlockingQueueElement *bqe;
+  StgTSO *tso;
+  rtsBool end;
+
+  debugBelch("## BQ of closure %p (%s): ",
+	  node, info_type(node));
+
+  /* should cover all closures that may have a blocking queue */
+  ASSERT(get_itbl(node)->type == BLACKHOLE_BQ ||
+	 get_itbl(node)->type == FETCH_ME_BQ ||
+	 get_itbl(node)->type == RBH ||
+	 get_itbl(node)->type == MVAR);
+    
+  ASSERT(node!=(StgClosure*)NULL);         // sanity check
+
+  print_bqe(((StgBlockingQueue*)node)->blocking_queue);
+}
+
+/* 
+   Print a whole blocking queue starting with the element bqe.
+*/
+void 
+print_bqe (StgBlockingQueueElement *bqe)
+{
+  rtsBool end;
+
+  /* 
+     NB: In a parallel setup a BQ of an RBH must end with an RBH_Save closure;
+  */
+  for (end = (bqe==END_BQ_QUEUE);
+       !end; // iterate until bqe points to a CONSTR
+       end = (get_itbl(bqe)->type == CONSTR) || (bqe->link==END_BQ_QUEUE), 
+       bqe = end ? END_BQ_QUEUE : bqe->link) {
+    ASSERT(bqe != END_BQ_QUEUE);                               // sanity check
+    ASSERT(bqe != (StgBlockingQueueElement *)NULL);            // sanity check
+    /* types of closures that may appear in a blocking queue */
+    ASSERT(get_itbl(bqe)->type == TSO ||           
+	   get_itbl(bqe)->type == BLOCKED_FETCH || 
+	   get_itbl(bqe)->type == CONSTR); 
+    /* only BQs of an RBH end with an RBH_Save closure */
+    //ASSERT(get_itbl(bqe)->type != CONSTR || get_itbl(node)->type == RBH);
+
+    switch (get_itbl(bqe)->type) {
+    case TSO:
+      debugBelch(" TSO %u (%x),",
+	      ((StgTSO *)bqe)->id, ((StgTSO *)bqe));
+      break;
+    case BLOCKED_FETCH:
+      debugBelch(" BF (node=%p, ga=((%x, %d, %x)),",
+	      ((StgBlockedFetch *)bqe)->node, 
+	      ((StgBlockedFetch *)bqe)->ga.payload.gc.gtid,
+	      ((StgBlockedFetch *)bqe)->ga.payload.gc.slot,
+	      ((StgBlockedFetch *)bqe)->ga.weight);
+      break;
+    case CONSTR:
+      debugBelch(" %s (IP %p),",
+	      (get_itbl(bqe) == &stg_RBH_Save_0_info ? "RBH_Save_0" :
+	       get_itbl(bqe) == &stg_RBH_Save_1_info ? "RBH_Save_1" :
+	       get_itbl(bqe) == &stg_RBH_Save_2_info ? "RBH_Save_2" :
+	       "RBH_Save_?"), get_itbl(bqe));
+      break;
+    default:
+      barf("Unexpected closure type %s in blocking queue", // of %p (%s)",
+	   info_type((StgClosure *)bqe)); // , node, info_type(node));
+      break;
+    }
+  } /* for */
+  debugBelch("\n");
+}
+# elif defined(GRAN)
+void 
+print_bq (StgClosure *node)
+{
+  StgBlockingQueueElement *bqe;
+  PEs node_loc, tso_loc;
+  rtsBool end;
+
+  /* should cover all closures that may have a blocking queue */
+  ASSERT(get_itbl(node)->type == BLACKHOLE_BQ ||
+	 get_itbl(node)->type == FETCH_ME_BQ ||
+	 get_itbl(node)->type == RBH);
+    
+  ASSERT(node!=(StgClosure*)NULL);         // sanity check
+  node_loc = where_is(node);
+
+  debugBelch("## BQ of closure %p (%s) on [PE %d]: ",
+	  node, info_type(node), node_loc);
+
+  /* 
+     NB: In a parallel setup a BQ of an RBH must end with an RBH_Save closure;
+  */
+  for (bqe = ((StgBlockingQueue*)node)->blocking_queue, end = (bqe==END_BQ_QUEUE);
+       !end; // iterate until bqe points to a CONSTR
+       end = (get_itbl(bqe)->type == CONSTR) || (bqe->link==END_BQ_QUEUE), bqe = end ? END_BQ_QUEUE : bqe->link) {
+    ASSERT(bqe != END_BQ_QUEUE);             // sanity check
+    ASSERT(bqe != (StgBlockingQueueElement *)NULL);  // sanity check
+    /* types of closures that may appear in a blocking queue */
+    ASSERT(get_itbl(bqe)->type == TSO ||           
+	   get_itbl(bqe)->type == CONSTR); 
+    /* only BQs of an RBH end with an RBH_Save closure */
+    ASSERT(get_itbl(bqe)->type != CONSTR || get_itbl(node)->type == RBH);
+
+    tso_loc = where_is((StgClosure *)bqe);
+    switch (get_itbl(bqe)->type) {
+    case TSO:
+      debugBelch(" TSO %d (%p) on [PE %d],",
+	      ((StgTSO *)bqe)->id, (StgTSO *)bqe, tso_loc);
+      break;
+    case CONSTR:
+      debugBelch(" %s (IP %p),",
+	      (get_itbl(bqe) == &stg_RBH_Save_0_info ? "RBH_Save_0" :
+	       get_itbl(bqe) == &stg_RBH_Save_1_info ? "RBH_Save_1" :
+	       get_itbl(bqe) == &stg_RBH_Save_2_info ? "RBH_Save_2" :
+	       "RBH_Save_?"), get_itbl(bqe));
+      break;
+    default:
+      barf("Unexpected closure type %s in blocking queue of %p (%s)",
+	   info_type((StgClosure *)bqe), node, info_type(node));
+      break;
+    }
+  } /* for */
+  debugBelch("\n");
+}
+# endif
+
+#if defined(PARALLEL_HASKELL)
+static nat
+run_queue_len(void)
+{
+    nat i;
+    StgTSO *tso;
+    
+    for (i=0, tso=run_queue_hd; 
+	 tso != END_TSO_QUEUE;
+	 i++, tso=tso->link) {
+	/* nothing */
+    }
+	
+    return i;
+}
+#endif
+
+void
+sched_belch(char *s, ...)
+{
+    va_list ap;
+    va_start(ap,s);
+#ifdef THREADED_RTS
+    debugBelch("sched (task %p): ", (void *)(unsigned long)(unsigned int)osThreadId());
+#elif defined(PARALLEL_HASKELL)
+    debugBelch("== ");
+#else
+    debugBelch("sched: ");
+#endif
+    vdebugBelch(s, ap);
+    debugBelch("\n");
+    va_end(ap);
+}
+
+#endif /* DEBUG */
diff --git a/rts/Schedule.h b/rts/Schedule.h
new file mode 100644
index 0000000000..37b07941f4
--- /dev/null
+++ b/rts/Schedule.h
@@ -0,0 +1,332 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1998-2005
+ *
+ * Prototypes for functions in Schedule.c 
+ * (RTS internal scheduler interface)
+ *
+ * -------------------------------------------------------------------------*/
+
+#ifndef SCHEDULE_H
+#define SCHEDULE_H
+
+#include "OSThreads.h"
+#include "Capability.h"
+
+/* initScheduler(), exitScheduler()
+ * Called from STG :  no
+ * Locks assumed   :  none
+ */
+void initScheduler (void);
+void exitScheduler (void);
+
+// Place a new thread on the run queue of the current Capability
+void scheduleThread (Capability *cap, StgTSO *tso);
+
+// Place a new thread on the run queue of a specified Capability
+// (cap is the currently owned Capability, cpu is the number of
+// the desired Capability).
+void scheduleThreadOn(Capability *cap, StgWord cpu, StgTSO *tso);
+
+/* awakenBlockedQueue()
+ *
+ * Takes a pointer to the beginning of a blocked TSO queue, and
+ * wakes up the entire queue.
+ * Called from STG :  yes
+ * Locks assumed   :  none
+ */
+#if defined(GRAN)
+void awakenBlockedQueue(StgBlockingQueueElement *q, StgClosure *node);
+#elif defined(PAR)
+void awakenBlockedQueue(StgBlockingQueueElement *q, StgClosure *node);
+#else
+void awakenBlockedQueue (Capability *cap, StgTSO *tso);
+#endif
+
+/* unblockOne()
+ *
+ * Put the specified thread on the run queue of the given Capability.
+ * Called from STG :  yes
+ * Locks assumed   :  we own the Capability.
+ */
+StgTSO * unblockOne(Capability *cap, StgTSO *tso);
+
+/* raiseAsync()
+ *
+ * Raises an exception asynchronously in the specified thread.
+ *
+ * Called from STG :  yes
+ * Locks assumed   :  none
+ */
+void raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception);
+
+/* suspendComputation()
+ *
+ * A variant of raiseAsync(), this strips the stack of the specified
+ * thread down to the stop_here point, leaving a current closure on
+ * top of the stack at [stop_here - 1].
+ */
+void suspendComputation(Capability *cap, StgTSO *tso, StgPtr stop_here);
+
+/* raiseExceptionHelper */
+StgWord raiseExceptionHelper (StgRegTable *reg, StgTSO *tso, StgClosure *exception);
+
+/* findRetryFrameHelper */
+StgWord findRetryFrameHelper (StgTSO *tso);
+
+/* GetRoots(evac_fn f)
+ *
+ * Call f() for each root known to the scheduler.
+ *
+ * Called from STG :  NO
+ * Locks assumed   :  ????
+ */
+void GetRoots(evac_fn);
+
+/* workerStart()
+ * 
+ * Entry point for a new worker task.
+ * Called from STG :  NO
+ * Locks assumed   :  none
+ */
+void workerStart(Task *task);
+
+#if defined(GRAN)
+void    awaken_blocked_queue(StgBlockingQueueElement *q, StgClosure *node);
+void    unlink_from_bq(StgTSO* tso, StgClosure* node);
+void    initThread(StgTSO *tso, nat stack_size, StgInt pri);
+#elif defined(PAR)
+nat     run_queue_len(void);
+void    awaken_blocked_queue(StgBlockingQueueElement *q, StgClosure *node);
+void    initThread(StgTSO *tso, nat stack_size);
+#else
+char   *info_type(StgClosure *closure);    // dummy
+char   *info_type_by_ip(StgInfoTable *ip); // dummy
+void    awaken_blocked_queue(StgTSO *q);
+void    initThread(StgTSO *tso, nat stack_size);
+#endif
+
+/* Context switch flag.
+ * Locks required  : none (conflicts are harmless)
+ */
+extern int RTS_VAR(context_switch);
+
+/* The state of the scheduler.  This is used to control the sequence
+ * of events during shutdown, and when the runtime is interrupted
+ * using ^C.
+ */
+#define SCHED_RUNNING       0  /* running as normal */
+#define SCHED_INTERRUPTING  1  /* ^C detected, before threads are deleted */
+#define SCHED_INTERRUPTED   2  /* ^C detected, after threads deleted */
+#define SCHED_SHUTTING_DOWN 3  /* final shutdown */
+
+extern rtsBool RTS_VAR(sched_state);
+
+/* 
+ * flag that tracks whether we have done any execution in this time slice.
+ */
+#define ACTIVITY_YES      0 /* there has been activity in the current slice */
+#define ACTIVITY_MAYBE_NO 1 /* no activity in the current slice */
+#define ACTIVITY_INACTIVE 2 /* a complete slice has passed with no activity */
+#define ACTIVITY_DONE_GC  3 /* like 2, but we've done a GC too */
+
+/* Recent activity flag.
+ * Locks required  : Transition from MAYBE_NO to INACTIVE
+ * happens in the timer signal, so it is atomic.  Trnasition from
+ * INACTIVE to DONE_GC happens under sched_mutex.  No lock required
+ * to set it to ACTIVITY_YES.
+ */
+extern nat recent_activity;
+
+/* Thread queues.
+ * Locks required  : sched_mutex
+ *
+ * In GranSim we have one run/blocked_queue per PE.
+ */
+#if defined(GRAN)
+// run_queue_hds defined in GranSim.h
+#else
+extern  StgTSO *RTS_VAR(blackhole_queue);
+#if !defined(THREADED_RTS)
+extern  StgTSO *RTS_VAR(blocked_queue_hd), *RTS_VAR(blocked_queue_tl);
+extern  StgTSO *RTS_VAR(sleeping_queue);
+#endif
+#endif
+
+/* Linked list of all threads.
+ * Locks required  : sched_mutex
+ */
+extern  StgTSO *RTS_VAR(all_threads);
+
+/* Set to rtsTrue if there are threads on the blackhole_queue, and
+ * it is possible that one or more of them may be available to run.
+ * This flag is set to rtsFalse after we've checked the queue, and
+ * set to rtsTrue just before we run some Haskell code.  It is used
+ * to decide whether we should yield the Capability or not.
+ * Locks required  : none (see scheduleCheckBlackHoles()).
+ */
+extern rtsBool blackholes_need_checking;
+
+#if defined(THREADED_RTS)
+extern Mutex RTS_VAR(sched_mutex);
+#endif
+
+StgBool isThreadBound(StgTSO *tso);
+
+SchedulerStatus rts_mainLazyIO(HaskellObj p, /*out*/HaskellObj *ret);
+
+/* Called by shutdown_handler(). */
+void interruptStgRts (void);
+
+nat  run_queue_len (void);
+
+void resurrectThreads (StgTSO *);
+
+void printAllThreads(void);
+
+/* debugging only 
+ */
+#ifdef DEBUG
+void print_bq (StgClosure *node);
+#endif
+#if defined(PAR)
+void print_bqe (StgBlockingQueueElement *bqe);
+#endif
+
+void labelThread(StgPtr tso, char *label);
+
+/* -----------------------------------------------------------------------------
+ * Some convenient macros/inline functions...
+ */
+
+#if !IN_STG_CODE
+
+/* END_TSO_QUEUE and friends now defined in includes/StgMiscClosures.h */
+
+/* Add a thread to the end of the run queue.
+ * NOTE: tso->link should be END_TSO_QUEUE before calling this macro.
+ * ASSUMES: cap->running_task is the current task.
+ */
+STATIC_INLINE void
+appendToRunQueue (Capability *cap, StgTSO *tso)
+{
+    ASSERT(tso->link == END_TSO_QUEUE);
+    if (cap->run_queue_hd == END_TSO_QUEUE) {
+	cap->run_queue_hd = tso;
+    } else {
+	cap->run_queue_tl->link = tso;
+    }
+    cap->run_queue_tl = tso;
+}
+
+/* Push a thread on the beginning of the run queue.  Used for
+ * newly awakened threads, so they get run as soon as possible.
+ * ASSUMES: cap->running_task is the current task.
+ */
+STATIC_INLINE void
+pushOnRunQueue (Capability *cap, StgTSO *tso)
+{
+    tso->link = cap->run_queue_hd;
+    cap->run_queue_hd = tso;
+    if (cap->run_queue_tl == END_TSO_QUEUE) {
+	cap->run_queue_tl = tso;
+    }
+}
+
+/* Pop the first thread off the runnable queue.
+ */
+STATIC_INLINE StgTSO *
+popRunQueue (Capability *cap)
+{ 
+    StgTSO *t = cap->run_queue_hd;
+    ASSERT(t != END_TSO_QUEUE);
+    cap->run_queue_hd = t->link;
+    t->link = END_TSO_QUEUE;
+    if (cap->run_queue_hd == END_TSO_QUEUE) {
+	cap->run_queue_tl = END_TSO_QUEUE;
+    }
+    return t;
+}
+
+/* Add a thread to the end of the blocked queue.
+ */
+#if !defined(THREADED_RTS)
+STATIC_INLINE void
+appendToBlockedQueue(StgTSO *tso)
+{
+    ASSERT(tso->link == END_TSO_QUEUE);
+    if (blocked_queue_hd == END_TSO_QUEUE) {
+	blocked_queue_hd = tso;
+    } else {
+	blocked_queue_tl->link = tso;
+    }
+    blocked_queue_tl = tso;
+}
+#endif
+
+#if defined(THREADED_RTS)
+STATIC_INLINE void
+appendToWakeupQueue (Capability *cap, StgTSO *tso)
+{
+    ASSERT(tso->link == END_TSO_QUEUE);
+    if (cap->wakeup_queue_hd == END_TSO_QUEUE) {
+	cap->wakeup_queue_hd = tso;
+    } else {
+	cap->wakeup_queue_tl->link = tso;
+    }
+    cap->wakeup_queue_tl = tso;
+}
+#endif
+
+/* Check whether various thread queues are empty
+ */
+STATIC_INLINE rtsBool
+emptyQueue (StgTSO *q)
+{
+    return (q == END_TSO_QUEUE);
+}
+
+STATIC_INLINE rtsBool
+emptyRunQueue(Capability *cap)
+{
+    return emptyQueue(cap->run_queue_hd);
+}
+
+#if defined(THREADED_RTS)
+STATIC_INLINE rtsBool
+emptyWakeupQueue(Capability *cap)
+{
+    return emptyQueue(cap->wakeup_queue_hd);
+}
+#endif
+
+#if !defined(THREADED_RTS)
+#define EMPTY_BLOCKED_QUEUE()  (emptyQueue(blocked_queue_hd))
+#define EMPTY_SLEEPING_QUEUE() (emptyQueue(sleeping_queue))
+#endif
+
+STATIC_INLINE rtsBool
+emptyThreadQueues(Capability *cap)
+{
+    return emptyRunQueue(cap)
+#if !defined(THREADED_RTS)
+	&& EMPTY_BLOCKED_QUEUE() && EMPTY_SLEEPING_QUEUE()
+#endif
+    ;
+}
+
+#ifdef DEBUG
+void sched_belch(char *s, ...)
+   GNU_ATTRIBUTE(format (printf, 1, 2));
+#endif
+
+#endif /* !IN_STG_CODE */
+
+STATIC_INLINE void
+dirtyTSO (StgTSO *tso)
+{
+    tso->flags |= TSO_DIRTY;
+}
+
+#endif /* SCHEDULE_H */
+
diff --git a/rts/Sparks.c b/rts/Sparks.c
new file mode 100644
index 0000000000..615d832e33
--- /dev/null
+++ b/rts/Sparks.c
@@ -0,0 +1,881 @@
+/* ---------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2000-2006
+ *
+ * Sparking support for PARALLEL_HASKELL and THREADED_RTS versions of the RTS.
+ *
+ * -------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "Schedule.h"
+#include "SchedAPI.h"
+#include "Storage.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "ParTicky.h"
+# if defined(PARALLEL_HASKELL)
+# include "ParallelRts.h"
+# include "GranSimRts.h"   // for GR_...
+# elif defined(GRAN)
+# include "GranSimRts.h"
+# endif
+#include "Sparks.h"
+
+#if defined(THREADED_RTS) || defined(PARALLEL_HASKELL)
+
+static INLINE_ME void bump_hd (StgSparkPool *p)
+{ p->hd++; if (p->hd == p->lim) p->hd = p->base; }
+
+static INLINE_ME void bump_tl (StgSparkPool *p)
+{ p->tl++; if (p->tl == p->lim) p->tl = p->base; }
+
+/* -----------------------------------------------------------------------------
+ * 
+ * Initialising spark pools.
+ *
+ * -------------------------------------------------------------------------- */
+
+static void 
+initSparkPool(StgSparkPool *pool)
+{
+    pool->base = stgMallocBytes(RtsFlags.ParFlags.maxLocalSparks
+				* sizeof(StgClosure *),
+				"initSparkPools");
+    pool->lim = pool->base + RtsFlags.ParFlags.maxLocalSparks;
+    pool->hd  = pool->base;
+    pool->tl  = pool->base;
+}
+
+void
+initSparkPools( void )
+{
+#ifdef THREADED_RTS
+    /* walk over the capabilities, allocating a spark pool for each one */
+    nat i;
+    for (i = 0; i < n_capabilities; i++) {
+	initSparkPool(&capabilities[i].r.rSparks);
+    }
+#else
+    /* allocate a single spark pool */
+    initSparkPool(&MainCapability.r.rSparks);
+#endif
+}
+
+/* -----------------------------------------------------------------------------
+ * 
+ * findSpark: find a spark on the current Capability that we can fork
+ * into a thread.
+ *
+ * -------------------------------------------------------------------------- */
+
+StgClosure *
+findSpark (Capability *cap)
+{
+    StgSparkPool *pool;
+    StgClosure *spark;
+    
+    pool = &(cap->r.rSparks);
+    ASSERT_SPARK_POOL_INVARIANTS(pool);
+
+    while (pool->hd != pool->tl) {
+	spark = *pool->hd;
+	bump_hd(pool);
+	if (closure_SHOULD_SPARK(spark)) {
+#ifdef GRAN
+	    if (RtsFlags.ParFlags.ParStats.Sparks) 
+		DumpRawGranEvent(CURRENT_PROC, CURRENT_PROC, 
+				 GR_STEALING, ((StgTSO *)NULL), spark, 
+				 0, 0 /* spark_queue_len(ADVISORY_POOL) */);
+#endif
+	    return spark;
+	}
+    }
+    // spark pool is now empty
+    return NULL;
+}
+
+/* -----------------------------------------------------------------------------
+ * Mark all nodes pointed to by sparks in the spark queues (for GC) Does an
+ * implicit slide i.e. after marking all sparks are at the beginning of the
+ * spark pool and the spark pool only contains sparkable closures 
+ * -------------------------------------------------------------------------- */
+
+void
+markSparkQueue (evac_fn evac)
+{ 
+    StgClosure **sparkp, **to_sparkp;
+    nat i, n, pruned_sparks; // stats only
+    StgSparkPool *pool;
+    Capability *cap;
+    
+    PAR_TICKY_MARK_SPARK_QUEUE_START();
+    
+    n = 0;
+    pruned_sparks = 0;
+    for (i = 0; i < n_capabilities; i++) {
+	cap = &capabilities[i];
+	pool = &(cap->r.rSparks);
+	
+	ASSERT_SPARK_POOL_INVARIANTS(pool);
+
+#if defined(PARALLEL_HASKELL)
+	// stats only
+	n = 0;
+	pruned_sparks = 0;
+#endif
+	
+	sparkp = pool->hd;
+	to_sparkp = pool->hd;
+	while (sparkp != pool->tl) {
+	    ASSERT(to_sparkp<=sparkp);
+	    ASSERT(*sparkp!=NULL);
+	    ASSERT(LOOKS_LIKE_CLOSURE_PTR(((StgClosure *)*sparkp)));
+	    // ToDo?: statistics gathering here (also for GUM!)
+	    if (closure_SHOULD_SPARK(*sparkp)) {
+		evac(sparkp);
+		*to_sparkp++ = *sparkp;
+		n++;
+	    } else {
+		pruned_sparks++;
+	    }
+	    sparkp++;
+	    if (sparkp == pool->lim) {
+		sparkp = pool->base;
+	    }
+	}
+	pool->tl = to_sparkp;
+	
+	PAR_TICKY_MARK_SPARK_QUEUE_END(n);
+	
+#if defined(PARALLEL_HASKELL)
+	IF_DEBUG(scheduler,
+		 debugBelch("markSparkQueue: marked %d sparks and pruned %d sparks on [%x]",
+			    n, pruned_sparks, mytid));
+#else
+	IF_DEBUG(scheduler,
+	       debugBelch("markSparkQueue: marked %d sparks and pruned %d sparks\n",
+			  n, pruned_sparks));
+#endif
+	
+	IF_DEBUG(scheduler,
+		 debugBelch("markSparkQueue:   new spark queue len=%d; (hd=%p; tl=%p)\n",
+			    sparkPoolSize(pool), pool->hd, pool->tl));
+	
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ * 
+ * Turn a spark into a real thread
+ *
+ * -------------------------------------------------------------------------- */
+
+void
+createSparkThread (Capability *cap, StgClosure *p)
+{
+    StgTSO *tso;
+
+    tso = createGenThread (cap, RtsFlags.GcFlags.initialStkSize, p);
+    appendToRunQueue(cap,tso);
+}
+
+/* -----------------------------------------------------------------------------
+ * 
+ * Create a new spark
+ *
+ * -------------------------------------------------------------------------- */
+
+#define DISCARD_NEW
+
+StgInt
+newSpark (StgRegTable *reg, StgClosure *p)
+{
+    StgSparkPool *pool = &(reg->rSparks);
+
+    ASSERT_SPARK_POOL_INVARIANTS(pool);
+
+    if (closure_SHOULD_SPARK(p)) {
+#ifdef DISCARD_NEW
+	StgClosure **new_tl;
+	new_tl = pool->tl + 1;
+	if (new_tl == pool->lim) { new_tl = pool->base; }
+	if (new_tl != pool->hd) {
+	    *pool->tl = p;
+	    pool->tl = new_tl;
+	} else if (!closure_SHOULD_SPARK(*pool->hd)) {
+	    // if the old closure is not sparkable, discard it and
+	    // keep the new one.  Otherwise, keep the old one.
+	    *pool->tl = p;
+	    bump_hd(pool);
+	}
+#else  /* DISCARD OLD */
+	*pool->tl = p;
+	bump_tl(pool);
+	if (pool->tl == pool->hd) { bump_hd(pool); }
+#endif
+    }	
+
+    ASSERT_SPARK_POOL_INVARIANTS(pool);
+    return 1;
+}
+
+#else
+
+StgInt
+newSpark (StgRegTable *reg STG_UNUSED, StgClosure *p STG_UNUSED)
+{
+    /* nothing */
+    return 1;
+}
+
+#endif /* PARALLEL_HASKELL || THREADED_RTS */
+
+
+/* -----------------------------------------------------------------------------
+ * 
+ * GRAN & PARALLEL_HASKELL stuff beyond here.
+ *
+ * -------------------------------------------------------------------------- */
+
+#if defined(PARALLEL_HASKELL) || defined(GRAN)
+
+static void slide_spark_pool( StgSparkPool *pool );
+
+rtsBool
+add_to_spark_queue( StgClosure *closure, StgSparkPool *pool )
+{
+  if (pool->tl == pool->lim)
+    slide_spark_pool(pool);
+
+  if (closure_SHOULD_SPARK(closure) && 
+      pool->tl < pool->lim) {
+    *(pool->tl++) = closure;
+
+#if defined(PARALLEL_HASKELL)
+    // collect parallel global statistics (currently done together with GC stats)
+    if (RtsFlags.ParFlags.ParStats.Global &&
+	RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+      // debugBelch("Creating spark for %x @ %11.2f\n", closure, usertime()); 
+      globalParStats.tot_sparks_created++;
+    }
+#endif
+    return rtsTrue;
+  } else {
+#if defined(PARALLEL_HASKELL)
+    // collect parallel global statistics (currently done together with GC stats)
+    if (RtsFlags.ParFlags.ParStats.Global &&
+	RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+      //debugBelch("Ignoring spark for %x @ %11.2f\n", closure, usertime()); 
+      globalParStats.tot_sparks_ignored++;
+    }
+#endif
+    return rtsFalse;
+  }
+}
+
+static void
+slide_spark_pool( StgSparkPool *pool )
+{
+  StgClosure **sparkp, **to_sparkp;
+
+  sparkp = pool->hd;
+  to_sparkp = pool->base;
+  while (sparkp < pool->tl) {
+    ASSERT(to_sparkp<=sparkp);
+    ASSERT(*sparkp!=NULL);
+    ASSERT(LOOKS_LIKE_GHC_INFO((*sparkp)->header.info));
+
+    if (closure_SHOULD_SPARK(*sparkp)) {
+      *to_sparkp++ = *sparkp++;
+    } else {
+      sparkp++;
+    }
+  }
+  pool->hd = pool->base;
+  pool->tl = to_sparkp;
+}
+
+void
+disposeSpark(spark)
+StgClosure *spark;
+{
+#if !defined(THREADED_RTS)
+  Capability *cap;
+  StgSparkPool *pool;
+
+  cap = &MainRegTable;
+  pool = &(cap->rSparks);
+  ASSERT(pool->hd <= pool->tl && pool->tl <= pool->lim);
+#endif
+  ASSERT(spark != (StgClosure *)NULL);
+  /* Do nothing */
+}
+
+
+#elif defined(GRAN)
+
+/* 
+   Search the spark queue of the proc in event for a spark that's worth
+   turning into a thread 
+   (was gimme_spark in the old RTS)
+*/
+void
+findLocalSpark (rtsEvent *event, rtsBool *found_res, rtsSparkQ *spark_res)
+{
+   PEs proc = event->proc,       /* proc to search for work */
+       creator = event->creator; /* proc that requested work */
+   StgClosure* node;
+   rtsBool found;
+   rtsSparkQ spark_of_non_local_node = NULL, 
+             spark_of_non_local_node_prev = NULL, 
+             low_priority_spark = NULL, 
+             low_priority_spark_prev = NULL,
+             spark = NULL, prev = NULL;
+  
+   /* Choose a spark from the local spark queue */
+   prev = (rtsSpark*)NULL;
+   spark = pending_sparks_hds[proc];
+   found = rtsFalse;
+
+   // ToDo: check this code & implement local sparking !! -- HWL  
+   while (!found && spark != (rtsSpark*)NULL)
+     {
+       ASSERT((prev!=(rtsSpark*)NULL || spark==pending_sparks_hds[proc]) &&
+	      (prev==(rtsSpark*)NULL || prev->next==spark) &&
+	      (spark->prev==prev));
+       node = spark->node;
+       if (!closure_SHOULD_SPARK(node)) 
+         {
+	   IF_GRAN_DEBUG(checkSparkQ,
+			 debugBelch("^^ pruning spark %p (node %p) in gimme_spark",
+			       spark, node));
+
+           if (RtsFlags.GranFlags.GranSimStats.Sparks)
+             DumpRawGranEvent(proc, (PEs)0, SP_PRUNED,(StgTSO*)NULL,
+			      spark->node, spark->name, spark_queue_len(proc));
+  
+	   ASSERT(spark != (rtsSpark*)NULL);
+	   ASSERT(SparksAvail>0);
+	   --SparksAvail;
+
+	   ASSERT(prev==(rtsSpark*)NULL || prev->next==spark);
+	   spark = delete_from_sparkq (spark, proc, rtsTrue);
+	   if (spark != (rtsSpark*)NULL)
+	     prev = spark->prev;
+	   continue;
+         }
+       /* -- node should eventually be sparked */
+       else if (RtsFlags.GranFlags.PreferSparksOfLocalNodes && 
+               !IS_LOCAL_TO(PROCS(node),CurrentProc)) 
+         {
+	   barf("Local sparking not yet implemented");
+
+           /* Remember first low priority spark */
+           if (spark_of_non_local_node==(rtsSpark*)NULL) {
+	     spark_of_non_local_node_prev = prev;
+             spark_of_non_local_node = spark;
+  	      }
+  
+           if (spark->next == (rtsSpark*)NULL) { 
+  	     /* ASSERT(spark==SparkQueueTl);  just for testing */
+  	     prev = spark_of_non_local_node_prev;
+  	     spark = spark_of_non_local_node;
+             found = rtsTrue;
+             break;
+           }
+  
+# if defined(GRAN) && defined(GRAN_CHECK)
+           /* Should never happen; just for testing 
+           if (spark==pending_sparks_tl) {
+             debugBelch("ReSchedule: Last spark != SparkQueueTl\n");
+	   	stg_exit(EXIT_FAILURE);
+		} */
+# endif
+  	   prev = spark; 
+  	   spark = spark->next;
+	   ASSERT(SparksAvail>0);
+           --SparksAvail;
+	   continue;
+         }
+       else if ( RtsFlags.GranFlags.DoPrioritySparking || 
+  		 (spark->gran_info >= RtsFlags.GranFlags.SparkPriority2) )
+         {
+	   if (RtsFlags.GranFlags.DoPrioritySparking)
+	     barf("Priority sparking not yet implemented");
+
+           found = rtsTrue;
+         }
+#if 0	   
+       else /* only used if SparkPriority2 is defined */
+         {
+	   /* ToDo: fix the code below and re-integrate it */
+           /* Remember first low priority spark */
+           if (low_priority_spark==(rtsSpark*)NULL) { 
+	     low_priority_spark_prev = prev;
+             low_priority_spark = spark;
+	   }
+  
+           if (spark->next == (rtsSpark*)NULL) { 
+	        /* ASSERT(spark==spark_queue_tl);  just for testing */
+	     prev = low_priority_spark_prev;
+	     spark = low_priority_spark;
+             found = rtsTrue;       /* take low pri spark => rc is 2  */
+             break;
+           }
+  
+           /* Should never happen; just for testing 
+           if (spark==pending_sparks_tl) {
+             debugBelch("ReSchedule: Last spark != SparkQueueTl\n");
+  		stg_exit(EXIT_FAILURE);
+             break;
+	   } */                
+	   prev = spark; 
+	   spark = spark->next;
+
+	   IF_GRAN_DEBUG(pri,
+			 debugBelch("++ Ignoring spark of priority %u (SparkPriority=%u); node=%p; name=%u\n", 
+			       spark->gran_info, RtsFlags.GranFlags.SparkPriority, 
+			       spark->node, spark->name);)
+           }
+#endif
+   }  /* while (spark!=NULL && !found) */
+
+   *spark_res = spark;
+   *found_res = found;
+}
+
+/*
+  Turn the spark into a thread.
+  In GranSim this basically means scheduling a StartThread event for the
+  node pointed to by the spark at some point in the future.
+  (was munch_spark in the old RTS)
+*/
+rtsBool
+activateSpark (rtsEvent *event, rtsSparkQ spark) 
+{
+  PEs proc = event->proc,       /* proc to search for work */
+      creator = event->creator; /* proc that requested work */
+  StgTSO* tso;
+  StgClosure* node;
+  rtsTime spark_arrival_time;
+
+  /* 
+     We've found a node on PE proc requested by PE creator.
+     If proc==creator we can turn the spark into a thread immediately;
+     otherwise we schedule a MoveSpark event on the requesting PE
+  */
+     
+  /* DaH Qu' yIchen */
+  if (proc!=creator) { 
+
+    /* only possible if we simulate GUM style fishing */
+    ASSERT(RtsFlags.GranFlags.Fishing);
+
+    /* Message packing costs for sending a Fish; qeq jabbI'ID */
+    CurrentTime[proc] += RtsFlags.GranFlags.Costs.mpacktime;
+  
+    if (RtsFlags.GranFlags.GranSimStats.Sparks)
+      DumpRawGranEvent(proc, (PEs)0, SP_EXPORTED,
+		       (StgTSO*)NULL, spark->node,
+		       spark->name, spark_queue_len(proc));
+
+    /* time of the spark arrival on the remote PE */
+    spark_arrival_time = CurrentTime[proc] + RtsFlags.GranFlags.Costs.latency;
+
+    new_event(creator, proc, spark_arrival_time,
+	      MoveSpark,
+	      (StgTSO*)NULL, spark->node, spark);
+
+    CurrentTime[proc] += RtsFlags.GranFlags.Costs.mtidytime;
+	    
+  } else { /* proc==creator i.e. turn the spark into a thread */
+
+    if ( RtsFlags.GranFlags.GranSimStats.Global && 
+	 spark->gran_info < RtsFlags.GranFlags.SparkPriority2 ) {
+
+      globalGranStats.tot_low_pri_sparks++;
+      IF_GRAN_DEBUG(pri,
+		    debugBelch("++ No high priority spark available; low priority (%u) spark chosen: node=%p; name=%u\n",
+			  spark->gran_info, 
+			  spark->node, spark->name));
+    } 
+    
+    CurrentTime[proc] += RtsFlags.GranFlags.Costs.threadcreatetime;
+    
+    node = spark->node;
+    
+# if 0
+    /* ToDo: fix the GC interface and move to StartThread handling-- HWL */
+    if (GARBAGE COLLECTION IS NECESSARY) {
+      /* Some kind of backoff needed here in case there's too little heap */
+#  if defined(GRAN_CHECK) && defined(GRAN)
+      if (RtsFlags.GcFlags.giveStats)
+	fprintf(RtsFlags.GcFlags.statsFile,"***** vIS Qu' chen veQ boSwI'; spark=%p, node=%p;  name=%u\n", 
+		/* (found==2 ? "no hi pri spark" : "hi pri spark"), */
+		spark, node, spark->name);
+#  endif
+      new_event(CurrentProc, CurrentProc, CurrentTime[CurrentProc]+1,
+    		  FindWork,
+    		  (StgTSO*)NULL, (StgClosure*)NULL, (rtsSpark*)NULL);
+      barf("//// activateSpark: out of heap ; ToDo: call GarbageCollect()");
+      GarbageCollect(GetRoots, rtsFalse);
+      // HWL old: ReallyPerformThreadGC(TSO_HS+TSO_CTS_SIZE,rtsFalse);
+      // HWL old: SAVE_Hp -= TSO_HS+TSO_CTS_SIZE;
+      spark = NULL;
+      return; /* was: continue; */ /* to the next event, eventually */
+    }
+# endif
+    
+    if (RtsFlags.GranFlags.GranSimStats.Sparks)
+      DumpRawGranEvent(CurrentProc,(PEs)0,SP_USED,(StgTSO*)NULL,
+		       spark->node, spark->name,
+		       spark_queue_len(CurrentProc));
+    
+    new_event(proc, proc, CurrentTime[proc],
+	      StartThread, 
+	      END_TSO_QUEUE, node, spark); // (rtsSpark*)NULL);
+    
+    procStatus[proc] = Starting;
+  }
+}
+
+/* -------------------------------------------------------------------------
+   This is the main point where handling granularity information comes into
+   play. 
+   ------------------------------------------------------------------------- */
+
+#define MAX_RAND_PRI    100
+
+/* 
+   Granularity info transformers. 
+   Applied to the GRAN_INFO field of a spark.
+*/
+STATIC_INLINE nat  ID(nat x) { return(x); };
+STATIC_INLINE nat  INV(nat x) { return(-x); };
+STATIC_INLINE nat  IGNORE(nat x) { return (0); };
+STATIC_INLINE nat  RAND(nat x) { return ((random() % MAX_RAND_PRI) + 1); }
+
+/* NB: size_info and par_info are currently unused (what a shame!) -- HWL */
+rtsSpark *
+newSpark(node,name,gran_info,size_info,par_info,local)
+StgClosure *node;
+nat name, gran_info, size_info, par_info, local;
+{
+  nat pri;
+  rtsSpark *newspark;
+
+  pri = RtsFlags.GranFlags.RandomPriorities ? RAND(gran_info) :
+        RtsFlags.GranFlags.InversePriorities ? INV(gran_info) :
+	RtsFlags.GranFlags.IgnorePriorities ? IGNORE(gran_info) :
+                           ID(gran_info);
+
+  if ( RtsFlags.GranFlags.SparkPriority!=0 && 
+       pri<RtsFlags.GranFlags.SparkPriority ) {
+    IF_GRAN_DEBUG(pri,
+      debugBelch(",, NewSpark: Ignoring spark of priority %u (SparkPriority=%u); node=%#x; name=%u\n", 
+	      pri, RtsFlags.GranFlags.SparkPriority, node, name));
+    return ((rtsSpark*)NULL);
+  }
+
+  newspark = (rtsSpark*) stgMallocBytes(sizeof(rtsSpark), "NewSpark");
+  newspark->prev = newspark->next = (rtsSpark*)NULL;
+  newspark->node = node;
+  newspark->name = (name==1) ? CurrentTSO->gran.sparkname : name;
+  newspark->gran_info = pri;
+  newspark->global = !local;      /* Check that with parAt, parAtAbs !!*/
+
+  if (RtsFlags.GranFlags.GranSimStats.Global) {
+    globalGranStats.tot_sparks_created++;
+    globalGranStats.sparks_created_on_PE[CurrentProc]++;
+  }
+
+  return(newspark);
+}
+
+void
+disposeSpark(spark)
+rtsSpark *spark;
+{
+  ASSERT(spark!=NULL);
+  stgFree(spark);
+}
+
+void 
+disposeSparkQ(spark)
+rtsSparkQ spark;
+{
+  if (spark==NULL) 
+    return;
+
+  disposeSparkQ(spark->next);
+
+# ifdef GRAN_CHECK
+  if (SparksAvail < 0) {
+    debugBelch("disposeSparkQ: SparksAvail<0 after disposing sparkq @ %p\n", &spark);
+    print_spark(spark);
+  }
+# endif
+
+  stgFree(spark);
+}
+
+/*
+   With PrioritySparking add_to_spark_queue performs an insert sort to keep
+   the spark queue sorted. Otherwise the spark is just added to the end of
+   the queue. 
+*/
+
+void
+add_to_spark_queue(spark)
+rtsSpark *spark;
+{
+  rtsSpark *prev = NULL, *next = NULL;
+  nat count = 0;
+  rtsBool found = rtsFalse;
+
+  if ( spark == (rtsSpark *)NULL ) {
+    return;
+  }
+
+  if (RtsFlags.GranFlags.DoPrioritySparking && (spark->gran_info != 0) ) {
+    /* Priority sparking is enabled i.e. spark queues must be sorted */
+
+    for (prev = NULL, next = pending_sparks_hd, count=0;
+	 (next != NULL) && 
+	 !(found = (spark->gran_info >= next->gran_info));
+	 prev = next, next = next->next, count++) 
+     {}
+
+  } else {   /* 'utQo' */
+    /* Priority sparking is disabled */
+    
+    found = rtsFalse;   /* to add it at the end */
+
+  }
+
+  if (found) {
+    /* next points to the first spark with a gran_info smaller than that
+       of spark; therefore, add spark before next into the spark queue */
+    spark->next = next;
+    if ( next == NULL ) {
+      pending_sparks_tl = spark;
+    } else {
+      next->prev = spark;
+    }
+    spark->prev = prev;
+    if ( prev == NULL ) {
+      pending_sparks_hd = spark;
+    } else {
+      prev->next = spark;
+    }
+  } else {  /* (RtsFlags.GranFlags.DoPrioritySparking && !found) || !DoPrioritySparking */
+    /* add the spark at the end of the spark queue */
+    spark->next = NULL;			       
+    spark->prev = pending_sparks_tl;
+    if (pending_sparks_hd == NULL)
+      pending_sparks_hd = spark;
+    else
+      pending_sparks_tl->next = spark;
+    pending_sparks_tl = spark;	  
+  } 
+  ++SparksAvail;
+
+  /* add costs for search in priority sparking */
+  if (RtsFlags.GranFlags.DoPrioritySparking) {
+    CurrentTime[CurrentProc] += count * RtsFlags.GranFlags.Costs.pri_spark_overhead;
+  }
+
+  IF_GRAN_DEBUG(checkSparkQ,
+		debugBelch("++ Spark stats after adding spark %p (node %p) to queue on PE %d",
+		      spark, spark->node, CurrentProc);
+		print_sparkq_stats());
+
+#  if defined(GRAN_CHECK)
+  if (RtsFlags.GranFlags.Debug.checkSparkQ) {
+    for (prev = NULL, next =  pending_sparks_hd;
+	 (next != NULL);
+	 prev = next, next = next->next) 
+      {}
+    if ( (prev!=NULL) && (prev!=pending_sparks_tl) )
+      debugBelch("SparkQ inconsistency after adding spark %p: (PE %u) pending_sparks_tl (%p) not end of queue (%p)\n",
+	      spark,CurrentProc, 
+	      pending_sparks_tl, prev);
+  }
+#  endif
+
+#  if defined(GRAN_CHECK)
+  /* Check if the sparkq is still sorted. Just for testing, really!  */
+  if ( RtsFlags.GranFlags.Debug.checkSparkQ &&
+       RtsFlags.GranFlags.Debug.pri ) {
+    rtsBool sorted = rtsTrue;
+    rtsSpark *prev, *next;
+
+    if (pending_sparks_hd == NULL ||
+	pending_sparks_hd->next == NULL ) {
+      /* just 1 elem => ok */
+    } else {
+      for (prev = pending_sparks_hd,
+	   next = pending_sparks_hd->next;
+	   (next != NULL) ;
+	   prev = next, next = next->next) {
+	sorted = sorted && 
+	         (prev->gran_info >= next->gran_info);
+      }
+    }
+    if (!sorted) {
+      debugBelch("ghuH: SPARKQ on PE %d is not sorted:\n",
+	      CurrentProc);
+      print_sparkq(CurrentProc);
+    }
+  }
+#  endif
+}
+
+nat
+spark_queue_len(proc) 
+PEs proc;
+{
+ rtsSpark *prev, *spark;                     /* prev only for testing !! */
+ nat len;
+
+ for (len = 0, prev = NULL, spark = pending_sparks_hds[proc]; 
+      spark != NULL; 
+      len++, prev = spark, spark = spark->next)
+   {}
+
+#  if defined(GRAN_CHECK)
+  if ( RtsFlags.GranFlags.Debug.checkSparkQ ) 
+    if ( (prev!=NULL) && (prev!=pending_sparks_tls[proc]) )
+      debugBelch("ERROR in spark_queue_len: (PE %u) pending_sparks_tl (%p) not end of queue (%p)\n",
+	      proc, pending_sparks_tls[proc], prev);
+#  endif
+
+ return (len);
+}
+
+/* 
+   Take spark out of the spark queue on PE p and nuke the spark. Adjusts
+   hd and tl pointers of the spark queue. Returns a pointer to the next
+   spark in the queue.
+*/
+rtsSpark *
+delete_from_sparkq (spark, p, dispose_too)     /* unlink and dispose spark */
+rtsSpark *spark;
+PEs p;
+rtsBool dispose_too;
+{
+  rtsSpark *new_spark;
+
+  if (spark==NULL) 
+    barf("delete_from_sparkq: trying to delete NULL spark\n");
+
+#  if defined(GRAN_CHECK)
+  if ( RtsFlags.GranFlags.Debug.checkSparkQ ) {
+    debugBelch("## |%p:%p| (%p)<-spark=%p->(%p) <-(%p)\n",
+	    pending_sparks_hd, pending_sparks_tl,
+	    spark->prev, spark, spark->next, 
+	    (spark->next==NULL ? 0 : spark->next->prev));
+  }
+#  endif
+
+  if (spark->prev==NULL) {
+    /* spark is first spark of queue => adjust hd pointer */
+    ASSERT(pending_sparks_hds[p]==spark);
+    pending_sparks_hds[p] = spark->next;
+  } else {
+    spark->prev->next = spark->next;
+  }
+  if (spark->next==NULL) {
+    ASSERT(pending_sparks_tls[p]==spark);
+    /* spark is first spark of queue => adjust tl pointer */
+    pending_sparks_tls[p] = spark->prev;
+  } else {
+    spark->next->prev = spark->prev;
+  }
+  new_spark = spark->next;
+  
+#  if defined(GRAN_CHECK)
+  if ( RtsFlags.GranFlags.Debug.checkSparkQ ) {
+    debugBelch("## |%p:%p| (%p)<-spark=%p->(%p) <-(%p); spark=%p will be deleted NOW \n",
+	    pending_sparks_hd, pending_sparks_tl,
+	    spark->prev, spark, spark->next, 
+	    (spark->next==NULL ? 0 : spark->next->prev), spark);
+  }
+#  endif
+
+  if (dispose_too)
+    disposeSpark(spark);
+                  
+  return new_spark;
+}
+
+/* Mark all nodes pointed to by sparks in the spark queues (for GC) */
+void
+markSparkQueue(void)
+{ 
+  StgClosure *MarkRoot(StgClosure *root); // prototype
+  PEs p;
+  rtsSpark *sp;
+
+  for (p=0; p<RtsFlags.GranFlags.proc; p++)
+    for (sp=pending_sparks_hds[p]; sp!=NULL; sp=sp->next) {
+      ASSERT(sp->node!=NULL);
+      ASSERT(LOOKS_LIKE_GHC_INFO(sp->node->header.info));
+      // ToDo?: statistics gathering here (also for GUM!)
+      sp->node = (StgClosure *)MarkRoot(sp->node);
+    }
+  IF_DEBUG(gc,
+	   debugBelch("@@ markSparkQueue: spark statistics at start of GC:");
+	   print_sparkq_stats());
+}
+
+void
+print_spark(spark)
+rtsSpark *spark;
+{ 
+  char str[16];
+
+  if (spark==NULL) {
+    debugBelch("Spark: NIL\n");
+    return;
+  } else {
+    sprintf(str,
+	    ((spark->node==NULL) ? "______" : "%#6lx"), 
+	    stgCast(StgPtr,spark->node));
+
+    debugBelch("Spark: Node %8s, Name %#6x, Global %5s, Creator %5x, Prev %6p, Next %6p\n",
+	    str, spark->name, 
+            ((spark->global)==rtsTrue?"True":"False"), spark->creator, 
+            spark->prev, spark->next);
+  }
+}
+
+void
+print_sparkq(proc)
+PEs proc;
+// rtsSpark *hd;
+{
+  rtsSpark *x = pending_sparks_hds[proc];
+
+  debugBelch("Spark Queue of PE %d with root at %p:\n", proc, x);
+  for (; x!=(rtsSpark*)NULL; x=x->next) {
+    print_spark(x);
+  }
+}
+
+/* 
+   Print a statistics of all spark queues.
+*/
+void
+print_sparkq_stats(void)
+{
+  PEs p;
+
+  debugBelch("SparkQs: [");
+  for (p=0; p<RtsFlags.GranFlags.proc; p++)
+    debugBelch(", PE %d: %d", p, spark_queue_len(p));
+  debugBelch("\n");
+}
+
+#endif
diff --git a/rts/Sparks.h b/rts/Sparks.h
new file mode 100644
index 0000000000..77d280bea8
--- /dev/null
+++ b/rts/Sparks.h
@@ -0,0 +1,104 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2000-2006
+ *
+ * Sparking support for GRAN, PAR and THREADED_RTS versions of the RTS.
+ * 
+ * ---------------------------------------------------------------------------*/
+
+#ifndef SPARKS_H
+#define SPARKS_H
+
+#if !defined(GRAN)
+StgInt newSpark (StgRegTable *reg, StgClosure *p);
+#endif
+
+#if defined(PARALLEL_HASKELL) || defined(THREADED_RTS)
+StgClosure * findSpark         (Capability *cap);
+void         initSparkPools    (void);
+void         markSparkQueue    (evac_fn evac);
+void         createSparkThread (Capability *cap, StgClosure *p);
+
+INLINE_HEADER void     discardSparks  (StgSparkPool *pool);
+INLINE_HEADER nat      sparkPoolSize  (StgSparkPool *pool);
+INLINE_HEADER rtsBool  emptySparkPool (StgSparkPool *pool);
+
+INLINE_HEADER void     discardSparksCap  (Capability *cap);
+INLINE_HEADER nat      sparkPoolSizeCap  (Capability *cap);
+INLINE_HEADER rtsBool  emptySparkPoolCap (Capability *cap);
+#endif
+
+#if defined(PARALLEL_HASKELL)
+StgTSO      *activateSpark (rtsSpark spark) ;
+rtsBool      add_to_spark_queue( StgClosure *closure, StgSparkPool *pool );
+void         markSparkQueue( void );
+nat          spark_queue_len( StgSparkPool *pool );
+void         disposeSpark( StgClosure *spark );
+#endif
+
+#if defined(GRAN)
+void      findLocalSpark (rtsEvent *event, rtsBool *found_res, rtsSparkQ *spark_res);
+rtsBool   activateSpark (rtsEvent *event, rtsSparkQ spark);
+rtsSpark *newSpark(StgClosure *node, nat name, nat gran_info, 
+		   nat size_info, nat par_info, nat local);
+void      add_to_spark_queue(rtsSpark *spark);
+rtsSpark *delete_from_sparkq (rtsSpark *spark, PEs p, rtsBool dispose_too);
+void 	  disposeSpark(rtsSpark *spark);
+void 	  disposeSparkQ(rtsSparkQ spark);
+void 	  print_spark(rtsSpark *spark);
+void      print_sparkq(PEs proc);
+void 	  print_sparkq_stats(void);
+nat  	  spark_queue_len(PEs proc);
+void      markSparkQueue(void);
+#endif
+
+/* -----------------------------------------------------------------------------
+ * PRIVATE below here
+ * -------------------------------------------------------------------------- */
+
+#if defined(PARALLEL_HASKELL) || defined(THREADED_RTS)
+
+INLINE_HEADER rtsBool
+emptySparkPool (StgSparkPool *pool)
+{
+    return (pool->hd == pool->tl);
+}
+
+INLINE_HEADER rtsBool
+emptySparkPoolCap (Capability *cap) 
+{ return emptySparkPool(&cap->r.rSparks); }
+
+INLINE_HEADER nat
+sparkPoolSize (StgSparkPool *pool) 
+{
+    if (pool->hd <= pool->tl) {
+	return (pool->hd - pool->tl);
+    } else {
+	return (pool->lim - pool->hd + pool->tl - pool->base);
+    }
+}
+
+INLINE_HEADER nat
+sparkPoolSizeCap (Capability *cap) 
+{ return sparkPoolSize(&cap->r.rSparks); }
+
+INLINE_HEADER void
+discardSparks (StgSparkPool *pool)
+{
+    pool->hd = pool->tl;
+}
+
+INLINE_HEADER void
+discardSparksCap (Capability *cap) 
+{ return discardSparks(&cap->r.rSparks); }
+
+
+#elif defined(THREADED_RTS) 
+
+INLINE_HEADER rtsBool
+emptySparkPoolCap (Capability *cap STG_UNUSED)
+{ return rtsTrue; }
+
+#endif
+
+#endif /* SPARKS_H */
diff --git a/rts/Stable.c b/rts/Stable.c
new file mode 100644
index 0000000000..a4db5cd749
--- /dev/null
+++ b/rts/Stable.c
@@ -0,0 +1,460 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2002
+ *
+ * Stable names and stable pointers.
+ *
+ * ---------------------------------------------------------------------------*/
+
+// Make static versions of inline functions in Stable.h:
+#define RTS_STABLE_C
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "Hash.h"
+#include "RtsUtils.h"
+#include "OSThreads.h"
+#include "Storage.h"
+#include "RtsAPI.h"
+#include "RtsFlags.h"
+#include "OSThreads.h"
+
+/* Comment from ADR's implementation in old RTS:
+
+  This files (together with @ghc/runtime/storage/PerformIO.lhc@ and a
+  small change in @HpOverflow.lc@) consists of the changes in the
+  runtime system required to implement "Stable Pointers". But we're
+  getting a bit ahead of ourselves --- what is a stable pointer and what
+  is it used for?
+
+  When Haskell calls C, it normally just passes over primitive integers,
+  floats, bools, strings, etc.  This doesn't cause any problems at all
+  for garbage collection because the act of passing them makes a copy
+  from the heap, stack or wherever they are onto the C-world stack.
+  However, if we were to pass a heap object such as a (Haskell) @String@
+  and a garbage collection occured before we finished using it, we'd run
+  into problems since the heap object might have been moved or even
+  deleted.
+
+  So, if a C call is able to cause a garbage collection or we want to
+  store a pointer to a heap object between C calls, we must be careful
+  when passing heap objects. Our solution is to keep a table of all
+  objects we've given to the C-world and to make sure that the garbage
+  collector collects these objects --- updating the table as required to
+  make sure we can still find the object.
+
+
+  Of course, all this rather begs the question: why would we want to
+  pass a boxed value?
+
+  One very good reason is to preserve laziness across the language
+  interface. Rather than evaluating an integer or a string because it
+  {\em might\/} be required by the C function, we can wait until the C
+  function actually wants the value and then force an evaluation.
+
+  Another very good reason (the motivating reason!) is that the C code
+  might want to execute an object of sort $IO ()$ for the side-effects
+  it will produce. For example, this is used when interfacing to an X
+  widgets library to allow a direct implementation of callbacks.
+
+
+  The @makeStablePointer :: a -> IO (StablePtr a)@ function
+  converts a value into a stable pointer.  It is part of the @PrimIO@
+  monad, because we want to be sure we don't allocate one twice by
+  accident, and then only free one of the copies.
+
+  \begin{verbatim}
+  makeStablePtr#  :: a -> State# RealWorld -> (# RealWorld, a #)
+  freeStablePtr#  :: StablePtr# a -> State# RealWorld -> State# RealWorld
+  deRefStablePtr# :: StablePtr# a -> State# RealWorld -> 
+        (# State# RealWorld, a #)
+  \end{verbatim}
+
+  There may be additional functions on the C side to allow evaluation,
+  application, etc of a stable pointer.
+
+*/
+
+snEntry *stable_ptr_table = NULL;
+static snEntry *stable_ptr_free = NULL;
+
+static unsigned int SPT_size = 0;
+
+#ifdef THREADED_RTS
+static Mutex stable_mutex;
+#endif
+
+/* This hash table maps Haskell objects to stable names, so that every
+ * call to lookupStableName on a given object will return the same
+ * stable name.
+ *
+ * OLD COMMENTS about reference counting follow.  The reference count
+ * in a stable name entry is now just a counter.
+ *
+ * Reference counting
+ * ------------------
+ * A plain stable name entry has a zero reference count, which means
+ * the entry will dissappear when the object it points to is
+ * unreachable.  For stable pointers, we need an entry that sticks
+ * around and keeps the object it points to alive, so each stable name
+ * entry has an associated reference count.
+ *
+ * A stable pointer has a weighted reference count N attached to it
+ * (actually in its upper 5 bits), which represents the weight
+ * 2^(N-1).  The stable name entry keeps a 32-bit reference count, which
+ * represents any weight between 1 and 2^32 (represented as zero).
+ * When the weight is 2^32, the stable name table owns "all" of the
+ * stable pointers to this object, and the entry can be garbage
+ * collected if the object isn't reachable.
+ *
+ * A new stable pointer is given the weight log2(W/2), where W is the
+ * weight stored in the table entry.  The new weight in the table is W
+ * - 2^log2(W/2).
+ *
+ * A stable pointer can be "split" into two stable pointers, by
+ * dividing the weight by 2 and giving each pointer half.
+ * When freeing a stable pointer, the weight of the pointer is added
+ * to the weight stored in the table entry.
+ * */
+
+static HashTable *addrToStableHash = NULL;
+
+#define INIT_SPT_SIZE 64
+
+STATIC_INLINE void
+initFreeList(snEntry *table, nat n, snEntry *free)
+{
+  snEntry *p;
+
+  for (p = table + n - 1; p >= table; p--) {
+    p->addr   = (P_)free;
+    p->old    = NULL;
+    p->ref    = 0;
+    p->sn_obj = NULL;
+    free = p;
+  }
+  stable_ptr_free = table;
+}
+
+void
+initStablePtrTable(void)
+{
+	if (SPT_size > 0)
+		return;
+
+    SPT_size = INIT_SPT_SIZE;
+    stable_ptr_table = stgMallocBytes(SPT_size * sizeof(snEntry),
+				      "initStablePtrTable");
+
+    /* we don't use index 0 in the stable name table, because that
+     * would conflict with the hash table lookup operations which
+     * return NULL if an entry isn't found in the hash table.
+     */
+    initFreeList(stable_ptr_table+1,INIT_SPT_SIZE-1,NULL);
+    addrToStableHash = allocHashTable();
+
+#ifdef THREADED_RTS
+    initMutex(&stable_mutex);
+#endif
+}
+
+/*
+ * get at the real stuff...remove indirections.
+ *
+ * ToDo: move to a better home.
+ */
+static
+StgClosure*
+removeIndirections(StgClosure* p)
+{
+  StgClosure* q = p;
+
+  while (get_itbl(q)->type == IND ||
+         get_itbl(q)->type == IND_STATIC ||
+         get_itbl(q)->type == IND_OLDGEN ||
+         get_itbl(q)->type == IND_PERM ||
+         get_itbl(q)->type == IND_OLDGEN_PERM ) {
+      q = ((StgInd *)q)->indirectee;
+  }
+  return q;
+}
+
+static StgWord
+lookupStableName_(StgPtr p)
+{
+  StgWord sn;
+  void* sn_tmp;
+
+  if (stable_ptr_free == NULL) {
+    enlargeStablePtrTable();
+  }
+
+  /* removing indirections increases the likelihood
+   * of finding a match in the stable name hash table.
+   */
+  p = (StgPtr)removeIndirections((StgClosure*)p);
+
+  sn_tmp = lookupHashTable(addrToStableHash,(W_)p);
+  sn = (StgWord)sn_tmp;
+  
+  if (sn != 0) {
+    ASSERT(stable_ptr_table[sn].addr == p);
+    IF_DEBUG(stable,debugBelch("cached stable name %ld at %p\n",sn,p));
+    return sn;
+  } else {
+    sn = stable_ptr_free - stable_ptr_table;
+    stable_ptr_free  = (snEntry*)(stable_ptr_free->addr);
+    stable_ptr_table[sn].ref = 0;
+    stable_ptr_table[sn].addr = p;
+    stable_ptr_table[sn].sn_obj = NULL;
+    /* IF_DEBUG(stable,debugBelch("new stable name %d at %p\n",sn,p)); */
+    
+    /* add the new stable name to the hash table */
+    insertHashTable(addrToStableHash, (W_)p, (void *)sn);
+
+    return sn;
+  }
+}
+
+StgWord
+lookupStableName(StgPtr p)
+{
+    StgWord res;
+
+    initStablePtrTable();
+    ACQUIRE_LOCK(&stable_mutex);
+    res = lookupStableName_(p);
+    RELEASE_LOCK(&stable_mutex);
+    return res;
+}
+
+STATIC_INLINE void
+freeStableName(snEntry *sn)
+{
+  ASSERT(sn->sn_obj == NULL);
+  if (sn->addr != NULL) {
+      removeHashTable(addrToStableHash, (W_)sn->addr, NULL);
+  }
+  sn->addr = (P_)stable_ptr_free;
+  stable_ptr_free = sn;
+}
+
+StgStablePtr
+getStablePtr(StgPtr p)
+{
+  StgWord sn;
+
+  initStablePtrTable();
+  ACQUIRE_LOCK(&stable_mutex);
+  sn = lookupStableName_(p);
+  stable_ptr_table[sn].ref++;
+  RELEASE_LOCK(&stable_mutex);
+  return (StgStablePtr)(sn);
+}
+
+void
+freeStablePtr(StgStablePtr sp)
+{
+    snEntry *sn;
+
+	initStablePtrTable();
+    ACQUIRE_LOCK(&stable_mutex);
+
+    sn = &stable_ptr_table[(StgWord)sp];
+    
+    ASSERT((StgWord)sp < SPT_size  &&  sn->addr != NULL  &&  sn->ref > 0);
+
+    sn->ref--;
+
+    // If this entry has no StableName attached, then just free it
+    // immediately.  This is important; it might be a while before the
+    // next major GC which actually collects the entry.
+    if (sn->sn_obj == NULL && sn->ref == 0) {
+	freeStableName(sn);
+    }
+
+    RELEASE_LOCK(&stable_mutex);
+}
+
+void
+enlargeStablePtrTable(void)
+{
+  nat old_SPT_size = SPT_size;
+
+    // 2nd and subsequent times
+  SPT_size *= 2;
+  stable_ptr_table =
+    stgReallocBytes(stable_ptr_table,
+		      SPT_size * sizeof(snEntry),
+		      "enlargeStablePtrTable");
+
+  initFreeList(stable_ptr_table + old_SPT_size, old_SPT_size, NULL);
+}
+
+/* -----------------------------------------------------------------------------
+ * Treat stable pointers as roots for the garbage collector.
+ *
+ * A stable pointer is any stable name entry with a ref > 0.  We'll
+ * take the opportunity to zero the "keep" flags at the same time.
+ * -------------------------------------------------------------------------- */
+
+void
+markStablePtrTable(evac_fn evac)
+{
+    snEntry *p, *end_stable_ptr_table;
+    StgPtr q;
+    
+    end_stable_ptr_table = &stable_ptr_table[SPT_size];
+    
+    // Mark all the stable *pointers* (not stable names).
+    // _starting_ at index 1; index 0 is unused.
+    for (p = stable_ptr_table+1; p < end_stable_ptr_table; p++) {
+	q = p->addr;
+
+	// Internal pointers are free slots.  If q == NULL, it's a
+	// stable name where the object has been GC'd, but the
+	// StableName object (sn_obj) is still alive.
+	if (q && (q < (P_)stable_ptr_table || q >= (P_)end_stable_ptr_table)) {
+
+	    // save the current addr away: we need to be able to tell
+	    // whether the objects moved in order to be able to update
+	    // the hash table later.
+	    p->old = p->addr;
+
+	    // if the ref is non-zero, treat addr as a root
+	    if (p->ref != 0) {
+		evac((StgClosure **)&p->addr);
+	    }
+	}
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ * Thread the stable pointer table for compacting GC.
+ * 
+ * Here we must call the supplied evac function for each pointer into
+ * the heap from the stable pointer table, because the compacting
+ * collector may move the object it points to.
+ * -------------------------------------------------------------------------- */
+
+void
+threadStablePtrTable( evac_fn evac )
+{
+    snEntry *p, *end_stable_ptr_table;
+    StgPtr q;
+    
+    end_stable_ptr_table = &stable_ptr_table[SPT_size];
+    
+    for (p = stable_ptr_table+1; p < end_stable_ptr_table; p++) {
+	
+	if (p->sn_obj != NULL) {
+	    evac((StgClosure **)&p->sn_obj);
+	}
+
+	q = p->addr;
+	if (q && (q < (P_)stable_ptr_table || q >= (P_)end_stable_ptr_table)) {
+	    evac((StgClosure **)&p->addr);
+	}
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ * Garbage collect any dead entries in the stable pointer table.
+ *
+ * A dead entry has:
+ *
+ *          - a zero reference count
+ *          - a dead sn_obj
+ *
+ * Both of these conditions must be true in order to re-use the stable
+ * name table entry.  We can re-use stable name table entries for live
+ * heap objects, as long as the program has no StableName objects that
+ * refer to the entry.
+ * -------------------------------------------------------------------------- */
+
+void
+gcStablePtrTable( void )
+{
+    snEntry *p, *end_stable_ptr_table;
+    StgPtr q;
+    
+    end_stable_ptr_table = &stable_ptr_table[SPT_size];
+    
+    // NOTE: _starting_ at index 1; index 0 is unused.
+    for (p = stable_ptr_table + 1; p < end_stable_ptr_table; p++) {
+	
+	// Update the pointer to the StableName object, if there is one
+	if (p->sn_obj != NULL) {
+	    p->sn_obj = isAlive(p->sn_obj);
+	}
+	
+	// Internal pointers are free slots.  If q == NULL, it's a
+	// stable name where the object has been GC'd, but the
+	// StableName object (sn_obj) is still alive.
+	q = p->addr;
+	if (q && (q < (P_)stable_ptr_table || q >= (P_)end_stable_ptr_table)) {
+
+	    // StableNames only:
+	    if (p->ref == 0) {
+		if (p->sn_obj == NULL) {
+		    // StableName object is dead
+		    freeStableName(p);
+		    IF_DEBUG(stable, debugBelch("GC'd Stable name %ld\n", 
+						p - stable_ptr_table));
+		    continue;
+		    
+		} else {
+		  p->addr = (StgPtr)isAlive((StgClosure *)p->addr);
+		    IF_DEBUG(stable, debugBelch("Stable name %ld still alive at %p, ref %ld\n", p - stable_ptr_table, p->addr, p->ref));
+		}
+	    }
+	}
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ * Update the StablePtr/StableName hash table
+ *
+ * The boolean argument 'full' indicates that a major collection is
+ * being done, so we might as well throw away the hash table and build
+ * a new one.  For a minor collection, we just re-hash the elements
+ * that changed.
+ * -------------------------------------------------------------------------- */
+
+void
+updateStablePtrTable(rtsBool full)
+{
+    snEntry *p, *end_stable_ptr_table;
+    
+    if (full && addrToStableHash != NULL) {
+	freeHashTable(addrToStableHash,NULL);
+	addrToStableHash = allocHashTable();
+    }
+    
+    end_stable_ptr_table = &stable_ptr_table[SPT_size];
+    
+    // NOTE: _starting_ at index 1; index 0 is unused.
+    for (p = stable_ptr_table + 1; p < end_stable_ptr_table; p++) {
+	
+	if (p->addr == NULL) {
+	    if (p->old != NULL) {
+		// The target has been garbage collected.  Remove its
+		// entry from the hash table.
+		removeHashTable(addrToStableHash, (W_)p->old, NULL);
+		p->old = NULL;
+	    }
+	}
+	else if (p->addr < (P_)stable_ptr_table 
+		 || p->addr >= (P_)end_stable_ptr_table) {
+	    // Target still alive, Re-hash this stable name 
+	    if (full) {
+		insertHashTable(addrToStableHash, (W_)p->addr, 
+				(void *)(p - stable_ptr_table));
+	    } else if (p->addr != p->old) {
+		removeHashTable(addrToStableHash, (W_)p->old, NULL);
+		insertHashTable(addrToStableHash, (W_)p->addr, 
+				(void *)(p - stable_ptr_table));
+	    }
+	}
+    }
+}
diff --git a/rts/Stats.c b/rts/Stats.c
new file mode 100644
index 0000000000..28d09bdbed
--- /dev/null
+++ b/rts/Stats.c
@@ -0,0 +1,632 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Statistics and timing-related functions.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "MBlock.h"
+#include "Schedule.h"
+#include "Stats.h"
+#include "ParTicky.h"                       /* ToDo: move into Rts.h */
+#include "Profiling.h"
+#include "Storage.h"
+#include "GetTime.h"
+
+/* huh? */
+#define BIG_STRING_LEN              512
+
+#define TICK_TO_DBL(t) ((double)(t) / TICKS_PER_SECOND)
+
+static Ticks ElapsedTimeStart = 0;
+
+static Ticks InitUserTime     = 0;
+static Ticks InitElapsedTime  = 0;
+static Ticks InitElapsedStamp = 0;
+
+static Ticks MutUserTime      = 0;
+static Ticks MutElapsedTime   = 0;
+static Ticks MutElapsedStamp  = 0;
+
+static Ticks ExitUserTime     = 0;
+static Ticks ExitElapsedTime  = 0;
+
+static ullong GC_tot_alloc        = 0;
+static ullong GC_tot_copied       = 0;
+static ullong GC_tot_scavd_copied = 0;
+
+static Ticks GC_start_time = 0,  GC_tot_time  = 0;  /* User GC Time */
+static Ticks GCe_start_time = 0, GCe_tot_time = 0;  /* Elapsed GC time */
+
+#ifdef PROFILING
+static Ticks RP_start_time  = 0, RP_tot_time  = 0;  /* retainer prof user time */
+static Ticks RPe_start_time = 0, RPe_tot_time = 0;  /* retainer prof elap time */
+
+static Ticks HC_start_time, HC_tot_time = 0;     // heap census prof user time
+static Ticks HCe_start_time, HCe_tot_time = 0;   // heap census prof elap time
+#endif
+
+#ifdef PROFILING
+#define PROF_VAL(x)   (x)
+#else
+#define PROF_VAL(x)   0
+#endif
+
+static lnat MaxResidency = 0;     // in words; for stats only
+static lnat AvgResidency = 0;
+static lnat ResidencySamples = 0; // for stats only
+
+static lnat GC_start_faults = 0, GC_end_faults = 0;
+
+static Ticks *GC_coll_times;
+
+static void statsPrintf( char *s, ... ) 
+    GNUC3_ATTRIBUTE(format (printf, 1, 2));
+
+static void statsFlush( void );
+static void statsClose( void );
+
+Ticks stat_getElapsedGCTime(void)
+{
+    return GCe_tot_time;
+}
+
+/* mut_user_time_during_GC() and mut_user_time()
+ *
+ * The former function can be used to get the current mutator time
+ * *during* a GC, i.e. between stat_startGC and stat_endGC.  This is
+ * used in the heap profiler for accurately time stamping the heap
+ * sample.  
+ *
+ * ATTENTION: mut_user_time_during_GC() relies on GC_start_time being 
+ *	      defined in stat_startGC() - to minimise system calls, 
+ *	      GC_start_time is, however, only defined when really needed (check
+ *	      stat_startGC() for details)
+ */
+double
+mut_user_time_during_GC( void )
+{
+  return TICK_TO_DBL(GC_start_time - GC_tot_time - PROF_VAL(RP_tot_time + HC_tot_time));
+}
+
+double
+mut_user_time( void )
+{
+    Ticks user;
+    user = getProcessCPUTime();
+    return TICK_TO_DBL(user - GC_tot_time - PROF_VAL(RP_tot_time + HC_tot_time));
+}
+
+#ifdef PROFILING
+/*
+  mut_user_time_during_RP() is similar to mut_user_time_during_GC();
+  it returns the MUT time during retainer profiling.
+  The same is for mut_user_time_during_HC();
+ */
+double
+mut_user_time_during_RP( void )
+{
+  return TICK_TO_DBL(RP_start_time - GC_tot_time - RP_tot_time - HC_tot_time);
+}
+
+double
+mut_user_time_during_heap_census( void )
+{
+  return TICK_TO_DBL(HC_start_time - GC_tot_time - RP_tot_time - HC_tot_time);
+}
+#endif /* PROFILING */
+
+void
+initStats(void)
+{
+    nat i;
+  
+    if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
+	statsPrintf("    Alloc    Copied     Live    GC    GC     TOT     TOT  Page Flts\n");
+	statsPrintf("    bytes     bytes     bytes  user  elap    user    elap\n");
+    }
+    GC_coll_times = 
+	(Ticks *)stgMallocBytes(
+	    sizeof(Ticks)*RtsFlags.GcFlags.generations,
+	    "initStats");
+    for (i = 0; i < RtsFlags.GcFlags.generations; i++) {
+	GC_coll_times[i] = 0;
+    }
+}    
+
+/* -----------------------------------------------------------------------------
+   Initialisation time...
+   -------------------------------------------------------------------------- */
+
+void
+stat_startInit(void)
+{
+    Ticks elapsed;
+
+    elapsed = getProcessElapsedTime();
+    ElapsedTimeStart = elapsed;
+}
+
+void 
+stat_endInit(void)
+{
+    Ticks user, elapsed;
+
+    getProcessTimes(&user, &elapsed);
+
+    InitUserTime = user;
+    InitElapsedStamp = elapsed; 
+    if (ElapsedTimeStart > elapsed) {
+	InitElapsedTime = 0;
+    } else {
+	InitElapsedTime = elapsed - ElapsedTimeStart;
+    }
+}
+
+/* -----------------------------------------------------------------------------
+   stat_startExit and stat_endExit
+   
+   These two measure the time taken in shutdownHaskell().
+   -------------------------------------------------------------------------- */
+
+void
+stat_startExit(void)
+{
+    Ticks user, elapsed;
+
+    getProcessTimes(&user, &elapsed);
+
+    MutElapsedStamp = elapsed;
+    MutElapsedTime = elapsed - GCe_tot_time -
+	PROF_VAL(RPe_tot_time + HCe_tot_time) - InitElapsedStamp;
+    if (MutElapsedTime < 0) { MutElapsedTime = 0; }	/* sometimes -0.00 */
+
+    MutUserTime = user - GC_tot_time - PROF_VAL(RP_tot_time + HC_tot_time) - InitUserTime;
+    if (MutUserTime < 0) { MutUserTime = 0; }
+}
+
+void
+stat_endExit(void)
+{
+    Ticks user, elapsed;
+
+    getProcessTimes(&user, &elapsed);
+
+    ExitUserTime = user - MutUserTime - GC_tot_time - PROF_VAL(RP_tot_time + HC_tot_time) - InitUserTime;
+    ExitElapsedTime = elapsed - MutElapsedStamp;
+    if (ExitUserTime < 0) {
+	ExitUserTime = 0;
+    }
+    if (ExitElapsedTime < 0) {
+	ExitElapsedTime = 0;
+    }
+}
+
+/* -----------------------------------------------------------------------------
+   Called at the beginning of each GC
+   -------------------------------------------------------------------------- */
+
+static nat rub_bell = 0;
+
+/*  initialise global variables needed during GC
+ *
+ *  * GC_start_time is read in mut_user_time_during_GC(), which in turn is 
+ *    needed if either PROFILING or DEBUGing is enabled
+ */
+void
+stat_startGC(void)
+{
+    nat bell = RtsFlags.GcFlags.ringBell;
+
+    if (bell) {
+	if (bell > 1) {
+	    debugBelch(" GC ");
+	    rub_bell = 1;
+	} else {
+	    debugBelch("\007");
+	}
+    }
+
+#if defined(PROFILING) || defined(DEBUG)
+    GC_start_time = getProcessCPUTime();  // needed in mut_user_time_during_GC()
+#endif
+
+    if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) {
+#if !defined(PROFILING) && !defined(DEBUG)
+        GC_start_time = getProcessCPUTime();
+#endif
+	GCe_start_time = getProcessElapsedTime();
+	if (RtsFlags.GcFlags.giveStats) {
+	    GC_start_faults = getPageFaults();
+	}
+    }
+}
+
+/* -----------------------------------------------------------------------------
+   Called at the end of each GC
+   -------------------------------------------------------------------------- */
+
+void
+stat_endGC (lnat alloc, lnat live, lnat copied, 
+	    lnat scavd_copied, lnat gen)
+{
+    if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) {
+	Ticks time, etime, gc_time, gc_etime;
+	
+	getProcessTimes(&time, &etime);
+	gc_time  = time - GC_start_time;
+	gc_etime = etime - GCe_start_time;
+	
+	if (RtsFlags.GcFlags.giveStats == VERBOSE_GC_STATS) {
+	    nat faults = getPageFaults();
+	    
+	    statsPrintf("%9ld %9ld %9ld",
+		    alloc*sizeof(W_), (copied+scavd_copied)*sizeof(W_), 
+			live*sizeof(W_));
+	    statsPrintf(" %5.2f %5.2f %7.2f %7.2f %4ld %4ld  (Gen: %2ld)\n", 
+		    TICK_TO_DBL(gc_time),
+		    TICK_TO_DBL(gc_etime),
+		    TICK_TO_DBL(time),
+		    TICK_TO_DBL(etime - ElapsedTimeStart),
+		    faults - GC_start_faults,
+		    GC_start_faults - GC_end_faults,
+		    gen);
+
+	    GC_end_faults = faults;
+	    statsFlush();
+	}
+
+	GC_coll_times[gen] += gc_time;
+
+	GC_tot_copied += (ullong) copied;
+	GC_tot_scavd_copied += (ullong) scavd_copied;
+	GC_tot_alloc  += (ullong) alloc;
+	GC_tot_time   += gc_time;
+	GCe_tot_time  += gc_etime;
+	
+#if defined(THREADED_RTS)
+	{
+	    Task *task;
+	    if ((task = myTask()) != NULL) {
+		task->gc_time += gc_time;
+		task->gc_etime += gc_etime;
+	    }
+	}
+#endif
+
+	if (gen == RtsFlags.GcFlags.generations-1) { /* major GC? */
+	    if (live > MaxResidency) {
+		MaxResidency = live;
+	    }
+	    ResidencySamples++;
+	    AvgResidency += live;
+	}
+    }
+
+    if (rub_bell) {
+	debugBelch("\b\b\b  \b\b\b");
+	rub_bell = 0;
+    }
+}
+
+/* -----------------------------------------------------------------------------
+   Called at the beginning of each Retainer Profiliing
+   -------------------------------------------------------------------------- */
+#ifdef PROFILING
+void
+stat_startRP(void)
+{
+    Ticks user, elapsed;
+    getProcessTimes( &user, &elapsed );
+
+    RP_start_time = user;
+    RPe_start_time = elapsed;
+}
+#endif /* PROFILING */
+
+/* -----------------------------------------------------------------------------
+   Called at the end of each Retainer Profiliing
+   -------------------------------------------------------------------------- */
+
+#ifdef PROFILING
+void
+stat_endRP(
+  nat retainerGeneration,
+#ifdef DEBUG_RETAINER
+  nat maxCStackSize,
+  int maxStackSize,
+#endif
+  double averageNumVisit)
+{
+    Ticks user, elapsed;
+    getProcessTimes( &user, &elapsed );
+
+    RP_tot_time += user - RP_start_time;
+    RPe_tot_time += elapsed - RPe_start_time;
+
+  fprintf(prof_file, "Retainer Profiling: %d, at %f seconds\n", 
+    retainerGeneration, mut_user_time_during_RP());
+#ifdef DEBUG_RETAINER
+  fprintf(prof_file, "\tMax C stack size = %u\n", maxCStackSize);
+  fprintf(prof_file, "\tMax auxiliary stack size = %u\n", maxStackSize);
+#endif
+  fprintf(prof_file, "\tAverage number of visits per object = %f\n", averageNumVisit);
+}
+#endif /* PROFILING */
+
+/* -----------------------------------------------------------------------------
+   Called at the beginning of each heap census
+   -------------------------------------------------------------------------- */
+#ifdef PROFILING
+void
+stat_startHeapCensus(void)
+{
+    Ticks user, elapsed;
+    getProcessTimes( &user, &elapsed );
+
+    HC_start_time = user;
+    HCe_start_time = elapsed;
+}
+#endif /* PROFILING */
+
+/* -----------------------------------------------------------------------------
+   Called at the end of each heap census
+   -------------------------------------------------------------------------- */
+#ifdef PROFILING
+void
+stat_endHeapCensus(void) 
+{
+    Ticks user, elapsed;
+    getProcessTimes( &user, &elapsed );
+
+    HC_tot_time += user - HC_start_time;
+    HCe_tot_time += elapsed - HCe_start_time;
+}
+#endif /* PROFILING */
+
+/* -----------------------------------------------------------------------------
+   Called at the end of execution
+
+   NOTE: number of allocations is not entirely accurate: it doesn't
+   take into account the few bytes at the end of the heap that
+   were left unused when the heap-check failed.
+   -------------------------------------------------------------------------- */
+
+void
+stat_exit(int alloc)
+{
+    if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) {
+
+	char temp[BIG_STRING_LEN];
+	Ticks time;
+	Ticks etime;
+	nat g, total_collections = 0;
+
+	getProcessTimes( &time, &etime );
+	etime -= ElapsedTimeStart;
+
+	GC_tot_alloc += alloc;
+
+	/* Count total garbage collections */
+	for (g = 0; g < RtsFlags.GcFlags.generations; g++)
+	    total_collections += generations[g].collections;
+
+	/* avoid divide by zero if time is measured as 0.00 seconds -- SDM */
+	if (time  == 0.0)  time = 1;
+	if (etime == 0.0) etime = 1;
+	
+	if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
+	    statsPrintf("%9ld %9.9s %9.9s", (lnat)alloc*sizeof(W_), "", "");
+	    statsPrintf(" %5.2f %5.2f\n\n", 0.0, 0.0);
+	}
+
+	if (RtsFlags.GcFlags.giveStats >= SUMMARY_GC_STATS) {
+	    ullong_format_string(GC_tot_alloc*sizeof(W_), 
+				 temp, rtsTrue/*commas*/);
+	    statsPrintf("%11s bytes allocated in the heap\n", temp);
+
+	    ullong_format_string(GC_tot_copied*sizeof(W_), 
+				 temp, rtsTrue/*commas*/);
+	    statsPrintf("%11s bytes copied during GC (scavenged)\n", temp);
+
+	    ullong_format_string(GC_tot_scavd_copied*sizeof(W_), 
+				 temp, rtsTrue/*commas*/);
+	    statsPrintf("%11s bytes copied during GC (not scavenged)\n", temp);
+  
+	    if ( ResidencySamples > 0 ) {
+		ullong_format_string(MaxResidency*sizeof(W_), 
+				     temp, rtsTrue/*commas*/);
+		statsPrintf("%11s bytes maximum residency (%ld sample(s))\n",
+			temp, ResidencySamples);
+	    }
+	    statsPrintf("\n");
+
+	    /* Print garbage collections in each gen */
+	    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+		statsPrintf("%11d collections in generation %d (%6.2fs)\n", 
+			generations[g].collections, g, 
+			TICK_TO_DBL(GC_coll_times[g]));
+	    }
+
+	    statsPrintf("\n%11ld Mb total memory in use\n\n", 
+		    mblocks_allocated * MBLOCK_SIZE / (1024 * 1024));
+
+#if defined(THREADED_RTS)
+	    {
+		nat i;
+		Task *task;
+		for (i = 0, task = all_tasks; 
+		     task != NULL; 
+		     i++, task = task->all_link) {
+		    statsPrintf("  Task %2d %-8s :  MUT time: %6.2fs  (%6.2fs elapsed)\n"
+			    "                      GC  time: %6.2fs  (%6.2fs elapsed)\n\n", 
+				i,
+				(task->tso == NULL) ? "(worker)" : "(bound)",
+				TICK_TO_DBL(task->mut_time),
+				TICK_TO_DBL(task->mut_etime),
+				TICK_TO_DBL(task->gc_time),
+				TICK_TO_DBL(task->gc_etime));
+		}
+	    }
+#endif
+
+	    statsPrintf("  INIT  time  %6.2fs  (%6.2fs elapsed)\n",
+		    TICK_TO_DBL(InitUserTime), TICK_TO_DBL(InitElapsedTime));
+	    statsPrintf("  MUT   time  %6.2fs  (%6.2fs elapsed)\n",
+		    TICK_TO_DBL(MutUserTime), TICK_TO_DBL(MutElapsedTime));
+	    statsPrintf("  GC    time  %6.2fs  (%6.2fs elapsed)\n",
+		    TICK_TO_DBL(GC_tot_time), TICK_TO_DBL(GCe_tot_time));
+#ifdef PROFILING
+	    statsPrintf("  RP    time  %6.2fs  (%6.2fs elapsed)\n",
+		    TICK_TO_DBL(RP_tot_time), TICK_TO_DBL(RPe_tot_time));
+	    statsPrintf("  PROF  time  %6.2fs  (%6.2fs elapsed)\n",
+		    TICK_TO_DBL(HC_tot_time), TICK_TO_DBL(HCe_tot_time));
+#endif 
+	    statsPrintf("  EXIT  time  %6.2fs  (%6.2fs elapsed)\n",
+		    TICK_TO_DBL(ExitUserTime), TICK_TO_DBL(ExitElapsedTime));
+	    statsPrintf("  Total time  %6.2fs  (%6.2fs elapsed)\n\n",
+		    TICK_TO_DBL(time), TICK_TO_DBL(etime));
+	    statsPrintf("  %%GC time     %5.1f%%  (%.1f%% elapsed)\n\n",
+		    TICK_TO_DBL(GC_tot_time)*100/TICK_TO_DBL(time),
+		    TICK_TO_DBL(GCe_tot_time)*100/TICK_TO_DBL(etime));
+
+	    if (time - GC_tot_time - PROF_VAL(RP_tot_time + HC_tot_time) == 0)
+		ullong_format_string(0, temp, rtsTrue/*commas*/);
+	    else
+		ullong_format_string(
+		    (ullong)((GC_tot_alloc*sizeof(W_))/
+			     TICK_TO_DBL(time - GC_tot_time - 
+					 PROF_VAL(RP_tot_time + HC_tot_time))),
+		    temp, rtsTrue/*commas*/);
+	    
+	    statsPrintf("  Alloc rate    %s bytes per MUT second\n\n", temp);
+	
+	    statsPrintf("  Productivity %5.1f%% of total user, %.1f%% of total elapsed\n\n",
+		    TICK_TO_DBL(time - GC_tot_time - 
+				PROF_VAL(RP_tot_time + HC_tot_time) - InitUserTime) * 100 
+		    / TICK_TO_DBL(time), 
+		    TICK_TO_DBL(time - GC_tot_time - 
+				PROF_VAL(RP_tot_time + HC_tot_time) - InitUserTime) * 100 
+		    / TICK_TO_DBL(etime));
+	}
+
+	if (RtsFlags.GcFlags.giveStats == ONELINE_GC_STATS) {
+	  /* print the long long separately to avoid bugginess on mingwin (2001-07-02, mingw-0.5) */
+	  statsPrintf("<<ghc: %llu bytes, ", GC_tot_alloc*(ullong)sizeof(W_));
+	  statsPrintf("%d GCs, %ld/%ld avg/max bytes residency (%ld samples), %luM in use, %.2f INIT (%.2f elapsed), %.2f MUT (%.2f elapsed), %.2f GC (%.2f elapsed) :ghc>>\n",
+		    total_collections,
+		    ResidencySamples == 0 ? 0 : 
+		        AvgResidency*sizeof(W_)/ResidencySamples, 
+		    MaxResidency*sizeof(W_), 
+		    ResidencySamples,
+		    (unsigned long)(mblocks_allocated * MBLOCK_SIZE / (1024L * 1024L)),
+		    TICK_TO_DBL(InitUserTime), TICK_TO_DBL(InitElapsedTime),
+		    TICK_TO_DBL(MutUserTime), TICK_TO_DBL(MutElapsedTime),
+		    TICK_TO_DBL(GC_tot_time), TICK_TO_DBL(GCe_tot_time));
+	}
+
+	statsFlush();
+	statsClose();
+    }
+}
+
+/* -----------------------------------------------------------------------------
+   stat_describe_gens
+
+   Produce some detailed info on the state of the generational GC.
+   -------------------------------------------------------------------------- */
+#ifdef DEBUG
+void
+statDescribeGens(void)
+{
+  nat g, s, mut, lge;
+  lnat live;
+  bdescr *bd;
+  step *step;
+
+  debugBelch(
+"     Gen    Steps      Max   Mutable  Step   Blocks     Live    Large\n"
+"                     Blocks Closures                          Objects\n");
+
+  mut = 0;
+  for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+      for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) {
+	  mut += bd->free - bd->start;
+      }
+
+    debugBelch("%8d %8d %8d %9d", g, generations[g].n_steps,
+	    generations[g].max_blocks, mut);
+
+    for (s = 0; s < generations[g].n_steps; s++) {
+      step = &generations[g].steps[s];
+      live = 0;
+      for (bd = step->large_objects, lge = 0; bd; bd = bd->link) {
+	lge++;
+      }
+      live = step->n_large_blocks * BLOCK_SIZE;
+      bd = step->blocks;
+      // This live figure will be slightly less that the "live" figure
+      // given by +RTS -Sstderr, because we take don't count the
+      // slop at the end of each block.
+      for (; bd; bd = bd->link) {
+	live += (bd->free - bd->start) * sizeof(W_);
+      }
+      if (s != 0) {
+	debugBelch("%36s","");
+      }
+      debugBelch("%6d %8d %8d %8d\n", s, step->n_blocks,
+	      live, lge);
+    }
+  }
+  debugBelch("\n");
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+   Stats available via a programmatic interface, so eg. GHCi can time
+   each compilation and expression evaluation.
+   -------------------------------------------------------------------------- */
+
+extern HsInt64 getAllocations( void ) 
+{ return (HsInt64)total_allocated * sizeof(W_); }
+
+/* -----------------------------------------------------------------------------
+   Dumping stuff in the stats file, or via the debug message interface
+   -------------------------------------------------------------------------- */
+
+static void
+statsPrintf( char *s, ... )
+{
+    FILE *sf = RtsFlags.GcFlags.statsFile;
+    va_list ap;
+    
+    va_start(ap,s);
+    if (sf == NULL) {
+	vdebugBelch(s,ap);
+    } else {
+	vfprintf(sf, s, ap);
+    }
+    va_end(ap);
+}
+
+static void
+statsFlush( void )
+{
+    FILE *sf = RtsFlags.GcFlags.statsFile;
+    if (sf != NULL) {
+	fflush(sf);
+    }
+}
+
+static void
+statsClose( void )
+{
+    FILE *sf = RtsFlags.GcFlags.statsFile;
+    if (sf != NULL) {
+	fclose(sf);
+    }
+}
diff --git a/rts/Stats.h b/rts/Stats.h
new file mode 100644
index 0000000000..20bc0155ad
--- /dev/null
+++ b/rts/Stats.h
@@ -0,0 +1,56 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Statistics and timing-related functions.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef STATS_H
+#define STATS_H
+
+#include "GetTime.h"
+
+void      stat_startInit(void);
+void      stat_endInit(void);
+
+void      stat_startGC(void);
+void      stat_endGC (lnat alloc, lnat live, 
+		      lnat copied, lnat scavd_copied, lnat gen);
+
+#ifdef PROFILING
+void      stat_startRP(void);
+void      stat_endRP(nat, 
+#ifdef DEBUG_RETAINER
+                            nat, int, 
+#endif
+                            double);
+#endif /* PROFILING */
+
+#if defined(PROFILING) || defined(DEBUG)
+void      stat_startHeapCensus(void);
+void      stat_endHeapCensus(void);
+#endif
+
+void      stat_startExit(void);
+void      stat_endExit(void);
+
+void      stat_exit(int alloc);
+void      stat_workerStop(void);
+
+void      initStats(void);
+
+double    mut_user_time_during_GC(void);
+double    mut_user_time(void);
+
+#ifdef PROFILING
+double    mut_user_time_during_RP(void);
+double    mut_user_time_during_heap_census(void);
+#endif /* PROFILING */
+
+void      statDescribeGens( void );
+HsInt64   getAllocations( void );
+
+Ticks stat_getElapsedGCTime(void);
+
+#endif /* STATS_H */
diff --git a/rts/StgCRun.c b/rts/StgCRun.c
new file mode 100644
index 0000000000..c1afc16559
--- /dev/null
+++ b/rts/StgCRun.c
@@ -0,0 +1,897 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2003
+ *
+ * STG-to-C glue.
+ *
+ * To run an STG function from C land, call
+ *
+ *		rv = StgRun(f,BaseReg);
+ *
+ * where "f" is the STG function to call, and BaseReg is the address of the
+ * RegTable for this run (we might have separate RegTables if we're running
+ * multiple threads on an SMP machine).
+ *
+ * In the end, "f" must JMP to StgReturn (defined below),
+ * passing the return-value "rv" in R1,
+ * to return to the caller of StgRun returning "rv" in
+ * the whatever way C returns a value.
+ *
+ * NOTE: StgRun/StgReturn do *NOT* load or store Hp or any
+ * other registers (other than saving the C callee-saves
+ * registers).  Instead, the called function "f" must do that
+ * in STG land.
+ *
+ * GCC will have assumed that pushing/popping of C-stack frames is
+ * going on when it generated its code, and used stack space
+ * accordingly.  However, we actually {\em post-process away} all
+ * such stack-framery (see \tr{ghc/driver/ghc-asm.lprl}). Things will
+ * be OK however, if we initially make sure there are
+ * @RESERVED_C_STACK_BYTES@ on the C-stack to begin with, for local
+ * variables.
+ *
+ * -------------------------------------------------------------------------- */
+
+#include "PosixSource.h"
+
+
+/*
+ * We define the following (unused) global register variables, because for
+ * some reason gcc generates sub-optimal code for StgRun() on the Alpha
+ * (unnecessarily saving extra registers on the stack) if we don't.
+ *
+ * Why do it at the top of this file, rather than near StgRun() below?  Because
+ * gcc doesn't let us define global register variables after any function
+ * definition has been read.  Any point after #include "Stg.h" would be too
+ * late.
+ *
+ * We define alpha_EXTRA_CAREFUL here to save $s6, $f8 and $f9 -- registers
+ * that we don't use but which are callee-save registers.  The __divq() routine
+ * in libc.a clobbers $s6.
+ */
+#include "ghcconfig.h"
+#ifdef alpha_HOST_ARCH
+#define alpha_EXTRA_CAREFUL
+register long   fake_ra __asm__("$26");
+register long   fake_gp __asm__("$29");
+#ifdef alpha_EXTRA_CAREFUL
+register long   fake_s6 __asm__("$15");
+register double fake_f8 __asm__("$f8");
+register double fake_f9 __asm__("$f9");
+#endif
+#endif
+
+/* include Stg.h first because we want real machine regs in here: we
+ * have to get the value of R1 back from Stg land to C land intact.
+ */
+#include "Stg.h"
+#include "Rts.h"
+#include "StgRun.h"
+#include "RtsFlags.h"
+#include "OSThreads.h"
+#include "Capability.h"
+
+#ifdef DEBUG
+#include "RtsUtils.h"
+#include "Printer.h"
+#endif
+
+#ifdef USE_MINIINTERPRETER
+
+/* -----------------------------------------------------------------------------
+   any architecture (using miniinterpreter)
+   -------------------------------------------------------------------------- */
+
+StgRegTable * StgRun(StgFunPtr f, StgRegTable *basereg STG_UNUSED)
+{
+    while (f) {
+	IF_DEBUG(interpreter,
+	    debugBelch("Jumping to ");
+	    printPtr((P_)f); fflush(stdout);
+	    debugBelch("\n");
+	    );
+	f = (StgFunPtr) (f)();
+    }
+    return (StgRegTable *)R1.p;
+}
+
+StgFunPtr StgReturn(void)
+{
+    return 0;
+}
+
+#else /* !USE_MINIINTERPRETER */
+
+#ifdef LEADING_UNDERSCORE
+#define STG_RETURN "_StgReturn"
+#else
+#define STG_RETURN "StgReturn"
+#endif
+
+/* -----------------------------------------------------------------------------
+   x86 architecture
+   -------------------------------------------------------------------------- */
+
+#ifdef i386_HOST_ARCH
+
+#ifdef darwin_TARGET_OS
+#define STG_GLOBAL ".globl "
+#else
+#define STG_GLOBAL ".global "
+#endif
+
+StgRegTable *
+StgRun(StgFunPtr f, StgRegTable *basereg) {
+
+    unsigned char space[ RESERVED_C_STACK_BYTES + 4*sizeof(void *) ];
+    StgRegTable * r;
+
+    __asm__ volatile (
+	/*
+	 * save callee-saves registers on behalf of the STG code.
+	 */
+	"movl %%esp, %%eax\n\t"
+	"addl %4, %%eax\n\t"
+        "movl %%ebx,0(%%eax)\n\t"
+        "movl %%esi,4(%%eax)\n\t"
+        "movl %%edi,8(%%eax)\n\t"
+        "movl %%ebp,12(%%eax)\n\t"
+	/*
+	 * Set BaseReg
+	 */
+	"movl %3,%%ebx\n\t"
+	/*
+	 * grab the function argument from the stack
+	 */
+        "movl %2,%%eax\n\t"
+        
+	/*
+	 * Darwin note:
+	 * The stack pointer has to be aligned to a multiple of 16 bytes at
+	 * this point. This works out correctly with gcc 4.0.1, but it might
+	 * break at any time in the future. TODO: Make this future-proof.
+	 */
+
+	/*
+	 * jump to it
+	 */
+        "jmp *%%eax\n\t"
+
+	STG_GLOBAL STG_RETURN "\n"
+       	STG_RETURN ":\n\t"
+
+	"movl %%esi, %%eax\n\t"   /* Return value in R1  */
+
+	/*
+	 * restore callee-saves registers.  (Don't stomp on %%eax!)
+	 */
+	"movl %%esp, %%edx\n\t"
+	"addl %4, %%edx\n\t"
+        "movl 0(%%edx),%%ebx\n\t"	/* restore the registers saved above */
+        "movl 4(%%edx),%%esi\n\t"
+        "movl 8(%%edx),%%edi\n\t"
+        "movl 12(%%edx),%%ebp\n\t"
+
+      : "=&a" (r), "=m" (space)
+      : "m" (f), "m" (basereg), "i" (RESERVED_C_STACK_BYTES)
+      : "edx" /* stomps on %edx */
+    );
+
+    return r;
+}
+
+#endif
+
+/* ----------------------------------------------------------------------------
+   x86-64 is almost the same as plain x86.
+
+   I've done it using entirely inline assembler, because I couldn't
+   get gcc to generate the correct subtraction from %rsp by using
+   the local array variable trick.  It didn't seem to reserve
+   enough space.  Oh well, it's not much harder this way.
+
+   ------------------------------------------------------------------------- */
+
+#ifdef x86_64_HOST_ARCH
+
+extern StgRegTable * StgRun(StgFunPtr f, StgRegTable *basereg);
+
+static void GNUC3_ATTRIBUTE(used)
+StgRunIsImplementedInAssembler(void)
+{
+    __asm__ volatile (
+	/*
+	 * save callee-saves registers on behalf of the STG code.
+	 */
+	".globl StgRun\n"
+	"StgRun:\n\t"
+	"subq %0, %%rsp\n\t"
+	"movq %%rsp, %%rax\n\t"
+	"addq %0-48, %%rax\n\t"
+        "movq %%rbx,0(%%rax)\n\t"
+        "movq %%rbp,8(%%rax)\n\t"
+        "movq %%r12,16(%%rax)\n\t"
+        "movq %%r13,24(%%rax)\n\t"
+        "movq %%r14,32(%%rax)\n\t"
+        "movq %%r15,40(%%rax)\n\t"
+	/*
+	 * Set BaseReg
+	 */
+	"movq %%rsi,%%r13\n\t"
+	/*
+	 * grab the function argument from the stack, and jump to it.
+	 */
+        "movq %%rdi,%%rax\n\t"
+        "jmp *%%rax\n\t"
+
+	".global " STG_RETURN "\n"
+       	STG_RETURN ":\n\t"
+
+	"movq %%rbx, %%rax\n\t"   /* Return value in R1  */
+
+	/*
+	 * restore callee-saves registers.  (Don't stomp on %%rax!)
+	 */
+	"movq %%rsp, %%rdx\n\t"
+	"addq %0-48, %%rdx\n\t"
+        "movq 0(%%rdx),%%rbx\n\t"	/* restore the registers saved above */
+        "movq 8(%%rdx),%%rbp\n\t"
+        "movq 16(%%rdx),%%r12\n\t"
+        "movq 24(%%rdx),%%r13\n\t"
+        "movq 32(%%rdx),%%r14\n\t"
+        "movq 40(%%rdx),%%r15\n\t"
+	"addq %0, %%rsp\n\t"
+	"retq"
+
+	: : "i"(RESERVED_C_STACK_BYTES+48+8 /*stack frame size*/));
+    /* 
+       HACK alert!
+
+       The x86_64 ABI specifies that on a procedure call, %rsp is
+       aligned on a 16-byte boundary + 8.  That is, the first
+       argument on the stack after the return address will be
+       16-byte aligned.  
+       
+       Which should be fine: RESERVED_C_STACK_BYTES+48 is a multiple
+       of 16 bytes.  
+       
+       BUT... when we do a C-call from STG land, gcc likes to put the
+       stack alignment adjustment in the prolog.  eg. if we're calling
+       a function with arguments in regs, gcc will insert 'subq $8,%rsp'
+       in the prolog, to keep %rsp aligned (the return address is 8
+       bytes, remember).  The mangler throws away the prolog, so we
+       lose the stack alignment.
+
+       The hack is to add this extra 8 bytes to our %rsp adjustment
+       here, so that throughout STG code, %rsp is 16-byte aligned,
+       ready for a C-call.  
+
+       A quick way to see if this is wrong is to compile this code:
+
+          main = System.Exit.exitWith ExitSuccess
+
+       And run it with +RTS -sstderr.  The stats code in the RTS, in
+       particular statsPrintf(), relies on the stack alignment because
+       it saves the %xmm regs on the stack, so it'll fall over if the
+       stack isn't aligned, and calling exitWith from Haskell invokes
+       shutdownHaskellAndExit using a C call.
+
+       Future gcc releases will almost certainly break this hack...
+    */
+}
+
+#endif /* x86-64 */
+
+/* -----------------------------------------------------------------------------
+   Sparc architecture
+
+   --
+   OLD COMMENT from GHC-3.02:
+
+   We want tailjumps to be calls, because `call xxx' is the only Sparc
+   branch that allows an arbitrary label as a target.  (Gcc's ``goto
+   *target'' construct ends up loading the label into a register and
+   then jumping, at the cost of two extra instructions for the 32-bit
+   load.)
+
+   When entering the threaded world, we stash our return address in a
+   known location so that \tr{%i7} is available as an extra
+   callee-saves register.  Of course, we have to restore this when
+   coming out of the threaded world.
+
+   I hate this god-forsaken architecture.  Since the top of the
+   reserved stack space is used for globals and the bottom is reserved
+   for outgoing arguments, we have to stick our return address
+   somewhere in the middle.  Currently, I'm allowing 100 extra
+   outgoing arguments beyond the first 6.  --JSM
+
+   Updated info (GHC 4.06): we don't appear to use %i7 any more, so
+   I'm not sure whether we still need to save it.  Incedentally, what
+   does the last paragraph above mean when it says "the top of the
+   stack is used for globals"?  What globals?  --SDM
+
+   Updated info (GHC 4.08.2): not saving %i7 any more (see below).
+   -------------------------------------------------------------------------- */
+
+#ifdef sparc_HOST_ARCH
+
+StgRegTable *
+StgRun(StgFunPtr f, StgRegTable *basereg) {
+
+    unsigned char space[RESERVED_C_STACK_BYTES];
+#if 0
+    register void *i7 __asm__("%i7");
+    ((void **)(space))[100] = i7;
+#endif
+    f();
+    __asm__ volatile (
+	    ".align 4\n"
+            ".global " STG_RETURN "\n"
+       	    STG_RETURN ":"
+	    : : : "l0","l1","l2","l3","l4","l5","l6","l7");
+    /* we tell the C compiler that l0-l7 are clobbered on return to
+     * StgReturn, otherwise it tries to use these to save eg. the
+     * address of space[100] across the call.  The correct thing
+     * to do would be to save all the callee-saves regs, but we
+     * can't be bothered to do that.
+     *
+     * The code that gcc generates for this little fragment is now
+     * terrible.  We could do much better by coding it directly in
+     * assembler.
+     */
+#if 0
+    /* updated 4.08.2: we don't save %i7 in the middle of the reserved
+     * space any more, since gcc tries to save its address across the
+     * call to f(), this gets clobbered in STG land and we end up
+     * dereferencing a bogus pointer in StgReturn.
+     */
+    __asm__ volatile ("ld %1,%0"
+		      : "=r" (i7) : "m" (((void **)(space))[100]));
+#endif
+    return (StgRegTable *)R1.i;
+}
+
+#endif
+
+/* -----------------------------------------------------------------------------
+   alpha architecture
+
+   "The stack pointer (SP) must at all times denote an address that has octaword
+    alignment. (This restriction has the side effect that the in-memory portion
+    of the argument list, if any, will start on an octaword boundary.) Note that
+    the stack grows toward lower addresses. During a procedure invocation, SP
+    can never be set to a value that is higher than the value of SP at entry to
+    that procedure invocation.
+
+   "The contents of the stack, located above the portion of the argument list
+    (if any) that is passed in memory, belong to the calling procedure. Because
+    they are part of the calling procedure, they should not be read or written
+    by the called procedure, except as specified by indirect arguments or
+    language-controlled up-level references.
+
+   "The SP value might be used by the hardware when raising exceptions and
+    asynchronous interrupts. It must be assumed that the contents of the stack
+    below the current SP value and within the stack for the current thread are
+    continually and unpredictably modified, as specified in the _Alpha
+    Architecture Reference Manual_, and as a result of asynchronous software
+    actions."
+
+   -- Compaq Computer Corporation, Houston. Tru64 UNIX Calling Standard for
+      Alpha Systems, 5.1 edition, August 2000, section 3.2.1.  http://www.
+      tru64unix.compaq.com/docs/base_doc/DOCUMENTATION/V51_PDF/ARH9MBTE.PDF
+   -------------------------------------------------------------------------- */
+
+#ifdef alpha_HOST_ARCH
+
+StgRegTable *
+StgRun(StgFunPtr f, StgRegTable *basereg)
+{
+    register long   real_ra __asm__("$26"); volatile long   save_ra;
+    register long   real_gp __asm__("$29"); volatile long   save_gp;
+
+    register long   real_s0 __asm__("$9" ); volatile long   save_s0;
+    register long   real_s1 __asm__("$10"); volatile long   save_s1;
+    register long   real_s2 __asm__("$11"); volatile long   save_s2;
+    register long   real_s3 __asm__("$12"); volatile long   save_s3;
+    register long   real_s4 __asm__("$13"); volatile long   save_s4;
+    register long   real_s5 __asm__("$14"); volatile long   save_s5;
+#ifdef alpha_EXTRA_CAREFUL
+    register long   real_s6 __asm__("$15"); volatile long   save_s6;
+#endif
+
+    register double real_f2 __asm__("$f2"); volatile double save_f2;
+    register double real_f3 __asm__("$f3"); volatile double save_f3;
+    register double real_f4 __asm__("$f4"); volatile double save_f4;
+    register double real_f5 __asm__("$f5"); volatile double save_f5;
+    register double real_f6 __asm__("$f6"); volatile double save_f6;
+    register double real_f7 __asm__("$f7"); volatile double save_f7;
+#ifdef alpha_EXTRA_CAREFUL
+    register double real_f8 __asm__("$f8"); volatile double save_f8;
+    register double real_f9 __asm__("$f9"); volatile double save_f9;
+#endif
+
+    register StgFunPtr real_pv __asm__("$27");
+
+    StgRegTable * ret;
+
+    save_ra = real_ra;
+    save_gp = real_gp;
+
+    save_s0 = real_s0;
+    save_s1 = real_s1;
+    save_s2 = real_s2;
+    save_s3 = real_s3;
+    save_s4 = real_s4;
+    save_s5 = real_s5;
+#ifdef alpha_EXTRA_CAREFUL
+    save_s6 = real_s6;
+#endif
+
+    save_f2 = real_f2;
+    save_f3 = real_f3;
+    save_f4 = real_f4;
+    save_f5 = real_f5;
+    save_f6 = real_f6;
+    save_f7 = real_f7;
+#ifdef alpha_EXTRA_CAREFUL
+    save_f8 = real_f8;
+    save_f9 = real_f9;
+#endif
+
+    real_pv = f;
+
+    __asm__ volatile(	"lda $30,-%0($30)"	"\n"
+		"\t"	"jmp ($27)"		"\n"
+		"\t"	".align 3"		"\n"
+		".globl " STG_RETURN		"\n"
+		STG_RETURN ":"			"\n"
+		"\t"	"lda $30,%0($30)"	"\n"
+		: : "K" (RESERVED_C_STACK_BYTES));
+
+    ret = real_s5;
+
+    real_s0 = save_s0;
+    real_s1 = save_s1;
+    real_s2 = save_s2;
+    real_s3 = save_s3;
+    real_s4 = save_s4;
+    real_s5 = save_s5;
+#ifdef alpha_EXTRA_CAREFUL
+    real_s6 = save_s6;
+#endif
+
+    real_f2 = save_f2;
+    real_f3 = save_f3;
+    real_f4 = save_f4;
+    real_f5 = save_f5;
+    real_f6 = save_f6;
+    real_f7 = save_f7;
+#ifdef alpha_EXTRA_CAREFUL
+    real_f8 = save_f8;
+    real_f9 = save_f9;
+#endif
+
+    real_ra = save_ra;
+    real_gp = save_gp;
+
+    return ret;
+}
+
+#endif /* alpha_HOST_ARCH */
+
+/* -----------------------------------------------------------------------------
+   HP-PA architecture
+   -------------------------------------------------------------------------- */
+
+#ifdef hppa1_1_HOST_ARCH
+
+StgRegTable *
+StgRun(StgFunPtr f, StgRegTable *basereg)
+{
+    StgChar space[RESERVED_C_STACK_BYTES+16*sizeof(long)+10*sizeof(double)];
+    StgRegTable * ret;
+
+    __asm__ volatile ("ldo %0(%%r30),%%r19\n"
+		      "\tstw %%r3, 0(0,%%r19)\n"
+                      "\tstw %%r4, 4(0,%%r19)\n"
+                      "\tstw %%r5, 8(0,%%r19)\n"
+                      "\tstw %%r6,12(0,%%r19)\n"
+                      "\tstw %%r7,16(0,%%r19)\n"
+                      "\tstw %%r8,20(0,%%r19)\n"
+                      "\tstw %%r9,24(0,%%r19)\n"
+		      "\tstw %%r10,28(0,%%r19)\n"
+                      "\tstw %%r11,32(0,%%r19)\n"
+                      "\tstw %%r12,36(0,%%r19)\n"
+                      "\tstw %%r13,40(0,%%r19)\n"
+                      "\tstw %%r14,44(0,%%r19)\n"
+                      "\tstw %%r15,48(0,%%r19)\n"
+                      "\tstw %%r16,52(0,%%r19)\n"
+                      "\tstw %%r17,56(0,%%r19)\n"
+                      "\tstw %%r18,60(0,%%r19)\n"
+		      "\tldo 80(%%r19),%%r19\n"
+		      "\tfstds %%fr12,-16(0,%%r19)\n"
+		      "\tfstds %%fr13, -8(0,%%r19)\n"
+		      "\tfstds %%fr14,  0(0,%%r19)\n"
+		      "\tfstds %%fr15,  8(0,%%r19)\n"
+		      "\tldo 32(%%r19),%%r19\n"
+		      "\tfstds %%fr16,-16(0,%%r19)\n"
+		      "\tfstds %%fr17, -8(0,%%r19)\n"
+		      "\tfstds %%fr18,  0(0,%%r19)\n"
+		      "\tfstds %%fr19,  8(0,%%r19)\n"
+		      "\tldo 32(%%r19),%%r19\n"
+		      "\tfstds %%fr20,-16(0,%%r19)\n"
+		      "\tfstds %%fr21, -8(0,%%r19)\n" : :
+                      "n" (-(116 * sizeof(long) + 10 * sizeof(double))) : "%r19"
+		      );
+
+    f();
+
+    __asm__ volatile (".align 4\n"
+               	      "\t.EXPORT " STG_RETURN ",CODE\n"
+		      "\t.EXPORT " STG_RETURN ",ENTRY,PRIV_LEV=3\n"
+                      STG_RETURN "\n"
+                      /* "\tldo %0(%%r3),%%r19\n" */
+                      "\tldo %1(%%r30),%%r19\n"
+                      "\tcopy %%r11, %0\n"  /* save R1 */
+		      "\tldw  0(0,%%r19),%%r3\n"
+                      "\tldw  4(0,%%r19),%%r4\n"
+                      "\tldw  8(0,%%r19),%%r5\n"
+                      "\tldw 12(0,%%r19),%%r6\n"
+                      "\tldw 16(0,%%r19),%%r7\n"
+                      "\tldw 20(0,%%r19),%%r8\n"
+                      "\tldw 24(0,%%r19),%%r9\n"
+		      "\tldw 28(0,%%r19),%%r10\n"
+                      "\tldw 32(0,%%r19),%%r11\n"
+                      "\tldw 36(0,%%r19),%%r12\n"
+                      "\tldw 40(0,%%r19),%%r13\n"
+                      "\tldw 44(0,%%r19),%%r14\n"
+                      "\tldw 48(0,%%r19),%%r15\n"
+                      "\tldw 52(0,%%r19),%%r16\n"
+                      "\tldw 56(0,%%r19),%%r17\n"
+                      "\tldw 60(0,%%r19),%%r18\n"
+		      "\tldo 80(%%r19),%%r19\n"
+		      "\tfldds -16(0,%%r19),%%fr12\n"
+		      "\tfldds  -8(0,%%r19),%%fr13\n"
+		      "\tfldds   0(0,%%r19),%%fr14\n"
+		      "\tfldds   8(0,%%r19),%%fr15\n"
+		      "\tldo 32(%%r19),%%r19\n"
+		      "\tfldds -16(0,%%r19),%%fr16\n"
+		      "\tfldds  -8(0,%%r19),%%fr17\n"
+		      "\tfldds   0(0,%%r19),%%fr18\n"
+		      "\tfldds   8(0,%%r19),%%fr19\n"
+		      "\tldo 32(%%r19),%%r19\n"
+		      "\tfldds -16(0,%%r19),%%fr20\n"
+		      "\tfldds  -8(0,%%r19),%%fr21\n"
+		         : "=r" (ret)
+		         : "n" (-(116 * sizeof(long) + 10 * sizeof(double)))
+		         : "%r19"
+		      );
+
+    return ret;
+}
+
+#endif /* hppa1_1_HOST_ARCH */
+
+/* -----------------------------------------------------------------------------
+   PowerPC architecture
+
+   Everything is in assembler, so we don't have to deal with GCC...
+   
+   -------------------------------------------------------------------------- */
+
+#ifdef powerpc_HOST_ARCH
+
+extern StgRegTable * StgRun(StgFunPtr f, StgRegTable *basereg);
+
+#ifdef darwin_HOST_OS
+void StgRunIsImplementedInAssembler(void)
+{
+#if HAVE_SUBSECTIONS_VIA_SYMBOLS
+            // if the toolchain supports deadstripping, we have to
+            // prevent it here (it tends to get confused here).
+        __asm__ volatile (".no_dead_strip _StgRunIsImplementedInAssembler");
+#endif
+	__asm__ volatile (
+		"\n.globl _StgRun\n"
+		"_StgRun:\n"
+		"\tmflr r0\n"
+		"\tbl saveFP # f14\n"
+		"\tstmw r13,-220(r1)\n"
+		"\tstwu r1,-%0(r1)\n"
+                "\tmr r27,r4\n" // BaseReg == r27
+		"\tmtctr r3\n"
+		"\tmr r12,r3\n"
+		"\tbctr\n"
+		".globl _StgReturn\n"
+		"_StgReturn:\n"
+		"\tmr r3,r14\n"
+		"\tla r1,%0(r1)\n"
+		"\tlmw r13,-220(r1)\n"
+		"\tb restFP # f14\n"
+	: : "i"(RESERVED_C_STACK_BYTES+224 /*stack frame size*/));
+}
+#else
+
+// This version is for PowerPC Linux.
+
+// Differences from the Darwin/Mac OS X version:
+// *) Different Assembler Syntax
+// *) Doesn't use Register Saving Helper Functions (although they exist somewhere)
+// *) We may not access positive stack offsets
+//    (no "Red Zone" as in the Darwin ABI)
+// *) The Link Register is saved to a different offset in the caller's stack frame
+//    (Linux: 4(r1), Darwin 8(r1))
+
+static void GNUC3_ATTRIBUTE(used)
+StgRunIsImplementedInAssembler(void)
+{
+	__asm__ volatile (
+		"\t.globl StgRun\n"
+		"\t.type StgRun,@function\n"
+		"StgRun:\n"
+		"\tmflr 0\n"
+		"\tstw 0,4(1)\n"
+		"\tmr 5,1\n"
+		"\tstwu 1,-%0(1)\n"
+		"\tstmw 13,-220(5)\n"
+		"\tstfd 14,-144(5)\n"
+		"\tstfd 15,-136(5)\n"
+		"\tstfd 16,-128(5)\n"
+		"\tstfd 17,-120(5)\n"
+		"\tstfd 18,-112(5)\n"
+		"\tstfd 19,-104(5)\n"
+		"\tstfd 20,-96(5)\n"
+		"\tstfd 21,-88(5)\n"
+		"\tstfd 22,-80(5)\n"
+		"\tstfd 23,-72(5)\n"
+		"\tstfd 24,-64(5)\n"
+		"\tstfd 25,-56(5)\n"
+		"\tstfd 26,-48(5)\n"
+		"\tstfd 27,-40(5)\n"
+		"\tstfd 28,-32(5)\n"
+		"\tstfd 29,-24(5)\n"
+		"\tstfd 30,-16(5)\n"
+		"\tstfd 31,-8(5)\n"
+		"\tmr 27,4\n"  // BaseReg == r27
+		"\tmtctr 3\n"
+		"\tmr 12,3\n"
+		"\tbctr\n"
+		".globl StgReturn\n"
+		"\t.type StgReturn,@function\n"
+		"StgReturn:\n"
+		"\tmr 3,14\n"
+		"\tla 5,%0(1)\n"
+		"\tlmw 13,-220(5)\n"
+		"\tlfd 14,-144(5)\n"
+		"\tlfd 15,-136(5)\n"
+		"\tlfd 16,-128(5)\n"
+		"\tlfd 17,-120(5)\n"
+		"\tlfd 18,-112(5)\n"
+		"\tlfd 19,-104(5)\n"
+		"\tlfd 20,-96(5)\n"
+		"\tlfd 21,-88(5)\n"
+		"\tlfd 22,-80(5)\n"
+		"\tlfd 23,-72(5)\n"
+		"\tlfd 24,-64(5)\n"
+		"\tlfd 25,-56(5)\n"
+		"\tlfd 26,-48(5)\n"
+		"\tlfd 27,-40(5)\n"
+		"\tlfd 28,-32(5)\n"
+		"\tlfd 29,-24(5)\n"
+		"\tlfd 30,-16(5)\n"
+		"\tlfd 31,-8(5)\n"
+		"\tmr 1,5\n"
+		"\tlwz 0,4(1)\n"
+		"\tmtlr 0\n"
+		"\tblr\n"
+	: : "i"(RESERVED_C_STACK_BYTES+224 /*stack frame size*/));
+}
+#endif
+
+#endif
+
+/* -----------------------------------------------------------------------------
+   PowerPC 64 architecture
+
+   Everything is in assembler, so we don't have to deal with GCC...
+   
+   -------------------------------------------------------------------------- */
+
+#ifdef powerpc64_HOST_ARCH
+
+#ifdef linux_HOST_OS
+extern StgRegTable * StgRun(StgFunPtr f, StgRegTable *basereg);
+
+static void GNUC3_ATTRIBUTE(used)
+StgRunIsImplementedInAssembler(void)
+{
+        // r0 volatile
+	// r1 stack pointer
+	// r2 toc - needs to be saved
+	// r3-r10 argument passing, volatile
+	// r11, r12 very volatile (not saved across cross-module calls)
+	// r13 thread local state (never modified, don't need to save)
+	// r14-r31 callee-save
+	__asm__ volatile (
+		".section \".opd\",\"aw\"\n"
+		".align 3\n"
+		".globl StgRun\n"
+		"StgRun:\n"
+			"\t.quad\t.StgRun,.TOC.@tocbase,0\n"
+			"\t.size StgRun,24\n"
+		".globl StgReturn\n"
+		"StgReturn:\n"
+			"\t.quad\t.StgReturn,.TOC.@tocbase,0\n"
+			"\t.size StgReturn,24\n"
+		".previous\n"
+		".globl .StgRun\n"
+		".type .StgRun,@function\n"
+		".StgRun:\n"
+			"\tmflr 0\n"
+			"\tmr 5, 1\n"
+			"\tstd 0, 16(1)\n"
+			"\tstdu 1, -%0(1)\n"
+			"\tstd 2, -296(5)\n"
+			"\tstd 14, -288(5)\n"
+			"\tstd 15, -280(5)\n"
+			"\tstd 16, -272(5)\n"
+			"\tstd 17, -264(5)\n"
+			"\tstd 18, -256(5)\n"
+			"\tstd 19, -248(5)\n"
+			"\tstd 20, -240(5)\n"
+			"\tstd 21, -232(5)\n"
+			"\tstd 22, -224(5)\n"
+			"\tstd 23, -216(5)\n"
+			"\tstd 24, -208(5)\n"
+			"\tstd 25, -200(5)\n"
+			"\tstd 26, -192(5)\n"
+			"\tstd 27, -184(5)\n"
+			"\tstd 28, -176(5)\n"
+			"\tstd 29, -168(5)\n"
+			"\tstd 30, -160(5)\n"
+			"\tstd 31, -152(5)\n"
+			"\tstfd 14, -144(5)\n"
+			"\tstfd 15, -136(5)\n"
+			"\tstfd 16, -128(5)\n"
+			"\tstfd 17, -120(5)\n"
+			"\tstfd 18, -112(5)\n"
+			"\tstfd 19, -104(5)\n"
+			"\tstfd 20, -96(5)\n"
+			"\tstfd 21, -88(5)\n"
+			"\tstfd 22, -80(5)\n"
+			"\tstfd 23, -72(5)\n"
+			"\tstfd 24, -64(5)\n"
+			"\tstfd 25, -56(5)\n"
+			"\tstfd 26, -48(5)\n"
+			"\tstfd 27, -40(5)\n"
+			"\tstfd 28, -32(5)\n"
+			"\tstfd 29, -24(5)\n"
+			"\tstfd 30, -16(5)\n"
+			"\tstfd 31, -8(5)\n"
+			"\tmr 27, 4\n"  // BaseReg == r27
+			"\tld 2, 8(3)\n"
+			"\tld 3, 0(3)\n"
+			"\tmtctr 3\n"
+			"\tbctr\n"
+		".globl .StgReturn\n"
+		".type .StgReturn,@function\n"
+		".StgReturn:\n"
+			"\tmr 3,14\n"
+			"\tla 5, %0(1)\n" // load address == addi r5, r1, %0
+			"\tld 2, -296(5)\n"
+			"\tld 14, -288(5)\n"
+			"\tld 15, -280(5)\n"
+			"\tld 16, -272(5)\n"
+			"\tld 17, -264(5)\n"
+			"\tld 18, -256(5)\n"
+			"\tld 19, -248(5)\n"
+			"\tld 20, -240(5)\n"
+			"\tld 21, -232(5)\n"
+			"\tld 22, -224(5)\n"
+			"\tld 23, -216(5)\n"
+			"\tld 24, -208(5)\n"
+			"\tld 25, -200(5)\n"
+			"\tld 26, -192(5)\n"
+			"\tld 27, -184(5)\n"
+			"\tld 28, -176(5)\n"
+			"\tld 29, -168(5)\n"
+			"\tld 30, -160(5)\n"
+			"\tld 31, -152(5)\n"
+			"\tlfd 14, -144(5)\n"
+			"\tlfd 15, -136(5)\n"
+			"\tlfd 16, -128(5)\n"
+			"\tlfd 17, -120(5)\n"
+			"\tlfd 18, -112(5)\n"
+			"\tlfd 19, -104(5)\n"
+			"\tlfd 20, -96(5)\n"
+			"\tlfd 21, -88(5)\n"
+			"\tlfd 22, -80(5)\n"
+			"\tlfd 23, -72(5)\n"
+			"\tlfd 24, -64(5)\n"
+			"\tlfd 25, -56(5)\n"
+			"\tlfd 26, -48(5)\n"
+			"\tlfd 27, -40(5)\n"
+			"\tlfd 28, -32(5)\n"
+			"\tlfd 29, -24(5)\n"
+			"\tlfd 30, -16(5)\n"
+			"\tlfd 31, -8(5)\n"
+			"\tmr 1, 5\n"
+			"\tld 0, 16(1)\n"
+			"\tmtlr 0\n"
+			"\tblr\n"
+	: : "i"(RESERVED_C_STACK_BYTES+304 /*stack frame size*/));
+}
+#else // linux_HOST_OS
+#error Only linux support for power64 right now.
+#endif
+
+#endif
+
+/* -----------------------------------------------------------------------------
+   IA64 architecture
+
+   Again, in assembler - so we can fiddle with the register stack, and because
+   gcc doesn't handle asm-clobbered callee-saves correctly.
+
+   loc0  - loc15: preserved locals
+   loc16 - loc28: STG registers
+           loc29: saved ar.pfs
+           loc30: saved b0
+           loc31: saved gp (gcc 3.3 uses this slot)
+   -------------------------------------------------------------------------- */
+
+#ifdef ia64_HOST_ARCH
+
+/* the memory stack is rarely used, so 16K is excessive */
+#undef RESERVED_C_STACK_BYTES
+#define RESERVED_C_STACK_BYTES 1024
+
+#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3)
+/* gcc 3.3+: leave an extra slot for gp saves */
+#define LOCALS 32
+#else
+#define LOCALS 31
+#endif
+
+static void GNUC3_ATTRIBUTE(used)
+StgRunIsImplementedInAssembler(void)
+{
+    __asm__ volatile(
+		".global StgRun\n"
+		"StgRun:\n"
+		"\talloc loc29 = ar.pfs, 0, %1, 8, 0\n"	/* setup register frame */
+		"\tld8 r18 = [r32],8\n"			/* get procedure address */
+		"\tadds sp = -%0, sp ;;\n"		/* setup stack */
+		"\tld8 gp = [r32]\n"			/* get procedure GP */
+		"\tadds r16 = %0-(6*16), sp\n"
+		"\tadds r17 = %0-(5*16), sp ;;\n"
+		"\tstf.spill [r16] = f16,32\n"		/* spill callee-saved fp regs */
+		"\tstf.spill [r17] = f17,32\n"
+		"\tmov b6 = r18 ;;\n"			/* set target address */
+		"\tstf.spill [r16] = f18,32\n"
+		"\tstf.spill [r17] = f19,32\n"
+		"\tmov loc30 = b0 ;;\n"			/* save return address */
+		"\tstf.spill [r16] = f20,32\n"
+		"\tstf.spill [r17] = f21,32\n"
+		"\tbr.few b6 ;;\n"			/* branch to function */
+		".global StgReturn\n"
+		"StgReturn:\n"
+		"\tmov r8 = loc16\n"		/* return value in r8 */
+		"\tadds r16 = %0-(6*16), sp\n"
+	    	"\tadds r17 = %0-(5*16), sp ;;\n"
+		"\tldf.fill f16 = [r16],32\n"	/* start restoring fp regs */
+		"\tldf.fill f17 = [r17],32\n"
+		"\tmov ar.pfs = loc29 ;;\n"	/* restore register frame */
+		"\tldf.fill f18 = [r16],32\n"
+		"\tldf.fill f19 = [r17],32\n"
+		"\tmov b0 = loc30 ;;\n"		/* restore return address */
+		"\tldf.fill f20 = [r16],32\n"
+		"\tldf.fill f21 = [r17],32\n"
+		"\tadds sp = %0, sp\n"		/* restore stack */
+		"\tbr.ret.sptk.many b0 ;;\n"	/* return */
+	: : "i"(RESERVED_C_STACK_BYTES + 6*16), "i"(LOCALS));
+}
+
+#endif
+
+#endif /* !USE_MINIINTERPRETER */
diff --git a/rts/StgMiscClosures.cmm b/rts/StgMiscClosures.cmm
new file mode 100644
index 0000000000..70d08aeb0e
--- /dev/null
+++ b/rts/StgMiscClosures.cmm
@@ -0,0 +1,953 @@
+/* ----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2004
+ *
+ * Entry code for various built-in closure types.
+ *
+ * This file is written in a subset of C--, extended with various
+ * features specific to GHC.  It is compiled by GHC directly.  For the
+ * syntax of .cmm files, see the parser in ghc/compiler/cmm/CmmParse.y.
+ *
+ * --------------------------------------------------------------------------*/
+
+#include "Cmm.h"
+
+/* ----------------------------------------------------------------------------
+   Support for the bytecode interpreter.
+   ------------------------------------------------------------------------- */
+
+/* 9 bits of return code for constructors created by the interpreter. */
+stg_interp_constr_entry
+{ 
+    /* R1 points at the constructor */
+    jump %ENTRY_CODE(Sp(0));
+}
+
+stg_interp_constr1_entry { jump %RET_VEC(Sp(0),0); }
+stg_interp_constr2_entry { jump %RET_VEC(Sp(0),1); }
+stg_interp_constr3_entry { jump %RET_VEC(Sp(0),2); }
+stg_interp_constr4_entry { jump %RET_VEC(Sp(0),3); }
+stg_interp_constr5_entry { jump %RET_VEC(Sp(0),4); }
+stg_interp_constr6_entry { jump %RET_VEC(Sp(0),5); }
+stg_interp_constr7_entry { jump %RET_VEC(Sp(0),6); }
+stg_interp_constr8_entry { jump %RET_VEC(Sp(0),7); }
+
+/* Some info tables to be used when compiled code returns a value to
+   the interpreter, i.e. the interpreter pushes one of these onto the
+   stack before entering a value.  What the code does is to
+   impedance-match the compiled return convention (in R1p/R1n/F1/D1 etc) to
+   the interpreter's convention (returned value is on top of stack),
+   and then cause the scheduler to enter the interpreter.
+
+   On entry, the stack (growing down) looks like this:
+
+      ptr to BCO holding return continuation
+      ptr to one of these info tables.
+ 
+   The info table code, both direct and vectored, must:
+      * push R1/F1/D1 on the stack, and its tag if necessary
+      * push the BCO (so it's now on the stack twice)
+      * Yield, ie, go to the scheduler.
+
+   Scheduler examines the t.o.s, discovers it is a BCO, and proceeds
+   directly to the bytecode interpreter.  That pops the top element
+   (the BCO, containing the return continuation), and interprets it.
+   Net result: return continuation gets interpreted, with the
+   following stack:
+
+      ptr to this BCO
+      ptr to the info table just jumped thru
+      return value
+
+   which is just what we want -- the "standard" return layout for the
+   interpreter.  Hurrah!
+
+   Don't ask me how unboxed tuple returns are supposed to work.  We
+   haven't got a good story about that yet.
+*/
+
+INFO_TABLE_RET( stg_ctoi_R1p, 
+		0/*size*/, 0/*bitmap*/,    /* special layout! */
+		RET_BCO,
+		RET_LBL(stg_ctoi_R1p),
+		RET_LBL(stg_ctoi_R1p),
+		RET_LBL(stg_ctoi_R1p),
+		RET_LBL(stg_ctoi_R1p),
+		RET_LBL(stg_ctoi_R1p),
+		RET_LBL(stg_ctoi_R1p),
+		RET_LBL(stg_ctoi_R1p),
+		RET_LBL(stg_ctoi_R1p))
+{
+    Sp_adj(-2);
+    Sp(1) = R1;
+    Sp(0) = stg_enter_info;
+    jump stg_yield_to_interpreter;
+}
+
+#if MAX_VECTORED_RTN != 8
+#error MAX_VECTORED_RTN has changed: please modify stg_ctoi_R1p too.
+#endif
+
+/*
+ * When the returned value is a pointer, but unlifted, in R1 ... 
+ */
+INFO_TABLE_RET( stg_ctoi_R1unpt,
+		0/*size*/, 0/*bitmap*/,    /* special layout! */
+		RET_BCO )
+{
+    Sp_adj(-2);
+    Sp(1) = R1;
+    Sp(0) = stg_gc_unpt_r1_info;
+    jump stg_yield_to_interpreter;
+}
+
+/*
+ * When the returned value is a non-pointer in R1 ...
+ */
+INFO_TABLE_RET( stg_ctoi_R1n,
+		0/*size*/, 0/*bitmap*/,    /* special layout! */
+		RET_BCO )
+{
+    Sp_adj(-2);
+    Sp(1) = R1;
+    Sp(0) = stg_gc_unbx_r1_info;
+    jump stg_yield_to_interpreter;
+}
+
+/*
+ * When the returned value is in F1
+ */
+INFO_TABLE_RET( stg_ctoi_F1,
+		0/*size*/, 0/*bitmap*/,    /* special layout! */
+		RET_BCO )
+{
+    Sp_adj(-2);
+    F_[Sp + WDS(1)] = F1;
+    Sp(0) = stg_gc_f1_info;
+    jump stg_yield_to_interpreter;
+}
+
+/*
+ * When the returned value is in D1
+ */
+INFO_TABLE_RET( stg_ctoi_D1,
+		0/*size*/, 0/*bitmap*/,    /* special layout! */
+		RET_BCO )
+{
+    Sp_adj(-1) - SIZEOF_DOUBLE;
+    D_[Sp + WDS(1)] = D1;
+    Sp(0) = stg_gc_d1_info;
+    jump stg_yield_to_interpreter;
+}
+
+/*
+ * When the returned value is in L1
+ */
+INFO_TABLE_RET( stg_ctoi_L1,
+		0/*size*/, 0/*bitmap*/,    /* special layout! */
+		RET_BCO )
+{
+    Sp_adj(-1) - 8;
+    L_[Sp + WDS(1)] = L1;
+    Sp(0) = stg_gc_l1_info;
+    jump stg_yield_to_interpreter;
+}
+
+/*
+ * When the returned value is a void
+ */
+INFO_TABLE_RET( stg_ctoi_V,
+		0/*size*/, 0/*bitmap*/,    /* special layout! */
+		RET_BCO )
+{
+    Sp_adj(-1);
+    Sp(0) = stg_gc_void_info;
+    jump stg_yield_to_interpreter;
+}
+
+/*
+ * Dummy info table pushed on the top of the stack when the interpreter
+ * should apply the BCO on the stack to its arguments, also on the
+ * stack.
+ */
+INFO_TABLE_RET( stg_apply_interp,
+		0/*size*/, 0/*bitmap*/,    /* special layout! */
+		RET_BCO )
+{
+    /* Just in case we end up in here... (we shouldn't) */
+    jump stg_yield_to_interpreter;
+}
+
+/* ----------------------------------------------------------------------------
+   Entry code for a BCO
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE_FUN( stg_BCO, 4, 0, BCO, "BCO", "BCO", ARG_BCO )
+{
+  /* entering a BCO means "apply it", same as a function */
+  Sp_adj(-2);
+  Sp(1) = R1;
+  Sp(0) = stg_apply_interp_info;
+  jump stg_yield_to_interpreter;
+}
+
+/* ----------------------------------------------------------------------------
+   Info tables for indirections.
+
+   SPECIALISED INDIRECTIONS: we have a specialised indirection for each
+   kind of return (direct, vectored 0-7), so that we can avoid entering
+   the object when we know what kind of return it will do.  The update
+   code (Updates.hc) updates objects with the appropriate kind of
+   indirection.  We only do this for young-gen indirections.
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_IND,1,0,IND,"IND","IND")
+{
+    TICK_ENT_DYN_IND();	/* tick */
+    R1 = StgInd_indirectee(R1);
+    TICK_ENT_VIA_NODE();
+    jump %GET_ENTRY(R1);
+}
+
+#define IND_SPEC(label,ret) \
+INFO_TABLE(label,1,0,IND,"IND","IND") \
+{						\
+    TICK_ENT_DYN_IND();	/* tick */		\
+    R1 = StgInd_indirectee(R1);			\
+    TICK_ENT_VIA_NODE();			\
+    jump ret;					\
+}
+
+IND_SPEC(stg_IND_direct, %ENTRY_CODE(Sp(0)))
+IND_SPEC(stg_IND_0, %RET_VEC(Sp(0),0))
+IND_SPEC(stg_IND_1, %RET_VEC(Sp(0),1))
+IND_SPEC(stg_IND_2, %RET_VEC(Sp(0),2))
+IND_SPEC(stg_IND_3, %RET_VEC(Sp(0),3))
+IND_SPEC(stg_IND_4, %RET_VEC(Sp(0),4))
+IND_SPEC(stg_IND_5, %RET_VEC(Sp(0),5))
+IND_SPEC(stg_IND_6, %RET_VEC(Sp(0),6))
+IND_SPEC(stg_IND_7, %RET_VEC(Sp(0),7))
+
+INFO_TABLE(stg_IND_STATIC,1,0,IND_STATIC,"IND_STATIC","IND_STATIC")
+{
+    TICK_ENT_STATIC_IND();	/* tick */
+    R1 = StgInd_indirectee(R1);
+    TICK_ENT_VIA_NODE();
+    jump %GET_ENTRY(R1);
+}
+
+INFO_TABLE(stg_IND_PERM,1,0,IND_PERM,"IND_PERM","IND_PERM")
+{
+    /* Don't add INDs to granularity cost */
+
+    /* Don't: TICK_ENT_STATIC_IND(Node); for ticky-ticky; this ind is
+       here only to help profiling */
+
+#if defined(TICKY_TICKY) && !defined(PROFILING)
+    /* TICKY_TICKY && !PROFILING means PERM_IND *replaces* an IND, rather than
+       being extra  */
+    TICK_ENT_PERM_IND();
+#endif
+
+    LDV_ENTER(R1);
+
+    /* Enter PAP cost centre */
+    ENTER_CCS_PAP_CL(R1);
+
+    /* For ticky-ticky, change the perm_ind to a normal ind on first
+     * entry, so the number of ent_perm_inds is the number of *thunks*
+     * entered again, not the number of subsequent entries.
+     *
+     * Since this screws up cost centres, we die if profiling and
+     * ticky_ticky are on at the same time.  KSW 1999-01.
+     */
+#ifdef TICKY_TICKY
+#  ifdef PROFILING
+#    error Profiling and ticky-ticky do not mix at present!
+#  endif  /* PROFILING */
+    StgHeader_info(R1) = stg_IND_info;
+#endif /* TICKY_TICKY */
+
+    R1 = StgInd_indirectee(R1);
+
+#if defined(TICKY_TICKY) && !defined(PROFILING)
+    TICK_ENT_VIA_NODE();
+#endif
+
+    jump %GET_ENTRY(R1);
+}  
+
+
+INFO_TABLE(stg_IND_OLDGEN,1,0,IND_OLDGEN,"IND_OLDGEN","IND_OLDGEN")
+{
+    TICK_ENT_STATIC_IND();	/* tick */
+    R1 = StgInd_indirectee(R1);
+    TICK_ENT_VIA_NODE();
+    jump %GET_ENTRY(R1);
+}
+
+INFO_TABLE(stg_IND_OLDGEN_PERM,1,0,IND_OLDGEN_PERM,"IND_OLDGEN_PERM","IND_OLDGEN_PERM")
+{
+    /* Don't: TICK_ENT_STATIC_IND(Node); for ticky-ticky; 
+       this ind is here only to help profiling */
+
+#if defined(TICKY_TICKY) && !defined(PROFILING)
+    /* TICKY_TICKY && !PROFILING means PERM_IND *replaces* an IND, 
+       rather than being extra  */
+    TICK_ENT_PERM_IND(R1); /* tick */
+#endif
+
+    LDV_ENTER(R1);
+
+    /* Enter PAP cost centre -- lexical scoping only */
+    ENTER_CCS_PAP_CL(R1);
+
+    /* see comment in IND_PERM */
+#ifdef TICKY_TICKY
+#  ifdef PROFILING
+#    error Profiling and ticky-ticky do not mix at present!
+#  endif  /* PROFILING */
+    StgHeader_info(R1) = stg_IND_OLDGEN_info;
+#endif /* TICKY_TICKY */
+
+    R1 = StgInd_indirectee(R1);
+
+    TICK_ENT_VIA_NODE();
+    jump %GET_ENTRY(R1);
+}
+
+/* ----------------------------------------------------------------------------
+   Black holes.
+
+   Entering a black hole normally causes a cyclic data dependency, but
+   in the concurrent world, black holes are synchronization points,
+   and they are turned into blocking queues when there are threads
+   waiting for the evaluation of the closure to finish.
+   ------------------------------------------------------------------------- */
+
+/* Note: a BLACKHOLE must be big enough to be
+ * overwritten with an indirection/evacuee/catch.  Thus we claim it
+ * has 1 non-pointer word of payload. 
+ */
+INFO_TABLE(stg_BLACKHOLE,0,1,BLACKHOLE,"BLACKHOLE","BLACKHOLE")
+{
+#if defined(GRAN)
+    /* Before overwriting TSO_LINK */
+    STGCALL3(GranSimBlock,CurrentTSO,CurrentProc,(StgClosure *)R1 /*Node*/);
+#endif
+
+    TICK_ENT_BH();
+
+#ifdef THREADED_RTS
+    // foreign "C" debugBelch("BLACKHOLE entry\n");
+#endif
+
+    /* Actually this is not necessary because R1 is about to be destroyed. */
+    LDV_ENTER(R1);
+
+#if defined(THREADED_RTS)
+    foreign "C" ACQUIRE_LOCK(sched_mutex "ptr");
+    // released in stg_block_blackhole_finally
+#endif
+
+    /* Put ourselves on the blackhole queue */
+    StgTSO_link(CurrentTSO) = W_[blackhole_queue];
+    W_[blackhole_queue] = CurrentTSO;
+
+    /* jot down why and on what closure we are blocked */
+    StgTSO_why_blocked(CurrentTSO) = BlockedOnBlackHole::I16;
+    StgTSO_block_info(CurrentTSO) = R1;
+
+    jump stg_block_blackhole;
+}
+
+#if defined(PAR) || defined(GRAN)
+
+INFO_TABLE(stg_RBH,1,1,RBH,"RBH","RBH")
+{
+# if defined(GRAN)
+    /* mainly statistics gathering for GranSim simulation */
+    STGCALL3(GranSimBlock,CurrentTSO,CurrentProc,(StgClosure *)R1 /*Node*/);
+# endif
+
+    /* exactly the same as a BLACKHOLE_BQ_entry -- HWL */
+    /* Put ourselves on the blocking queue for this black hole */
+    TSO_link(CurrentTSO) = StgBlockingQueue_blocking_queue(R1);
+    StgBlockingQueue_blocking_queue(R1) = CurrentTSO;
+    /* jot down why and on what closure we are blocked */
+    TSO_why_blocked(CurrentTSO) = BlockedOnBlackHole::I16;
+    TSO_block_info(CurrentTSO) = R1;
+
+    /* PAR: dumping of event now done in blockThread -- HWL */
+
+    /* stg_gen_block is too heavyweight, use a specialised one */
+    jump stg_block_1;
+}
+
+INFO_TABLE(stg_RBH_Save_0,0,2,CONSTR,"RBH_Save_0","RBH_Save_0")
+{ foreign "C" barf("RBH_Save_0 object entered!"); }
+
+INFO_TABLE(stg_RBH_Save_1,1,1,CONSTR,"RBH_Save_1","RBH_Save_1");
+{ foreign "C" barf("RBH_Save_1 object entered!"); }
+
+INFO_TABLE(stg_RBH_Save_2,2,0,CONSTR,"RBH_Save_2","RBH_Save_2");
+{ foreign "C" barf("RBH_Save_2 object entered!"); }
+
+#endif /* defined(PAR) || defined(GRAN) */
+
+/* identical to BLACKHOLEs except for the infotag */
+INFO_TABLE(stg_CAF_BLACKHOLE,0,1,CAF_BLACKHOLE,"CAF_BLACKHOLE","CAF_BLACKHOLE")
+{
+#if defined(GRAN)
+    /* mainly statistics gathering for GranSim simulation */
+    STGCALL3(GranSimBlock,CurrentTSO,CurrentProc,(StgClosure *)R1 /*Node*/);
+#endif
+
+    TICK_ENT_BH();
+    LDV_ENTER(R1);
+
+#if defined(THREADED_RTS)
+    // foreign "C" debugBelch("BLACKHOLE entry\n");
+#endif
+
+#if defined(THREADED_RTS)
+    foreign "C" ACQUIRE_LOCK(sched_mutex "ptr");
+    // released in stg_block_blackhole_finally
+#endif
+
+    /* Put ourselves on the blackhole queue */
+    StgTSO_link(CurrentTSO) = W_[blackhole_queue];
+    W_[blackhole_queue] = CurrentTSO;
+
+    /* jot down why and on what closure we are blocked */
+    StgTSO_why_blocked(CurrentTSO) = BlockedOnBlackHole::I16;
+    StgTSO_block_info(CurrentTSO) = R1;
+
+    jump stg_block_blackhole;
+}
+
+#ifdef EAGER_BLACKHOLING
+INFO_TABLE(stg_SE_BLACKHOLE,0,1,SE_BLACKHOLE,"SE_BLACKHOLE","SE_BLACKHOLE")
+{ foreign "C" barf("SE_BLACKHOLE object entered!"); }
+
+INFO_TABLE(stg_SE_CAF_BLACKHOLE,0,1,SE_CAF_BLACKHOLE,"SE_CAF_BLACKHOLE","SE_CAF_BLACKHOLE")
+{ foreign "C" barf("SE_CAF_BLACKHOLE object entered!"); }
+#endif
+
+/* ----------------------------------------------------------------------------
+   Whiteholes are used for the "locked" state of a closure (see lockClosure())
+
+   The closure type is BLAKCHOLE, just because we need a valid closure type
+   for sanity checking.
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_WHITEHOLE, 0,0, BLACKHOLE, "WHITEHOLE", "WHITEHOLE")
+{ foreign "C" barf("WHITEHOLE object entered!"); }
+
+/* ----------------------------------------------------------------------------
+   Some static info tables for things that don't get entered, and
+   therefore don't need entry code (i.e. boxed but unpointed objects)
+   NON_ENTERABLE_ENTRY_CODE now defined at the beginning of the file
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_TSO, 0,0,TSO, "TSO", "TSO")
+{ foreign "C" barf("TSO object entered!"); }
+
+/* ----------------------------------------------------------------------------
+   Evacuees are left behind by the garbage collector.  Any attempt to enter
+   one is a real bug.
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_EVACUATED,1,0,EVACUATED,"EVACUATED","EVACUATED")
+{ foreign "C" barf("EVACUATED object entered!"); }
+
+/* ----------------------------------------------------------------------------
+   Weak pointers
+
+   Live weak pointers have a special closure type.  Dead ones are just
+   nullary constructors (although they live on the heap - we overwrite
+   live weak pointers with dead ones).
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_WEAK,0,4,WEAK,"WEAK","WEAK")
+{ foreign "C" barf("WEAK object entered!"); }
+
+/*
+ * It's important when turning an existing WEAK into a DEAD_WEAK
+ * (which is what finalizeWeak# does) that we don't lose the link
+ * field and break the linked list of weak pointers.  Hence, we give
+ * DEAD_WEAK 4 non-pointer fields, the same as WEAK.
+ */
+INFO_TABLE_CONSTR(stg_DEAD_WEAK,0,4,0,CONSTR,"DEAD_WEAK","DEAD_WEAK")
+{ foreign "C" barf("DEAD_WEAK object entered!"); }
+
+/* ----------------------------------------------------------------------------
+   NO_FINALIZER
+
+   This is a static nullary constructor (like []) that we use to mark an empty
+   finalizer in a weak pointer object.
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE_CONSTR(stg_NO_FINALIZER,0,0,0,CONSTR_NOCAF_STATIC,"NO_FINALIZER","NO_FINALIZER")
+{ foreign "C" barf("NO_FINALIZER object entered!"); }
+
+CLOSURE(stg_NO_FINALIZER_closure,stg_NO_FINALIZER);
+
+/* ----------------------------------------------------------------------------
+   Stable Names are unlifted too.
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_STABLE_NAME,0,1,STABLE_NAME,"STABLE_NAME","STABLE_NAME")
+{ foreign "C" barf("STABLE_NAME object entered!"); }
+
+/* ----------------------------------------------------------------------------
+   MVars
+
+   There are two kinds of these: full and empty.  We need an info table
+   and entry code for each type.
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_FULL_MVAR,3,0,MVAR,"MVAR","MVAR")
+{ foreign "C" barf("FULL_MVAR object entered!"); }
+
+INFO_TABLE(stg_EMPTY_MVAR,3,0,MVAR,"MVAR","MVAR")
+{ foreign "C" barf("EMPTY_MVAR object entered!"); }
+
+/* -----------------------------------------------------------------------------
+   STM
+   -------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_TVAR, 0, 0, TVAR, "TVAR", "TVAR")
+{ foreign "C" barf("TVAR object entered!"); }
+
+INFO_TABLE(stg_TVAR_WAIT_QUEUE, 0, 0, TVAR_WAIT_QUEUE, "TVAR_WAIT_QUEUE", "TVAR_WAIT_QUEUE")
+{ foreign "C" barf("TVAR_WAIT_QUEUE object entered!"); }
+
+INFO_TABLE(stg_TREC_CHUNK, 0, 0, TREC_CHUNK, "TREC_CHUNK", "TREC_CHUNK")
+{ foreign "C" barf("TREC_CHUNK object entered!"); }
+
+INFO_TABLE(stg_TREC_HEADER, 0, 0, TREC_HEADER, "TREC_HEADER", "TREC_HEADER")
+{ foreign "C" barf("TREC_HEADER object entered!"); }
+
+INFO_TABLE_CONSTR(stg_END_STM_WAIT_QUEUE,0,0,0,CONSTR_NOCAF_STATIC,"END_STM_WAIT_QUEUE","END_STM_WAIT_QUEUE")
+{ foreign "C" barf("END_STM_WAIT_QUEUE object entered!"); }
+
+INFO_TABLE_CONSTR(stg_END_STM_CHUNK_LIST,0,0,0,CONSTR_NOCAF_STATIC,"END_STM_CHUNK_LIST","END_STM_CHUNK_LIST")
+{ foreign "C" barf("END_STM_CHUNK_LIST object entered!"); }
+
+INFO_TABLE_CONSTR(stg_NO_TREC,0,0,0,CONSTR_NOCAF_STATIC,"NO_TREC","NO_TREC")
+{ foreign "C" barf("NO_TREC object entered!"); }
+
+CLOSURE(stg_END_STM_WAIT_QUEUE_closure,stg_END_STM_WAIT_QUEUE);
+
+CLOSURE(stg_END_STM_CHUNK_LIST_closure,stg_END_STM_CHUNK_LIST);
+
+CLOSURE(stg_NO_TREC_closure,stg_NO_TREC);
+
+/* ----------------------------------------------------------------------------
+   END_TSO_QUEUE
+
+   This is a static nullary constructor (like []) that we use to mark the
+   end of a linked TSO queue.
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE_CONSTR(stg_END_TSO_QUEUE,0,0,0,CONSTR_NOCAF_STATIC,"END_TSO_QUEUE","END_TSO_QUEUE")
+{ foreign "C" barf("END_TSO_QUEUE object entered!"); }
+
+CLOSURE(stg_END_TSO_QUEUE_closure,stg_END_TSO_QUEUE);
+
+/* ----------------------------------------------------------------------------
+   Exception lists
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE_CONSTR(stg_END_EXCEPTION_LIST,0,0,0,CONSTR_NOCAF_STATIC,"END_EXCEPTION_LIST","END_EXCEPTION_LIST")
+{ foreign "C" barf("END_EXCEPTION_LIST object entered!"); }
+
+CLOSURE(stg_END_EXCEPTION_LIST_closure,stg_END_EXCEPTION_LIST);
+
+INFO_TABLE(stg_EXCEPTION_CONS,1,1,CONSTR,"EXCEPTION_CONS","EXCEPTION_CONS")
+{ foreign "C" barf("EXCEPTION_CONS object entered!"); }
+
+/* ----------------------------------------------------------------------------
+   Arrays
+
+   These come in two basic flavours: arrays of data (StgArrWords) and arrays of
+   pointers (StgArrPtrs).  They all have a similar layout:
+
+	___________________________
+	| Info | No. of | data....
+        |  Ptr | Words  |
+	---------------------------
+
+   These are *unpointed* objects: i.e. they cannot be entered.
+
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_ARR_WORDS, 0, 0, ARR_WORDS, "ARR_WORDS", "ARR_WORDS")
+{ foreign "C" barf("ARR_WORDS object entered!"); }
+
+INFO_TABLE(stg_MUT_ARR_PTRS_CLEAN, 0, 0, MUT_ARR_PTRS_CLEAN, "MUT_ARR_PTRS_CLEAN", "MUT_ARR_PTRS_CLEAN")
+{ foreign "C" barf("MUT_ARR_PTRS_CLEAN object entered!"); }
+
+INFO_TABLE(stg_MUT_ARR_PTRS_DIRTY, 0, 0, MUT_ARR_PTRS_DIRTY, "MUT_ARR_PTRS_DIRTY", "MUT_ARR_PTRS_DIRTY")
+{ foreign "C" barf("MUT_ARR_PTRS_DIRTY object entered!"); }
+
+INFO_TABLE(stg_MUT_ARR_PTRS_FROZEN, 0, 0, MUT_ARR_PTRS_FROZEN, "MUT_ARR_PTRS_FROZEN", "MUT_ARR_PTRS_FROZEN")
+{ foreign "C" barf("MUT_ARR_PTRS_FROZEN object entered!"); }
+
+INFO_TABLE(stg_MUT_ARR_PTRS_FROZEN0, 0, 0, MUT_ARR_PTRS_FROZEN0, "MUT_ARR_PTRS_FROZEN0", "MUT_ARR_PTRS_FROZEN0")
+{ foreign "C" barf("MUT_ARR_PTRS_FROZEN0 object entered!"); }
+
+/* ----------------------------------------------------------------------------
+   Mutable Variables
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_MUT_VAR_CLEAN, 1, 0, MUT_VAR_CLEAN, "MUT_VAR_CLEAN", "MUT_VAR_CLEAN")
+{ foreign "C" barf("MUT_VAR_CLEAN object entered!"); }
+INFO_TABLE(stg_MUT_VAR_DIRTY, 1, 0, MUT_VAR_DIRTY, "MUT_VAR_DIRTY", "MUT_VAR_DIRTY")
+{ foreign "C" barf("MUT_VAR_DIRTY object entered!"); }
+
+/* ----------------------------------------------------------------------------
+   Dummy return closure
+ 
+   Entering this closure will just return to the address on the top of the
+   stack.  Useful for getting a thread in a canonical form where we can
+   just enter the top stack word to start the thread.  (see deleteThread)
+ * ------------------------------------------------------------------------- */
+
+INFO_TABLE( stg_dummy_ret, 0, 0, CONSTR_NOCAF_STATIC, "DUMMY_RET", "DUMMY_RET")
+{
+  jump %ENTRY_CODE(Sp(0));
+}
+CLOSURE(stg_dummy_ret_closure,stg_dummy_ret);
+
+/* ----------------------------------------------------------------------------
+   CHARLIKE and INTLIKE closures.  
+
+   These are static representations of Chars and small Ints, so that
+   we can remove dynamic Chars and Ints during garbage collection and
+   replace them with references to the static objects.
+   ------------------------------------------------------------------------- */
+
+#if defined(ENABLE_WIN32_DLL_SUPPORT)
+/*
+ * When sticking the RTS in a DLL, we delay populating the
+ * Charlike and Intlike tables until load-time, which is only
+ * when we've got the real addresses to the C# and I# closures.
+ *
+ */
+static INFO_TBL_CONST StgInfoTable czh_static_info;
+static INFO_TBL_CONST StgInfoTable izh_static_info;
+#define Char_hash_static_info czh_static_info
+#define Int_hash_static_info izh_static_info
+#else
+#define Char_hash_static_info GHCziBase_Czh_static
+#define Int_hash_static_info GHCziBase_Izh_static
+#endif
+
+
+#define CHARLIKE_HDR(n)  CLOSURE(Char_hash_static_info, n)
+#define INTLIKE_HDR(n)   CLOSURE(Int_hash_static_info, n)
+
+/* put these in the *data* section, since the garbage collector relies
+ * on the fact that static closures live in the data section.
+ */
+
+/* end the name with _closure, to convince the mangler this is a closure */
+
+section "data" {
+ stg_CHARLIKE_closure:
+    CHARLIKE_HDR(0)
+    CHARLIKE_HDR(1)
+    CHARLIKE_HDR(2)
+    CHARLIKE_HDR(3)
+    CHARLIKE_HDR(4)
+    CHARLIKE_HDR(5)
+    CHARLIKE_HDR(6)
+    CHARLIKE_HDR(7)
+    CHARLIKE_HDR(8)
+    CHARLIKE_HDR(9)
+    CHARLIKE_HDR(10)
+    CHARLIKE_HDR(11)
+    CHARLIKE_HDR(12)
+    CHARLIKE_HDR(13)
+    CHARLIKE_HDR(14)
+    CHARLIKE_HDR(15)
+    CHARLIKE_HDR(16)
+    CHARLIKE_HDR(17)
+    CHARLIKE_HDR(18)
+    CHARLIKE_HDR(19)
+    CHARLIKE_HDR(20)
+    CHARLIKE_HDR(21)
+    CHARLIKE_HDR(22)
+    CHARLIKE_HDR(23)
+    CHARLIKE_HDR(24)
+    CHARLIKE_HDR(25)
+    CHARLIKE_HDR(26)
+    CHARLIKE_HDR(27)
+    CHARLIKE_HDR(28)
+    CHARLIKE_HDR(29)
+    CHARLIKE_HDR(30)
+    CHARLIKE_HDR(31)
+    CHARLIKE_HDR(32)
+    CHARLIKE_HDR(33)
+    CHARLIKE_HDR(34)
+    CHARLIKE_HDR(35)
+    CHARLIKE_HDR(36)
+    CHARLIKE_HDR(37)
+    CHARLIKE_HDR(38)
+    CHARLIKE_HDR(39)
+    CHARLIKE_HDR(40)
+    CHARLIKE_HDR(41)
+    CHARLIKE_HDR(42)
+    CHARLIKE_HDR(43)
+    CHARLIKE_HDR(44)
+    CHARLIKE_HDR(45)
+    CHARLIKE_HDR(46)
+    CHARLIKE_HDR(47)
+    CHARLIKE_HDR(48)
+    CHARLIKE_HDR(49)
+    CHARLIKE_HDR(50)
+    CHARLIKE_HDR(51)
+    CHARLIKE_HDR(52)
+    CHARLIKE_HDR(53)
+    CHARLIKE_HDR(54)
+    CHARLIKE_HDR(55)
+    CHARLIKE_HDR(56)
+    CHARLIKE_HDR(57)
+    CHARLIKE_HDR(58)
+    CHARLIKE_HDR(59)
+    CHARLIKE_HDR(60)
+    CHARLIKE_HDR(61)
+    CHARLIKE_HDR(62)
+    CHARLIKE_HDR(63)
+    CHARLIKE_HDR(64)
+    CHARLIKE_HDR(65)
+    CHARLIKE_HDR(66)
+    CHARLIKE_HDR(67)
+    CHARLIKE_HDR(68)
+    CHARLIKE_HDR(69)
+    CHARLIKE_HDR(70)
+    CHARLIKE_HDR(71)
+    CHARLIKE_HDR(72)
+    CHARLIKE_HDR(73)
+    CHARLIKE_HDR(74)
+    CHARLIKE_HDR(75)
+    CHARLIKE_HDR(76)
+    CHARLIKE_HDR(77)
+    CHARLIKE_HDR(78)
+    CHARLIKE_HDR(79)
+    CHARLIKE_HDR(80)
+    CHARLIKE_HDR(81)
+    CHARLIKE_HDR(82)
+    CHARLIKE_HDR(83)
+    CHARLIKE_HDR(84)
+    CHARLIKE_HDR(85)
+    CHARLIKE_HDR(86)
+    CHARLIKE_HDR(87)
+    CHARLIKE_HDR(88)
+    CHARLIKE_HDR(89)
+    CHARLIKE_HDR(90)
+    CHARLIKE_HDR(91)
+    CHARLIKE_HDR(92)
+    CHARLIKE_HDR(93)
+    CHARLIKE_HDR(94)
+    CHARLIKE_HDR(95)
+    CHARLIKE_HDR(96)
+    CHARLIKE_HDR(97)
+    CHARLIKE_HDR(98)
+    CHARLIKE_HDR(99)
+    CHARLIKE_HDR(100)
+    CHARLIKE_HDR(101)
+    CHARLIKE_HDR(102)
+    CHARLIKE_HDR(103)
+    CHARLIKE_HDR(104)
+    CHARLIKE_HDR(105)
+    CHARLIKE_HDR(106)
+    CHARLIKE_HDR(107)
+    CHARLIKE_HDR(108)
+    CHARLIKE_HDR(109)
+    CHARLIKE_HDR(110)
+    CHARLIKE_HDR(111)
+    CHARLIKE_HDR(112)
+    CHARLIKE_HDR(113)
+    CHARLIKE_HDR(114)
+    CHARLIKE_HDR(115)
+    CHARLIKE_HDR(116)
+    CHARLIKE_HDR(117)
+    CHARLIKE_HDR(118)
+    CHARLIKE_HDR(119)
+    CHARLIKE_HDR(120)
+    CHARLIKE_HDR(121)
+    CHARLIKE_HDR(122)
+    CHARLIKE_HDR(123)
+    CHARLIKE_HDR(124)
+    CHARLIKE_HDR(125)
+    CHARLIKE_HDR(126)
+    CHARLIKE_HDR(127)
+    CHARLIKE_HDR(128)
+    CHARLIKE_HDR(129)
+    CHARLIKE_HDR(130)
+    CHARLIKE_HDR(131)
+    CHARLIKE_HDR(132)
+    CHARLIKE_HDR(133)
+    CHARLIKE_HDR(134)
+    CHARLIKE_HDR(135)
+    CHARLIKE_HDR(136)
+    CHARLIKE_HDR(137)
+    CHARLIKE_HDR(138)
+    CHARLIKE_HDR(139)
+    CHARLIKE_HDR(140)
+    CHARLIKE_HDR(141)
+    CHARLIKE_HDR(142)
+    CHARLIKE_HDR(143)
+    CHARLIKE_HDR(144)
+    CHARLIKE_HDR(145)
+    CHARLIKE_HDR(146)
+    CHARLIKE_HDR(147)
+    CHARLIKE_HDR(148)
+    CHARLIKE_HDR(149)
+    CHARLIKE_HDR(150)
+    CHARLIKE_HDR(151)
+    CHARLIKE_HDR(152)
+    CHARLIKE_HDR(153)
+    CHARLIKE_HDR(154)
+    CHARLIKE_HDR(155)
+    CHARLIKE_HDR(156)
+    CHARLIKE_HDR(157)
+    CHARLIKE_HDR(158)
+    CHARLIKE_HDR(159)
+    CHARLIKE_HDR(160)
+    CHARLIKE_HDR(161)
+    CHARLIKE_HDR(162)
+    CHARLIKE_HDR(163)
+    CHARLIKE_HDR(164)
+    CHARLIKE_HDR(165)
+    CHARLIKE_HDR(166)
+    CHARLIKE_HDR(167)
+    CHARLIKE_HDR(168)
+    CHARLIKE_HDR(169)
+    CHARLIKE_HDR(170)
+    CHARLIKE_HDR(171)
+    CHARLIKE_HDR(172)
+    CHARLIKE_HDR(173)
+    CHARLIKE_HDR(174)
+    CHARLIKE_HDR(175)
+    CHARLIKE_HDR(176)
+    CHARLIKE_HDR(177)
+    CHARLIKE_HDR(178)
+    CHARLIKE_HDR(179)
+    CHARLIKE_HDR(180)
+    CHARLIKE_HDR(181)
+    CHARLIKE_HDR(182)
+    CHARLIKE_HDR(183)
+    CHARLIKE_HDR(184)
+    CHARLIKE_HDR(185)
+    CHARLIKE_HDR(186)
+    CHARLIKE_HDR(187)
+    CHARLIKE_HDR(188)
+    CHARLIKE_HDR(189)
+    CHARLIKE_HDR(190)
+    CHARLIKE_HDR(191)
+    CHARLIKE_HDR(192)
+    CHARLIKE_HDR(193)
+    CHARLIKE_HDR(194)
+    CHARLIKE_HDR(195)
+    CHARLIKE_HDR(196)
+    CHARLIKE_HDR(197)
+    CHARLIKE_HDR(198)
+    CHARLIKE_HDR(199)
+    CHARLIKE_HDR(200)
+    CHARLIKE_HDR(201)
+    CHARLIKE_HDR(202)
+    CHARLIKE_HDR(203)
+    CHARLIKE_HDR(204)
+    CHARLIKE_HDR(205)
+    CHARLIKE_HDR(206)
+    CHARLIKE_HDR(207)
+    CHARLIKE_HDR(208)
+    CHARLIKE_HDR(209)
+    CHARLIKE_HDR(210)
+    CHARLIKE_HDR(211)
+    CHARLIKE_HDR(212)
+    CHARLIKE_HDR(213)
+    CHARLIKE_HDR(214)
+    CHARLIKE_HDR(215)
+    CHARLIKE_HDR(216)
+    CHARLIKE_HDR(217)
+    CHARLIKE_HDR(218)
+    CHARLIKE_HDR(219)
+    CHARLIKE_HDR(220)
+    CHARLIKE_HDR(221)
+    CHARLIKE_HDR(222)
+    CHARLIKE_HDR(223)
+    CHARLIKE_HDR(224)
+    CHARLIKE_HDR(225)
+    CHARLIKE_HDR(226)
+    CHARLIKE_HDR(227)
+    CHARLIKE_HDR(228)
+    CHARLIKE_HDR(229)
+    CHARLIKE_HDR(230)
+    CHARLIKE_HDR(231)
+    CHARLIKE_HDR(232)
+    CHARLIKE_HDR(233)
+    CHARLIKE_HDR(234)
+    CHARLIKE_HDR(235)
+    CHARLIKE_HDR(236)
+    CHARLIKE_HDR(237)
+    CHARLIKE_HDR(238)
+    CHARLIKE_HDR(239)
+    CHARLIKE_HDR(240)
+    CHARLIKE_HDR(241)
+    CHARLIKE_HDR(242)
+    CHARLIKE_HDR(243)
+    CHARLIKE_HDR(244)
+    CHARLIKE_HDR(245)
+    CHARLIKE_HDR(246)
+    CHARLIKE_HDR(247)
+    CHARLIKE_HDR(248)
+    CHARLIKE_HDR(249)
+    CHARLIKE_HDR(250)
+    CHARLIKE_HDR(251)
+    CHARLIKE_HDR(252)
+    CHARLIKE_HDR(253)
+    CHARLIKE_HDR(254)
+    CHARLIKE_HDR(255)
+}
+
+section "data" {
+ stg_INTLIKE_closure:
+    INTLIKE_HDR(-16)	/* MIN_INTLIKE == -16 */
+    INTLIKE_HDR(-15)
+    INTLIKE_HDR(-14)
+    INTLIKE_HDR(-13)
+    INTLIKE_HDR(-12)
+    INTLIKE_HDR(-11)
+    INTLIKE_HDR(-10)
+    INTLIKE_HDR(-9)
+    INTLIKE_HDR(-8)
+    INTLIKE_HDR(-7)
+    INTLIKE_HDR(-6)
+    INTLIKE_HDR(-5)
+    INTLIKE_HDR(-4)
+    INTLIKE_HDR(-3)
+    INTLIKE_HDR(-2)
+    INTLIKE_HDR(-1)
+    INTLIKE_HDR(0)
+    INTLIKE_HDR(1)
+    INTLIKE_HDR(2)
+    INTLIKE_HDR(3)
+    INTLIKE_HDR(4)
+    INTLIKE_HDR(5)
+    INTLIKE_HDR(6)
+    INTLIKE_HDR(7)
+    INTLIKE_HDR(8)
+    INTLIKE_HDR(9)
+    INTLIKE_HDR(10)
+    INTLIKE_HDR(11)
+    INTLIKE_HDR(12)
+    INTLIKE_HDR(13)
+    INTLIKE_HDR(14)
+    INTLIKE_HDR(15)
+    INTLIKE_HDR(16)	/* MAX_INTLIKE == 16 */
+}
diff --git a/rts/StgPrimFloat.c b/rts/StgPrimFloat.c
new file mode 100644
index 0000000000..5bd6aebb1c
--- /dev/null
+++ b/rts/StgPrimFloat.c
@@ -0,0 +1,491 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2000
+ *
+ * Miscellaneous support for floating-point primitives
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+
+#include <math.h>
+
+/*
+ * Encoding and decoding Doubles.  Code based on the HBC code
+ * (lib/fltcode.c).
+ */
+
+#ifdef _SHORT_LIMB
+#define SIZEOF_LIMB_T SIZEOF_UNSIGNED_INT
+#else
+#ifdef _LONG_LONG_LIMB
+#define SIZEOF_LIMB_T SIZEOF_UNSIGNED_LONG_LONG
+#else
+#define SIZEOF_LIMB_T SIZEOF_UNSIGNED_LONG
+#endif
+#endif
+
+#if SIZEOF_LIMB_T == 4
+#define GMP_BASE 4294967296.0
+#elif SIZEOF_LIMB_T == 8
+#define GMP_BASE 18446744073709551616.0
+#else
+#error Cannot cope with SIZEOF_LIMB_T -- please add definition of GMP_BASE
+#endif
+
+#define DNBIGIT	 ((SIZEOF_DOUBLE+SIZEOF_LIMB_T-1)/SIZEOF_LIMB_T)
+#define FNBIGIT	 ((SIZEOF_FLOAT +SIZEOF_LIMB_T-1)/SIZEOF_LIMB_T)
+
+#if IEEE_FLOATING_POINT
+#define MY_DMINEXP  ((DBL_MIN_EXP) - (DBL_MANT_DIG) - 1)
+/* DMINEXP is defined in values.h on Linux (for example) */
+#define DHIGHBIT 0x00100000
+#define DMSBIT   0x80000000
+
+#define MY_FMINEXP  ((FLT_MIN_EXP) - (FLT_MANT_DIG) - 1)
+#define FHIGHBIT 0x00800000
+#define FMSBIT   0x80000000
+#endif
+
+#ifdef WORDS_BIGENDIAN
+#define L 1
+#define H 0
+#else
+#define L 0
+#define H 1
+#endif
+
+#define __abs(a)		(( (a) >= 0 ) ? (a) : (-(a)))
+
+StgDouble
+__encodeDouble (I_ size, StgByteArray ba, I_ e) /* result = s * 2^e */
+{
+    StgDouble r;
+    const mp_limb_t *const arr = (const mp_limb_t *)ba;
+    I_ i;
+
+    /* Convert MP_INT to a double; knows a lot about internal rep! */
+    for(r = 0.0, i = __abs(size)-1; i >= 0; i--)
+	r = (r * GMP_BASE) + arr[i];
+
+    /* Now raise to the exponent */
+    if ( r != 0.0 ) /* Lennart suggests this avoids a bug in MIPS's ldexp */
+	r = ldexp(r, e);
+
+    /* sign is encoded in the size */
+    if (size < 0)
+	r = -r;
+
+    return r;
+}
+
+/* Special version for small Integers */
+StgDouble
+__int_encodeDouble (I_ j, I_ e)
+{
+  StgDouble r;
+  
+  r = (StgDouble)__abs(j);
+  
+  /* Now raise to the exponent */
+  if ( r != 0.0 ) /* Lennart suggests this avoids a bug in MIPS's ldexp */
+    r = ldexp(r, e);
+  
+  /* sign is encoded in the size */
+  if (j < 0)
+    r = -r;
+  
+  return r;
+}
+
+StgFloat
+__encodeFloat (I_ size, StgByteArray ba, I_ e) /* result = s * 2^e */
+{
+    StgFloat r;
+    const mp_limb_t *arr = (const mp_limb_t *)ba;
+    I_ i;
+
+    /* Convert MP_INT to a float; knows a lot about internal rep! */
+    for(r = 0.0, i = __abs(size)-1; i >= 0; i--)
+	r = (r * GMP_BASE) + arr[i];
+
+    /* Now raise to the exponent */
+    if ( r != 0.0 ) /* Lennart suggests this avoids a bug in MIPS's ldexp */
+	r = ldexp(r, e);
+
+    /* sign is encoded in the size */
+    if (size < 0)
+	r = -r;
+
+    return r;
+}
+
+/* Special version for small Integers */
+StgFloat
+__int_encodeFloat (I_ j, I_ e)
+{
+  StgFloat r;
+  
+  r = (StgFloat)__abs(j);
+  
+  /* Now raise to the exponent */
+  if ( r != 0.0 ) /* Lennart suggests this avoids a bug in MIPS's ldexp */
+    r = ldexp(r, e);
+  
+  /* sign is encoded in the size */
+  if (j < 0)
+    r = -r;
+  
+  return r;
+}
+
+/* This only supports IEEE floating point */
+
+void
+__decodeDouble (MP_INT *man, I_ *exp, StgDouble dbl)
+{
+    /* Do some bit fiddling on IEEE */
+    unsigned int low, high; 	     	/* assuming 32 bit ints */
+    int sign, iexp;
+    union { double d; unsigned int i[2]; } u;	/* assuming 32 bit ints, 64 bit double */
+
+    ASSERT(sizeof(unsigned int ) == 4            );
+    ASSERT(sizeof(dbl          ) == SIZEOF_DOUBLE);
+    ASSERT(sizeof(man->_mp_d[0]) == SIZEOF_LIMB_T);
+    ASSERT(DNBIGIT*SIZEOF_LIMB_T >= SIZEOF_DOUBLE);
+
+    u.d = dbl;	    /* grab chunks of the double */
+    low = u.i[L];
+    high = u.i[H];
+
+    /* we know the MP_INT* passed in has size zero, so we realloc
+    	no matter what.
+    */
+    man->_mp_alloc = DNBIGIT;
+
+    if (low == 0 && (high & ~DMSBIT) == 0) {
+	man->_mp_size = 0;
+	*exp = 0L;
+    } else {
+	man->_mp_size = DNBIGIT;
+	iexp = ((high >> 20) & 0x7ff) + MY_DMINEXP;
+	sign = high;
+
+	high &= DHIGHBIT-1;
+	if (iexp != MY_DMINEXP)	/* don't add hidden bit to denorms */
+	    high |= DHIGHBIT;
+	else {
+	    iexp++;
+	    /* A denorm, normalize the mantissa */
+	    while (! (high & DHIGHBIT)) {
+		high <<= 1;
+		if (low & DMSBIT)
+		    high++;
+		low <<= 1;
+		iexp--;
+	    }
+	}
+        *exp = (I_) iexp;
+#if DNBIGIT == 2
+	man->_mp_d[0] = (mp_limb_t)low;
+	man->_mp_d[1] = (mp_limb_t)high;
+#else
+#if DNBIGIT == 1
+	man->_mp_d[0] = ((mp_limb_t)high) << 32 | (mp_limb_t)low;
+#else
+#error Cannot cope with DNBIGIT
+#endif
+#endif
+	if (sign < 0)
+	    man->_mp_size = -man->_mp_size;
+    }
+}
+
+void
+__decodeFloat (MP_INT *man, I_ *exp, StgFloat flt)
+{
+    /* Do some bit fiddling on IEEE */
+    int high, sign; 	    	    /* assuming 32 bit ints */
+    union { float f; int i; } u;    /* assuming 32 bit float and int */
+
+    ASSERT(sizeof(int          ) == 4            );
+    ASSERT(sizeof(flt          ) == SIZEOF_FLOAT );
+    ASSERT(sizeof(man->_mp_d[0]) == SIZEOF_LIMB_T);
+    ASSERT(FNBIGIT*SIZEOF_LIMB_T >= SIZEOF_FLOAT );
+
+    u.f = flt;	    /* grab the float */
+    high = u.i;
+
+    /* we know the MP_INT* passed in has size zero, so we realloc
+    	no matter what.
+    */
+    man->_mp_alloc = FNBIGIT;
+
+    if ((high & ~FMSBIT) == 0) {
+	man->_mp_size = 0;
+	*exp = 0;
+    } else {
+	man->_mp_size = FNBIGIT;
+	*exp = ((high >> 23) & 0xff) + MY_FMINEXP;
+	sign = high;
+
+	high &= FHIGHBIT-1;
+	if (*exp != MY_FMINEXP)	/* don't add hidden bit to denorms */
+	    high |= FHIGHBIT;
+	else {
+	    (*exp)++;
+	    /* A denorm, normalize the mantissa */
+	    while (! (high & FHIGHBIT)) {
+		high <<= 1;
+		(*exp)--;
+	    }
+	}
+#if FNBIGIT == 1
+	man->_mp_d[0] = (mp_limb_t)high;
+#else
+#error Cannot cope with FNBIGIT
+#endif
+	if (sign < 0)
+	    man->_mp_size = -man->_mp_size;
+    }
+}
+
+/* Convenient union types for checking the layout of IEEE 754 types -
+   based on defs in GNU libc <ieee754.h>
+*/
+
+union stg_ieee754_flt
+{
+   float f;
+   struct {
+
+#if WORDS_BIGENDIAN
+	unsigned int negative:1;
+	unsigned int exponent:8;
+	unsigned int mantissa:23;
+#else
+	unsigned int mantissa:23;
+	unsigned int exponent:8;
+	unsigned int negative:1;
+#endif
+   } ieee;
+   struct {
+
+#if WORDS_BIGENDIAN
+	unsigned int negative:1;
+	unsigned int exponent:8;
+	unsigned int quiet_nan:1;
+	unsigned int mantissa:22;
+#else
+	unsigned int mantissa:22;
+	unsigned int quiet_nan:1;
+	unsigned int exponent:8;
+	unsigned int negative:1;
+#endif
+   } ieee_nan;
+};
+
+/*
+ 
+ To recap, here's the representation of a double precision
+ IEEE floating point number:
+
+ sign         63           sign bit (0==positive, 1==negative)
+ exponent     62-52        exponent (biased by 1023)
+ fraction     51-0         fraction (bits to right of binary point)
+*/
+
+union stg_ieee754_dbl
+{
+   double d;
+   struct {
+
+#if WORDS_BIGENDIAN
+	unsigned int negative:1;
+	unsigned int exponent:11;
+	unsigned int mantissa0:20;
+	unsigned int mantissa1:32;
+#else
+	unsigned int mantissa1:32;
+	unsigned int mantissa0:20;
+	unsigned int exponent:11;
+	unsigned int negative:1;
+#endif
+   } ieee;
+    /* This format makes it easier to see if a NaN is a signalling NaN.  */
+   struct {
+
+#if WORDS_BIGENDIAN
+	unsigned int negative:1;
+	unsigned int exponent:11;
+	unsigned int quiet_nan:1;
+	unsigned int mantissa0:19;
+	unsigned int mantissa1:32;
+#else
+	unsigned int mantissa1:32;
+	unsigned int mantissa0:19;
+	unsigned int quiet_nan:1;
+	unsigned int exponent:11;
+	unsigned int negative:1;
+#endif
+   } ieee_nan;
+};
+
+/*
+ * Predicates for testing for extreme IEEE fp values. Used
+ * by the bytecode evaluator and the Prelude.
+ *
+ */ 
+
+/* In case you don't suppport IEEE, you'll just get dummy defs.. */
+#ifdef IEEE_FLOATING_POINT
+
+StgInt
+isDoubleNaN(StgDouble d)
+{
+  union stg_ieee754_dbl u;
+  
+  u.d = d;
+
+  return (
+    u.ieee.exponent  == 2047 /* 2^11 - 1 */ &&  /* Is the exponent all ones? */
+    (u.ieee.mantissa0 != 0 || u.ieee.mantissa1 != 0)
+    	/* and the mantissa non-zero? */
+    );
+}
+
+StgInt
+isDoubleInfinite(StgDouble d)
+{
+    union stg_ieee754_dbl u;
+
+    u.d = d;
+
+    /* Inf iff exponent is all ones, mantissa all zeros */
+    return (
+        u.ieee.exponent  == 2047 /* 2^11 - 1 */ &&
+	u.ieee.mantissa0 == 0 		        &&
+	u.ieee.mantissa1 == 0
+      );
+}
+
+StgInt
+isDoubleDenormalized(StgDouble d) 
+{
+    union stg_ieee754_dbl u;
+
+    u.d = d;
+
+    /* A (single/double/quad) precision floating point number
+       is denormalised iff:
+        - exponent is zero
+	- mantissa is non-zero.
+        - (don't care about setting of sign bit.)
+
+    */
+    return (  
+	u.ieee.exponent  == 0 &&
+	(u.ieee.mantissa0 != 0 ||
+	 u.ieee.mantissa1 != 0)
+      );
+	 
+}
+
+StgInt
+isDoubleNegativeZero(StgDouble d) 
+{
+    union stg_ieee754_dbl u;
+
+    u.d = d;
+    /* sign (bit 63) set (only) => negative zero */
+
+    return (
+    	u.ieee.negative  == 1 &&
+	u.ieee.exponent  == 0 &&
+	u.ieee.mantissa0 == 0 &&
+	u.ieee.mantissa1 == 0);
+}
+
+/* Same tests, this time for StgFloats. */
+
+/*
+ To recap, here's the representation of a single precision
+ IEEE floating point number:
+
+ sign         31           sign bit (0 == positive, 1 == negative)
+ exponent     30-23        exponent (biased by 127)
+ fraction     22-0         fraction (bits to right of binary point)
+*/
+
+
+StgInt
+isFloatNaN(StgFloat f)
+{
+    union stg_ieee754_flt u;
+    u.f = f;
+
+   /* Floating point NaN iff exponent is all ones, mantissa is
+      non-zero (but see below.) */
+   return (
+   	u.ieee.exponent == 255 /* 2^8 - 1 */ &&
+	u.ieee.mantissa != 0);
+}
+
+StgInt
+isFloatInfinite(StgFloat f)
+{
+    union stg_ieee754_flt u;
+    u.f = f;
+  
+    /* A float is Inf iff exponent is max (all ones),
+       and mantissa is min(all zeros.) */
+    return (
+    	u.ieee.exponent == 255 /* 2^8 - 1 */ &&
+	u.ieee.mantissa == 0);
+}
+
+StgInt
+isFloatDenormalized(StgFloat f)
+{
+    union stg_ieee754_flt u;
+    u.f = f;
+
+    /* A (single/double/quad) precision floating point number
+       is denormalised iff:
+        - exponent is zero
+	- mantissa is non-zero.
+        - (don't care about setting of sign bit.)
+
+    */
+    return (
+    	u.ieee.exponent == 0 &&
+	u.ieee.mantissa != 0);
+}
+
+StgInt
+isFloatNegativeZero(StgFloat f) 
+{
+    union stg_ieee754_flt u;
+    u.f = f;
+
+    /* sign (bit 31) set (only) => negative zero */
+    return (
+	u.ieee.negative      &&
+	u.ieee.exponent == 0 &&
+	u.ieee.mantissa == 0);
+}
+
+#else /* ! IEEE_FLOATING_POINT */
+
+/* Dummy definitions of predicates - they all return false */
+StgInt isDoubleNaN(d) StgDouble d; { return 0; }
+StgInt isDoubleInfinite(d) StgDouble d; { return 0; }
+StgInt isDoubleDenormalized(d) StgDouble d; { return 0; }
+StgInt isDoubleNegativeZero(d) StgDouble d; { return 0; }
+StgInt isFloatNaN(f) StgFloat f; { return 0; }
+StgInt isFloatInfinite(f) StgFloat f; { return 0; }
+StgInt isFloatDenormalized(f) StgFloat f; { return 0; }
+StgInt isFloatNegativeZero(f) StgFloat f; { return 0; }
+
+#endif /* ! IEEE_FLOATING_POINT */
diff --git a/rts/StgRun.h b/rts/StgRun.h
new file mode 100644
index 0000000000..da376b4971
--- /dev/null
+++ b/rts/StgRun.h
@@ -0,0 +1,16 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2004
+ *
+ * Tiny assembler 'layer' between the C and STG worlds.
+ * 
+ ---------------------------------------------------------------------------- */
+
+#ifndef STGRUN_H
+#define STGRUN_H
+
+extern StgRegTable * StgRun(StgFunPtr f, StgRegTable *basereg);
+
+RTS_FUN(StgReturn);
+
+#endif /* STGRUN_H */
diff --git a/rts/StgStartup.cmm b/rts/StgStartup.cmm
new file mode 100644
index 0000000000..2f2a759c81
--- /dev/null
+++ b/rts/StgStartup.cmm
@@ -0,0 +1,218 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2004
+ *
+ * Code for starting, stopping and restarting threads.
+ *
+ * This file is written in a subset of C--, extended with various
+ * features specific to GHC.  It is compiled by GHC directly.  For the
+ * syntax of .cmm files, see the parser in ghc/compiler/cmm/CmmParse.y.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Cmm.h"
+
+/*
+ * This module contains the two entry points and the final exit point
+ * to/from the Haskell world.  We can enter either by:
+ *
+ *   a) returning to the address on the top of the stack, or
+ *   b) entering the closure on the top of the stack
+ *
+ * the function stg_stop_thread_entry is the final exit for a
+ * thread: it is the last return address on the stack.  It returns
+ * to the scheduler marking the thread as finished.
+ */
+
+#define CHECK_SENSIBLE_REGS() \
+    ASSERT(Hp != 0);			\
+    ASSERT(Sp != 0);			\
+    ASSERT(SpLim != 0);			\
+    ASSERT(HpLim != 0);			\
+    ASSERT(SpLim - WDS(RESERVED_STACK_WORDS) <= Sp); \
+    ASSERT(HpLim >= Hp);
+
+/* -----------------------------------------------------------------------------
+   Returning from the STG world.
+
+   This is a polymorphic return address, meaning that any old constructor
+   can be returned, we don't care (actually, it's probably going to be
+   an IOok constructor, which will indirect through the vector table
+   slot 0).
+   -------------------------------------------------------------------------- */
+
+#if defined(PROFILING)
+#define STOP_THREAD_BITMAP 3
+#define STOP_THREAD_WORDS  2
+#else
+#define STOP_THREAD_BITMAP 0
+#define STOP_THREAD_WORDS  0
+#endif
+
+/* A polymorhpic return address, where all the vector slots point to the
+   direct entry point. */
+INFO_TABLE_RET( stg_stop_thread, STOP_THREAD_WORDS, STOP_THREAD_BITMAP,
+		STOP_FRAME, 
+		RET_LBL(stg_stop_thread),
+		RET_LBL(stg_stop_thread),
+		RET_LBL(stg_stop_thread),
+		RET_LBL(stg_stop_thread),
+		RET_LBL(stg_stop_thread),
+		RET_LBL(stg_stop_thread),
+		RET_LBL(stg_stop_thread),
+		RET_LBL(stg_stop_thread) )
+{
+    /* 
+       The final exit.
+      
+       The top-top-level closures (e.g., "main") are of type "IO a".
+       When entered, they perform an IO action and return an 'a' in R1.
+      
+       We save R1 on top of the stack where the scheduler can find it,
+       tidy up the registers and return to the scheduler.
+      
+       We Leave the stack looking like this:
+      
+            	+----------------+
+                |      -------------------> return value
+            	+----------------+
+            	| stg_enter_info |
+            	+----------------+
+      
+       The stg_enter_info is just a dummy info table so that the
+       garbage collector can understand the stack (there must always
+       be an info table on top of the stack).
+    */
+
+    Sp = Sp + SIZEOF_StgStopFrame - WDS(2);
+    Sp(1) = R1;
+    Sp(0) = stg_enter_info;
+
+    StgTSO_what_next(CurrentTSO) = ThreadComplete::I16;
+
+    SAVE_THREAD_STATE();
+
+    /* The return code goes in BaseReg->rRet, and BaseReg is returned in R1 */
+    StgRegTable_rRet(BaseReg) = ThreadFinished;
+    R1 = BaseReg;
+
+    jump StgReturn;
+}
+
+/* -----------------------------------------------------------------------------
+   Start a thread from the scheduler by returning to the address on
+   the top of the stack.  This is used for all entries to STG code
+   from C land.
+
+   On the way back, we (usually) pass through stg_returnToSched which saves
+   the thread's state away nicely.
+   -------------------------------------------------------------------------- */
+
+stg_returnToStackTop
+{
+  LOAD_THREAD_STATE();
+  CHECK_SENSIBLE_REGS();
+  jump %ENTRY_CODE(Sp(0));
+}
+
+stg_returnToSched
+{
+  SAVE_THREAD_STATE();
+  foreign "C" threadPaused(MyCapability() "ptr", CurrentTSO);
+  jump StgReturn;
+}
+
+// A variant of stg_returntToSched that doesn't call threadPaused() on the
+// current thread.  This is used for switching from compiled execution to the
+// interpreter, where calling threadPaused() on every switch would be too
+// expensive.
+stg_returnToSchedNotPaused
+{
+  SAVE_THREAD_STATE();
+  jump StgReturn;
+}
+
+// A variant of stg_returnToSched, but instead of returning directly to the
+// scheduler, we jump to the code fragment pointed to by R2.  This lets us
+// perform some final actions after making the thread safe, such as unlocking
+// the MVar on which we are about to block in SMP mode.
+stg_returnToSchedButFirst
+{
+  SAVE_THREAD_STATE();
+  foreign "C" threadPaused(MyCapability() "ptr", CurrentTSO);
+  jump R2;
+}
+
+/* -----------------------------------------------------------------------------
+    Strict IO application - performing an IO action and entering its result.
+    
+    rts_evalIO() lets you perform Haskell IO actions from outside of
+    Haskell-land, returning back to you their result. Want this result
+    to be evaluated to WHNF by that time, so that we can easily get at
+    the int/char/whatever using the various get{Ty} functions provided
+    by the RTS API.
+
+    forceIO takes care of this, performing the IO action and entering the
+    results that comes back.
+    ------------------------------------------------------------------------- */
+
+INFO_TABLE_RET( stg_forceIO, 0/*size*/, 0/*bitmap*/, RET_SMALL)
+
+#ifdef REG_R1
+{
+  Sp_adj(1);
+  ENTER();
+}
+#else
+{
+  R1 = Sp(0);
+  Sp_adj(2);
+  ENTER();
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+    Non-strict IO application.
+
+    This stack frame works like stg_forceIO_info except that it
+    doesn't evaluate the return value.  We need the layer because the
+    return convention for an IO action differs depending on whether R1
+    is a register or not.
+    ------------------------------------------------------------------------- */
+
+INFO_TABLE_RET( stg_noforceIO, 0/*size*/, 0/*bitmap*/, RET_SMALL )
+
+#ifdef REG_R1
+{
+  Sp_adj(1);
+  jump %ENTRY_CODE(Sp(0));
+}
+#else
+{
+  R1 = Sp(0);
+  Sp_adj(2);
+  jump %ENTRY_CODE(Sp(0));
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+   Special STG entry points for module registration.
+   -------------------------------------------------------------------------- */
+
+stg_init_finish
+{
+  jump StgReturn;
+}
+
+/* On entry to stg_init:
+ *    init_stack[0] = &stg_init_ret;
+ *    init_stack[1] = __stginit_Something;
+ */
+stg_init
+{
+  W_ next;
+  Sp = W_[BaseReg + OFFSET_StgRegTable_rSp];
+  next = W_[Sp];
+  Sp_adj(1);
+  jump next;
+}
diff --git a/rts/StgStdThunks.cmm b/rts/StgStdThunks.cmm
new file mode 100644
index 0000000000..342a6eb164
--- /dev/null
+++ b/rts/StgStdThunks.cmm
@@ -0,0 +1,274 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The University of Glasgow, 1998-2004
+ *
+ * Canned "Standard Form" Thunks
+ *
+ * This file is written in a subset of C--, extended with various
+ * features specific to GHC.  It is compiled by GHC directly.  For the
+ * syntax of .cmm files, see the parser in ghc/compiler/cmm/CmmParse.y.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Cmm.h"
+
+/* -----------------------------------------------------------------------------
+   The code for a thunk that simply extracts a field from a
+   single-constructor datatype depends only on the offset of the field
+   to be selected.
+
+   Here we define some canned "selector" thunks that do just that; any
+   selector thunk appearing in a program will refer to one of these
+   instead of being compiled independently.
+
+   The garbage collector spots selector thunks and reduces them if
+   possible, in order to avoid space leaks resulting from lazy pattern
+   matching.
+   -------------------------------------------------------------------------- */
+
+#define WITHUPD_FRAME_SIZE  (SIZEOF_StgUpdateFrame + SIZEOF_StgHeader)
+#define NOUPD_FRAME_SIZE    (SIZEOF_StgHeader)
+
+#ifdef PROFILING
+#define SAVE_CCCS(fs)  	StgHeader_ccs(Sp-fs) = W_[CCCS]
+#define GET_SAVED_CCCS  W_[CCCS] = StgHeader_ccs(Sp)
+#define RET_BITMAP    3
+#define RET_FRAMESIZE 2
+#else
+#define SAVE_CCCS(fs)   /* empty */
+#define GET_SAVED_CCCS  /* empty */
+#define RET_BITMAP    0
+#define RET_FRAMESIZE 0
+#endif
+
+#define SELECTOR_CODE_UPD(offset) \
+  INFO_TABLE_RET(stg_sel_ret_##offset##_upd, RET_FRAMESIZE, RET_BITMAP, RET_SMALL)	\
+  {									\
+      R1 = StgClosure_payload(R1,offset);				\
+      GET_SAVED_CCCS;							\
+      Sp = Sp + SIZEOF_StgHeader;					\
+      ENTER();								\
+  }									\
+									\
+  INFO_TABLE_SELECTOR(stg_sel_##offset##_upd, offset, THUNK_SELECTOR, "stg_sel_upd", "stg_sel_upd") \
+  {									\
+      TICK_ENT_DYN_THK();						\
+      STK_CHK_NP(WITHUPD_FRAME_SIZE);					\
+      UPD_BH_UPDATABLE();						\
+      LDV_ENTER(R1);							\
+      PUSH_UPD_FRAME(Sp - SIZEOF_StgUpdateFrame, R1);			\
+      ENTER_CCS_THUNK(R1);						\
+      SAVE_CCCS(WITHUPD_FRAME_SIZE);					\
+      W_[Sp-WITHUPD_FRAME_SIZE] = stg_sel_ret_##offset##_upd_info;	\
+      R1 = StgThunk_payload(R1,0);					\
+      Sp = Sp - WITHUPD_FRAME_SIZE;					\
+      jump %GET_ENTRY(R1);						\
+  }
+  /* NOTE: no need to ENTER() here, we know the closure cannot evaluate to a function,
+     because we're going to do a field selection on the result. */
+
+SELECTOR_CODE_UPD(0)
+SELECTOR_CODE_UPD(1)
+SELECTOR_CODE_UPD(2)
+SELECTOR_CODE_UPD(3)
+SELECTOR_CODE_UPD(4)
+SELECTOR_CODE_UPD(5)
+SELECTOR_CODE_UPD(6)
+SELECTOR_CODE_UPD(7)
+SELECTOR_CODE_UPD(8)
+SELECTOR_CODE_UPD(9)
+SELECTOR_CODE_UPD(10)
+SELECTOR_CODE_UPD(11)
+SELECTOR_CODE_UPD(12)
+SELECTOR_CODE_UPD(13)
+SELECTOR_CODE_UPD(14)
+SELECTOR_CODE_UPD(15)
+
+#define SELECTOR_CODE_NOUPD(offset) \
+  INFO_TABLE_RET(stg_sel_ret_##offset##_noupd, RET_FRAMESIZE, RET_BITMAP, RET_SMALL)	\
+  {									\
+      R1 = StgClosure_payload(R1,offset);				\
+      GET_SAVED_CCCS;							\
+      Sp = Sp + SIZEOF_StgHeader;					\
+      jump %GET_ENTRY(R1);						\
+  }									\
+									\
+  INFO_TABLE_SELECTOR(stg_sel_##offset##_noupd, offset, THUNK_SELECTOR, "stg_sel_noupd", "stg_sel_noupd")\
+  {									\
+      TICK_ENT_DYN_THK();						\
+      STK_CHK_NP(NOUPD_FRAME_SIZE);					\
+      UPD_BH_SINGLE_ENTRY();						\
+      LDV_ENTER(R1);							\
+      TICK_UPDF_OMITTED();						\
+      ENTER_CCS_THUNK(R1);						\
+      SAVE_CCCS(NOUPD_FRAME_SIZE);					\
+      W_[Sp-NOUPD_FRAME_SIZE] = stg_sel_ret_##offset##_noupd_info;	\
+      R1 = StgThunk_payload(R1,0);					\
+      Sp = Sp - NOUPD_FRAME_SIZE;					\
+      jump %GET_ENTRY(R1);						\
+  }
+
+SELECTOR_CODE_NOUPD(0)
+SELECTOR_CODE_NOUPD(1)
+SELECTOR_CODE_NOUPD(2)
+SELECTOR_CODE_NOUPD(3)
+SELECTOR_CODE_NOUPD(4)
+SELECTOR_CODE_NOUPD(5)
+SELECTOR_CODE_NOUPD(6)
+SELECTOR_CODE_NOUPD(7)
+SELECTOR_CODE_NOUPD(8)
+SELECTOR_CODE_NOUPD(9)
+SELECTOR_CODE_NOUPD(10)
+SELECTOR_CODE_NOUPD(11)
+SELECTOR_CODE_NOUPD(12)
+SELECTOR_CODE_NOUPD(13)
+SELECTOR_CODE_NOUPD(14)
+SELECTOR_CODE_NOUPD(15)
+
+/* -----------------------------------------------------------------------------
+   Apply thunks
+
+   An apply thunk is a thunk of the form
+	
+		let z = [x1...xn] \u x1...xn
+		in ...
+
+   We pre-compile some of these because the code is always the same.
+
+   These have to be independent of the update frame size, so the code
+   works when profiling etc.
+   -------------------------------------------------------------------------- */
+
+/* stg_ap_1_upd_info is a bit redundant, but there appears to be a bug
+ * in the compiler that means stg_ap_1 is generated occasionally (ToDo)
+ */
+
+INFO_TABLE(stg_ap_1_upd,1,1,THUNK_1_0,"stg_ap_1_upd_info","stg_ap_1_upd_info")
+{
+  TICK_ENT_DYN_THK();
+  STK_CHK_NP(SIZEOF_StgUpdateFrame+WDS(1));
+  UPD_BH_UPDATABLE();
+  LDV_ENTER(R1);
+  ENTER_CCS_THUNK(R1);
+  PUSH_UPD_FRAME(Sp-SIZEOF_StgUpdateFrame,R1);
+  R1 = StgThunk_payload(R1,0);
+  Sp = Sp - SIZEOF_StgUpdateFrame;
+  jump stg_ap_0_fast;
+}
+
+INFO_TABLE(stg_ap_2_upd,2,0,THUNK_2_0,"stg_ap_2_upd_info","stg_ap_2_upd_info")
+{
+  TICK_ENT_DYN_THK();
+  STK_CHK_NP(SIZEOF_StgUpdateFrame+WDS(2));
+  UPD_BH_UPDATABLE();
+  LDV_ENTER(R1);
+  ENTER_CCS_THUNK(R1);
+  PUSH_UPD_FRAME(Sp-SIZEOF_StgUpdateFrame,R1);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(1)] = StgThunk_payload(R1,1);
+  R1 = StgThunk_payload(R1,0);
+  Sp = Sp - SIZEOF_StgUpdateFrame - WDS(1);
+  Sp_adj(-1); // for stg_ap_*_ret
+  TICK_UNKNOWN_CALL();
+  TICK_SLOW_CALL_p();
+  jump RET_LBL(stg_ap_p);
+}
+
+INFO_TABLE(stg_ap_3_upd,3,0,THUNK,"stg_ap_3_upd_info","stg_ap_3_upd_info")
+{
+  TICK_ENT_DYN_THK();
+  STK_CHK_NP(SIZEOF_StgUpdateFrame+WDS(3));
+  UPD_BH_UPDATABLE();
+  LDV_ENTER(R1);
+  ENTER_CCS_THUNK(R1);
+  PUSH_UPD_FRAME(Sp-SIZEOF_StgUpdateFrame,R1);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(1)] = StgThunk_payload(R1,2);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(2)] = StgThunk_payload(R1,1);
+  R1 = StgThunk_payload(R1,0);
+  Sp = Sp - SIZEOF_StgUpdateFrame - WDS(2);
+  Sp_adj(-1); // for stg_ap_*_ret
+  TICK_UNKNOWN_CALL();
+  TICK_SLOW_CALL_pp();
+  jump RET_LBL(stg_ap_pp);
+}
+
+INFO_TABLE(stg_ap_4_upd,4,0,THUNK,"stg_ap_4_upd_info","stg_ap_4_upd_info")
+{
+  TICK_ENT_DYN_THK();
+  STK_CHK_NP(SIZEOF_StgUpdateFrame+WDS(4));
+  UPD_BH_UPDATABLE();
+  LDV_ENTER(R1);
+  ENTER_CCS_THUNK(R1);
+  PUSH_UPD_FRAME(Sp-SIZEOF_StgUpdateFrame,R1);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(1)] = StgThunk_payload(R1,3);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(2)] = StgThunk_payload(R1,2);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(3)] = StgThunk_payload(R1,1);
+  R1 = StgThunk_payload(R1,0);
+  Sp = Sp - SIZEOF_StgUpdateFrame - WDS(3);
+  Sp_adj(-1); // for stg_ap_*_ret
+  TICK_UNKNOWN_CALL();
+  TICK_SLOW_CALL_ppp();
+  jump RET_LBL(stg_ap_ppp);
+}
+
+INFO_TABLE(stg_ap_5_upd,5,0,THUNK,"stg_ap_5_upd_info","stg_ap_5_upd_info")
+{
+  TICK_ENT_DYN_THK();
+  STK_CHK_NP(SIZEOF_StgUpdateFrame+WDS(5));
+  UPD_BH_UPDATABLE();
+  LDV_ENTER(R1);
+  ENTER_CCS_THUNK(R1);
+  PUSH_UPD_FRAME(Sp-SIZEOF_StgUpdateFrame,R1);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(1)] = StgThunk_payload(R1,4);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(2)] = StgThunk_payload(R1,3);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(3)] = StgThunk_payload(R1,2);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(4)] = StgThunk_payload(R1,1);
+  R1 = StgThunk_payload(R1,0);
+  Sp = Sp - SIZEOF_StgUpdateFrame - WDS(4);
+  Sp_adj(-1); // for stg_ap_*_ret
+  TICK_UNKNOWN_CALL();
+  TICK_SLOW_CALL_pppp();
+  jump RET_LBL(stg_ap_pppp);
+}
+
+INFO_TABLE(stg_ap_6_upd,6,0,THUNK,"stg_ap_6_upd_info","stg_ap_6_upd_info")
+{
+  TICK_ENT_DYN_THK();
+  STK_CHK_NP(SIZEOF_StgUpdateFrame+WDS(6));
+  UPD_BH_UPDATABLE();
+  LDV_ENTER(R1);
+  ENTER_CCS_THUNK(R1);
+  PUSH_UPD_FRAME(Sp-SIZEOF_StgUpdateFrame,R1);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(1)] = StgThunk_payload(R1,5);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(2)] = StgThunk_payload(R1,4);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(3)] = StgThunk_payload(R1,3);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(4)] = StgThunk_payload(R1,2);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(5)] = StgThunk_payload(R1,1);
+  R1 = StgThunk_payload(R1,0);
+  Sp = Sp - SIZEOF_StgUpdateFrame - WDS(5);
+  Sp_adj(-1); // for stg_ap_*_ret
+  TICK_UNKNOWN_CALL();
+  TICK_SLOW_CALL_ppppp();
+  jump RET_LBL(stg_ap_ppppp);
+}
+
+INFO_TABLE(stg_ap_7_upd,7,0,THUNK,"stg_ap_7_upd_info","stg_ap_7_upd_info")
+{
+  TICK_ENT_DYN_THK();
+  STK_CHK_NP(SIZEOF_StgUpdateFrame+WDS(7));
+  UPD_BH_UPDATABLE();
+  LDV_ENTER(R1);
+  ENTER_CCS_THUNK(R1);
+  PUSH_UPD_FRAME(Sp-SIZEOF_StgUpdateFrame,R1);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(1)] = StgThunk_payload(R1,6);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(2)] = StgThunk_payload(R1,5);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(3)] = StgThunk_payload(R1,4);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(4)] = StgThunk_payload(R1,3);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(5)] = StgThunk_payload(R1,2);
+  W_[Sp-SIZEOF_StgUpdateFrame-WDS(6)] = StgThunk_payload(R1,1);
+  R1 = StgThunk_payload(R1,0);
+  Sp = Sp - SIZEOF_StgUpdateFrame - WDS(6);
+  Sp_adj(-1); // for stg_ap_*_ret
+  TICK_UNKNOWN_CALL();
+  TICK_SLOW_CALL_pppppp();
+  jump RET_LBL(stg_ap_pppppp);
+}
diff --git a/rts/Storage.c b/rts/Storage.c
new file mode 100644
index 0000000000..974be45f10
--- /dev/null
+++ b/rts/Storage.c
@@ -0,0 +1,1137 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2004
+ *
+ * Storage manager front end
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "RtsFlags.h"
+#include "Stats.h"
+#include "Hooks.h"
+#include "BlockAlloc.h"
+#include "MBlock.h"
+#include "Weak.h"
+#include "Sanity.h"
+#include "Arena.h"
+#include "OSThreads.h"
+#include "Capability.h"
+#include "Storage.h"
+#include "Schedule.h"
+#include "RetainerProfile.h"	// for counting memory blocks (memInventory)
+
+#include <stdlib.h>
+#include <string.h>
+
+/* 
+ * All these globals require sm_mutex to access in THREADED_RTS mode.
+ */
+StgClosure    *caf_list         = NULL;
+StgClosure    *revertible_caf_list = NULL;
+rtsBool       keepCAFs;
+
+bdescr *small_alloc_list;	/* allocate()d small objects */
+bdescr *pinned_object_block;    /* allocate pinned objects into this block */
+nat alloc_blocks;		/* number of allocate()d blocks since GC */
+nat alloc_blocks_lim;		/* approximate limit on alloc_blocks */
+
+StgPtr alloc_Hp    = NULL;	/* next free byte in small_alloc_list */
+StgPtr alloc_HpLim = NULL;	/* end of block at small_alloc_list   */
+
+generation *generations = NULL;	/* all the generations */
+generation *g0		= NULL; /* generation 0, for convenience */
+generation *oldest_gen  = NULL; /* oldest generation, for convenience */
+step *g0s0 		= NULL; /* generation 0, step 0, for convenience */
+
+ullong total_allocated = 0;	/* total memory allocated during run */
+
+nat n_nurseries         = 0;    /* == RtsFlags.ParFlags.nNodes, convenience */
+step *nurseries         = NULL; /* array of nurseries, >1 only if THREADED_RTS */
+
+#ifdef THREADED_RTS
+/*
+ * Storage manager mutex:  protects all the above state from
+ * simultaneous access by two STG threads.
+ */
+Mutex sm_mutex;
+/*
+ * This mutex is used by atomicModifyMutVar# only
+ */
+Mutex atomic_modify_mutvar_mutex;
+#endif
+
+
+/*
+ * Forward references
+ */
+static void *stgAllocForGMP   (size_t size_in_bytes);
+static void *stgReallocForGMP (void *ptr, size_t old_size, size_t new_size);
+static void  stgDeallocForGMP (void *ptr, size_t size);
+
+static void
+initStep (step *stp, int g, int s)
+{
+    stp->no = s;
+    stp->blocks = NULL;
+    stp->n_blocks = 0;
+    stp->old_blocks = NULL;
+    stp->n_old_blocks = 0;
+    stp->gen = &generations[g];
+    stp->gen_no = g;
+    stp->hp = NULL;
+    stp->hpLim = NULL;
+    stp->hp_bd = NULL;
+    stp->scavd_hp = NULL;
+    stp->scavd_hpLim = NULL;
+    stp->scan = NULL;
+    stp->scan_bd = NULL;
+    stp->large_objects = NULL;
+    stp->n_large_blocks = 0;
+    stp->new_large_objects = NULL;
+    stp->scavenged_large_objects = NULL;
+    stp->n_scavenged_large_blocks = 0;
+    stp->is_compacted = 0;
+    stp->bitmap = NULL;
+}
+
+void
+initStorage( void )
+{
+  nat g, s;
+  generation *gen;
+
+  if (generations != NULL) {
+      // multi-init protection
+      return;
+  }
+
+  /* Sanity check to make sure the LOOKS_LIKE_ macros appear to be
+   * doing something reasonable.
+   */
+  ASSERT(LOOKS_LIKE_INFO_PTR(&stg_BLACKHOLE_info));
+  ASSERT(LOOKS_LIKE_CLOSURE_PTR(&stg_dummy_ret_closure));
+  ASSERT(!HEAP_ALLOCED(&stg_dummy_ret_closure));
+  
+  if (RtsFlags.GcFlags.maxHeapSize != 0 &&
+      RtsFlags.GcFlags.heapSizeSuggestion > 
+      RtsFlags.GcFlags.maxHeapSize) {
+    RtsFlags.GcFlags.maxHeapSize = RtsFlags.GcFlags.heapSizeSuggestion;
+  }
+
+  if (RtsFlags.GcFlags.maxHeapSize != 0 &&
+      RtsFlags.GcFlags.minAllocAreaSize > 
+      RtsFlags.GcFlags.maxHeapSize) {
+      errorBelch("maximum heap size (-M) is smaller than minimum alloc area size (-A)");
+      exit(1);
+  }
+
+  initBlockAllocator();
+  
+#if defined(THREADED_RTS)
+  initMutex(&sm_mutex);
+  initMutex(&atomic_modify_mutvar_mutex);
+#endif
+
+  ACQUIRE_SM_LOCK;
+
+  /* allocate generation info array */
+  generations = (generation *)stgMallocBytes(RtsFlags.GcFlags.generations 
+					     * sizeof(struct generation_),
+					     "initStorage: gens");
+
+  /* Initialise all generations */
+  for(g = 0; g < RtsFlags.GcFlags.generations; g++) {
+    gen = &generations[g];
+    gen->no = g;
+    gen->mut_list = allocBlock();
+    gen->collections = 0;
+    gen->failed_promotions = 0;
+    gen->max_blocks = 0;
+  }
+
+  /* A couple of convenience pointers */
+  g0 = &generations[0];
+  oldest_gen = &generations[RtsFlags.GcFlags.generations-1];
+
+  /* Allocate step structures in each generation */
+  if (RtsFlags.GcFlags.generations > 1) {
+    /* Only for multiple-generations */
+
+    /* Oldest generation: one step */
+    oldest_gen->n_steps = 1;
+    oldest_gen->steps = 
+      stgMallocBytes(1 * sizeof(struct step_), "initStorage: last step");
+
+    /* set up all except the oldest generation with 2 steps */
+    for(g = 0; g < RtsFlags.GcFlags.generations-1; g++) {
+      generations[g].n_steps = RtsFlags.GcFlags.steps;
+      generations[g].steps  = 
+	stgMallocBytes (RtsFlags.GcFlags.steps * sizeof(struct step_),
+			"initStorage: steps");
+    }
+    
+  } else {
+    /* single generation, i.e. a two-space collector */
+    g0->n_steps = 1;
+    g0->steps = stgMallocBytes (sizeof(struct step_), "initStorage: steps");
+  }
+
+#ifdef THREADED_RTS
+  n_nurseries = n_capabilities;
+  nurseries = stgMallocBytes (n_nurseries * sizeof(struct step_),
+			      "initStorage: nurseries");
+#else
+  n_nurseries = 1;
+  nurseries = g0->steps; // just share nurseries[0] with g0s0
+#endif  
+
+  /* Initialise all steps */
+  for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+    for (s = 0; s < generations[g].n_steps; s++) {
+	initStep(&generations[g].steps[s], g, s);
+    }
+  }
+  
+#ifdef THREADED_RTS
+  for (s = 0; s < n_nurseries; s++) {
+      initStep(&nurseries[s], 0, s);
+  }
+#endif
+  
+  /* Set up the destination pointers in each younger gen. step */
+  for (g = 0; g < RtsFlags.GcFlags.generations-1; g++) {
+    for (s = 0; s < generations[g].n_steps-1; s++) {
+      generations[g].steps[s].to = &generations[g].steps[s+1];
+    }
+    generations[g].steps[s].to = &generations[g+1].steps[0];
+  }
+  oldest_gen->steps[0].to = &oldest_gen->steps[0];
+  
+#ifdef THREADED_RTS
+  for (s = 0; s < n_nurseries; s++) {
+      nurseries[s].to = generations[0].steps[0].to;
+  }
+#endif
+  
+  /* The oldest generation has one step. */
+  if (RtsFlags.GcFlags.compact) {
+      if (RtsFlags.GcFlags.generations == 1) {
+	  errorBelch("WARNING: compaction is incompatible with -G1; disabled");
+      } else {
+	  oldest_gen->steps[0].is_compacted = 1;
+      }
+  }
+
+#ifdef THREADED_RTS
+  if (RtsFlags.GcFlags.generations == 1) {
+      errorBelch("-G1 is incompatible with -threaded");
+      stg_exit(EXIT_FAILURE);
+  }
+#endif
+
+  /* generation 0 is special: that's the nursery */
+  generations[0].max_blocks = 0;
+
+  /* G0S0: the allocation area.  Policy: keep the allocation area
+   * small to begin with, even if we have a large suggested heap
+   * size.  Reason: we're going to do a major collection first, and we
+   * don't want it to be a big one.  This vague idea is borne out by 
+   * rigorous experimental evidence.
+   */
+  g0s0 = &generations[0].steps[0];
+
+  allocNurseries();
+
+  weak_ptr_list = NULL;
+  caf_list = NULL;
+  revertible_caf_list = NULL;
+   
+  /* initialise the allocate() interface */
+  small_alloc_list = NULL;
+  alloc_blocks = 0;
+  alloc_blocks_lim = RtsFlags.GcFlags.minAllocAreaSize;
+
+  /* Tell GNU multi-precision pkg about our custom alloc functions */
+  mp_set_memory_functions(stgAllocForGMP, stgReallocForGMP, stgDeallocForGMP);
+
+  IF_DEBUG(gc, statDescribeGens());
+
+  RELEASE_SM_LOCK;
+}
+
+void
+exitStorage (void)
+{
+    stat_exit(calcAllocated());
+}
+
+void
+freeStorage (void)
+{
+    freeAllMBlocks();
+}
+
+/* -----------------------------------------------------------------------------
+   CAF management.
+
+   The entry code for every CAF does the following:
+     
+      - builds a CAF_BLACKHOLE in the heap
+      - pushes an update frame pointing to the CAF_BLACKHOLE
+      - invokes UPD_CAF(), which:
+          - calls newCaf, below
+	  - updates the CAF with a static indirection to the CAF_BLACKHOLE
+      
+   Why do we build a BLACKHOLE in the heap rather than just updating
+   the thunk directly?  It's so that we only need one kind of update
+   frame - otherwise we'd need a static version of the update frame too.
+
+   newCaf() does the following:
+       
+      - it puts the CAF on the oldest generation's mut-once list.
+        This is so that we can treat the CAF as a root when collecting
+	younger generations.
+
+   For GHCI, we have additional requirements when dealing with CAFs:
+
+      - we must *retain* all dynamically-loaded CAFs ever entered,
+        just in case we need them again.
+      - we must be able to *revert* CAFs that have been evaluated, to
+        their pre-evaluated form.
+
+      To do this, we use an additional CAF list.  When newCaf() is
+      called on a dynamically-loaded CAF, we add it to the CAF list
+      instead of the old-generation mutable list, and save away its
+      old info pointer (in caf->saved_info) for later reversion.
+
+      To revert all the CAFs, we traverse the CAF list and reset the
+      info pointer to caf->saved_info, then throw away the CAF list.
+      (see GC.c:revertCAFs()).
+
+      -- SDM 29/1/01
+
+   -------------------------------------------------------------------------- */
+
+void
+newCAF(StgClosure* caf)
+{
+  ACQUIRE_SM_LOCK;
+
+  if(keepCAFs)
+  {
+    // HACK:
+    // If we are in GHCi _and_ we are using dynamic libraries,
+    // then we can't redirect newCAF calls to newDynCAF (see below),
+    // so we make newCAF behave almost like newDynCAF.
+    // The dynamic libraries might be used by both the interpreted
+    // program and GHCi itself, so they must not be reverted.
+    // This also means that in GHCi with dynamic libraries, CAFs are not
+    // garbage collected. If this turns out to be a problem, we could
+    // do another hack here and do an address range test on caf to figure
+    // out whether it is from a dynamic library.
+    ((StgIndStatic *)caf)->saved_info  = (StgInfoTable *)caf->header.info;
+    ((StgIndStatic *)caf)->static_link = caf_list;
+    caf_list = caf;
+  }
+  else
+  {
+    /* Put this CAF on the mutable list for the old generation.
+    * This is a HACK - the IND_STATIC closure doesn't really have
+    * a mut_link field, but we pretend it has - in fact we re-use
+    * the STATIC_LINK field for the time being, because when we
+    * come to do a major GC we won't need the mut_link field
+    * any more and can use it as a STATIC_LINK.
+    */
+    ((StgIndStatic *)caf)->saved_info = NULL;
+    recordMutableGen(caf, oldest_gen);
+  }
+  
+  RELEASE_SM_LOCK;
+
+#ifdef PAR
+  /* If we are PAR or DIST then  we never forget a CAF */
+  { globalAddr *newGA;
+    //debugBelch("<##> Globalising CAF %08x %s",caf,info_type(caf));
+    newGA=makeGlobal(caf,rtsTrue); /*given full weight*/
+    ASSERT(newGA);
+  } 
+#endif /* PAR */
+}
+
+// An alternate version of newCaf which is used for dynamically loaded
+// object code in GHCi.  In this case we want to retain *all* CAFs in
+// the object code, because they might be demanded at any time from an
+// expression evaluated on the command line.
+// Also, GHCi might want to revert CAFs, so we add these to the
+// revertible_caf_list.
+//
+// The linker hackily arranges that references to newCaf from dynamic
+// code end up pointing to newDynCAF.
+void
+newDynCAF(StgClosure *caf)
+{
+    ACQUIRE_SM_LOCK;
+
+    ((StgIndStatic *)caf)->saved_info  = (StgInfoTable *)caf->header.info;
+    ((StgIndStatic *)caf)->static_link = revertible_caf_list;
+    revertible_caf_list = caf;
+
+    RELEASE_SM_LOCK;
+}
+
+/* -----------------------------------------------------------------------------
+   Nursery management.
+   -------------------------------------------------------------------------- */
+
+static bdescr *
+allocNursery (step *stp, bdescr *tail, nat blocks)
+{
+    bdescr *bd;
+    nat i;
+
+    // Allocate a nursery: we allocate fresh blocks one at a time and
+    // cons them on to the front of the list, not forgetting to update
+    // the back pointer on the tail of the list to point to the new block.
+    for (i=0; i < blocks; i++) {
+	// @LDV profiling
+	/*
+	  processNursery() in LdvProfile.c assumes that every block group in
+	  the nursery contains only a single block. So, if a block group is
+	  given multiple blocks, change processNursery() accordingly.
+	*/
+	bd = allocBlock();
+	bd->link = tail;
+	// double-link the nursery: we might need to insert blocks
+	if (tail != NULL) {
+	    tail->u.back = bd;
+	}
+	bd->step = stp;
+	bd->gen_no = 0;
+	bd->flags = 0;
+	bd->free = bd->start;
+	tail = bd;
+    }
+    tail->u.back = NULL;
+    return tail;
+}
+
+static void
+assignNurseriesToCapabilities (void)
+{
+#ifdef THREADED_RTS
+    nat i;
+
+    for (i = 0; i < n_nurseries; i++) {
+	capabilities[i].r.rNursery        = &nurseries[i];
+	capabilities[i].r.rCurrentNursery = nurseries[i].blocks;
+	capabilities[i].r.rCurrentAlloc   = NULL;
+    }
+#else /* THREADED_RTS */
+    MainCapability.r.rNursery        = &nurseries[0];
+    MainCapability.r.rCurrentNursery = nurseries[0].blocks;
+    MainCapability.r.rCurrentAlloc   = NULL;
+#endif
+}
+
+void
+allocNurseries( void )
+{ 
+    nat i;
+
+    for (i = 0; i < n_nurseries; i++) {
+	nurseries[i].blocks = 
+	    allocNursery(&nurseries[i], NULL, 
+			 RtsFlags.GcFlags.minAllocAreaSize);
+	nurseries[i].n_blocks    = RtsFlags.GcFlags.minAllocAreaSize;
+	nurseries[i].old_blocks   = NULL;
+	nurseries[i].n_old_blocks = 0;
+	/* hp, hpLim, hp_bd, to_space etc. aren't used in the nursery */
+    }
+    assignNurseriesToCapabilities();
+}
+      
+void
+resetNurseries( void )
+{
+    nat i;
+    bdescr *bd;
+    step *stp;
+
+    for (i = 0; i < n_nurseries; i++) {
+	stp = &nurseries[i];
+	for (bd = stp->blocks; bd; bd = bd->link) {
+	    bd->free = bd->start;
+	    ASSERT(bd->gen_no == 0);
+	    ASSERT(bd->step == stp);
+	    IF_DEBUG(sanity,memset(bd->start, 0xaa, BLOCK_SIZE));
+	}
+    }
+    assignNurseriesToCapabilities();
+}
+
+lnat
+countNurseryBlocks (void)
+{
+    nat i;
+    lnat blocks = 0;
+
+    for (i = 0; i < n_nurseries; i++) {
+	blocks += nurseries[i].n_blocks;
+    }
+    return blocks;
+}
+
+static void
+resizeNursery ( step *stp, nat blocks )
+{
+  bdescr *bd;
+  nat nursery_blocks;
+
+  nursery_blocks = stp->n_blocks;
+  if (nursery_blocks == blocks) return;
+
+  if (nursery_blocks < blocks) {
+    IF_DEBUG(gc, debugBelch("Increasing size of nursery to %d blocks\n", 
+			 blocks));
+    stp->blocks = allocNursery(stp, stp->blocks, blocks-nursery_blocks);
+  } 
+  else {
+    bdescr *next_bd;
+    
+    IF_DEBUG(gc, debugBelch("Decreasing size of nursery to %d blocks\n", 
+			 blocks));
+
+    bd = stp->blocks;
+    while (nursery_blocks > blocks) {
+	next_bd = bd->link;
+	next_bd->u.back = NULL;
+	nursery_blocks -= bd->blocks; // might be a large block
+	freeGroup(bd);
+	bd = next_bd;
+    }
+    stp->blocks = bd;
+    // might have gone just under, by freeing a large block, so make
+    // up the difference.
+    if (nursery_blocks < blocks) {
+	stp->blocks = allocNursery(stp, stp->blocks, blocks-nursery_blocks);
+    }
+  }
+  
+  stp->n_blocks = blocks;
+  ASSERT(countBlocks(stp->blocks) == stp->n_blocks);
+}
+
+// 
+// Resize each of the nurseries to the specified size.
+//
+void
+resizeNurseriesFixed (nat blocks)
+{
+    nat i;
+    for (i = 0; i < n_nurseries; i++) {
+	resizeNursery(&nurseries[i], blocks);
+    }
+}
+
+// 
+// Resize the nurseries to the total specified size.
+//
+void
+resizeNurseries (nat blocks)
+{
+    // If there are multiple nurseries, then we just divide the number
+    // of available blocks between them.
+    resizeNurseriesFixed(blocks / n_nurseries);
+}
+
+/* -----------------------------------------------------------------------------
+   The allocate() interface
+
+   allocate(n) always succeeds, and returns a chunk of memory n words
+   long.  n can be larger than the size of a block if necessary, in
+   which case a contiguous block group will be allocated.
+   -------------------------------------------------------------------------- */
+
+StgPtr
+allocate( nat n )
+{
+    bdescr *bd;
+    StgPtr p;
+
+    ACQUIRE_SM_LOCK;
+
+    TICK_ALLOC_HEAP_NOCTR(n);
+    CCS_ALLOC(CCCS,n);
+
+    /* big allocation (>LARGE_OBJECT_THRESHOLD) */
+    /* ToDo: allocate directly into generation 1 */
+    if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) {
+	nat req_blocks =  (lnat)BLOCK_ROUND_UP(n*sizeof(W_)) / BLOCK_SIZE;
+	bd = allocGroup(req_blocks);
+	dbl_link_onto(bd, &g0s0->large_objects);
+	g0s0->n_large_blocks += req_blocks;
+	bd->gen_no  = 0;
+	bd->step = g0s0;
+	bd->flags = BF_LARGE;
+	bd->free = bd->start + n;
+	alloc_blocks += req_blocks;
+	RELEASE_SM_LOCK;
+	return bd->start;
+	
+	/* small allocation (<LARGE_OBJECT_THRESHOLD) */
+    } else if (small_alloc_list == NULL || alloc_Hp + n > alloc_HpLim) {
+	if (small_alloc_list) {
+	    small_alloc_list->free = alloc_Hp;
+	}
+	bd = allocBlock();
+	bd->link = small_alloc_list;
+	small_alloc_list = bd;
+	bd->gen_no = 0;
+	bd->step = g0s0;
+	bd->flags = 0;
+	alloc_Hp = bd->start;
+	alloc_HpLim = bd->start + BLOCK_SIZE_W;
+	alloc_blocks++;
+    }
+    
+    p = alloc_Hp;
+    alloc_Hp += n;
+    RELEASE_SM_LOCK;
+    return p;
+}
+
+lnat
+allocated_bytes( void )
+{
+    lnat allocated;
+
+    allocated = alloc_blocks * BLOCK_SIZE_W - (alloc_HpLim - alloc_Hp);
+    if (pinned_object_block != NULL) {
+	allocated -= (pinned_object_block->start + BLOCK_SIZE_W) - 
+	    pinned_object_block->free;
+    }
+	
+    return allocated;
+}
+
+void
+tidyAllocateLists (void)
+{
+    if (small_alloc_list != NULL) {
+	ASSERT(alloc_Hp >= small_alloc_list->start && 
+	       alloc_Hp <= small_alloc_list->start + BLOCK_SIZE);
+	small_alloc_list->free = alloc_Hp;
+    }
+}
+
+/* -----------------------------------------------------------------------------
+   allocateLocal()
+
+   This allocates memory in the current thread - it is intended for
+   use primarily from STG-land where we have a Capability.  It is
+   better than allocate() because it doesn't require taking the
+   sm_mutex lock in the common case.
+
+   Memory is allocated directly from the nursery if possible (but not
+   from the current nursery block, so as not to interfere with
+   Hp/HpLim).
+   -------------------------------------------------------------------------- */
+
+StgPtr
+allocateLocal (Capability *cap, nat n)
+{
+    bdescr *bd;
+    StgPtr p;
+
+    TICK_ALLOC_HEAP_NOCTR(n);
+    CCS_ALLOC(CCCS,n);
+    
+    /* big allocation (>LARGE_OBJECT_THRESHOLD) */
+    /* ToDo: allocate directly into generation 1 */
+    if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) {
+	nat req_blocks =  (lnat)BLOCK_ROUND_UP(n*sizeof(W_)) / BLOCK_SIZE;
+	ACQUIRE_SM_LOCK;
+	bd = allocGroup(req_blocks);
+	dbl_link_onto(bd, &g0s0->large_objects);
+	g0s0->n_large_blocks += req_blocks;
+	bd->gen_no  = 0;
+	bd->step = g0s0;
+	bd->flags = BF_LARGE;
+	bd->free = bd->start + n;
+	alloc_blocks += req_blocks;
+	RELEASE_SM_LOCK;
+	return bd->start;
+	
+	/* small allocation (<LARGE_OBJECT_THRESHOLD) */
+    } else {
+
+	bd = cap->r.rCurrentAlloc;
+	if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) {
+
+	    // The CurrentAlloc block is full, we need to find another
+	    // one.  First, we try taking the next block from the
+	    // nursery:
+	    bd = cap->r.rCurrentNursery->link;
+
+	    if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) {
+		// The nursery is empty, or the next block is already
+		// full: allocate a fresh block (we can't fail here).
+		ACQUIRE_SM_LOCK;
+		bd = allocBlock();
+		cap->r.rNursery->n_blocks++;
+		RELEASE_SM_LOCK;
+		bd->gen_no = 0;
+		bd->step = cap->r.rNursery;
+		bd->flags = 0;
+	    } else {
+		// we have a block in the nursery: take it and put
+		// it at the *front* of the nursery list, and use it
+		// to allocate() from.
+		cap->r.rCurrentNursery->link = bd->link;
+		if (bd->link != NULL) {
+		    bd->link->u.back = cap->r.rCurrentNursery;
+		}
+	    }
+	    dbl_link_onto(bd, &cap->r.rNursery->blocks);
+	    cap->r.rCurrentAlloc = bd;
+	    IF_DEBUG(sanity, checkNurserySanity(cap->r.rNursery));
+	}
+    }
+    p = bd->free;
+    bd->free += n;
+    return p;
+}
+
+/* ---------------------------------------------------------------------------
+   Allocate a fixed/pinned object.
+
+   We allocate small pinned objects into a single block, allocating a
+   new block when the current one overflows.  The block is chained
+   onto the large_object_list of generation 0 step 0.
+
+   NOTE: The GC can't in general handle pinned objects.  This
+   interface is only safe to use for ByteArrays, which have no
+   pointers and don't require scavenging.  It works because the
+   block's descriptor has the BF_LARGE flag set, so the block is
+   treated as a large object and chained onto various lists, rather
+   than the individual objects being copied.  However, when it comes
+   to scavenge the block, the GC will only scavenge the first object.
+   The reason is that the GC can't linearly scan a block of pinned
+   objects at the moment (doing so would require using the
+   mostly-copying techniques).  But since we're restricting ourselves
+   to pinned ByteArrays, not scavenging is ok.
+
+   This function is called by newPinnedByteArray# which immediately
+   fills the allocated memory with a MutableByteArray#.
+   ------------------------------------------------------------------------- */
+
+StgPtr
+allocatePinned( nat n )
+{
+    StgPtr p;
+    bdescr *bd = pinned_object_block;
+
+    // If the request is for a large object, then allocate()
+    // will give us a pinned object anyway.
+    if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) {
+	return allocate(n);
+    }
+
+    ACQUIRE_SM_LOCK;
+    
+    TICK_ALLOC_HEAP_NOCTR(n);
+    CCS_ALLOC(CCCS,n);
+
+    // we always return 8-byte aligned memory.  bd->free must be
+    // 8-byte aligned to begin with, so we just round up n to
+    // the nearest multiple of 8 bytes.
+    if (sizeof(StgWord) == 4) {
+	n = (n+1) & ~1;
+    }
+
+    // If we don't have a block of pinned objects yet, or the current
+    // one isn't large enough to hold the new object, allocate a new one.
+    if (bd == NULL || (bd->free + n) > (bd->start + BLOCK_SIZE_W)) {
+	pinned_object_block = bd = allocBlock();
+	dbl_link_onto(bd, &g0s0->large_objects);
+	bd->gen_no = 0;
+	bd->step   = g0s0;
+	bd->flags  = BF_PINNED | BF_LARGE;
+	bd->free   = bd->start;
+	alloc_blocks++;
+    }
+
+    p = bd->free;
+    bd->free += n;
+    RELEASE_SM_LOCK;
+    return p;
+}
+
+/* -----------------------------------------------------------------------------
+   This is the write barrier for MUT_VARs, a.k.a. IORefs.  A
+   MUT_VAR_CLEAN object is not on the mutable list; a MUT_VAR_DIRTY
+   is.  When written to, a MUT_VAR_CLEAN turns into a MUT_VAR_DIRTY
+   and is put on the mutable list.
+   -------------------------------------------------------------------------- */
+
+void
+dirty_MUT_VAR(StgRegTable *reg, StgClosure *p)
+{
+    Capability *cap = regTableToCapability(reg);
+    bdescr *bd;
+    if (p->header.info == &stg_MUT_VAR_CLEAN_info) {
+	p->header.info = &stg_MUT_VAR_DIRTY_info;
+	bd = Bdescr((StgPtr)p);
+	if (bd->gen_no > 0) recordMutableCap(p,cap,bd->gen_no);
+    }
+}
+
+/* -----------------------------------------------------------------------------
+   Allocation functions for GMP.
+
+   These all use the allocate() interface - we can't have any garbage
+   collection going on during a gmp operation, so we use allocate()
+   which always succeeds.  The gmp operations which might need to
+   allocate will ask the storage manager (via doYouWantToGC()) whether
+   a garbage collection is required, in case we get into a loop doing
+   only allocate() style allocation.
+   -------------------------------------------------------------------------- */
+
+static void *
+stgAllocForGMP (size_t size_in_bytes)
+{
+  StgArrWords* arr;
+  nat data_size_in_words, total_size_in_words;
+  
+  /* round up to a whole number of words */
+  data_size_in_words  = (size_in_bytes + sizeof(W_) + 1) / sizeof(W_);
+  total_size_in_words = sizeofW(StgArrWords) + data_size_in_words;
+  
+  /* allocate and fill it in. */
+#if defined(THREADED_RTS)
+  arr = (StgArrWords *)allocateLocal(myTask()->cap, total_size_in_words);
+#else
+  arr = (StgArrWords *)allocateLocal(&MainCapability, total_size_in_words);
+#endif
+  SET_ARR_HDR(arr, &stg_ARR_WORDS_info, CCCS, data_size_in_words);
+  
+  /* and return a ptr to the goods inside the array */
+  return arr->payload;
+}
+
+static void *
+stgReallocForGMP (void *ptr, size_t old_size, size_t new_size)
+{
+    void *new_stuff_ptr = stgAllocForGMP(new_size);
+    nat i = 0;
+    char *p = (char *) ptr;
+    char *q = (char *) new_stuff_ptr;
+
+    for (; i < old_size; i++, p++, q++) {
+	*q = *p;
+    }
+
+    return(new_stuff_ptr);
+}
+
+static void
+stgDeallocForGMP (void *ptr STG_UNUSED, 
+		  size_t size STG_UNUSED)
+{
+    /* easy for us: the garbage collector does the dealloc'n */
+}
+
+/* -----------------------------------------------------------------------------
+ * Stats and stuff
+ * -------------------------------------------------------------------------- */
+
+/* -----------------------------------------------------------------------------
+ * calcAllocated()
+ *
+ * Approximate how much we've allocated: number of blocks in the
+ * nursery + blocks allocated via allocate() - unused nusery blocks.
+ * This leaves a little slop at the end of each block, and doesn't
+ * take into account large objects (ToDo).
+ * -------------------------------------------------------------------------- */
+
+lnat
+calcAllocated( void )
+{
+  nat allocated;
+  bdescr *bd;
+
+  allocated = allocated_bytes();
+  allocated += countNurseryBlocks() * BLOCK_SIZE_W;
+  
+  {
+#ifdef THREADED_RTS
+  nat i;
+  for (i = 0; i < n_nurseries; i++) {
+      Capability *cap;
+      for ( bd = capabilities[i].r.rCurrentNursery->link; 
+	    bd != NULL; bd = bd->link ) {
+	  allocated -= BLOCK_SIZE_W;
+      }
+      cap = &capabilities[i];
+      if (cap->r.rCurrentNursery->free < 
+	  cap->r.rCurrentNursery->start + BLOCK_SIZE_W) {
+	  allocated -= (cap->r.rCurrentNursery->start + BLOCK_SIZE_W)
+	      - cap->r.rCurrentNursery->free;
+      }
+  }
+#else
+  bdescr *current_nursery = MainCapability.r.rCurrentNursery;
+
+  for ( bd = current_nursery->link; bd != NULL; bd = bd->link ) {
+      allocated -= BLOCK_SIZE_W;
+  }
+  if (current_nursery->free < current_nursery->start + BLOCK_SIZE_W) {
+      allocated -= (current_nursery->start + BLOCK_SIZE_W)
+	  - current_nursery->free;
+  }
+#endif
+  }
+
+  total_allocated += allocated;
+  return allocated;
+}  
+
+/* Approximate the amount of live data in the heap.  To be called just
+ * after garbage collection (see GarbageCollect()).
+ */
+extern lnat 
+calcLive(void)
+{
+  nat g, s;
+  lnat live = 0;
+  step *stp;
+
+  if (RtsFlags.GcFlags.generations == 1) {
+    live = (g0s0->n_blocks - 1) * BLOCK_SIZE_W + 
+      ((lnat)g0s0->hp_bd->free - (lnat)g0s0->hp_bd->start) / sizeof(W_);
+    return live;
+  }
+
+  for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+    for (s = 0; s < generations[g].n_steps; s++) {
+      /* approximate amount of live data (doesn't take into account slop
+       * at end of each block).
+       */
+      if (g == 0 && s == 0) { 
+	  continue; 
+      }
+      stp = &generations[g].steps[s];
+      live += (stp->n_large_blocks + stp->n_blocks - 1) * BLOCK_SIZE_W;
+      if (stp->hp_bd != NULL) {
+	  live += ((lnat)stp->hp_bd->free - (lnat)stp->hp_bd->start) 
+	      / sizeof(W_);
+      }
+      if (stp->scavd_hp != NULL) {
+	  live -= (P_)(BLOCK_ROUND_UP(stp->scavd_hp)) - stp->scavd_hp;
+      }
+    }
+  }
+  return live;
+}
+
+/* Approximate the number of blocks that will be needed at the next
+ * garbage collection.
+ *
+ * Assume: all data currently live will remain live.  Steps that will
+ * be collected next time will therefore need twice as many blocks
+ * since all the data will be copied.
+ */
+extern lnat 
+calcNeeded(void)
+{
+    lnat needed = 0;
+    nat g, s;
+    step *stp;
+    
+    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+	for (s = 0; s < generations[g].n_steps; s++) {
+	    if (g == 0 && s == 0) { continue; }
+	    stp = &generations[g].steps[s];
+	    if (generations[g].steps[0].n_blocks +
+		generations[g].steps[0].n_large_blocks 
+		> generations[g].max_blocks
+		&& stp->is_compacted == 0) {
+		needed += 2 * stp->n_blocks;
+	    } else {
+		needed += stp->n_blocks;
+	    }
+	}
+    }
+    return needed;
+}
+
+/* -----------------------------------------------------------------------------
+   Debugging
+
+   memInventory() checks for memory leaks by counting up all the
+   blocks we know about and comparing that to the number of blocks
+   allegedly floating around in the system.
+   -------------------------------------------------------------------------- */
+
+#ifdef DEBUG
+
+static lnat
+stepBlocks (step *stp)
+{
+    lnat total_blocks;
+    bdescr *bd;
+
+    total_blocks = stp->n_blocks;    
+    total_blocks += stp->n_old_blocks;
+    for (bd = stp->large_objects; bd; bd = bd->link) {
+	total_blocks += bd->blocks;
+	/* hack for megablock groups: they have an extra block or two in
+	   the second and subsequent megablocks where the block
+	   descriptors would normally go.
+	*/
+	if (bd->blocks > BLOCKS_PER_MBLOCK) {
+	    total_blocks -= (MBLOCK_SIZE / BLOCK_SIZE - BLOCKS_PER_MBLOCK)
+		* (bd->blocks/(MBLOCK_SIZE/BLOCK_SIZE));
+	}
+    }
+    return total_blocks;
+}
+
+void
+memInventory(void)
+{
+  nat g, s, i;
+  step *stp;
+  bdescr *bd;
+  lnat total_blocks = 0, free_blocks = 0;
+
+  /* count the blocks we current have */
+
+  for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+      for (i = 0; i < n_capabilities; i++) {
+	  for (bd = capabilities[i].mut_lists[g]; bd != NULL; bd = bd->link) {
+	      total_blocks += bd->blocks;
+	  }
+      }	  
+      for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) {
+	  total_blocks += bd->blocks;
+      }
+      for (s = 0; s < generations[g].n_steps; s++) {
+	  if (g==0 && s==0) continue;
+	  stp = &generations[g].steps[s];
+	  total_blocks += stepBlocks(stp);
+      }
+  }
+
+  for (i = 0; i < n_nurseries; i++) {
+      total_blocks += stepBlocks(&nurseries[i]);
+  }
+#ifdef THREADED_RTS
+  // We put pinned object blocks in g0s0, so better count blocks there too.
+  total_blocks += stepBlocks(g0s0);
+#endif
+
+  /* any blocks held by allocate() */
+  for (bd = small_alloc_list; bd; bd = bd->link) {
+    total_blocks += bd->blocks;
+  }
+
+#ifdef PROFILING
+  if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_RETAINER) {
+      total_blocks += retainerStackBlocks();
+  }
+#endif
+
+  // count the blocks allocated by the arena allocator
+  total_blocks += arenaBlocks();
+
+  /* count the blocks on the free list */
+  free_blocks = countFreeList();
+
+  if (total_blocks + free_blocks != mblocks_allocated *
+      BLOCKS_PER_MBLOCK) {
+    debugBelch("Blocks: %ld live + %ld free  = %ld total (%ld around)\n",
+	    total_blocks, free_blocks, total_blocks + free_blocks,
+	    mblocks_allocated * BLOCKS_PER_MBLOCK);
+  }
+
+  ASSERT(total_blocks + free_blocks == mblocks_allocated * BLOCKS_PER_MBLOCK);
+}
+
+
+nat
+countBlocks(bdescr *bd)
+{
+    nat n;
+    for (n=0; bd != NULL; bd=bd->link) {
+	n += bd->blocks;
+    }
+    return n;
+}
+
+/* Full heap sanity check. */
+void
+checkSanity( void )
+{
+    nat g, s;
+
+    if (RtsFlags.GcFlags.generations == 1) {
+	checkHeap(g0s0->blocks);
+	checkChain(g0s0->large_objects);
+    } else {
+	
+	for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+	    for (s = 0; s < generations[g].n_steps; s++) {
+		if (g == 0 && s == 0) { continue; }
+		ASSERT(countBlocks(generations[g].steps[s].blocks)
+		       == generations[g].steps[s].n_blocks);
+		ASSERT(countBlocks(generations[g].steps[s].large_objects)
+		       == generations[g].steps[s].n_large_blocks);
+		checkHeap(generations[g].steps[s].blocks);
+		checkChain(generations[g].steps[s].large_objects);
+		if (g > 0) {
+		    checkMutableList(generations[g].mut_list, g);
+		}
+	    }
+	}
+
+	for (s = 0; s < n_nurseries; s++) {
+	    ASSERT(countBlocks(nurseries[s].blocks)
+		   == nurseries[s].n_blocks);
+	    ASSERT(countBlocks(nurseries[s].large_objects)
+		   == nurseries[s].n_large_blocks);
+	}
+	    
+	checkFreeListSanity();
+    }
+}
+
+/* Nursery sanity check */
+void
+checkNurserySanity( step *stp )
+{
+    bdescr *bd, *prev;
+    nat blocks = 0;
+
+    prev = NULL;
+    for (bd = stp->blocks; bd != NULL; bd = bd->link) {
+	ASSERT(bd->u.back == prev);
+	prev = bd;
+	blocks += bd->blocks;
+    }
+    ASSERT(blocks == stp->n_blocks);
+}
+
+// handy function for use in gdb, because Bdescr() is inlined.
+extern bdescr *_bdescr( StgPtr p );
+
+bdescr *
+_bdescr( StgPtr p )
+{
+    return Bdescr(p);
+}
+
+#endif
diff --git a/rts/Task.c b/rts/Task.c
new file mode 100644
index 0000000000..7366480094
--- /dev/null
+++ b/rts/Task.c
@@ -0,0 +1,315 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 2001-2005
+ *
+ * The task manager subsystem.  Tasks execute STG code, with this
+ * module providing the API which the Scheduler uses to control their
+ * creation and destruction.
+ * 
+ * -------------------------------------------------------------------------*/
+
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "OSThreads.h"
+#include "Task.h"
+#include "Capability.h"
+#include "Stats.h"
+#include "RtsFlags.h"
+#include "Schedule.h"
+#include "Hash.h"
+
+#if HAVE_SIGNAL_H
+#include <signal.h>
+#endif
+
+// Task lists and global counters.
+// Locks required: sched_mutex.
+Task *all_tasks = NULL;
+static Task *task_free_list = NULL; // singly-linked
+static nat taskCount;
+#define DEFAULT_MAX_WORKERS 64
+static nat maxWorkers; // we won't create more workers than this
+static nat tasksRunning;
+static nat workerCount;
+
+/* -----------------------------------------------------------------------------
+ * Remembering the current thread's Task
+ * -------------------------------------------------------------------------- */
+
+// A thread-local-storage key that we can use to get access to the
+// current thread's Task structure.
+#if defined(THREADED_RTS)
+ThreadLocalKey currentTaskKey;
+#else
+Task *my_task;
+#endif
+
+/* -----------------------------------------------------------------------------
+ * Rest of the Task API
+ * -------------------------------------------------------------------------- */
+
+void
+initTaskManager (void)
+{
+    static int initialized = 0;
+
+    if (!initialized) {
+	taskCount = 0;
+	workerCount = 0;
+	tasksRunning = 0;
+	maxWorkers = DEFAULT_MAX_WORKERS;
+	initialized = 1;
+#if defined(THREADED_RTS)
+	newThreadLocalKey(&currentTaskKey);
+#endif
+    }
+}
+
+
+void
+stopTaskManager (void)
+{
+    IF_DEBUG(scheduler, sched_belch("stopping task manager, %d tasks still running", tasksRunning));
+}
+
+
+static Task*
+newTask (void)
+{
+#if defined(THREADED_RTS)
+    Ticks currentElapsedTime, currentUserTime;
+#endif
+    Task *task;
+
+    task = stgMallocBytes(sizeof(Task), "newTask");
+    
+    task->cap  = NULL;
+    task->stopped = rtsFalse;
+    task->suspended_tso = NULL;
+    task->tso  = NULL;
+    task->stat = NoStatus;
+    task->ret  = NULL;
+    
+#if defined(THREADED_RTS)
+    initCondition(&task->cond);
+    initMutex(&task->lock);
+    task->wakeup = rtsFalse;
+#endif
+
+#if defined(THREADED_RTS)
+    currentUserTime = getThreadCPUTime();
+    currentElapsedTime = getProcessElapsedTime();
+    task->mut_time = 0.0;
+    task->mut_etime = 0.0;
+    task->gc_time = 0.0;
+    task->gc_etime = 0.0;
+    task->muttimestart = currentUserTime;
+    task->elapsedtimestart = currentElapsedTime;
+#endif
+
+    task->prev = NULL;
+    task->next = NULL;
+    task->return_link = NULL;
+
+    task->all_link = all_tasks;
+    all_tasks = task;
+
+    taskCount++;
+    workerCount++;
+
+    return task;
+}
+
+Task *
+newBoundTask (void)
+{
+    Task *task;
+
+    ASSERT_LOCK_HELD(&sched_mutex);
+    if (task_free_list == NULL) {
+	task = newTask();
+    } else {
+	task = task_free_list;
+	task_free_list = task->next;
+	task->next = NULL;
+	task->prev = NULL;
+	task->stopped = rtsFalse;
+    }
+#if defined(THREADED_RTS)
+    task->id = osThreadId();
+#endif
+    ASSERT(task->cap == NULL);
+
+    tasksRunning++;
+
+    taskEnter(task);
+
+    IF_DEBUG(scheduler,sched_belch("new task (taskCount: %d)", taskCount););
+    return task;
+}
+
+void
+boundTaskExiting (Task *task)
+{
+    task->stopped = rtsTrue;
+    task->cap = NULL;
+
+#if defined(THREADED_RTS)
+    ASSERT(osThreadId() == task->id);
+#endif
+    ASSERT(myTask() == task);
+    setMyTask(task->prev_stack);
+
+    tasksRunning--;
+
+    // sadly, we need a lock around the free task list. Todo: eliminate.
+    ACQUIRE_LOCK(&sched_mutex);
+    task->next = task_free_list;
+    task_free_list = task;
+    RELEASE_LOCK(&sched_mutex);
+
+    IF_DEBUG(scheduler,sched_belch("task exiting"));
+}
+
+#ifdef THREADED_RTS
+#define TASK_ID(t) (t)->id
+#else
+#define TASK_ID(t) (t)
+#endif
+
+void
+discardTask (Task *task)
+{
+    ASSERT_LOCK_HELD(&sched_mutex);
+    if (!task->stopped) {
+	IF_DEBUG(scheduler,sched_belch("discarding task %p", TASK_ID(task)));
+	task->cap = NULL;
+	task->tso = NULL;
+	task->stopped = rtsTrue;
+	tasksRunning--;
+	task->next = task_free_list;
+	task_free_list = task;
+    }
+}
+
+void
+taskStop (Task *task)
+{
+#if defined(THREADED_RTS)
+    OSThreadId id;
+    Ticks currentElapsedTime, currentUserTime, elapsedGCTime;
+
+    id = osThreadId();
+    ASSERT(task->id == id);
+    ASSERT(myTask() == task);
+
+    currentUserTime = getThreadCPUTime();
+    currentElapsedTime = getProcessElapsedTime();
+
+    // XXX this is wrong; we want elapsed GC time since the
+    // Task started.
+    elapsedGCTime = stat_getElapsedGCTime();
+    
+    task->mut_time = 
+	currentUserTime - task->muttimestart - task->gc_time;
+    task->mut_etime = 
+	currentElapsedTime - task->elapsedtimestart - elapsedGCTime;
+
+    if (task->mut_time < 0.0)  { task->mut_time = 0.0;  }
+    if (task->mut_etime < 0.0) { task->mut_etime = 0.0; }
+#endif
+
+    task->stopped = rtsTrue;
+    tasksRunning--;
+}
+
+void
+resetTaskManagerAfterFork (void)
+{
+#warning TODO!
+    taskCount = 0;
+}
+
+#if defined(THREADED_RTS)
+
+void
+startWorkerTask (Capability *cap, 
+		 void OSThreadProcAttr (*taskStart)(Task *task))
+{
+  int r;
+  OSThreadId tid;
+  Task *task;
+
+  if (workerCount >= maxWorkers) {
+      barf("too many workers; runaway worker creation?");
+  }
+  workerCount++;
+
+  // A worker always gets a fresh Task structure.
+  task = newTask();
+
+  tasksRunning++;
+
+  // The lock here is to synchronise with taskStart(), to make sure
+  // that we have finished setting up the Task structure before the
+  // worker thread reads it.
+  ACQUIRE_LOCK(&task->lock);
+
+  task->cap = cap;
+
+  // Give the capability directly to the worker; we can't let anyone
+  // else get in, because the new worker Task has nowhere to go to
+  // sleep so that it could be woken up again.
+  ASSERT_LOCK_HELD(&cap->lock);
+  cap->running_task = task;
+
+  r = createOSThread(&tid, (OSThreadProc *)taskStart, task);
+  if (r != 0) {
+    barf("startTask: Can't create new task");
+  }
+
+  IF_DEBUG(scheduler,sched_belch("new worker task (taskCount: %d)", taskCount););
+
+  task->id = tid;
+
+  // ok, finished with the Task struct.
+  RELEASE_LOCK(&task->lock);
+}
+
+#endif /* THREADED_RTS */
+
+#ifdef DEBUG
+
+static void *taskId(Task *task)
+{
+#ifdef THREADED_RTS
+    return (void *)task->id;
+#else
+    return (void *)task;
+#endif
+}
+
+void printAllTasks(void);
+
+void
+printAllTasks(void)
+{
+    Task *task;
+    for (task = all_tasks; task != NULL; task = task->all_link) {
+	debugBelch("task %p is %s, ", taskId(task), task->stopped ? "stopped" : "alive");
+	if (!task->stopped) {
+	    if (task->cap) {
+		debugBelch("on capability %d, ", task->cap->no);
+	    }
+	    if (task->tso) {
+		debugBelch("bound to thread %d", task->tso->id);
+	    } else {
+		debugBelch("worker");
+	    }
+	}
+	debugBelch("\n");
+    }
+}		       
+
+#endif
+
diff --git a/rts/Task.h b/rts/Task.h
new file mode 100644
index 0000000000..ca71d2809a
--- /dev/null
+++ b/rts/Task.h
@@ -0,0 +1,271 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 2001-2005
+ *
+ * Tasks
+ *
+ * -------------------------------------------------------------------------*/
+
+#ifndef TASK_H
+#define TASK_H
+
+#include "GetTime.h"
+
+/* 
+   Definition of a Task
+   --------------------
+ 
+   A task is an OSThread that runs Haskell code.  Every OSThread
+   created by the RTS for the purposes of running Haskell code is a
+   Task, and OS threads that enter the Haskell RTS for the purposes of
+   making a call-in are also Tasks.
+   
+   The relationship between the number of tasks and capabilities, and
+   the runtime build (-threaded, -smp etc.) is summarised by the
+   following table:
+
+     build        Tasks   Capabilities
+     ---------------------------------
+     normal         1          1
+     -threaded      N          N
+
+   The non-threaded build has a single Task and a single global
+   Capability.
+   
+   The THREADED_RTS build allows multiple tasks and mulitple Capabilities.
+   Multiple Tasks may all be running Haskell code simultaneously. A task
+   relinquishes its Capability when it is asked to evaluate an external
+   (C) call.
+
+   In general, there may be multiple Tasks for an OS thread.  This
+   happens if one Task makes a foreign call from Haskell, and
+   subsequently calls back in to create a new bound thread.
+
+   A particular Task structure can belong to more than one OS thread
+   over its lifetime.  This is to avoid creating an unbounded number
+   of Task structures.  The stats just accumulate.
+
+   Ownership of Task
+   -----------------
+
+   The OS thread named in the Task structure has exclusive access to
+   the structure, as long as it is the running_task of its Capability.
+   That is, if (task->cap->running_task == task), then task->id owns
+   the Task.  Otherwise the Task is owned by the owner of the parent
+   data structure on which it is sleeping; for example, if the task is
+   sleeping on spare_workers field of a Capability, then the owner of the
+   Capability has access to the Task.
+
+   When a task is migrated from sleeping on one Capability to another,
+   its task->cap field must be modified.  When the task wakes up, it
+   will read the new value of task->cap to find out which Capability
+   it belongs to.  Hence some synchronisation is required on
+   task->cap, and this is why we have task->lock.
+
+   If the Task is not currently owned by task->id, then the thread is
+   either
+
+      (a) waiting on the condition task->cond.  The Task is either
+         (1) a bound Task, the TSO will be on a queue somewhere
+	 (2) a worker task, on the spare_workers queue of task->cap.
+
+     (b) making a foreign call.  The Task will be on the
+         suspended_ccalling_tasks list.
+
+   We re-establish ownership in each case by respectively
+
+      (a) the task is currently blocked in yieldCapability().
+          This call will return when we have ownership of the Task and
+          a Capability.  The Capability we get might not be the same
+	  as the one we had when we called yieldCapability().
+          
+      (b) we must call resumeThread(task), which will safely establish
+          ownership of the Task and a Capability.
+*/
+
+typedef struct Task_ {
+#if defined(THREADED_RTS)
+    OSThreadId id;		// The OS Thread ID of this task
+#endif
+
+    // This points to the Capability that the Task "belongs" to.  If
+    // the Task owns a Capability, then task->cap points to it.  If
+    // the task does not own a Capability, then either (a) if the task
+    // is a worker, then task->cap points to the Capability it belongs
+    // to, or (b) it is returning from a foreign call, then task->cap
+    // points to the Capability with the returning_worker queue that this
+    // this Task is on.
+    //
+    // When a task goes to sleep, it may be migrated to a different
+    // Capability.  Hence, we always check task->cap on wakeup.  To
+    // syncrhonise between the migrater and the migratee, task->lock
+    // must be held when modifying task->cap.
+    struct Capability_ *cap;
+
+    rtsBool    stopped;         // this task has stopped or exited Haskell
+    StgTSO *   suspended_tso;   // the TSO is stashed here when we
+				// make a foreign call (NULL otherwise);
+
+    // The following 3 fields are used by bound threads:
+    StgTSO *   tso;             // the bound TSO (or NULL)
+    SchedulerStatus  stat;      // return status
+    StgClosure **    ret;       // return value
+
+#if defined(THREADED_RTS)
+    Condition cond;             // used for sleeping & waking up this task
+    Mutex lock;			// lock for the condition variable
+
+    // this flag tells the task whether it should wait on task->cond
+    // or just continue immediately.  It's a workaround for the fact
+    // that signalling a condition variable doesn't do anything if the
+    // thread is already running, but we want it to be sticky.
+    rtsBool wakeup;
+#endif
+
+    // Stats that we collect about this task
+    // ToDo: we probably want to put this in a separate TaskStats
+    // structure, so we can share it between multiple Tasks.  We don't
+    // really want separate stats for each call in a nested chain of
+    // foreign->haskell->foreign->haskell calls, but we'll get a
+    // separate Task for each of the haskell calls.
+    Ticks       elapsedtimestart;
+    Ticks       muttimestart;
+    Ticks       mut_time;
+    Ticks       mut_etime;
+    Ticks       gc_time;
+    Ticks       gc_etime;
+
+    // Links tasks onto various lists. (ToDo: do we need double
+    // linking now?)
+    struct Task_ *prev;
+    struct Task_ *next;
+
+    // Links tasks on the returning_tasks queue of a Capability.
+    struct Task_ *return_link;
+
+    // Links tasks on the all_tasks list
+    struct Task_ *all_link;
+
+    // When a Haskell thread makes a foreign call that re-enters
+    // Haskell, we end up with another Task associated with the
+    // current thread.  We have to remember the whole stack of Tasks
+    // associated with the current thread so that we can correctly
+    // save & restore the thread-local current task pointer.
+    struct Task_ *prev_stack;
+} Task;
+
+INLINE_HEADER rtsBool
+isBoundTask (Task *task) 
+{
+    return (task->tso != NULL);
+}
+
+
+// Linked list of all tasks.
+//
+extern Task *all_tasks;
+
+// Start and stop the task manager.
+// Requires: sched_mutex.
+//
+void initTaskManager (void);
+void stopTaskManager (void);
+
+// Create a new Task for a bound thread
+// Requires: sched_mutex.
+//
+Task *newBoundTask (void);
+
+// The current task is a bound task that is exiting.
+// Requires: sched_mutex.
+//
+void boundTaskExiting (Task *task);
+
+// This must be called when a new Task is associated with the current
+// thread.  It sets up the thread-local current task pointer so that
+// myTask() can work.
+INLINE_HEADER void taskEnter (Task *task);
+
+// Notify the task manager that a task has stopped.  This is used
+// mainly for stats-gathering purposes.
+// Requires: sched_mutex.
+//
+void taskStop (Task *task);
+
+// Put the task back on the free list, mark it stopped.  Used by
+// forkProcess().
+//
+void discardTask (Task *task);
+
+// Get the Task associated with the current OS thread (or NULL if none).
+//
+INLINE_HEADER Task *myTask (void);
+
+// After a fork, the tasks are not carried into the child process, so
+// we must tell the task manager.
+// Requires: sched_mutex.
+//
+void resetTaskManagerAfterFork (void);
+
+#if defined(THREADED_RTS)
+
+// Workers are attached to the supplied Capability.  This Capability
+// should not currently have a running_task, because the new task
+// will become the running_task for that Capability.
+// Requires: sched_mutex.
+//
+void startWorkerTask  (struct Capability_ *cap, 
+		       void OSThreadProcAttr (*taskStart)(Task *task));
+
+#endif /* THREADED_RTS */
+
+// -----------------------------------------------------------------------------
+// INLINE functions... private from here on down:
+
+// A thread-local-storage key that we can use to get access to the
+// current thread's Task structure.
+#if defined(THREADED_RTS)
+extern ThreadLocalKey currentTaskKey;
+#else
+extern Task *my_task;
+#endif
+
+//
+// myTask() uses thread-local storage to find the Task associated with
+// the current OS thread.  If the current OS thread has multiple
+// Tasks, because it has re-entered the RTS, then the task->prev_stack
+// field is used to store the previous Task.
+//
+INLINE_HEADER Task *
+myTask (void)
+{
+#if defined(THREADED_RTS)
+    return getThreadLocalVar(&currentTaskKey);
+#else
+    return my_task;
+#endif
+}
+
+INLINE_HEADER void
+setMyTask (Task *task)
+{
+#if defined(THREADED_RTS)
+    setThreadLocalVar(&currentTaskKey,task);
+#else
+    my_task = task;
+#endif
+}
+
+// This must be called when a new Task is associated with the current
+// thread.  It sets up the thread-local current task pointer so that
+// myTask() can work.
+INLINE_HEADER void
+taskEnter (Task *task)
+{
+    // save the current value, just in case this Task has been created
+    // as a result of re-entering the RTS (defaults to NULL):
+    task->prev_stack = myTask();
+    setMyTask(task);
+}
+
+#endif /* TASK_H */
diff --git a/rts/ThreadLabels.c b/rts/ThreadLabels.c
new file mode 100644
index 0000000000..9b9f1723ff
--- /dev/null
+++ b/rts/ThreadLabels.c
@@ -0,0 +1,50 @@
+/* -----------------------------------------------------------------------------
+ * ThreadLabels.c
+ *
+ * (c) The GHC Team 2002-2003
+ *
+ * Table of thread labels.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#include "ThreadLabels.h"
+#include "RtsUtils.h"
+
+#include <stdlib.h>
+
+#if defined(DEBUG)
+/* to the end */
+static HashTable * threadLabels = NULL;
+
+void
+initThreadLabelTable(void)
+{
+  if (threadLabels == NULL) {
+    threadLabels = allocHashTable();
+  }
+}
+
+void
+updateThreadLabel(StgWord key, void *data)
+{
+  removeThreadLabel(key);
+  insertHashTable(threadLabels,key,data);
+}
+
+void *
+lookupThreadLabel(StgWord key)
+{
+  return lookupHashTable(threadLabels,key);
+}
+
+void
+removeThreadLabel(StgWord key)
+{
+  void * old = NULL;
+  if ((old = lookupHashTable(threadLabels,key))) {
+    removeHashTable(threadLabels,key,old);
+    stgFree(old);
+  }  
+}
+#endif /* DEBUG */
diff --git a/rts/ThreadLabels.h b/rts/ThreadLabels.h
new file mode 100644
index 0000000000..97d3d0d241
--- /dev/null
+++ b/rts/ThreadLabels.h
@@ -0,0 +1,27 @@
+/* -----------------------------------------------------------------------------
+ * ThreadLabels.h
+ *
+ * (c) The GHC Team 2002-2003
+ *
+ * Table of thread labels.
+ *
+ * ---------------------------------------------------------------------------*/
+#ifndef __THREADLABELS_H__
+#define __THREADLABELS_H__
+
+#include "Rts.h"
+#include "Hash.h"
+
+void
+initThreadLabelTable(void);
+
+void
+updateThreadLabel(StgWord key, void *data);
+
+void *
+lookupThreadLabel(StgWord key);
+
+void
+removeThreadLabel(StgWord key);
+
+#endif /* __THREADLABELS_H__ */
diff --git a/rts/Ticker.h b/rts/Ticker.h
new file mode 100644
index 0000000000..f9555768b5
--- /dev/null
+++ b/rts/Ticker.h
@@ -0,0 +1,15 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 2005
+ *
+ * Ticker interface (implementation is OS-specific)
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef TICKER_H
+#define TICKER_H
+
+extern int  startTicker( nat ms, TickProc handle_tick );
+extern int  stopTicker ( void );
+
+#endif /* TICKER_H */
diff --git a/rts/Ticky.c b/rts/Ticky.c
new file mode 100644
index 0000000000..294e12bdda
--- /dev/null
+++ b/rts/Ticky.c
@@ -0,0 +1,628 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The AQUA project, Glasgow University, 1992-1997
+ * (c) The GHC Team, 1998-1999
+ *
+ * Ticky-ticky profiling
+ *-------------------------------------------------------------------------- */
+
+#if defined(TICKY_TICKY)
+
+#define TICKY_C			/* define those variables */
+#include "PosixSource.h"
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "Ticky.h"
+
+/* -----------------------------------------------------------------------------
+   Print out all the counters
+   -------------------------------------------------------------------------- */
+
+static void printRegisteredCounterInfo (FILE *); /* fwd decl */
+
+#define INTAVG(a,b) ((b == 0) ? 0.0 : ((double) (a) / (double) (b)))
+#define PC(a)	    (100.0 * a)
+
+#define AVG(thing) \
+	StgDouble avg##thing  = INTAVG(tot##thing,ctr##thing)
+
+void
+PrintTickyInfo(void)
+{
+  unsigned long i;
+  unsigned long tot_allocs = /* total number of things allocated */
+	ALLOC_FUN_ctr + ALLOC_SE_THK_ctr + ALLOC_UP_THK_ctr + ALLOC_CON_ctr + ALLOC_TUP_ctr +
+    	+ ALLOC_TSO_ctr + ALLOC_BH_ctr  + ALLOC_PAP_ctr + ALLOC_PRIM_ctr
+#ifdef PAR
+	+ ALLOC_FMBQ_ctr + ALLOC_FME_ctr + ALLOC_BF_ctr
+#endif
+      ;	
+
+  unsigned long tot_adm_wds = /* total number of admin words allocated */
+	ALLOC_FUN_adm + ALLOC_THK_adm + ALLOC_CON_adm + ALLOC_TUP_adm
+    	+ ALLOC_TSO_adm + ALLOC_BH_adm  + ALLOC_PAP_adm + ALLOC_PRIM_adm
+#ifdef PAR
+	+ ALLOC_FMBQ_adm + ALLOC_FME_adm + ALLOC_BF_adm
+#endif
+      ;
+
+  unsigned long tot_gds_wds = /* total number of words of ``good stuff'' allocated */
+	ALLOC_FUN_gds + ALLOC_THK_gds + ALLOC_CON_gds + ALLOC_TUP_gds
+    	+ ALLOC_TSO_gds + ALLOC_BH_gds  + ALLOC_PAP_gds + ALLOC_PRIM_gds
+#ifdef PAR
+	+ ALLOC_FMBQ_gds + ALLOC_FME_gds + ALLOC_BF_gds
+#endif
+      ;
+
+  unsigned long tot_slp_wds = /* total number of ``slop'' words allocated */
+	ALLOC_FUN_slp + ALLOC_THK_slp + ALLOC_CON_slp + ALLOC_TUP_slp
+    	+ ALLOC_TSO_slp + ALLOC_BH_slp  + ALLOC_PAP_slp + ALLOC_PRIM_slp
+#ifdef PAR
+	+ ALLOC_FMBQ_slp + ALLOC_FME_slp + ALLOC_BF_slp
+#endif
+      ;
+
+  unsigned long tot_wds = /* total words */
+	tot_adm_wds + tot_gds_wds + tot_slp_wds;
+
+  unsigned long tot_thk_enters = ENT_STATIC_THK_ctr + ENT_DYN_THK_ctr;
+  unsigned long tot_con_enters = ENT_STATIC_CON_ctr + ENT_DYN_CON_ctr;
+  unsigned long tot_fun_direct_enters = ENT_STATIC_FUN_DIRECT_ctr + ENT_DYN_FUN_DIRECT_ctr;
+  unsigned long tot_ind_enters = ENT_STATIC_IND_ctr + ENT_DYN_IND_ctr;
+  
+  // This is the number of times we entered a function via some kind
+  // of slow call.  It amounts to all the slow applications, not
+  // counting those that were to too few arguments.
+  unsigned long tot_fun_slow_enters = 
+      SLOW_CALL_ctr - 
+      SLOW_CALL_FUN_TOO_FEW_ctr -
+      SLOW_CALL_PAP_TOO_FEW_ctr;
+
+  unsigned long tot_known_calls =
+      KNOWN_CALL_ctr + KNOWN_CALL_TOO_FEW_ARGS_ctr + 
+      + KNOWN_CALL_EXTRA_ARGS_ctr;
+  unsigned long tot_tail_calls =
+      UNKNOWN_CALL_ctr + tot_known_calls;
+
+  unsigned long tot_enters =
+	tot_con_enters + tot_fun_direct_enters +
+	tot_ind_enters + ENT_PERM_IND_ctr + ENT_PAP_ctr + tot_thk_enters;
+  unsigned long jump_direct_enters =
+	tot_enters - ENT_VIA_NODE_ctr;
+
+  unsigned long tot_returns =
+      RET_NEW_ctr + RET_OLD_ctr + RET_UNBOXED_TUP_ctr;
+
+  unsigned long tot_returns_of_new = RET_NEW_ctr;
+
+  unsigned long con_updates = UPD_CON_IN_NEW_ctr + UPD_CON_IN_PLACE_ctr;
+  unsigned long pap_updates = UPD_PAP_IN_NEW_ctr + UPD_PAP_IN_PLACE_ctr;
+
+  unsigned long tot_updates = UPD_SQUEEZED_ctr + pap_updates + con_updates;
+
+  unsigned long tot_new_updates   = UPD_NEW_IND_ctr + UPD_NEW_PERM_IND_ctr;
+  unsigned long tot_old_updates   = UPD_OLD_IND_ctr + UPD_OLD_PERM_IND_ctr;
+  unsigned long tot_gengc_updates = tot_new_updates + tot_old_updates;
+
+  FILE *tf = RtsFlags.TickyFlags.tickyFile;
+
+  fprintf(tf,"\n\nALLOCATIONS: %ld (%ld words total: %ld admin, %ld goods, %ld slop)\n",
+	  tot_allocs, tot_wds, tot_adm_wds, tot_gds_wds, tot_slp_wds);
+  fprintf(tf,"\t\t\t\ttotal words:\t    2     3     4     5    6+\n");
+
+#define ALLOC_HISTO_MAGIC(categ) \
+	(PC(INTAVG(ALLOC_##categ##_hst[0], ALLOC_##categ##_ctr))), \
+	(PC(INTAVG(ALLOC_##categ##_hst[1], ALLOC_##categ##_ctr))), \
+	(PC(INTAVG(ALLOC_##categ##_hst[2], ALLOC_##categ##_ctr))), \
+	(PC(INTAVG(ALLOC_##categ##_hst[3], ALLOC_##categ##_ctr))), \
+	(PC(INTAVG(ALLOC_##categ##_hst[4], ALLOC_##categ##_ctr)))
+
+  fprintf(tf,"%7ld (%5.1f%%) function values",
+	ALLOC_FUN_ctr,
+	PC(INTAVG(ALLOC_FUN_ctr, tot_allocs)));
+  if (ALLOC_FUN_ctr != 0)
+      fprintf(tf,"\t\t%5.1f %5.1f %5.1f %5.1f %5.1f", ALLOC_HISTO_MAGIC(FUN));
+
+  fprintf(tf,"\n%7ld (%5.1f%%) thunks",
+	ALLOC_SE_THK_ctr + ALLOC_UP_THK_ctr,
+	PC(INTAVG(ALLOC_SE_THK_ctr + ALLOC_UP_THK_ctr, tot_allocs)));
+
+#define ALLOC_THK_ctr (ALLOC_UP_THK_ctr + ALLOC_SE_THK_ctr)
+  /* hack to make ALLOC_HISTO_MAGIC still work for THK */
+  if ((ALLOC_SE_THK_ctr + ALLOC_UP_THK_ctr) != 0)
+      fprintf(tf,"\t\t\t\t%5.1f %5.1f %5.1f %5.1f %5.1f", ALLOC_HISTO_MAGIC(THK));
+#undef ALLOC_THK_ctr
+
+  fprintf(tf,"\n%7ld (%5.1f%%) data values",
+	ALLOC_CON_ctr,
+	PC(INTAVG(ALLOC_CON_ctr, tot_allocs)));
+  if (ALLOC_CON_ctr != 0)
+      fprintf(tf,"\t\t\t%5.1f %5.1f %5.1f %5.1f %5.1f", ALLOC_HISTO_MAGIC(CON));
+
+  fprintf(tf,"\n%7ld (%5.1f%%) big tuples",
+	ALLOC_TUP_ctr,
+	PC(INTAVG(ALLOC_TUP_ctr, tot_allocs)));
+  if (ALLOC_TUP_ctr != 0)
+      fprintf(tf,"\t\t\t%5.1f %5.1f %5.1f %5.1f %5.1f", ALLOC_HISTO_MAGIC(TUP));
+
+  fprintf(tf,"\n%7ld (%5.1f%%) black holes",
+	ALLOC_BH_ctr,
+	PC(INTAVG(ALLOC_BH_ctr, tot_allocs)));
+  if (ALLOC_BH_ctr != 0)
+      fprintf(tf,"\t\t\t%5.1f %5.1f %5.1f %5.1f %5.1f", ALLOC_HISTO_MAGIC(BH));
+
+  fprintf(tf,"\n%7ld (%5.1f%%) prim things",
+	ALLOC_PRIM_ctr,
+	PC(INTAVG(ALLOC_PRIM_ctr, tot_allocs)));
+  if (ALLOC_PRIM_ctr != 0)
+      fprintf(tf,"\t\t\t%5.1f %5.1f %5.1f %5.1f %5.1f", ALLOC_HISTO_MAGIC(PRIM));
+
+  fprintf(tf,"\n%7ld (%5.1f%%) partial applications",
+	ALLOC_PAP_ctr,
+	PC(INTAVG(ALLOC_PAP_ctr, tot_allocs)));
+  if (ALLOC_PAP_ctr != 0)
+      fprintf(tf,"\t\t%5.1f %5.1f %5.1f %5.1f %5.1f", ALLOC_HISTO_MAGIC(PAP));
+
+  fprintf(tf,"\n%7ld (%5.1f%%) thread state objects",
+	ALLOC_TSO_ctr,
+	PC(INTAVG(ALLOC_TSO_ctr, tot_allocs)));
+  if (ALLOC_TSO_ctr != 0)
+      fprintf(tf,"\t\t%5.1f %5.1f %5.1f %5.1f %5.1f", ALLOC_HISTO_MAGIC(TSO));
+#ifdef PAR
+  fprintf(tf,"\n%7ld (%5.1f%%) thread state objects",
+	ALLOC_FMBQ_ctr,
+	PC(INTAVG(ALLOC_FMBQ_ctr, tot_allocs)));
+  if (ALLOC_FMBQ_ctr != 0)
+      fprintf(tf,"\t\t%5.1f %5.1f %5.1f %5.1f %5.1f", ALLOC_HISTO_MAGIC(FMBQ));
+  fprintf(tf,"\n%7ld (%5.1f%%) thread state objects",
+	ALLOC_FME_ctr,
+	PC(INTAVG(ALLOC_FME_ctr, tot_allocs)));
+  if (ALLOC_FME_ctr != 0)
+      fprintf(tf,"\t\t%5.1f %5.1f %5.1f %5.1f %5.1f", ALLOC_HISTO_MAGIC(FME));
+  fprintf(tf,"\n%7ld (%5.1f%%) thread state objects",
+	ALLOC_BF_ctr,
+	PC(INTAVG(ALLOC_BF_ctr, tot_allocs)));
+  if (ALLOC_BF_ctr != 0)
+      fprintf(tf,"\t\t%5.1f %5.1f %5.1f %5.1f %5.1f", ALLOC_HISTO_MAGIC(BF));
+#endif
+  fprintf(tf,"\n");
+
+  fprintf(tf,"\nTotal storage-manager allocations: %ld (%ld words)\n\t[%ld words lost to speculative heap-checks]\n", ALLOC_HEAP_ctr, ALLOC_HEAP_tot, ALLOC_HEAP_tot - tot_wds);
+
+  fprintf(tf,"\nSTACK USAGE:\n"); /* NB: some bits are direction sensitive */
+
+  fprintf(tf,"\nENTERS: %ld  of which %ld (%.1f%%) direct to the entry code\n\t\t  [the rest indirected via Node's info ptr]\n",
+	tot_enters,
+	jump_direct_enters,
+	PC(INTAVG(jump_direct_enters,tot_enters)));
+  fprintf(tf,"%7ld (%5.1f%%) thunks\n",
+	tot_thk_enters,
+	PC(INTAVG(tot_thk_enters,tot_enters)));
+  fprintf(tf,"%7ld (%5.1f%%) data values\n",
+	tot_con_enters,
+	PC(INTAVG(tot_con_enters,tot_enters)));
+  fprintf(tf,"%7ld (%5.1f%%) normal indirections\n",
+	tot_ind_enters,
+	PC(INTAVG(tot_ind_enters,tot_enters)));
+  fprintf(tf,"%7ld (%5.1f%%) permanent indirections\n",
+	ENT_PERM_IND_ctr,
+	PC(INTAVG(ENT_PERM_IND_ctr,tot_enters)));
+
+  fprintf(tf,"\nFUNCTION ENTRIES: %ld\n", tot_fun_direct_enters);
+
+  fprintf(tf, "\nTAIL CALLS: %ld, of which %ld (%.lf%%) were to known functions\n", 
+	  tot_tail_calls, tot_known_calls,
+	  PC(INTAVG(tot_known_calls,tot_tail_calls)));
+
+  fprintf(tf, "\nSLOW APPLICATIONS: %ld evaluated, %ld unevaluated\n",
+	  SLOW_CALL_ctr, SLOW_CALL_UNEVALD_ctr);
+  fprintf(tf, "\n");
+  fprintf(tf, "         Too few args   Correct args   Too many args\n");
+  fprintf(tf, "   FUN     %8ld       %8ld        %8ld\n", 
+	  SLOW_CALL_FUN_TOO_FEW_ctr, SLOW_CALL_FUN_CORRECT_ctr, SLOW_CALL_FUN_TOO_MANY_ctr);
+  fprintf(tf, "   PAP     %8ld       %8ld        %8ld\n", 
+	  SLOW_CALL_PAP_TOO_FEW_ctr, SLOW_CALL_PAP_CORRECT_ctr, SLOW_CALL_PAP_TOO_MANY_ctr);
+  fprintf(tf, "\n");
+
+  fprintf(tf,"\nRETURNS: %ld\n", tot_returns);
+  fprintf(tf,"%7ld (%5.1f%%) from entering a new constructor\n\t\t  [the rest from entering an existing constructor]\n",
+	tot_returns_of_new,
+	PC(INTAVG(tot_returns_of_new,tot_returns)));
+  fprintf(tf,"%7ld (%5.1f%%) vectored [the rest unvectored]\n",
+	VEC_RETURN_ctr,
+	PC(INTAVG(VEC_RETURN_ctr,tot_returns)));
+
+  fprintf(tf, "\nRET_NEW:         %7ld: ", RET_NEW_ctr);
+  for (i = 0; i < 9; i++) { fprintf(tf, "%5.1f%%",
+				PC(INTAVG(RET_NEW_hst[i],RET_NEW_ctr))); }
+  fprintf(tf, "\n");
+  fprintf(tf, "RET_OLD:         %7ld: ", RET_OLD_ctr);
+  for (i = 0; i < 9; i++) { fprintf(tf, "%5.1f%%",
+				PC(INTAVG(RET_OLD_hst[i],RET_OLD_ctr))); }
+  fprintf(tf, "\n");
+  fprintf(tf, "RET_UNBOXED_TUP: %7ld: ", RET_UNBOXED_TUP_ctr);
+  for (i = 0; i < 9; i++) { fprintf(tf, "%5.1f%%",
+				    PC(INTAVG(RET_UNBOXED_TUP_hst[i],
+					      RET_UNBOXED_TUP_ctr))); }
+  fprintf(tf, "\n");
+  fprintf(tf, "\nRET_VEC_RETURN : %7ld: ", VEC_RETURN_ctr);
+  for (i = 0; i < 9; i++) { fprintf(tf, "%5.1f%%",
+				PC(INTAVG(RET_VEC_RETURN_hst[i],VEC_RETURN_ctr))); }
+  fprintf(tf, "\n");
+
+  fprintf(tf,"\nUPDATE FRAMES: %ld (%ld omitted from thunks)",
+	UPDF_PUSHED_ctr,
+	UPDF_OMITTED_ctr);
+
+  fprintf(tf,"\nCATCH FRAMES:  %ld", CATCHF_PUSHED_ctr);
+
+  if (UPDF_RCC_PUSHED_ctr != 0)
+     fprintf(tf,"%7ld restore cost centre frames (%ld omitted)\n",
+	UPDF_RCC_PUSHED_ctr,
+	UPDF_RCC_OMITTED_ctr);
+
+  fprintf(tf,"\nUPDATES: %ld\n", tot_updates);
+  fprintf(tf,"%7ld (%5.1f%%) data values\n\t\t  [%ld in place, %ld allocated new space]\n",
+	con_updates,
+	PC(INTAVG(con_updates,tot_updates)),
+	UPD_CON_IN_PLACE_ctr, UPD_CON_IN_NEW_ctr);
+  fprintf(tf,"%7ld (%5.1f%%) partial applications\n\t\t  [%ld in place, %ld allocated new space]\n",
+	pap_updates,
+	PC(INTAVG(pap_updates,tot_updates)),
+	UPD_PAP_IN_PLACE_ctr, UPD_PAP_IN_NEW_ctr);
+  fprintf(tf,"%7ld (%5.1f%%) updates by squeezing\n",
+	UPD_SQUEEZED_ctr,
+	PC(INTAVG(UPD_SQUEEZED_ctr, tot_updates)));
+
+  fprintf(tf, "\nUPD_CON_IN_NEW:   %7ld: ", UPD_CON_IN_NEW_ctr);
+  for (i = 0; i < 9; i++) { fprintf(tf, "%7ld", UPD_CON_IN_NEW_hst[i]); }
+  fprintf(tf, "\n");
+  fprintf(tf, "UPD_CON_IN_PLACE: %7ld: ", UPD_CON_IN_PLACE_ctr);
+  for (i = 0; i < 9; i++) { fprintf(tf, "%7ld", UPD_CON_IN_PLACE_hst[i]); }
+  fprintf(tf, "\n");
+  fprintf(tf, "UPD_PAP_IN_NEW:   %7ld: ", UPD_PAP_IN_NEW_ctr);
+  for (i = 0; i < 9; i++) { fprintf(tf, "%7ld", UPD_PAP_IN_NEW_hst[i]); }
+  fprintf(tf, "\n");
+
+  if (tot_gengc_updates != 0) {
+      fprintf(tf,"\nNEW GEN UPDATES: %9ld (%5.1f%%)\n",
+	      tot_new_updates,
+	      PC(INTAVG(tot_new_updates,tot_gengc_updates)));
+      fprintf(tf,"OLD GEN UPDATES: %9ld (%5.1f%%)\n",
+	      tot_old_updates,
+	      PC(INTAVG(tot_old_updates,tot_gengc_updates)));
+  }
+
+  fprintf(tf,"\nTotal bytes copied during GC: %ld\n",
+	  GC_WORDS_COPIED_ctr * sizeof(W_));
+
+  printRegisteredCounterInfo(tf);
+
+  fprintf(tf,"\n**************************************************\n");
+
+  /* here, we print out all the raw numbers; these are really
+    more useful when we want to snag them for subsequent
+    rdb-etc processing. WDP 95/11
+  */
+
+#define PR_CTR(ctr) \
+  do { fprintf(tf,"%7ld " #ctr "\n", ctr); } while(0)
+/* COND_PR_CTR takes a boolean; if false then msg is the printname rather than ctr */
+#define COND_PR_CTR(ctr,b,msg) \
+    if (b) { fprintf(tf,"%7ld " #ctr "\n", ctr); } else { fprintf(tf,"%7ld " msg "\n", ctr); }
+#define PR_HST(hst,i) \
+  do { fprintf(tf,"%7ld " #hst "_" #i "\n", hst[i]); } while(0)
+
+  PR_CTR(ALLOC_HEAP_ctr);
+  PR_CTR(ALLOC_HEAP_tot);
+
+  PR_CTR(ALLOC_FUN_ctr);
+  PR_CTR(ALLOC_FUN_adm);
+  PR_CTR(ALLOC_FUN_gds);
+  PR_CTR(ALLOC_FUN_slp);
+  PR_HST(ALLOC_FUN_hst,0);
+  PR_HST(ALLOC_FUN_hst,1);
+  PR_HST(ALLOC_FUN_hst,2);
+  PR_HST(ALLOC_FUN_hst,3);
+  PR_HST(ALLOC_FUN_hst,4);
+  PR_CTR(ALLOC_UP_THK_ctr);
+  PR_CTR(ALLOC_SE_THK_ctr);
+  PR_CTR(ALLOC_THK_adm);
+  PR_CTR(ALLOC_THK_gds);
+  PR_CTR(ALLOC_THK_slp);
+  PR_HST(ALLOC_THK_hst,0);
+  PR_HST(ALLOC_THK_hst,1);
+  PR_HST(ALLOC_THK_hst,2);
+  PR_HST(ALLOC_THK_hst,3);
+  PR_HST(ALLOC_THK_hst,4);
+  PR_CTR(ALLOC_CON_ctr);
+  PR_CTR(ALLOC_CON_adm);
+  PR_CTR(ALLOC_CON_gds);
+  PR_CTR(ALLOC_CON_slp);
+  PR_HST(ALLOC_CON_hst,0);
+  PR_HST(ALLOC_CON_hst,1);
+  PR_HST(ALLOC_CON_hst,2);
+  PR_HST(ALLOC_CON_hst,3);
+  PR_HST(ALLOC_CON_hst,4);
+  PR_CTR(ALLOC_TUP_ctr);
+  PR_CTR(ALLOC_TUP_adm);
+  PR_CTR(ALLOC_TUP_gds);
+  PR_CTR(ALLOC_TUP_slp);
+  PR_HST(ALLOC_TUP_hst,0);
+  PR_HST(ALLOC_TUP_hst,1);
+  PR_HST(ALLOC_TUP_hst,2);
+  PR_HST(ALLOC_TUP_hst,3);
+  PR_HST(ALLOC_TUP_hst,4);
+  PR_CTR(ALLOC_BH_ctr);
+  PR_CTR(ALLOC_BH_adm);
+  PR_CTR(ALLOC_BH_gds);
+  PR_CTR(ALLOC_BH_slp);
+  PR_HST(ALLOC_BH_hst,0);
+  PR_HST(ALLOC_BH_hst,1);
+  PR_HST(ALLOC_BH_hst,2);
+  PR_HST(ALLOC_BH_hst,3);
+  PR_HST(ALLOC_BH_hst,4);
+  PR_CTR(ALLOC_PRIM_ctr);
+  PR_CTR(ALLOC_PRIM_adm);
+  PR_CTR(ALLOC_PRIM_gds);
+  PR_CTR(ALLOC_PRIM_slp);
+  PR_HST(ALLOC_PRIM_hst,0);
+  PR_HST(ALLOC_PRIM_hst,1);
+  PR_HST(ALLOC_PRIM_hst,2);
+  PR_HST(ALLOC_PRIM_hst,3);
+  PR_HST(ALLOC_PRIM_hst,4);
+  PR_CTR(ALLOC_PAP_ctr);
+  PR_CTR(ALLOC_PAP_adm);
+  PR_CTR(ALLOC_PAP_gds);
+  PR_CTR(ALLOC_PAP_slp);
+  PR_HST(ALLOC_PAP_hst,0);
+  PR_HST(ALLOC_PAP_hst,1);
+  PR_HST(ALLOC_PAP_hst,2);
+  PR_HST(ALLOC_PAP_hst,3);
+  PR_HST(ALLOC_PAP_hst,4);
+
+  PR_CTR(ALLOC_TSO_ctr);
+  PR_CTR(ALLOC_TSO_adm);
+  PR_CTR(ALLOC_TSO_gds);
+  PR_CTR(ALLOC_TSO_slp);
+  PR_HST(ALLOC_TSO_hst,0);
+  PR_HST(ALLOC_TSO_hst,1);
+  PR_HST(ALLOC_TSO_hst,2);
+  PR_HST(ALLOC_TSO_hst,3);
+  PR_HST(ALLOC_TSO_hst,4);
+
+#ifdef PAR
+  PR_CTR(ALLOC_FMBQ_ctr);
+  PR_CTR(ALLOC_FMBQ_adm);
+  PR_CTR(ALLOC_FMBQ_gds);
+  PR_CTR(ALLOC_FMBQ_slp);
+  PR_HST(ALLOC_FMBQ_hst,0);
+  PR_HST(ALLOC_FMBQ_hst,1);
+  PR_HST(ALLOC_FMBQ_hst,2);
+  PR_HST(ALLOC_FMBQ_hst,3);
+  PR_HST(ALLOC_FMBQ_hst,4);
+  PR_CTR(ALLOC_FME_ctr);
+  PR_CTR(ALLOC_FME_adm);
+  PR_CTR(ALLOC_FME_gds);
+  PR_CTR(ALLOC_FME_slp);
+  PR_HST(ALLOC_FME_hst,0);
+  PR_HST(ALLOC_FME_hst,1);
+  PR_HST(ALLOC_FME_hst,2);
+  PR_HST(ALLOC_FME_hst,3);
+  PR_HST(ALLOC_FME_hst,4);
+  PR_CTR(ALLOC_BF_ctr);
+  PR_CTR(ALLOC_BF_adm);
+  PR_CTR(ALLOC_BF_gds);
+  PR_CTR(ALLOC_BF_slp);
+  PR_HST(ALLOC_BF_hst,0);
+  PR_HST(ALLOC_BF_hst,1);
+  PR_HST(ALLOC_BF_hst,2);
+  PR_HST(ALLOC_BF_hst,3);
+  PR_HST(ALLOC_BF_hst,4);
+#endif
+
+  PR_CTR(ENT_VIA_NODE_ctr);
+  PR_CTR(ENT_STATIC_CON_ctr);
+  PR_CTR(ENT_DYN_CON_ctr);
+  PR_CTR(ENT_STATIC_FUN_DIRECT_ctr);
+  PR_CTR(ENT_DYN_FUN_DIRECT_ctr);
+  PR_CTR(ENT_STATIC_IND_ctr);
+  PR_CTR(ENT_DYN_IND_ctr);
+
+/* The counters ENT_PERM_IND and UPD_{NEW,OLD}_PERM_IND are not dumped
+ * at the end of execution unless update squeezing is turned off (+RTS
+ * -Z =RtsFlags.GcFlags.squeezeUpdFrames), as they will be wrong
+ * otherwise.  Why?  Because for each update frame squeezed out, we
+ * count an UPD_NEW_PERM_IND *at GC time* (i.e., too early).  And
+ * further, when we enter the closure that has been updated, we count
+ * the ENT_PERM_IND, but we then enter the PERM_IND that was built for
+ * the next update frame below, and so on down the chain until we
+ * finally reach the value.  Thus we count many new ENT_PERM_INDs too
+ * early.  
+ * 
+ * This of course refers to the -ticky version that uses PERM_INDs to
+ * determine the number of closures entered 0/1/>1.  KSW 1999-04.  */
+  COND_PR_CTR(ENT_PERM_IND_ctr,RtsFlags.GcFlags.squeezeUpdFrames == rtsFalse,"E!NT_PERM_IND_ctr requires +RTS -Z");
+
+  PR_CTR(ENT_AP_ctr);
+  PR_CTR(ENT_PAP_ctr);
+  PR_CTR(ENT_AP_STACK_ctr);
+  PR_CTR(ENT_BH_ctr);
+  PR_CTR(ENT_STATIC_THK_ctr);
+  PR_CTR(ENT_DYN_THK_ctr);
+
+  PR_CTR(SLOW_CALL_v_ctr);
+  PR_CTR(SLOW_CALL_f_ctr);
+  PR_CTR(SLOW_CALL_d_ctr);
+  PR_CTR(SLOW_CALL_l_ctr);
+  PR_CTR(SLOW_CALL_n_ctr);
+  PR_CTR(SLOW_CALL_p_ctr);
+  PR_CTR(SLOW_CALL_pv_ctr);
+  PR_CTR(SLOW_CALL_pp_ctr);
+  PR_CTR(SLOW_CALL_ppv_ctr);
+  PR_CTR(SLOW_CALL_ppp_ctr);
+  PR_CTR(SLOW_CALL_pppv_ctr);
+  PR_CTR(SLOW_CALL_pppp_ctr);
+  PR_CTR(SLOW_CALL_ppppp_ctr);
+  PR_CTR(SLOW_CALL_pppppp_ctr);
+  PR_CTR(SLOW_CALL_OTHER_ctr);
+
+  PR_CTR(UNKNOWN_CALL_ctr);
+  PR_CTR(KNOWN_CALL_ctr);
+  PR_CTR(KNOWN_CALL_TOO_FEW_ARGS_ctr);
+  PR_CTR(KNOWN_CALL_EXTRA_ARGS_ctr);
+  PR_CTR(MULTI_CHUNK_SLOW_CALL_ctr);
+  PR_CTR(MULTI_CHUNK_SLOW_CALL_CHUNKS_ctr);
+  PR_CTR(SLOW_CALL_ctr);
+  PR_CTR(SLOW_CALL_FUN_TOO_FEW_ctr);
+  PR_CTR(SLOW_CALL_FUN_CORRECT_ctr);
+  PR_CTR(SLOW_CALL_FUN_TOO_MANY_ctr);
+  PR_CTR(SLOW_CALL_PAP_TOO_FEW_ctr);
+  PR_CTR(SLOW_CALL_PAP_CORRECT_ctr);
+  PR_CTR(SLOW_CALL_PAP_TOO_MANY_ctr);
+  PR_CTR(SLOW_CALL_UNEVALD_ctr);
+  PR_HST(SLOW_CALL_hst,0);
+  PR_HST(SLOW_CALL_hst,1);
+  PR_HST(SLOW_CALL_hst,2);
+  PR_HST(SLOW_CALL_hst,3);
+  PR_HST(SLOW_CALL_hst,4);
+  PR_HST(SLOW_CALL_hst,5);
+  PR_HST(SLOW_CALL_hst,6);
+  PR_HST(SLOW_CALL_hst,7);
+
+  PR_CTR(RET_NEW_ctr);
+  PR_CTR(RET_OLD_ctr);
+  PR_CTR(RET_UNBOXED_TUP_ctr);
+  PR_CTR(VEC_RETURN_ctr);
+
+  PR_HST(RET_NEW_hst,0);
+  PR_HST(RET_NEW_hst,1);
+  PR_HST(RET_NEW_hst,2);
+  PR_HST(RET_NEW_hst,3);
+  PR_HST(RET_NEW_hst,4);
+  PR_HST(RET_NEW_hst,5);
+  PR_HST(RET_NEW_hst,6);
+  PR_HST(RET_NEW_hst,7);
+  PR_HST(RET_NEW_hst,8);
+  PR_HST(RET_OLD_hst,0);
+  PR_HST(RET_OLD_hst,1);
+  PR_HST(RET_OLD_hst,2);
+  PR_HST(RET_OLD_hst,3);
+  PR_HST(RET_OLD_hst,4);
+  PR_HST(RET_OLD_hst,5);
+  PR_HST(RET_OLD_hst,6);
+  PR_HST(RET_OLD_hst,7);
+  PR_HST(RET_OLD_hst,8);
+  PR_HST(RET_UNBOXED_TUP_hst,0);
+  PR_HST(RET_UNBOXED_TUP_hst,1);
+  PR_HST(RET_UNBOXED_TUP_hst,2);
+  PR_HST(RET_UNBOXED_TUP_hst,3);
+  PR_HST(RET_UNBOXED_TUP_hst,4);
+  PR_HST(RET_UNBOXED_TUP_hst,5);
+  PR_HST(RET_UNBOXED_TUP_hst,6);
+  PR_HST(RET_UNBOXED_TUP_hst,7);
+  PR_HST(RET_UNBOXED_TUP_hst,8);
+  PR_HST(RET_VEC_RETURN_hst,0);
+  PR_HST(RET_VEC_RETURN_hst,1);
+  PR_HST(RET_VEC_RETURN_hst,2);
+  PR_HST(RET_VEC_RETURN_hst,3);
+  PR_HST(RET_VEC_RETURN_hst,4);
+  PR_HST(RET_VEC_RETURN_hst,5);
+  PR_HST(RET_VEC_RETURN_hst,6);
+  PR_HST(RET_VEC_RETURN_hst,7);
+  PR_HST(RET_VEC_RETURN_hst,8);
+
+  PR_CTR(UPDF_OMITTED_ctr);
+  PR_CTR(UPDF_PUSHED_ctr);
+  PR_CTR(CATCHF_PUSHED_ctr);
+
+  PR_CTR(UPDF_RCC_PUSHED_ctr);
+  PR_CTR(UPDF_RCC_OMITTED_ctr);
+
+  PR_CTR(UPD_SQUEEZED_ctr);
+  PR_CTR(UPD_CON_IN_NEW_ctr);
+  PR_CTR(UPD_CON_IN_PLACE_ctr);
+  PR_CTR(UPD_PAP_IN_NEW_ctr);
+  PR_CTR(UPD_PAP_IN_PLACE_ctr);
+
+  PR_CTR(UPD_BH_UPDATABLE_ctr);
+  PR_CTR(UPD_BH_SINGLE_ENTRY_ctr);
+  PR_CTR(UPD_CAF_BH_UPDATABLE_ctr);
+  PR_CTR(UPD_CAF_BH_SINGLE_ENTRY_ctr);
+
+  PR_HST(UPD_CON_IN_NEW_hst,0);
+  PR_HST(UPD_CON_IN_NEW_hst,1);
+  PR_HST(UPD_CON_IN_NEW_hst,2);
+  PR_HST(UPD_CON_IN_NEW_hst,3);
+  PR_HST(UPD_CON_IN_NEW_hst,4);
+  PR_HST(UPD_CON_IN_NEW_hst,5);
+  PR_HST(UPD_CON_IN_NEW_hst,6);
+  PR_HST(UPD_CON_IN_NEW_hst,7);
+  PR_HST(UPD_CON_IN_NEW_hst,8);
+  PR_HST(UPD_PAP_IN_NEW_hst,0);
+  PR_HST(UPD_PAP_IN_NEW_hst,1);
+  PR_HST(UPD_PAP_IN_NEW_hst,2);
+  PR_HST(UPD_PAP_IN_NEW_hst,3);
+  PR_HST(UPD_PAP_IN_NEW_hst,4);
+  PR_HST(UPD_PAP_IN_NEW_hst,5);
+  PR_HST(UPD_PAP_IN_NEW_hst,6);
+  PR_HST(UPD_PAP_IN_NEW_hst,7);
+  PR_HST(UPD_PAP_IN_NEW_hst,8);
+
+  PR_CTR(UPD_NEW_IND_ctr);
+  /* see comment on ENT_PERM_IND_ctr */
+  COND_PR_CTR(UPD_NEW_PERM_IND_ctr,RtsFlags.GcFlags.squeezeUpdFrames == rtsFalse,"U!PD_NEW_PERM_IND_ctr requires +RTS -Z");
+  PR_CTR(UPD_OLD_IND_ctr);
+  /* see comment on ENT_PERM_IND_ctr */
+  COND_PR_CTR(UPD_OLD_PERM_IND_ctr,RtsFlags.GcFlags.squeezeUpdFrames == rtsFalse,"U!PD_OLD_PERM_IND_ctr requires +RTS -Z");
+
+  PR_CTR(GC_SEL_ABANDONED_ctr);
+  PR_CTR(GC_SEL_MINOR_ctr);
+  PR_CTR(GC_SEL_MAJOR_ctr);
+  PR_CTR(GC_FAILED_PROMOTION_ctr);
+  PR_CTR(GC_WORDS_COPIED_ctr);
+}
+
+/* Data structure used in ``registering'' one of these counters. */
+
+StgEntCounter *ticky_entry_ctrs = NULL; /* root of list of them */
+
+/* To print out all the registered-counter info: */
+
+static void
+printRegisteredCounterInfo (FILE *tf)
+{
+    StgEntCounter *p;
+
+    if ( ticky_entry_ctrs != NULL ) {
+      fprintf(tf,"\n**************************************************\n\n");
+    }
+    fprintf(tf, "%11s%11s %6s%6s    %-11s%-30s\n",
+	    "Entries", "Allocs", "Arity", "Stack", "Kinds", "Function");
+    fprintf(tf, "--------------------------------------------------------------------------------\n");
+    /* Function name at the end so it doesn't mess up the tabulation */
+
+    for (p = ticky_entry_ctrs; p != NULL; p = p->link) {
+	fprintf(tf, "%11ld%11ld %6u%6u    %-11s%-30s",
+		p->entry_count,
+		p->allocs,
+		p->arity,
+		p->stk_args,
+		p->arg_kinds,
+		p->str);
+
+	fprintf(tf, "\n");
+
+    }
+}
+
+/* Catch-all top-level counter struct.  Allocations from CAFs will go
+ * here.
+ */
+StgEntCounter top_ct
+        = { 0, 0, 0,
+	    "TOP", "",
+	    0, 0, NULL };
+
+#endif /* TICKY_TICKY */
+
diff --git a/rts/Ticky.h b/rts/Ticky.h
new file mode 100644
index 0000000000..21765e4bbb
--- /dev/null
+++ b/rts/Ticky.h
@@ -0,0 +1,9 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1999
+ *
+ * Header for Ticky.c
+ *
+ * ---------------------------------------------------------------------------*/
+
+extern void PrintTickyInfo(void);
diff --git a/rts/Timer.c b/rts/Timer.c
new file mode 100644
index 0000000000..0bfea2d6fd
--- /dev/null
+++ b/rts/Timer.c
@@ -0,0 +1,102 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1995-2005
+ *
+ * Interval timer service for profiling and pre-emptive scheduling.
+ *
+ * ---------------------------------------------------------------------------*/
+
+/*
+ * The interval timer is used for profiling and for context switching in the
+ * threaded build. 
+ *
+ * This file defines the platform-independent view of interval timing, relying
+ * on platform-specific services to install and run the timers.
+ *
+ */
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "Proftimer.h"
+#include "Schedule.h"
+#include "Timer.h"
+#include "Ticker.h"
+#include "Capability.h"
+
+/* ticks left before next pre-emptive context switch */
+static int ticks_to_ctxt_switch = 0;
+
+#if defined(THREADED_RTS)
+/* idle ticks left before we perform a GC */
+static int ticks_to_gc = 0;
+#endif
+
+/*
+ * Function: handle_tick()
+ *
+ * At each occurrence of a tick, the OS timer will invoke
+ * handle_tick().
+ */
+static
+void
+handle_tick(int unused STG_UNUSED)
+{
+#ifdef PROFILING
+  handleProfTick();
+#endif
+  if (RtsFlags.ConcFlags.ctxtSwitchTicks > 0) {
+      ticks_to_ctxt_switch--;
+      if (ticks_to_ctxt_switch <= 0) {
+	  ticks_to_ctxt_switch = RtsFlags.ConcFlags.ctxtSwitchTicks;
+	  context_switch = 1;	/* schedule a context switch */
+      }
+  }
+
+#if defined(THREADED_RTS)
+  /* 
+   * If we've been inactive for idleGCDelayTicks (set by +RTS
+   * -I), tell the scheduler to wake up and do a GC, to check
+   * for threads that are deadlocked.
+   */
+  switch (recent_activity) {
+  case ACTIVITY_YES:
+      recent_activity = ACTIVITY_MAYBE_NO;
+      ticks_to_gc = RtsFlags.GcFlags.idleGCDelayTicks;
+      break;
+  case ACTIVITY_MAYBE_NO:
+      if (ticks_to_gc == 0) break; /* 0 ==> no idle GC */
+      ticks_to_gc--;
+      if (ticks_to_gc == 0) {
+	  ticks_to_gc = RtsFlags.GcFlags.idleGCDelayTicks;
+	  recent_activity = ACTIVITY_INACTIVE;
+	  blackholes_need_checking = rtsTrue;
+	  /* hack: re-use the blackholes_need_checking flag */
+	  
+	  /* ToDo: this doesn't work.  Can't invoke
+	   * pthread_cond_signal from a signal handler.
+	   * Furthermore, we can't prod a capability that we
+	   * might be holding.  What can we do?
+	   */
+	  prodOneCapability();
+      }
+      break;
+  default:
+      break;
+  }
+#endif
+}
+
+int
+startTimer(nat ms)
+{
+#ifdef PROFILING
+  initProfTimer();
+#endif
+
+  return startTicker(ms, handle_tick);
+}
+
+int
+stopTimer()
+{
+  return stopTicker();
+}
diff --git a/rts/Timer.h b/rts/Timer.h
new file mode 100644
index 0000000000..ae26653462
--- /dev/null
+++ b/rts/Timer.h
@@ -0,0 +1,24 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1995-2005
+ *
+ * Interval timer service for profiling and pre-emptive scheduling.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef TIMER_H
+#define TIMER_H
+
+# define TICK_MILLISECS   (1000/TICK_FREQUENCY)   /* ms per tick */
+
+/* Context switch timing constants. Context switches happen after a
+ * whole number of ticks, the default being every tick.
+ */
+#define CS_MIN_MILLISECS TICK_MILLISECS       /* milliseconds per slice */
+
+typedef void (*TickProc)(int);
+
+extern int startTimer(nat ms);
+extern int stopTimer(void);
+
+#endif /* TIMER_H */
diff --git a/rts/Updates.cmm b/rts/Updates.cmm
new file mode 100644
index 0000000000..1d2fc5fe0f
--- /dev/null
+++ b/rts/Updates.cmm
@@ -0,0 +1,153 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2004
+ *
+ * Code to perform updates.
+ *
+ * This file is written in a subset of C--, extended with various
+ * features specific to GHC.  It is compiled by GHC directly.  For the
+ * syntax of .cmm files, see the parser in ghc/compiler/cmm/CmmParse.y.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Cmm.h"
+#include "Updates.h"
+#include "StgLdvProf.h"
+
+/*
+  The update frame return address must be *polymorphic*, that means
+  we have to cope with both vectored and non-vectored returns.  This
+  is done by putting the return vector right before the info table, and
+  having a standard direct return address after the info table (pointed
+  to by the return address itself, as usual).
+
+  Each entry in the vector table points to a specialised entry code fragment
+  that knows how to return after doing the update.  It would be possible to
+  use a single generic piece of code that simply entered the return value
+  to return, but it's quicker this way.  The direct return code of course
+  just does another direct return when it's finished.
+*/
+
+/* on entry to the update code
+   (1) R1 points to the closure being returned
+   (2) Sp points to the update frame
+*/
+
+/* The update fragment has been tuned so as to generate good
+   code with gcc, which accounts for some of the strangeness in the
+   way it is written.  
+
+   In particular, the JMP_(ret) bit is passed down and pinned on the
+   end of each branch (there end up being two major branches in the
+   code), since we don't mind duplicating this jump.
+*/
+
+#define UPD_FRAME_ENTRY_TEMPLATE(label,ind_info,ret)			\
+        label								\
+	{								\
+          W_ updatee;							\
+									\
+          updatee = StgUpdateFrame_updatee(Sp);				\
+									\
+	  /* remove the update frame from the stack */			\
+	  Sp = Sp + SIZEOF_StgUpdateFrame;				\
+									\
+	  /* ToDo: it might be a PAP, so we should check... */		\
+	  TICK_UPD_CON_IN_NEW(sizeW_fromITBL(%GET_STD_INFO(updatee)));	\
+									\
+	  UPD_SPEC_IND(updatee, ind_info, R1, jump (ret));		\
+	}
+
+UPD_FRAME_ENTRY_TEMPLATE(stg_upd_frame_0_ret,stg_IND_0_info,%RET_VEC(Sp(0),0))
+UPD_FRAME_ENTRY_TEMPLATE(stg_upd_frame_1_ret,stg_IND_1_info,%RET_VEC(Sp(0),1))
+UPD_FRAME_ENTRY_TEMPLATE(stg_upd_frame_2_ret,stg_IND_2_info,%RET_VEC(Sp(0),2))
+UPD_FRAME_ENTRY_TEMPLATE(stg_upd_frame_3_ret,stg_IND_3_info,%RET_VEC(Sp(0),3))
+UPD_FRAME_ENTRY_TEMPLATE(stg_upd_frame_4_ret,stg_IND_4_info,%RET_VEC(Sp(0),4))
+UPD_FRAME_ENTRY_TEMPLATE(stg_upd_frame_5_ret,stg_IND_5_info,%RET_VEC(Sp(0),5))
+UPD_FRAME_ENTRY_TEMPLATE(stg_upd_frame_6_ret,stg_IND_6_info,%RET_VEC(Sp(0),6))
+UPD_FRAME_ENTRY_TEMPLATE(stg_upd_frame_7_ret,stg_IND_7_info,%RET_VEC(Sp(0),7))
+
+#if MAX_VECTORED_RTN > 8
+#error MAX_VECTORED_RTN has changed: please modify stg_upd_frame too.
+#endif
+
+/*
+  Make sure this table is big enough to handle the maximum vectored
+  return size!
+  */
+
+#if defined(PROFILING)
+#define UPD_FRAME_BITMAP 3
+#define UPD_FRAME_WORDS  3
+#else
+#define UPD_FRAME_BITMAP 0
+#define UPD_FRAME_WORDS  1
+#endif
+
+/* this bitmap indicates that the first word of an update frame is a
+ * non-pointer - this is the update frame link.  (for profiling,
+ * there's a cost-centre-stack in there too).
+ */
+
+INFO_TABLE_RET( stg_upd_frame, 
+	    UPD_FRAME_WORDS, UPD_FRAME_BITMAP, UPDATE_FRAME,
+	    stg_upd_frame_0_ret,
+	    stg_upd_frame_1_ret,
+	    stg_upd_frame_2_ret,
+	    stg_upd_frame_3_ret,
+	    stg_upd_frame_4_ret,
+	    stg_upd_frame_5_ret,
+	    stg_upd_frame_6_ret,
+	    stg_upd_frame_7_ret
+	    )
+UPD_FRAME_ENTRY_TEMPLATE(,stg_IND_direct_info,%ENTRY_CODE(Sp(0)))
+
+
+INFO_TABLE_RET( stg_marked_upd_frame, 
+	    UPD_FRAME_WORDS, UPD_FRAME_BITMAP, UPDATE_FRAME,
+	    stg_upd_frame_0_ret,
+	    stg_upd_frame_1_ret,
+	    stg_upd_frame_2_ret,
+	    stg_upd_frame_3_ret,
+	    stg_upd_frame_4_ret,
+	    stg_upd_frame_5_ret,
+	    stg_upd_frame_6_ret,
+	    stg_upd_frame_7_ret
+	    )
+UPD_FRAME_ENTRY_TEMPLATE(,stg_IND_direct_info,%ENTRY_CODE(Sp(0)))
+
+/*-----------------------------------------------------------------------------
+  Seq frames 
+
+  We don't have a primitive seq# operator: it is just a 'case'
+  expression whose scrutinee has either a polymorphic or function type
+  (constructor types can be handled by normal 'case' expressions).
+
+  To handle a polymorphic/function typed seq, we push a SEQ frame on
+  the stack.  This is a polymorphic activation record that just pops
+  itself and returns (in a non-vectored way) when entered.  The
+  purpose of the SEQ frame is to avoid having to make a polymorphic return
+  point for each polymorphic case expression.  
+
+  Another way of looking at it: the SEQ frame turns a vectored return
+  into a direct one.
+  -------------------------------------------------------------------------- */
+
+#if MAX_VECTORED_RTN > 8
+#error MAX_VECTORED_RTN has changed: please modify stg_seq_frame too.
+#endif
+
+INFO_TABLE_RET( stg_seq_frame, 0/* words */, 0/* bitmap */, RET_SMALL,
+	RET_LBL(stg_seq_frame), /* 0 */
+	RET_LBL(stg_seq_frame), /* 1 */
+	RET_LBL(stg_seq_frame), /* 2 */
+	RET_LBL(stg_seq_frame), /* 3 */
+	RET_LBL(stg_seq_frame), /* 4 */
+	RET_LBL(stg_seq_frame), /* 5 */
+	RET_LBL(stg_seq_frame), /* 6 */
+	RET_LBL(stg_seq_frame)  /* 7 */
+	)
+{
+   Sp_adj(1);
+   jump %ENTRY_CODE(Sp(0));
+}
diff --git a/rts/Updates.h b/rts/Updates.h
new file mode 100644
index 0000000000..5872157c81
--- /dev/null
+++ b/rts/Updates.h
@@ -0,0 +1,361 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2004
+ *
+ * Performing updates.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef UPDATES_H
+#define UPDATES_H
+
+/* -----------------------------------------------------------------------------
+   Updates
+
+   We have two layers of update macros.  The top layer, UPD_IND() and
+   friends perform all the work of an update.  In detail:
+
+      - if the closure being updated is a blocking queue, then all the
+        threads waiting on the blocking queue are updated.
+
+      - then the lower level updateWithIndirection() macro is invoked 
+        to actually replace the closure with an indirection (see below).
+
+   -------------------------------------------------------------------------- */
+
+#ifdef TICKY_TICKY
+# define UPD_IND(updclosure, heapptr) \
+   UPD_PERM_IND(updclosure,heapptr)
+# define UPD_SPEC_IND(updclosure, ind_info, heapptr, and_then) \
+   UPD_PERM_IND(updclosure,heapptr); and_then
+#else
+#  define SEMI ;
+# define UPD_IND(updclosure, heapptr) \
+   UPD_REAL_IND(updclosure,INFO_PTR(stg_IND_info),heapptr,SEMI)
+# define UPD_SPEC_IND(updclosure, ind_info, heapptr, and_then) \
+   UPD_REAL_IND(updclosure,ind_info,heapptr,and_then)
+#endif
+
+/* These macros have to work in both C and C--, so here's the
+ * impedence matching:
+ */
+#ifdef CMINUSMINUS
+#define BLOCK_BEGIN
+#define BLOCK_END
+#define DECLARE_IPTR(info)  W_ info
+#define FCALL               foreign "C"
+#define INFO_PTR(info)      info
+#define ARG_PTR             "ptr"
+#else
+#define BLOCK_BEGIN         {
+#define BLOCK_END           }
+#define DECLARE_IPTR(info)  const StgInfoTable *(info)
+#define FCALL               /* nothing */
+#define INFO_PTR(info)      &info
+#define StgBlockingQueue_blocking_queue(closure) \
+    (((StgBlockingQueue *)closure)->blocking_queue)
+#define ARG_PTR             /* nothing */
+#endif
+
+/* UPD_IND actually does a PERM_IND if TICKY_TICKY is on;
+   if you *really* need an IND use UPD_REAL_IND
+ */
+#define UPD_REAL_IND(updclosure, ind_info, heapptr, and_then)	\
+        BLOCK_BEGIN						\
+	DECLARE_IPTR(info);					\
+	info = GET_INFO(updclosure);				\
+	updateWithIndirection(ind_info,				\
+			      updclosure,			\
+			      heapptr,				\
+			      and_then);			\
+	BLOCK_END
+
+#if defined(PROFILING) || defined(TICKY_TICKY)
+#define UPD_PERM_IND(updclosure, heapptr)	\
+        BLOCK_BEGIN				\
+	updateWithPermIndirection(updclosure,	\
+				  heapptr);	\
+	BLOCK_END
+#endif
+
+#if defined(RTS_SUPPORTS_THREADS)
+
+# ifdef TICKY_TICKY
+#  define UPD_IND_NOLOCK(updclosure, heapptr)	\
+        BLOCK_BEGIN				\
+	updateWithPermIndirection(updclosure,	\
+				  heapptr);	\
+	BLOCK_END
+# else
+#  define UPD_IND_NOLOCK(updclosure, heapptr)			\
+        BLOCK_BEGIN						\
+	updateWithIndirection(INFO_PTR(stg_IND_info),		\
+			      updclosure,			\
+			      heapptr,); 			\
+	BLOCK_END
+# endif
+
+#else
+#define UPD_IND_NOLOCK(updclosure,heapptr) UPD_IND(updclosure,heapptr)
+#endif
+
+/* -----------------------------------------------------------------------------
+   Awaken any threads waiting on a blocking queue (BLACKHOLE_BQ).
+   -------------------------------------------------------------------------- */
+
+#if defined(PAR) 
+
+/* 
+   In a parallel setup several types of closures might have a blocking queue:
+     BLACKHOLE_BQ ... same as in the default concurrent setup; it will be
+                      reawakened via calling UPD_IND on that closure after
+		      having finished the computation of the graph
+     FETCH_ME_BQ  ... a global indirection (FETCH_ME) may be entered by a 
+                      local TSO, turning it into a FETCH_ME_BQ; it will be
+		      reawakened via calling processResume
+     RBH          ... a revertible black hole may be entered by another 
+                      local TSO, putting it onto its blocking queue; since
+		      RBHs only exist while the corresponding closure is in 
+		      transit, they will be reawakened via calling 
+		      convertToFetchMe (upon processing an ACK message)
+
+   In a parallel setup a blocking queue may contain 3 types of closures:
+     TSO           ... as in the default concurrent setup
+     BLOCKED_FETCH ... indicating that a TSO on another PE is waiting for
+                       the result of the current computation
+     CONSTR        ... an RBHSave closure (which contains data ripped out of
+                       the closure to make room for a blocking queue; since
+		       it only contains data we use the exisiting type of
+		       a CONSTR closure); this closure is the end of a 
+		       blocking queue for an RBH closure; it only exists in
+		       this kind of blocking queue and must be at the end
+		       of the queue
+*/		      
+extern void awakenBlockedQueue(StgBlockingQueueElement *q, StgClosure *node);
+#define DO_AWAKEN_BQ(bqe, node)  STGCALL2(awakenBlockedQueue, bqe, node);
+
+#define AWAKEN_BQ(info,closure)						\
+     	if (info == &stg_BLACKHOLE_BQ_info ||               \
+	    info == &stg_FETCH_ME_BQ_info ||                \
+	    get_itbl(closure)->type == RBH) {		                \
+		DO_AWAKEN_BQ(((StgBlockingQueue *)closure)->blocking_queue, closure);     	                \
+	}
+
+#elif defined(GRAN)
+
+extern void awakenBlockedQueue(StgBlockingQueueElement *q, StgClosure *node);
+#define DO_AWAKEN_BQ(bq, node)  STGCALL2(awakenBlockedQueue, bq, node);
+
+/* In GranSim we don't have FETCH_ME or FETCH_ME_BQ closures, so they are
+   not checked. The rest of the code is the same as for GUM.
+*/
+#define AWAKEN_BQ(info,closure)						\
+     	if (info == &stg_BLACKHOLE_BQ_info ||               \
+	    get_itbl(closure)->type == RBH) {		                \
+		DO_AWAKEN_BQ(((StgBlockingQueue *)closure)->blocking_queue, closure);     	                \
+	}
+
+#endif /* GRAN || PAR */
+
+
+/* -----------------------------------------------------------------------------
+   Updates: lower-level macros which update a closure with an
+   indirection to another closure.
+
+   There are several variants of this code.
+
+       PROFILING:
+   -------------------------------------------------------------------------- */
+
+/* LDV profiling:
+ * We call LDV_recordDead_FILL_SLOP_DYNAMIC(p1) regardless of the generation in 
+ * which p1 resides.
+ *
+ * Note: 
+ *   After all, we do *NOT* need to call LDV_RECORD_CREATE() for both IND and 
+ *   IND_OLDGEN closures because they are inherently used. But, it corrupts
+ *   the invariants that every closure keeps its creation time in the profiling
+ *  field. So, we call LDV_RECORD_CREATE().
+ */
+
+/* In the DEBUG case, we also zero out the slop of the old closure,
+ * so that the sanity checker can tell where the next closure is.
+ *
+ * Two important invariants: we should never try to update a closure
+ * to point to itself, and the closure being updated should not
+ * already have been updated (the mutable list will get messed up
+ * otherwise).
+ *
+ * NB. We do *not* do this in THREADED_RTS mode, because when we have the
+ * possibility of multiple threads entering the same closure, zeroing
+ * the slop in one of the threads would have a disastrous effect on
+ * the other (seen in the wild!).
+ */
+#ifdef CMINUSMINUS
+
+#define FILL_SLOP(p)							\
+  W_ inf;								\
+  W_ sz;								\
+  W_ i;									\
+  inf = %GET_STD_INFO(p);						\
+  if (%INFO_TYPE(inf) != HALF_W_(THUNK_SELECTOR)			\
+	&& %INFO_TYPE(inf) != HALF_W_(BLACKHOLE)			\
+	&& %INFO_TYPE(inf) != HALF_W_(CAF_BLACKHOLE)) {			\
+      if (%INFO_TYPE(inf) == HALF_W_(AP_STACK)) {			\
+          sz = StgAP_STACK_size(p) + BYTES_TO_WDS(SIZEOF_StgAP_STACK_NoThunkHdr); \
+      } else {								\
+          if (%INFO_TYPE(inf) == HALF_W_(AP)) {				\
+	      sz = TO_W_(StgAP_n_args(p)) +  BYTES_TO_WDS(SIZEOF_StgAP_NoThunkHdr);	\
+          } else {							\
+              sz = TO_W_(%INFO_PTRS(inf)) + TO_W_(%INFO_NPTRS(inf));	\
+	  }								\
+      }									\
+      i = 0;								\
+      for:								\
+        if (i < sz) {							\
+          StgThunk_payload(p,i) = 0;					\
+          i = i + 1;							\
+          goto for;							\
+        }								\
+  }
+
+#else /* !CMINUSMINUS */
+
+INLINE_HEADER void
+FILL_SLOP(StgClosure *p)
+{						
+    StgInfoTable *inf = get_itbl(p);		
+    nat i, sz;
+
+    switch (inf->type) {
+    case BLACKHOLE:
+    case CAF_BLACKHOLE:
+    case THUNK_SELECTOR:
+	return;
+    case AP:
+	sz = ((StgAP *)p)->n_args + sizeofW(StgAP) - sizeofW(StgThunkHeader);
+	break;
+    case AP_STACK:
+	sz = ((StgAP_STACK *)p)->size + sizeofW(StgAP_STACK) - sizeofW(StgThunkHeader);
+	break;
+    default:
+	sz = inf->layout.payload.ptrs + inf->layout.payload.nptrs;
+        break;
+    }
+    for (i = 0; i < sz; i++) {
+	((StgThunk *)p)->payload[i] = 0;
+    }
+}
+
+#endif /* CMINUSMINUS */
+
+#if !defined(DEBUG) || defined(THREADED_RTS)
+#define DEBUG_FILL_SLOP(p) /* do nothing */
+#else
+#define DEBUG_FILL_SLOP(p) FILL_SLOP(p)
+#endif
+
+/* We have two versions of this macro (sadly), one for use in C-- code,
+ * and the other for C.
+ *
+ * The and_then argument is a performance hack so that we can paste in
+ * the continuation code directly.  It helps shave a couple of
+ * instructions off the common case in the update code, which is
+ * worthwhile (the update code is often part of the inner loop).
+ * (except that gcc now appears to common up this code again and
+ * invert the optimisation.  Grrrr --SDM).
+ */
+#ifdef CMINUSMINUS
+#define generation(n) (W_[generations] + n*SIZEOF_generation)
+#define updateWithIndirection(ind_info, p1, p2, and_then)	\
+    W_ bd;							\
+								\
+    DEBUG_FILL_SLOP(p1);					\
+    LDV_RECORD_DEAD_FILL_SLOP_DYNAMIC(p1);			\
+    StgInd_indirectee(p1) = p2;					\
+    foreign "C" wb() [];					\
+    bd = Bdescr(p1);						\
+    if (bdescr_gen_no(bd) != 0 :: CInt) {			\
+      foreign "C" recordMutableCap(p1 "ptr",			\
+				   MyCapability() "ptr",	\
+		                   bdescr_gen_no(bd)) [R1];	\
+      SET_INFO(p1, stg_IND_OLDGEN_info);			\
+      LDV_RECORD_CREATE(p1);					\
+      TICK_UPD_OLD_IND();					\
+      and_then;							\
+    } else {							\
+      SET_INFO(p1, ind_info);					\
+      LDV_RECORD_CREATE(p1);					\
+      TICK_UPD_NEW_IND();					\
+      and_then;							\
+  }
+#else
+#define updateWithIndirection(ind_info, p1, p2, and_then)		\
+  {									\
+    bdescr *bd;								\
+									\
+    /* cas(p1, 0, &stg_WHITEHOLE_info); */				\
+    ASSERT( (P_)p1 != (P_)p2 && !closure_IND(p1) );			\
+    DEBUG_FILL_SLOP(p1);						\
+    LDV_RECORD_DEAD_FILL_SLOP_DYNAMIC(p1);				\
+    ((StgInd *)p1)->indirectee = p2;					\
+    wb();								\
+    bd = Bdescr((P_)p1);						\
+    if (bd->gen_no != 0) {						\
+      recordMutableGenLock(p1, &generations[bd->gen_no]);		\
+      SET_INFO(p1, &stg_IND_OLDGEN_info);				\
+      TICK_UPD_OLD_IND();						\
+      and_then;								\
+    } else {								\
+      SET_INFO(p1, ind_info);						\
+      LDV_RECORD_CREATE(p1);						\
+      TICK_UPD_NEW_IND();						\
+      and_then;								\
+    }									\
+  }
+#endif
+
+/* The permanent indirection version isn't performance critical.  We
+ * therefore use an inline C function instead of the C-- macro.
+ */
+#ifndef CMINUSMINUS
+INLINE_HEADER void
+updateWithPermIndirection(StgClosure *p1,
+	                  StgClosure *p2) 
+{
+  bdescr *bd;
+
+  ASSERT( p1 != p2 && !closure_IND(p1) );
+
+  /*
+   * @LDV profiling
+   * Destroy the old closure.
+   * Nb: LDV_* stuff cannot mix with ticky-ticky
+   */
+  LDV_RECORD_DEAD_FILL_SLOP_DYNAMIC(p1);
+
+  bd = Bdescr((P_)p1);
+  if (bd->gen_no != 0) {
+    recordMutableGenLock(p1, &generations[bd->gen_no]);
+    ((StgInd *)p1)->indirectee = p2;
+    SET_INFO(p1, &stg_IND_OLDGEN_PERM_info);
+    /*
+     * @LDV profiling
+     * We have just created a new closure.
+     */
+    LDV_RECORD_CREATE(p1);
+    TICK_UPD_OLD_PERM_IND();
+  } else {
+    ((StgInd *)p1)->indirectee = p2;
+    SET_INFO(p1, &stg_IND_PERM_info);
+    /*
+     * @LDV profiling
+     * We have just created a new closure.
+     */
+    LDV_RECORD_CREATE(p1);
+    TICK_UPD_NEW_PERM_IND(p1);
+  }
+}
+#endif
+
+#endif /* UPDATES_H */
diff --git a/rts/VisCallbacks.c b/rts/VisCallbacks.c
new file mode 100644
index 0000000000..8e3c6ceb6c
--- /dev/null
+++ b/rts/VisCallbacks.c
@@ -0,0 +1,75 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 2000
+ *
+ * RTS GTK Front Panel (callbacks)
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifdef RTS_GTK_FRONTPANEL
+
+#include "Rts.h"
+
+#include <gtk/gtk.h>
+
+#include "VisCallbacks.h"
+#include "VisWindow.h"
+#include "VisSupport.h"
+#include "FrontPanel.h"
+
+void
+on_cont_radio_clicked                  (GtkButton       *button,
+                                        gpointer         user_data)
+{
+    update_mode = Continuous;
+}
+
+
+void
+on_stop_before_radio_clicked           (GtkButton       *button,
+                                        gpointer         user_data)
+{
+    update_mode = BeforeGC;
+}
+
+
+void
+on_stop_after_radio_clicked            (GtkButton       *button,
+                                        gpointer         user_data)
+{
+    update_mode = AfterGC;
+}
+
+
+void
+on_stop_both_radio_clicked             (GtkButton       *button,
+                                        gpointer         user_data)
+{
+    update_mode = BeforeAfterGC;
+}
+
+
+void
+on_stop_but_clicked                    (GtkButton       *button,
+                                        gpointer         user_data)
+{
+    stop_now = TRUE;
+}
+
+
+void
+on_continue_but_clicked                (GtkButton       *button,
+                                        gpointer         user_data)
+{
+    continue_now = TRUE;
+}
+
+
+void
+on_quit_but_clicked                    (GtkButton       *button,
+                                        gpointer         user_data)
+{
+    quit = TRUE;
+}
+
+#endif /* RTS_GTK_FRONTPANEL */
diff --git a/rts/VisCallbacks.h b/rts/VisCallbacks.h
new file mode 100644
index 0000000000..d242010fad
--- /dev/null
+++ b/rts/VisCallbacks.h
@@ -0,0 +1,30 @@
+#include <gtk/gtk.h>
+
+
+void
+on_cont_radio_clicked                  (GtkButton       *button,
+                                        gpointer         user_data);
+
+void
+on_stop_before_radio_clicked           (GtkButton       *button,
+                                        gpointer         user_data);
+
+void
+on_stop_after_radio_clicked            (GtkButton       *button,
+                                        gpointer         user_data);
+
+void
+on_stop_both_radio_clicked             (GtkButton       *button,
+                                        gpointer         user_data);
+
+void
+on_stop_but_clicked                    (GtkButton       *button,
+                                        gpointer         user_data);
+
+void
+on_continue_but_clicked                (GtkButton       *button,
+                                        gpointer         user_data);
+
+void
+on_quit_but_clicked                    (GtkButton       *button,
+                                        gpointer         user_data);
diff --git a/rts/VisSupport.c b/rts/VisSupport.c
new file mode 100644
index 0000000000..a85c5f43a4
--- /dev/null
+++ b/rts/VisSupport.c
@@ -0,0 +1,144 @@
+/*
+ * DO NOT EDIT THIS FILE - it is generated by Glade.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdio.h>
+
+#include <gtk/gtk.h>
+
+#include "VisSupport.h"
+
+GtkWidget*
+lookup_widget                          (GtkWidget       *widget,
+                                        const gchar     *widget_name)
+{
+  GtkWidget *parent, *found_widget;
+
+  for (;;)
+    {
+      if (GTK_IS_MENU (widget))
+        parent = gtk_menu_get_attach_widget (GTK_MENU (widget));
+      else
+        parent = widget->parent;
+      if (!parent)
+        parent = (GtkWidget*) g_object_get_data (G_OBJECT (widget), "GladeParentKey");
+      if (parent == NULL)
+        break;
+      widget = parent;
+    }
+
+  found_widget = (GtkWidget*) g_object_get_data (G_OBJECT (widget),
+                                                 widget_name);
+  if (!found_widget)
+    g_warning ("Widget not found: %s", widget_name);
+  return found_widget;
+}
+
+static GList *pixmaps_directories = NULL;
+
+/* Use this function to set the directory containing installed pixmaps. */
+void
+add_pixmap_directory                   (const gchar     *directory)
+{
+  pixmaps_directories = g_list_prepend (pixmaps_directories,
+                                        g_strdup (directory));
+}
+
+/* This is an internally used function to find pixmap files. */
+static gchar*
+find_pixmap_file                       (const gchar     *filename)
+{
+  GList *elem;
+
+  /* We step through each of the pixmaps directory to find it. */
+  elem = pixmaps_directories;
+  while (elem)
+    {
+      gchar *pathname = g_strdup_printf ("%s%s%s", (gchar*)elem->data,
+                                         G_DIR_SEPARATOR_S, filename);
+      if (g_file_test (pathname, G_FILE_TEST_EXISTS))
+        return pathname;
+      g_free (pathname);
+      elem = elem->next;
+    }
+  return NULL;
+}
+
+/* This is an internally used function to create pixmaps. */
+GtkWidget*
+create_pixmap                          (GtkWidget       *widget,
+                                        const gchar     *filename)
+{
+  gchar *pathname = NULL;
+  GtkWidget *pixmap;
+
+  if (!filename || !filename[0])
+      return gtk_image_new ();
+
+  pathname = find_pixmap_file (filename);
+
+  if (!pathname)
+    {
+      g_warning ("Couldn't find pixmap file: %s", filename);
+      return gtk_image_new ();
+    }
+
+  pixmap = gtk_image_new_from_file (pathname);
+  g_free (pathname);
+  return pixmap;
+}
+
+/* This is an internally used function to create pixmaps. */
+GdkPixbuf*
+create_pixbuf                          (const gchar     *filename)
+{
+  gchar *pathname = NULL;
+  GdkPixbuf *pixbuf;
+  GError *error = NULL;
+
+  if (!filename || !filename[0])
+      return NULL;
+
+  pathname = find_pixmap_file (filename);
+
+  if (!pathname)
+    {
+      g_warning ("Couldn't find pixmap file: %s", filename);
+      return NULL;
+    }
+
+  pixbuf = gdk_pixbuf_new_from_file (pathname, &error);
+  if (!pixbuf)
+    {
+      fprintf (stderr, "Failed to load pixbuf file: %s: %s\n",
+               pathname, error->message);
+      g_error_free (error);
+    }
+  g_free (pathname);
+  return pixbuf;
+}
+
+/* This is used to set ATK action descriptions. */
+void
+glade_set_atk_action_description       (AtkAction       *action,
+                                        const gchar     *action_name,
+                                        const gchar     *description)
+{
+  gint n_actions, i;
+
+  n_actions = atk_action_get_n_actions (action);
+  for (i = 0; i < n_actions; i++)
+    {
+      if (!strcmp (atk_action_get_name (action, i), action_name))
+        atk_action_set_description (action, i, description);
+    }
+}
+
diff --git a/rts/VisSupport.h b/rts/VisSupport.h
new file mode 100644
index 0000000000..2dea079c2a
--- /dev/null
+++ b/rts/VisSupport.h
@@ -0,0 +1,44 @@
+/*
+ * DO NOT EDIT THIS FILE - it is generated by Glade.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#include <gtk/gtk.h>
+
+/*
+ * Public Functions.
+ */
+
+/*
+ * This function returns a widget in a component created by Glade.
+ * Call it with the toplevel widget in the component (i.e. a window/dialog),
+ * or alternatively any widget in the component, and the name of the widget
+ * you want returned.
+ */
+GtkWidget*  lookup_widget              (GtkWidget       *widget,
+                                        const gchar     *widget_name);
+
+
+/* Use this function to set the directory containing installed pixmaps. */
+void        add_pixmap_directory       (const gchar     *directory);
+
+
+/*
+ * Private Functions.
+ */
+
+/* This is used to create the pixmaps used in the interface. */
+GtkWidget*  create_pixmap              (GtkWidget       *widget,
+                                        const gchar     *filename);
+
+/* This is used to create the pixbufs used in the interface. */
+GdkPixbuf*  create_pixbuf              (const gchar     *filename);
+
+/* This is used to set ATK action descriptions. */
+void        glade_set_atk_action_description (AtkAction       *action,
+                                              const gchar     *action_name,
+                                              const gchar     *description);
+
diff --git a/rts/VisWindow.c b/rts/VisWindow.c
new file mode 100644
index 0000000000..188b88976e
--- /dev/null
+++ b/rts/VisWindow.c
@@ -0,0 +1,747 @@
+/*
+ * DO NOT EDIT THIS FILE - it is generated by Glade.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdio.h>
+
+#include <gdk/gdkkeysyms.h>
+#include <gtk/gtk.h>
+
+#include "VisCallbacks.h"
+#include "VisWindow.h"
+#include "VisSupport.h"
+
+#define GLADE_HOOKUP_OBJECT(component,widget,name) \
+  g_object_set_data_full (G_OBJECT (component), name, \
+    gtk_widget_ref (widget), (GDestroyNotify) gtk_widget_unref)
+
+#define GLADE_HOOKUP_OBJECT_NO_REF(component,widget,name) \
+  g_object_set_data (G_OBJECT (component), name, widget)
+
+GtkWidget*
+create_GHC_Front_Panel (void)
+{
+  GtkWidget *GHC_Front_Panel;
+  GtkWidget *vbox1;
+  GtkWidget *hbox1;
+  GtkWidget *vbox4;
+  GtkWidget *frame3;
+  GtkWidget *hbox3;
+  GtkWidget *label40;
+  GtkWidget *map_ruler;
+  GtkWidget *memmap;
+  GtkWidget *label1;
+  GtkWidget *frame8;
+  GtkWidget *vbox14;
+  GtkWidget *table4;
+  GtkWidget *gen_ruler;
+  GtkWidget *gen_hbox;
+  GtkWidget *generations;
+  GtkWidget *label39;
+  GtkWidget *label41;
+  GtkWidget *frame7;
+  GtkWidget *table3;
+  GtkWidget *res_hruler;
+  GtkWidget *res_vruler;
+  GtkWidget *res_drawingarea;
+  GtkWidget *label37;
+  GtkWidget *label38;
+  GtkWidget *label42;
+  GtkWidget *vbox5;
+  GtkWidget *frame5;
+  GtkWidget *vbox6;
+  GtkWidget *table1;
+  GtkWidget *label12;
+  GtkWidget *label13;
+  GtkWidget *label14;
+  GtkWidget *label15;
+  GtkWidget *label16;
+  GtkWidget *label17;
+  GtkWidget *label18;
+  GtkWidget *label19;
+  GtkWidget *live_label;
+  GtkWidget *allocated_label;
+  GtkWidget *footprint_label;
+  GtkWidget *alloc_rate_label;
+  GtkWidget *label43;
+  GtkWidget *frame9;
+  GtkWidget *table5;
+  GtkWidget *label20;
+  GtkWidget *label21;
+  GtkWidget *label22;
+  GtkWidget *label24;
+  GtkWidget *label26;
+  GtkWidget *label25;
+  GtkWidget *label27;
+  GtkWidget *running_label;
+  GtkWidget *blockread_label;
+  GtkWidget *blockwrite_label;
+  GtkWidget *blockmvar_label;
+  GtkWidget *blockthrowto_label;
+  GtkWidget *blockbh_label;
+  GtkWidget *sleeping_label;
+  GtkWidget *hseparator1;
+  GtkWidget *hseparator2;
+  GtkWidget *label35;
+  GtkWidget *total_label;
+  GtkWidget *label44;
+  GtkWidget *frame6;
+  GtkWidget *vbox7;
+  GtkWidget *vbox9;
+  GtkWidget *cont_radio;
+  GSList *cont_radio_group = NULL;
+  GtkWidget *stop_before_radio;
+  GtkWidget *stop_after_radio;
+  GtkWidget *stop_both_radio;
+  GtkWidget *vbox8;
+  GtkWidget *stop_but;
+  GtkWidget *continue_but;
+  GtkWidget *label45;
+  GtkWidget *quit_but;
+  GtkWidget *statusbar;
+
+  GHC_Front_Panel = gtk_window_new (GTK_WINDOW_TOPLEVEL);
+  gtk_widget_set_name (GHC_Front_Panel, "GHC_Front_Panel");
+  gtk_window_set_title (GTK_WINDOW (GHC_Front_Panel), "GHC Front Panel");
+  gtk_window_set_default_size (GTK_WINDOW (GHC_Front_Panel), 450, 600);
+
+  vbox1 = gtk_vbox_new (FALSE, 0);
+  gtk_widget_set_name (vbox1, "vbox1");
+  gtk_widget_show (vbox1);
+  gtk_container_add (GTK_CONTAINER (GHC_Front_Panel), vbox1);
+
+  hbox1 = gtk_hbox_new (FALSE, 10);
+  gtk_widget_set_name (hbox1, "hbox1");
+  gtk_widget_show (hbox1);
+  gtk_box_pack_start (GTK_BOX (vbox1), hbox1, TRUE, TRUE, 0);
+  gtk_container_set_border_width (GTK_CONTAINER (hbox1), 10);
+
+  vbox4 = gtk_vbox_new (FALSE, 10);
+  gtk_widget_set_name (vbox4, "vbox4");
+  gtk_widget_show (vbox4);
+  gtk_box_pack_start (GTK_BOX (hbox1), vbox4, TRUE, TRUE, 0);
+
+  frame3 = gtk_frame_new (NULL);
+  gtk_widget_set_name (frame3, "frame3");
+  gtk_widget_show (frame3);
+  gtk_box_pack_start (GTK_BOX (vbox4), frame3, TRUE, TRUE, 0);
+
+  hbox3 = gtk_hbox_new (FALSE, 0);
+  gtk_widget_set_name (hbox3, "hbox3");
+  gtk_widget_show (hbox3);
+  gtk_container_add (GTK_CONTAINER (frame3), hbox3);
+
+  label40 = gtk_label_new ("Mb");
+  gtk_widget_set_name (label40, "label40");
+  gtk_widget_show (label40);
+  gtk_box_pack_start (GTK_BOX (hbox3), label40, FALSE, FALSE, 0);
+  gtk_label_set_justify (GTK_LABEL (label40), GTK_JUSTIFY_CENTER);
+
+  map_ruler = gtk_vruler_new ();
+  gtk_widget_set_name (map_ruler, "map_ruler");
+  gtk_widget_show (map_ruler);
+  gtk_box_pack_start (GTK_BOX (hbox3), map_ruler, FALSE, FALSE, 0);
+  gtk_ruler_set_range (GTK_RULER (map_ruler), 0, 10, 1.40845, 10);
+
+  memmap = gtk_drawing_area_new ();
+  gtk_widget_set_name (memmap, "memmap");
+  gtk_widget_show (memmap);
+  gtk_box_pack_start (GTK_BOX (hbox3), memmap, TRUE, TRUE, 0);
+
+  label1 = gtk_label_new ("Memory Map");
+  gtk_widget_set_name (label1, "label1");
+  gtk_widget_show (label1);
+  gtk_frame_set_label_widget (GTK_FRAME (frame3), label1);
+
+  frame8 = gtk_frame_new (NULL);
+  gtk_widget_set_name (frame8, "frame8");
+  gtk_widget_show (frame8);
+  gtk_box_pack_start (GTK_BOX (vbox4), frame8, TRUE, TRUE, 0);
+
+  vbox14 = gtk_vbox_new (FALSE, 0);
+  gtk_widget_set_name (vbox14, "vbox14");
+  gtk_widget_show (vbox14);
+  gtk_container_add (GTK_CONTAINER (frame8), vbox14);
+
+  table4 = gtk_table_new (2, 3, FALSE);
+  gtk_widget_set_name (table4, "table4");
+  gtk_widget_show (table4);
+  gtk_box_pack_start (GTK_BOX (vbox14), table4, TRUE, TRUE, 0);
+
+  gen_ruler = gtk_vruler_new ();
+  gtk_widget_set_name (gen_ruler, "gen_ruler");
+  gtk_widget_show (gen_ruler);
+  gtk_table_attach (GTK_TABLE (table4), gen_ruler, 1, 2, 0, 1,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (GTK_EXPAND | GTK_FILL), 0, 0);
+  gtk_ruler_set_range (GTK_RULER (gen_ruler), 0, 10, 1.69935, 10);
+
+  gen_hbox = gtk_hbox_new (FALSE, 0);
+  gtk_widget_set_name (gen_hbox, "gen_hbox");
+  gtk_widget_show (gen_hbox);
+  gtk_table_attach (GTK_TABLE (table4), gen_hbox, 2, 3, 1, 2,
+                    (GtkAttachOptions) (GTK_EXPAND | GTK_FILL),
+                    (GtkAttachOptions) (GTK_FILL), 0, 0);
+
+  generations = gtk_drawing_area_new ();
+  gtk_widget_set_name (generations, "generations");
+  gtk_widget_show (generations);
+  gtk_table_attach (GTK_TABLE (table4), generations, 2, 3, 0, 1,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (GTK_FILL), 0, 0);
+
+  label39 = gtk_label_new ("Mb");
+  gtk_widget_set_name (label39, "label39");
+  gtk_widget_show (label39);
+  gtk_table_attach (GTK_TABLE (table4), label39, 0, 1, 0, 1,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label39), GTK_JUSTIFY_CENTER);
+
+  label41 = gtk_label_new ("Generations");
+  gtk_widget_set_name (label41, "label41");
+  gtk_widget_show (label41);
+  gtk_frame_set_label_widget (GTK_FRAME (frame8), label41);
+
+  frame7 = gtk_frame_new (NULL);
+  gtk_widget_set_name (frame7, "frame7");
+  gtk_widget_show (frame7);
+  gtk_box_pack_start (GTK_BOX (vbox4), frame7, TRUE, TRUE, 0);
+
+  table3 = gtk_table_new (3, 3, FALSE);
+  gtk_widget_set_name (table3, "table3");
+  gtk_widget_show (table3);
+  gtk_container_add (GTK_CONTAINER (frame7), table3);
+  gtk_container_set_border_width (GTK_CONTAINER (table3), 2);
+
+  res_hruler = gtk_hruler_new ();
+  gtk_widget_set_name (res_hruler, "res_hruler");
+  gtk_widget_show (res_hruler);
+  gtk_table_attach (GTK_TABLE (table3), res_hruler, 2, 3, 1, 2,
+                    (GtkAttachOptions) (GTK_EXPAND | GTK_FILL),
+                    (GtkAttachOptions) (GTK_FILL), 0, 0);
+  gtk_ruler_set_range (GTK_RULER (res_hruler), 0, 10, 8.35443, 10);
+
+  res_vruler = gtk_vruler_new ();
+  gtk_widget_set_name (res_vruler, "res_vruler");
+  gtk_widget_show (res_vruler);
+  gtk_table_attach (GTK_TABLE (table3), res_vruler, 1, 2, 2, 3,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (GTK_EXPAND | GTK_FILL), 0, 0);
+  gtk_ruler_set_range (GTK_RULER (res_vruler), 0, 10, 9.69925, 10);
+
+  res_drawingarea = gtk_drawing_area_new ();
+  gtk_widget_set_name (res_drawingarea, "res_drawingarea");
+  gtk_widget_show (res_drawingarea);
+  gtk_table_attach (GTK_TABLE (table3), res_drawingarea, 2, 3, 2, 3,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (GTK_FILL), 0, 0);
+
+  label37 = gtk_label_new ("Secs");
+  gtk_widget_set_name (label37, "label37");
+  gtk_widget_show (label37);
+  gtk_table_attach (GTK_TABLE (table3), label37, 2, 3, 0, 1,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label37), GTK_JUSTIFY_CENTER);
+
+  label38 = gtk_label_new ("Mb");
+  gtk_widget_set_name (label38, "label38");
+  gtk_widget_show (label38);
+  gtk_table_attach (GTK_TABLE (table3), label38, 0, 1, 2, 3,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label38), GTK_JUSTIFY_CENTER);
+
+  label42 = gtk_label_new ("Residency");
+  gtk_widget_set_name (label42, "label42");
+  gtk_widget_show (label42);
+  gtk_frame_set_label_widget (GTK_FRAME (frame7), label42);
+
+  vbox5 = gtk_vbox_new (FALSE, 10);
+  gtk_widget_set_name (vbox5, "vbox5");
+  gtk_widget_show (vbox5);
+  gtk_box_pack_end (GTK_BOX (hbox1), vbox5, FALSE, FALSE, 0);
+
+  frame5 = gtk_frame_new (NULL);
+  gtk_widget_set_name (frame5, "frame5");
+  gtk_widget_show (frame5);
+  gtk_box_pack_start (GTK_BOX (vbox5), frame5, FALSE, TRUE, 0);
+
+  vbox6 = gtk_vbox_new (FALSE, 0);
+  gtk_widget_set_name (vbox6, "vbox6");
+  gtk_widget_show (vbox6);
+  gtk_container_add (GTK_CONTAINER (frame5), vbox6);
+  gtk_container_set_border_width (GTK_CONTAINER (vbox6), 5);
+
+  table1 = gtk_table_new (4, 3, FALSE);
+  gtk_widget_set_name (table1, "table1");
+  gtk_widget_show (table1);
+  gtk_box_pack_start (GTK_BOX (vbox6), table1, TRUE, TRUE, 0);
+  gtk_table_set_col_spacings (GTK_TABLE (table1), 7);
+
+  label12 = gtk_label_new ("Allocated");
+  gtk_widget_set_name (label12, "label12");
+  gtk_widget_show (label12);
+  gtk_table_attach (GTK_TABLE (table1), label12, 0, 1, 1, 2,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label12), GTK_JUSTIFY_RIGHT);
+  gtk_misc_set_alignment (GTK_MISC (label12), 1, 0.5);
+
+  label13 = gtk_label_new ("Live");
+  gtk_widget_set_name (label13, "label13");
+  gtk_widget_show (label13);
+  gtk_table_attach (GTK_TABLE (table1), label13, 0, 1, 0, 1,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label13), GTK_JUSTIFY_RIGHT);
+  gtk_misc_set_alignment (GTK_MISC (label13), 1, 0.5);
+
+  label14 = gtk_label_new ("Allocation Rate");
+  gtk_widget_set_name (label14, "label14");
+  gtk_widget_show (label14);
+  gtk_table_attach (GTK_TABLE (table1), label14, 0, 1, 3, 4,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label14), GTK_JUSTIFY_RIGHT);
+  gtk_misc_set_alignment (GTK_MISC (label14), 1, 0.5);
+
+  label15 = gtk_label_new ("\t\tFootprint");
+  gtk_widget_set_name (label15, "label15");
+  gtk_widget_show (label15);
+  gtk_table_attach (GTK_TABLE (table1), label15, 0, 1, 2, 3,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label15), GTK_JUSTIFY_RIGHT);
+  gtk_misc_set_alignment (GTK_MISC (label15), 1, 0.5);
+
+  label16 = gtk_label_new ("M/sec");
+  gtk_widget_set_name (label16, "label16");
+  gtk_widget_show (label16);
+  gtk_table_attach (GTK_TABLE (table1), label16, 2, 3, 3, 4,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label16), GTK_JUSTIFY_CENTER);
+
+  label17 = gtk_label_new ("M");
+  gtk_widget_set_name (label17, "label17");
+  gtk_widget_show (label17);
+  gtk_table_attach (GTK_TABLE (table1), label17, 2, 3, 2, 3,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_misc_set_alignment (GTK_MISC (label17), 7.45058e-09, 0.5);
+
+  label18 = gtk_label_new ("M");
+  gtk_widget_set_name (label18, "label18");
+  gtk_widget_show (label18);
+  gtk_table_attach (GTK_TABLE (table1), label18, 2, 3, 1, 2,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label18), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (label18), 7.45058e-09, 0.5);
+
+  label19 = gtk_label_new ("M");
+  gtk_widget_set_name (label19, "label19");
+  gtk_widget_show (label19);
+  gtk_table_attach (GTK_TABLE (table1), label19, 2, 3, 0, 1,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label19), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (label19), 7.45058e-09, 0.5);
+
+  live_label = gtk_label_new ("");
+  gtk_widget_set_name (live_label, "live_label");
+  gtk_widget_show (live_label);
+  gtk_table_attach (GTK_TABLE (table1), live_label, 1, 2, 0, 1,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (live_label), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (live_label), 1, 0.5);
+
+  allocated_label = gtk_label_new ("");
+  gtk_widget_set_name (allocated_label, "allocated_label");
+  gtk_widget_show (allocated_label);
+  gtk_table_attach (GTK_TABLE (table1), allocated_label, 1, 2, 1, 2,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (allocated_label), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (allocated_label), 1, 0.5);
+
+  footprint_label = gtk_label_new ("");
+  gtk_widget_set_name (footprint_label, "footprint_label");
+  gtk_widget_show (footprint_label);
+  gtk_table_attach (GTK_TABLE (table1), footprint_label, 1, 2, 2, 3,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (footprint_label), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (footprint_label), 1, 0.5);
+
+  alloc_rate_label = gtk_label_new ("");
+  gtk_widget_set_name (alloc_rate_label, "alloc_rate_label");
+  gtk_widget_show (alloc_rate_label);
+  gtk_table_attach (GTK_TABLE (table1), alloc_rate_label, 1, 2, 3, 4,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (alloc_rate_label), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (alloc_rate_label), 1, 0.5);
+
+  label43 = gtk_label_new ("Stats");
+  gtk_widget_set_name (label43, "label43");
+  gtk_widget_show (label43);
+  gtk_frame_set_label_widget (GTK_FRAME (frame5), label43);
+
+  frame9 = gtk_frame_new (NULL);
+  gtk_widget_set_name (frame9, "frame9");
+  gtk_widget_show (frame9);
+  gtk_box_pack_start (GTK_BOX (vbox5), frame9, FALSE, TRUE, 0);
+
+  table5 = gtk_table_new (9, 2, FALSE);
+  gtk_widget_set_name (table5, "table5");
+  gtk_widget_show (table5);
+  gtk_container_add (GTK_CONTAINER (frame9), table5);
+  gtk_container_set_border_width (GTK_CONTAINER (table5), 6);
+  gtk_table_set_col_spacings (GTK_TABLE (table5), 10);
+
+  label20 = gtk_label_new ("Running");
+  gtk_widget_set_name (label20, "label20");
+  gtk_widget_show (label20);
+  gtk_table_attach (GTK_TABLE (table5), label20, 0, 1, 0, 1,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label20), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (label20), 1, 0.5);
+
+  label21 = gtk_label_new ("Blocked on I/O (Read)");
+  gtk_widget_set_name (label21, "label21");
+  gtk_widget_show (label21);
+  gtk_table_attach (GTK_TABLE (table5), label21, 0, 1, 1, 2,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label21), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (label21), 1, 0.5);
+
+  label22 = gtk_label_new ("Blocked on MVar");
+  gtk_widget_set_name (label22, "label22");
+  gtk_widget_show (label22);
+  gtk_table_attach (GTK_TABLE (table5), label22, 0, 1, 3, 4,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label22), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (label22), 1, 0.5);
+
+  label24 = gtk_label_new ("Blocked on throwTo");
+  gtk_widget_set_name (label24, "label24");
+  gtk_widget_show (label24);
+  gtk_table_attach (GTK_TABLE (table5), label24, 0, 1, 4, 5,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label24), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (label24), 1, 0.5);
+
+  label26 = gtk_label_new ("Blocked on Black Hole");
+  gtk_widget_set_name (label26, "label26");
+  gtk_widget_show (label26);
+  gtk_table_attach (GTK_TABLE (table5), label26, 0, 1, 5, 6,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label26), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (label26), 1, 0.5);
+
+  label25 = gtk_label_new ("Sleeping");
+  gtk_widget_set_name (label25, "label25");
+  gtk_widget_show (label25);
+  gtk_table_attach (GTK_TABLE (table5), label25, 0, 1, 6, 7,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label25), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (label25), 1, 0.5);
+
+  label27 = gtk_label_new ("Blocked on I/O (Write)");
+  gtk_widget_set_name (label27, "label27");
+  gtk_widget_show (label27);
+  gtk_table_attach (GTK_TABLE (table5), label27, 0, 1, 2, 3,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label27), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (label27), 1, 0.5);
+
+  running_label = gtk_label_new ("label28");
+  gtk_widget_set_name (running_label, "running_label");
+  gtk_widget_show (running_label);
+  gtk_table_attach (GTK_TABLE (table5), running_label, 1, 2, 0, 1,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (running_label), GTK_JUSTIFY_CENTER);
+
+  blockread_label = gtk_label_new ("label29");
+  gtk_widget_set_name (blockread_label, "blockread_label");
+  gtk_widget_show (blockread_label);
+  gtk_table_attach (GTK_TABLE (table5), blockread_label, 1, 2, 1, 2,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (blockread_label), GTK_JUSTIFY_CENTER);
+
+  blockwrite_label = gtk_label_new ("label30");
+  gtk_widget_set_name (blockwrite_label, "blockwrite_label");
+  gtk_widget_show (blockwrite_label);
+  gtk_table_attach (GTK_TABLE (table5), blockwrite_label, 1, 2, 2, 3,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (blockwrite_label), GTK_JUSTIFY_CENTER);
+
+  blockmvar_label = gtk_label_new ("label31");
+  gtk_widget_set_name (blockmvar_label, "blockmvar_label");
+  gtk_widget_show (blockmvar_label);
+  gtk_table_attach (GTK_TABLE (table5), blockmvar_label, 1, 2, 3, 4,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (blockmvar_label), GTK_JUSTIFY_CENTER);
+
+  blockthrowto_label = gtk_label_new ("label32");
+  gtk_widget_set_name (blockthrowto_label, "blockthrowto_label");
+  gtk_widget_show (blockthrowto_label);
+  gtk_table_attach (GTK_TABLE (table5), blockthrowto_label, 1, 2, 4, 5,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (blockthrowto_label), GTK_JUSTIFY_CENTER);
+
+  blockbh_label = gtk_label_new ("label33");
+  gtk_widget_set_name (blockbh_label, "blockbh_label");
+  gtk_widget_show (blockbh_label);
+  gtk_table_attach (GTK_TABLE (table5), blockbh_label, 1, 2, 5, 6,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (blockbh_label), GTK_JUSTIFY_CENTER);
+
+  sleeping_label = gtk_label_new ("label34");
+  gtk_widget_set_name (sleeping_label, "sleeping_label");
+  gtk_widget_show (sleeping_label);
+  gtk_table_attach (GTK_TABLE (table5), sleeping_label, 1, 2, 6, 7,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (sleeping_label), GTK_JUSTIFY_CENTER);
+
+  hseparator1 = gtk_hseparator_new ();
+  gtk_widget_set_name (hseparator1, "hseparator1");
+  gtk_widget_show (hseparator1);
+  gtk_table_attach (GTK_TABLE (table5), hseparator1, 0, 1, 7, 8,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (GTK_EXPAND | GTK_FILL), 0, 0);
+
+  hseparator2 = gtk_hseparator_new ();
+  gtk_widget_set_name (hseparator2, "hseparator2");
+  gtk_widget_show (hseparator2);
+  gtk_table_attach (GTK_TABLE (table5), hseparator2, 1, 2, 7, 8,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (GTK_FILL), 0, 0);
+
+  label35 = gtk_label_new ("Total");
+  gtk_widget_set_name (label35, "label35");
+  gtk_widget_show (label35);
+  gtk_table_attach (GTK_TABLE (table5), label35, 0, 1, 8, 9,
+                    (GtkAttachOptions) (GTK_FILL),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (label35), GTK_JUSTIFY_CENTER);
+  gtk_misc_set_alignment (GTK_MISC (label35), 1, 0.5);
+
+  total_label = gtk_label_new ("label36");
+  gtk_widget_set_name (total_label, "total_label");
+  gtk_widget_show (total_label);
+  gtk_table_attach (GTK_TABLE (table5), total_label, 1, 2, 8, 9,
+                    (GtkAttachOptions) (0),
+                    (GtkAttachOptions) (0), 0, 0);
+  gtk_label_set_justify (GTK_LABEL (total_label), GTK_JUSTIFY_CENTER);
+
+  label44 = gtk_label_new ("Threads");
+  gtk_widget_set_name (label44, "label44");
+  gtk_widget_show (label44);
+  gtk_frame_set_label_widget (GTK_FRAME (frame9), label44);
+
+  frame6 = gtk_frame_new (NULL);
+  gtk_widget_set_name (frame6, "frame6");
+  gtk_widget_show (frame6);
+  gtk_box_pack_start (GTK_BOX (vbox5), frame6, FALSE, FALSE, 0);
+
+  vbox7 = gtk_vbox_new (FALSE, 10);
+  gtk_widget_set_name (vbox7, "vbox7");
+  gtk_widget_show (vbox7);
+  gtk_container_add (GTK_CONTAINER (frame6), vbox7);
+  gtk_container_set_border_width (GTK_CONTAINER (vbox7), 5);
+
+  vbox9 = gtk_vbox_new (FALSE, 0);
+  gtk_widget_set_name (vbox9, "vbox9");
+  gtk_widget_show (vbox9);
+  gtk_box_pack_start (GTK_BOX (vbox7), vbox9, TRUE, TRUE, 0);
+
+  cont_radio = gtk_radio_button_new_with_mnemonic (NULL, "Continuous");
+  gtk_widget_set_name (cont_radio, "cont_radio");
+  gtk_widget_show (cont_radio);
+  gtk_box_pack_start (GTK_BOX (vbox9), cont_radio, FALSE, FALSE, 0);
+  gtk_radio_button_set_group (GTK_RADIO_BUTTON (cont_radio), cont_radio_group);
+  cont_radio_group = gtk_radio_button_get_group (GTK_RADIO_BUTTON (cont_radio));
+  gtk_toggle_button_set_active (GTK_TOGGLE_BUTTON (cont_radio), TRUE);
+
+  stop_before_radio = gtk_radio_button_new_with_mnemonic (NULL, "Stop before GC");
+  gtk_widget_set_name (stop_before_radio, "stop_before_radio");
+  gtk_widget_show (stop_before_radio);
+  gtk_box_pack_start (GTK_BOX (vbox9), stop_before_radio, FALSE, FALSE, 0);
+  gtk_radio_button_set_group (GTK_RADIO_BUTTON (stop_before_radio), cont_radio_group);
+  cont_radio_group = gtk_radio_button_get_group (GTK_RADIO_BUTTON (stop_before_radio));
+
+  stop_after_radio = gtk_radio_button_new_with_mnemonic (NULL, "Stop after GC");
+  gtk_widget_set_name (stop_after_radio, "stop_after_radio");
+  gtk_widget_show (stop_after_radio);
+  gtk_box_pack_start (GTK_BOX (vbox9), stop_after_radio, FALSE, FALSE, 0);
+  gtk_radio_button_set_group (GTK_RADIO_BUTTON (stop_after_radio), cont_radio_group);
+  cont_radio_group = gtk_radio_button_get_group (GTK_RADIO_BUTTON (stop_after_radio));
+
+  stop_both_radio = gtk_radio_button_new_with_mnemonic (NULL, "Stop before & after GC");
+  gtk_widget_set_name (stop_both_radio, "stop_both_radio");
+  gtk_widget_show (stop_both_radio);
+  gtk_box_pack_start (GTK_BOX (vbox9), stop_both_radio, FALSE, FALSE, 0);
+  gtk_radio_button_set_group (GTK_RADIO_BUTTON (stop_both_radio), cont_radio_group);
+  cont_radio_group = gtk_radio_button_get_group (GTK_RADIO_BUTTON (stop_both_radio));
+
+  vbox8 = gtk_vbox_new (FALSE, 0);
+  gtk_widget_set_name (vbox8, "vbox8");
+  gtk_widget_show (vbox8);
+  gtk_box_pack_start (GTK_BOX (vbox7), vbox8, FALSE, FALSE, 0);
+
+  stop_but = gtk_button_new_with_mnemonic ("Stop");
+  gtk_widget_set_name (stop_but, "stop_but");
+  gtk_widget_show (stop_but);
+  gtk_box_pack_start (GTK_BOX (vbox8), stop_but, FALSE, FALSE, 0);
+
+  continue_but = gtk_button_new_with_mnemonic ("Continue");
+  gtk_widget_set_name (continue_but, "continue_but");
+  gtk_widget_show (continue_but);
+  gtk_box_pack_start (GTK_BOX (vbox8), continue_but, FALSE, FALSE, 0);
+
+  label45 = gtk_label_new ("Updates");
+  gtk_widget_set_name (label45, "label45");
+  gtk_widget_show (label45);
+  gtk_frame_set_label_widget (GTK_FRAME (frame6), label45);
+
+  quit_but = gtk_button_new_with_mnemonic ("Quit");
+  gtk_widget_set_name (quit_but, "quit_but");
+  gtk_widget_show (quit_but);
+  gtk_box_pack_end (GTK_BOX (vbox5), quit_but, FALSE, FALSE, 0);
+
+  statusbar = gtk_statusbar_new ();
+  gtk_widget_set_name (statusbar, "statusbar");
+  gtk_widget_show (statusbar);
+  gtk_box_pack_start (GTK_BOX (vbox1), statusbar, FALSE, FALSE, 0);
+
+  g_signal_connect ((gpointer) cont_radio, "clicked",
+                    G_CALLBACK (on_cont_radio_clicked),
+                    NULL);
+  g_signal_connect ((gpointer) stop_before_radio, "clicked",
+                    G_CALLBACK (on_stop_before_radio_clicked),
+                    NULL);
+  g_signal_connect ((gpointer) stop_after_radio, "clicked",
+                    G_CALLBACK (on_stop_after_radio_clicked),
+                    NULL);
+  g_signal_connect ((gpointer) stop_both_radio, "clicked",
+                    G_CALLBACK (on_stop_both_radio_clicked),
+                    NULL);
+  g_signal_connect ((gpointer) stop_but, "clicked",
+                    G_CALLBACK (on_stop_but_clicked),
+                    NULL);
+  g_signal_connect ((gpointer) continue_but, "clicked",
+                    G_CALLBACK (on_continue_but_clicked),
+                    NULL);
+  g_signal_connect ((gpointer) quit_but, "clicked",
+                    G_CALLBACK (on_quit_but_clicked),
+                    NULL);
+
+  /* Store pointers to all widgets, for use by lookup_widget(). */
+  GLADE_HOOKUP_OBJECT_NO_REF (GHC_Front_Panel, GHC_Front_Panel, "GHC_Front_Panel");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, vbox1, "vbox1");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, hbox1, "hbox1");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, vbox4, "vbox4");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, frame3, "frame3");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, hbox3, "hbox3");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label40, "label40");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, map_ruler, "map_ruler");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, memmap, "memmap");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label1, "label1");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, frame8, "frame8");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, vbox14, "vbox14");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, table4, "table4");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, gen_ruler, "gen_ruler");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, gen_hbox, "gen_hbox");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, generations, "generations");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label39, "label39");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label41, "label41");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, frame7, "frame7");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, table3, "table3");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, res_hruler, "res_hruler");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, res_vruler, "res_vruler");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, res_drawingarea, "res_drawingarea");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label37, "label37");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label38, "label38");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label42, "label42");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, vbox5, "vbox5");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, frame5, "frame5");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, vbox6, "vbox6");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, table1, "table1");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label12, "label12");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label13, "label13");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label14, "label14");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label15, "label15");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label16, "label16");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label17, "label17");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label18, "label18");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label19, "label19");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, live_label, "live_label");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, allocated_label, "allocated_label");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, footprint_label, "footprint_label");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, alloc_rate_label, "alloc_rate_label");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label43, "label43");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, frame9, "frame9");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, table5, "table5");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label20, "label20");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label21, "label21");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label22, "label22");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label24, "label24");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label26, "label26");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label25, "label25");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label27, "label27");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, running_label, "running_label");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, blockread_label, "blockread_label");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, blockwrite_label, "blockwrite_label");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, blockmvar_label, "blockmvar_label");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, blockthrowto_label, "blockthrowto_label");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, blockbh_label, "blockbh_label");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, sleeping_label, "sleeping_label");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, hseparator1, "hseparator1");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, hseparator2, "hseparator2");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label35, "label35");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, total_label, "total_label");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label44, "label44");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, frame6, "frame6");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, vbox7, "vbox7");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, vbox9, "vbox9");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, cont_radio, "cont_radio");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, stop_before_radio, "stop_before_radio");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, stop_after_radio, "stop_after_radio");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, stop_both_radio, "stop_both_radio");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, vbox8, "vbox8");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, stop_but, "stop_but");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, continue_but, "continue_but");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, label45, "label45");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, quit_but, "quit_but");
+  GLADE_HOOKUP_OBJECT (GHC_Front_Panel, statusbar, "statusbar");
+
+  return GHC_Front_Panel;
+}
+
diff --git a/rts/VisWindow.h b/rts/VisWindow.h
new file mode 100644
index 0000000000..c646c40c02
--- /dev/null
+++ b/rts/VisWindow.h
@@ -0,0 +1,5 @@
+/*
+ * DO NOT EDIT THIS FILE - it is generated by Glade.
+ */
+
+GtkWidget* create_GHC_Front_Panel (void);
diff --git a/rts/Weak.c b/rts/Weak.c
new file mode 100644
index 0000000000..f010395221
--- /dev/null
+++ b/rts/Weak.c
@@ -0,0 +1,97 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-1999
+ *
+ * Weak pointers / finalizers
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "PosixSource.h"
+#define COMPILING_RTS_MAIN
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "SchedAPI.h"
+#include "RtsFlags.h"
+#include "Weak.h"
+#include "Storage.h"
+#include "Schedule.h"
+#include "Prelude.h"
+#include "RtsAPI.h"
+
+StgWeak *weak_ptr_list;
+
+/*
+ * scheduleFinalizers() is called on the list of weak pointers found
+ * to be dead after a garbage collection.  It overwrites each object
+ * with DEAD_WEAK, and creates a new thread to run the pending finalizers.
+ *
+ * This function is called just after GC.  The weak pointers on the
+ * argument list are those whose keys were found to be not reachable,
+ * however the value and finalizer fields have by now been marked live.
+ * The weak pointer object itself may not be alive - i.e. we may be
+ * looking at either an object in from-space or one in to-space.  It
+ * doesn't really matter either way.
+ *
+ * Pre-condition: sched_mutex _not_ held.
+ */
+
+void
+scheduleFinalizers(Capability *cap, StgWeak *list)
+{
+    StgWeak *w;
+    StgTSO *t;
+    StgMutArrPtrs *arr;
+    nat n;
+
+    // count number of finalizers, and kill all the weak pointers first...
+    n = 0;
+    for (w = list; w; w = w->link) { 
+
+	// Better not be a DEAD_WEAK at this stage; the garbage
+	// collector removes DEAD_WEAKs from the weak pointer list.
+	ASSERT(w->header.info != &stg_DEAD_WEAK_info);
+
+	if (w->finalizer != &stg_NO_FINALIZER_closure) {
+	    n++;
+	}
+
+#ifdef PROFILING
+        // A weak pointer is inherently used, so we do not need to call
+        // LDV_recordDead().
+	//
+        // Furthermore, when PROFILING is turned on, dead weak
+        // pointers are exactly as large as weak pointers, so there is
+        // no need to fill the slop, either.  See stg_DEAD_WEAK_info
+        // in StgMiscClosures.hc.
+#endif
+	SET_HDR(w, &stg_DEAD_WEAK_info, w->header.prof.ccs);
+    }
+	
+    // No finalizers to run?
+    if (n == 0) return;
+
+    IF_DEBUG(weak,debugBelch("weak: batching %d finalizers\n", n));
+
+    arr = (StgMutArrPtrs *)allocateLocal(cap, sizeofW(StgMutArrPtrs) + n);
+    TICK_ALLOC_PRIM(sizeofW(StgMutArrPtrs), n, 0);
+    SET_HDR(arr, &stg_MUT_ARR_PTRS_FROZEN_info, CCS_SYSTEM);
+    arr->ptrs = n;
+
+    n = 0;
+    for (w = list; w; w = w->link) {
+	if (w->finalizer != &stg_NO_FINALIZER_closure) {
+	    arr->payload[n] = w->finalizer;
+	    n++;
+	}
+    }
+
+    t = createIOThread(cap, 
+		       RtsFlags.GcFlags.initialStkSize, 
+		       rts_apply(cap,
+			   rts_apply(cap,
+			       (StgClosure *)runFinalizerBatch_closure,
+			       rts_mkInt(cap,n)), 
+			   (StgClosure *)arr)
+	);
+    scheduleThread(cap,t);
+}
diff --git a/rts/Weak.h b/rts/Weak.h
new file mode 100644
index 0000000000..ba8c1ca913
--- /dev/null
+++ b/rts/Weak.h
@@ -0,0 +1,17 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Weak pointers / finalizers
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef WEAK_H
+#define WEAK_H
+
+#include "Capability.h"
+
+void scheduleFinalizers(Capability *cap, StgWeak *w);
+void markWeakList(void);
+
+#endif
diff --git a/rts/dotnet/Invoke.c b/rts/dotnet/Invoke.c
new file mode 100644
index 0000000000..585dcacaad
--- /dev/null
+++ b/rts/dotnet/Invoke.c
@@ -0,0 +1,1081 @@
+/*
+ * C callable bridge to the .NET object model
+ *
+ * Managed C++ is used to access the .NET object model via
+ * System.Reflection. Here we provide C callable functions
+ * to that functionality, which we then export via a COM
+ * component.
+ *
+ * Note: the _only_ reason why we're going via COM and not simply
+ * exposing the required via some DLL entry points, is that COM
+ * gives us location independence (i.e., the RTS doesn't need
+ * be told where this interop layer resides in order to hoik
+ * it in, the CLSID suffices (provided the component has been
+ * registered, of course.)) It is a bit tiresome to have play
+ * by the .NET COM Interop's rules as regards argument arrays,
+ * so we may want to revisit this issue at some point.
+ * 
+ * [ But why not simply use MC++ and provide C-callable entry
+ *   points to the relevant functionality, and avoid COM interop
+ *   alltogether? Because we have to be able to (statically)
+ *   link with gcc-compiled code, and linking MC++ and gcc-compiled
+ *   object files doesn't work.]
+ *
+ * Note: you need something never than gcc-2.95 to compile this
+ *       code (I'm using gcc-3.2, which comes with mingw-2).
+ */
+#define _WIN32_DCOM
+#define COBJMACROS
+#include <stdio.h>
+#include <stdlib.h>
+#include <wtypes.h>
+#ifndef _MSC_VER
+#include <oaidl.h>
+#include <objbase.h>
+#include <oleauto.h>
+# if defined(COBJMACROS) && !defined(_MSC_VER)
+#define IErrorInfo_QueryInterface(T,r,O) (T)->lpVtbl->QueryInterface(T,r,O)
+#define IErrorInfo_AddRef(T) (T)->lpVtbl->AddRef(T)
+#define IErrorInfo_Release(T) (T)->lpVtbl->Release(T)
+#define IErrorInfo_GetSource(T,pbstr) (T)->lpVtbl->GetSource(T,pbstr)
+#define IErrorInfo_GetDescription(T,pbstr) (T)->lpVtbl->GetDescription(T,pbstr)
+
+#define ISupportErrorInfo_QueryInterface(T,r,O) (T)->lpVtbl->QueryInterface(T,r,O)
+#define ISupportErrorInfo_AddRef(T) (T)->lpVtbl->AddRef(T)
+#define ISupportErrorInfo_Release(T) (T)->lpVtbl->Release(T)
+#define ISupportErrorInfo_InterfaceSupportsErrorInfo(T,iid) (T)->lpVtbl->InterfaceSupportsErrorInfo(T,iid)
+# endif
+#endif
+#include "DNInvoke.h"
+#define WANT_UUID_DECLS
+#include "InvokerClient.h"
+#include "Dotnet.h"
+
+/* Local prototypes */
+static void genError( IUnknown* pUnk,
+		      HRESULT hr,
+		      char* loc,
+		      char** pErrMsg);
+static int  startBridge(char**);
+static int  fromVariant
+                    ( DotnetType resTy, 
+		      VARIANT* pVar, 
+		      void* res,
+		      char** pErrMsg);
+static VARIANT* toVariant ( DotnetArg* p );
+
+/* Pointer to .NET COM component instance; instantiated on demand. */
+static InvokeBridge* pBridge = NULL;
+
+/* convert a char* to a BSTR, copied from the HDirect comlib/ sources */
+static
+HRESULT
+stringToBSTR( /*[in,ptr]*/const char* pstrz
+	    , /*[out]*/ BSTR* pbstr
+	    )
+{
+  int i;
+
+  if (!pbstr) {
+    return E_FAIL;
+  } else {
+    *pbstr = NULL;
+  }
+  if (!pstrz) {
+    return S_OK;
+  }
+
+  i = MultiByteToWideChar(CP_ACP, 0, pstrz, -1, NULL, 0);
+  if ( i < 0 ) {
+    return E_FAIL;
+  }
+  *pbstr = SysAllocStringLen(NULL,i-1);
+  if (*pbstr != NULL) {
+    MultiByteToWideChar(CP_ACP, 0, pstrz, -1, *pbstr, i-1); 
+    //    (*pbstr)[i]=0;
+    return S_OK;
+  } else {
+    return E_FAIL;
+  }
+}
+
+static
+char*
+bstrToString( BSTR bstr )
+{
+    int  i,len;
+    char *res;
+    int  blen;
+
+    if (!bstr) {
+	return NULL;
+    }
+    
+    blen =  SysStringLen(bstr);
+    
+    /* pass in NULL for the multi-byte arg in order to compute length first */
+    len = WideCharToMultiByte(CP_ACP, 0, bstr, blen,
+			      NULL, 0, NULL, NULL);
+    if (len == 0) return NULL;
+    
+    /* Allocate string of required length. */
+    res = (char*)malloc(sizeof(char) * (len + 1));
+    if (!res) return NULL;
+    
+    i = WideCharToMultiByte(CP_ACP, 0, bstr, blen,
+			    res, (len+1), NULL, NULL);
+			    
+    /* Poor error handling to map this to NULL. */
+    if ( i == 0 ) return NULL;
+
+    /* Terminate and return */
+    res[i] = '\0';
+    return res;
+}
+
+static
+void
+freeArgs ( SAFEARRAY* psa )
+{
+  /* The argument SAFEARRAYs contain dynamically allocated
+   * VARIANTs. Release the VARIANT contents and its memory here.
+   */
+  long lb,ub;
+  int i;
+  HRESULT hr;
+  VARIANT *pv = NULL;
+  
+  hr = SafeArrayGetLBound(psa, 1, &lb);
+  if (FAILED(hr)) {
+    fprintf(stderr, "freeArgs: failed fetching lower bound\n");
+    SafeArrayDestroy(psa);
+    return;
+  }
+  hr = SafeArrayGetUBound(psa, 1, &ub);
+  if (FAILED(hr)) {
+    fprintf(stderr, "freeArgs: failed fetching upper bound\n");
+    SafeArrayDestroy(psa);
+    return;
+  }
+  for ( i = 0; i < (ub - lb); i++ ) {
+    hr = SafeArrayGetElement(psa,(long*)&i,(void*)pv);
+    if (FAILED(hr)) {
+      fprintf(stderr, "freeArgs: unable to fetch element %d\n", i);
+      SafeArrayDestroy(psa);
+      return;
+    }
+    VariantClear(pv);
+    free(pv);
+  }
+  SafeArrayDestroy(psa);
+}
+
+static
+SAFEARRAY*
+marshalArgs ( DotnetArg*   args,
+	      unsigned int n_args )
+{
+  SAFEARRAY *psa;
+  SAFEARRAYBOUND rgsabound[1];
+  int i;
+  long idxArr[1];
+  HRESULT hr;
+  VARIANT* var;
+
+  rgsabound[0].lLbound   = 0;
+  rgsabound[0].cElements = n_args;
+  psa = SafeArrayCreate(VT_VARIANT, 1, rgsabound);
+  
+  for(i=0;i < n_args; i++) {
+    idxArr[0] = i;
+    var = toVariant(&args[i]);
+    hr = SafeArrayPutElement(psa, idxArr, (void*)var);
+  }
+  return psa;
+}
+
+/* 
+ * ***** Accessing the .NET object model *****
+ *
+ * General remarks:
+ *
+ *   - the functions report error conditions via their return value; a char*.
+ *     If NULL, the call was successful. If not, the returned string 
+ *     contains the (dynamically allocated) error message. 
+ * 
+ *     This unorthodox calling convetion is used to simplify the task
+ *     of interfacing to these funs from GHC-generated code.
+ */
+
+/*
+ * Function: DN_invokeStatic()
+ *
+ * Given assembly and fully-qualified name of a static .NET method,
+ * invoke it using the supplied arguments.
+ *
+ * Returns NULL on success, pointer to error message if an error.
+ *
+ */
+char*
+DN_invokeStatic ( char       *assemName,
+		  char       *methName,
+		  DotnetArg  *args,
+		  int        n_args,
+		  DotnetType resultTy,
+		  void       *res)
+{
+    SAFEARRAY* psa;
+    VARIANT    result;
+    HRESULT    hr;
+    BSTR       b_assemName;
+    BSTR       b_methName;
+    char*      errMsg = NULL;
+    
+    if (!pBridge && !startBridge(&errMsg)) {
+      return errMsg;
+    }
+    
+    /* Package up arguments */
+    psa = marshalArgs(args, n_args);
+    VariantInit(&result);
+    
+    hr = stringToBSTR(assemName, &b_assemName);
+    hr = stringToBSTR(methName, &b_methName);
+
+    hr = InvokeBridge_InvokeStaticMethod(pBridge,
+					 b_assemName,
+					 b_methName,
+					 psa,
+					 &result);
+    SysFreeString(b_assemName);
+    SysFreeString(b_methName);
+    if (FAILED(hr)) {
+	genError((IUnknown*)pBridge, hr, "DInvoke.invokeStatic", &errMsg);
+	return errMsg;
+    }
+   
+    fromVariant(resultTy, &result, res, &errMsg);
+    freeArgs(psa);
+  
+    return errMsg;
+}
+
+/*
+ * Function: DN_invokeMethod()
+ *
+ * Given method name and arguments, invoke .NET method on an object.
+ * The object ref / this-pointer is passed in as the last argument.
+ *
+ * Returns NULL on success, pointer to error message if an error.
+ *
+ */
+char*
+DN_invokeMethod ( char       *clsAndMethName,
+		  DotnetArg  *args,
+		  int        n_args,
+		  DotnetType resultTy,
+		  void       *res)
+{
+    SAFEARRAY* psa;
+    VARIANT    result;
+    HRESULT    hr;
+    char*      methName;
+    BSTR       b_methName;
+    char*      errMsg = NULL;
+    VARIANT    *thisPtr;
+    
+    if (!pBridge && !startBridge(&errMsg)) {
+      return errMsg;
+    }
+    
+    if (n_args <= 0) {
+      genError(NULL, 0x0, "Invoke.invokeMethod - missing this pointer", &errMsg);
+      return errMsg;
+    }
+    
+    /* The this-pointer is last */
+    thisPtr = toVariant(&args[n_args-1]);
+
+    /* Package up arguments */
+    psa = marshalArgs(args, n_args-1);
+    VariantInit(&result);
+    
+    /* If the user has qualified method with class, ignore the class bit. */
+    if ( (methName = strrchr(clsAndMethName, '.')) == NULL) {
+      methName = clsAndMethName;
+    } else {
+      /* Skip past '.' */
+      methName++;
+    }
+    
+    hr = stringToBSTR(methName, &b_methName);
+    hr = InvokeBridge_InvokeMethod(pBridge,
+				   *thisPtr,
+				   b_methName,
+				   psa,
+				   &result);
+    SysFreeString(b_methName);
+    if (FAILED(hr)) {
+	genError((IUnknown*)pBridge, hr, "Invoke.invokeMethod", &errMsg);
+	return errMsg;
+    }
+    
+    fromVariant(resultTy, &result, res, &errMsg);
+    freeArgs(psa);
+  
+    return errMsg;
+}
+
+/*
+ * Function: DN_getField()
+ *
+ * Given a field name and an object pointer, read a field value.
+ * The object ref / this-pointer is passed in as the last argument.
+ *
+ * Returns NULL on success, pointer to error message if an error.
+ *
+ */
+char*
+DN_getField ( char       *clsAndMethName,
+	      DotnetArg  *args,
+	      int        n_args,
+	      DotnetType resultTy,
+	      void       *res)
+{
+    VARIANT    result;
+    HRESULT    hr;
+    char*      methName;
+    BSTR       b_methName;
+    char*      errMsg = NULL;
+    VARIANT    *thisPtr;
+    
+    if (!pBridge && !startBridge(&errMsg)) {
+      return errMsg;
+    }
+    
+    if (n_args <= 0) {
+      genError(NULL, 0x0, "Invoke.getField - missing this pointer", &errMsg);
+      return errMsg;
+    }
+    
+    /* The this-pointer is last */
+    thisPtr = toVariant(&args[n_args-1]);
+    VariantInit(&result);
+    
+    /* If the user has qualified method with class, ignore the class bit. */
+    if ( (methName = strrchr(clsAndMethName, '.')) == NULL) {
+      methName = clsAndMethName;
+    } else {
+      /* Skip past '.' */
+      methName++;
+    }
+    
+    hr = stringToBSTR(methName, &b_methName);
+    hr = InvokeBridge_GetField(pBridge,
+			       *thisPtr,
+			       b_methName,
+			       &result);
+    SysFreeString(b_methName);
+    if (FAILED(hr)) {
+	genError((IUnknown*)pBridge, hr, "Invoke.getField", &errMsg);
+	return errMsg;
+    }
+    
+    fromVariant(resultTy, &result, res, &errMsg);
+    return errMsg;
+}
+
+/*
+ * Function: DN_setField()
+ *
+ * Given field name, a value and an object reference, set the field value of
+ * an object.
+ * The object ref / this-pointer is passed in as the last argument.
+ *
+ * Returns NULL on success, pointer to error message if an error.
+ *
+ */
+char*
+DN_setField ( char       *clsAndMethName,
+	      DotnetArg  *args,
+	      int        n_args,
+	      /* next two args are ignored */
+	      DotnetType resultTy,
+	      void       *res)
+{
+    HRESULT    hr;
+    char*      methName;
+    BSTR       b_methName;
+    char*      errMsg = NULL;
+    VARIANT    *thisPtr;
+    VARIANT    *pVal;
+
+    if (!pBridge && !startBridge(&errMsg)) {
+      return errMsg;
+    }
+    
+    if (n_args != 2) {
+      genError(NULL, 0x0, "Invoke.setField - missing this pointer", &errMsg);
+      return errMsg;
+    }
+    
+    /* The this-pointer is last */
+    thisPtr = toVariant(&args[1]);
+
+    /* Package up arguments */
+    pVal = toVariant(&args[0]);
+    
+    /* If the user has qualified method with class, ignore the class bit. */
+    if ( (methName = strrchr(clsAndMethName, '.')) == NULL) {
+      methName = clsAndMethName;
+    } else {
+      /* Skip past '.' */
+      methName++;
+    }
+    
+    hr = stringToBSTR(methName, &b_methName);
+    hr = InvokeBridge_SetField(pBridge,
+			       *thisPtr,
+			       b_methName,
+			       *pVal);
+    SysFreeString(b_methName);
+    VariantClear(pVal);
+    free(pVal);
+    free(thisPtr);
+
+    if (FAILED(hr)) {
+	genError((IUnknown*)pBridge, hr, "Invoke.setField", &errMsg);
+	return errMsg;
+    }
+    return errMsg;
+}
+
+
+/*
+ * Function: DN_createObject()
+ *
+ * Given assembly and fully-qualified name of a type,
+ * invoke its (possibly parameterised) constructor.
+ *
+ * Returns NULL on success, pointer to error message if an error.
+ *
+ */
+char*
+DN_createObject ( char       *assemName,
+		  char       *methName,
+		  DotnetArg  *args,
+		  int        n_args,
+		  DotnetType resultTy,
+		  void       *res)
+{
+    SAFEARRAY* psa;
+    VARIANT    result;
+    HRESULT    hr;
+    BSTR       b_assemName;
+    BSTR       b_methName;
+    char*      errMsg = NULL;
+    
+    if (!pBridge && !startBridge(&errMsg)) {
+      return errMsg;
+    }
+    
+    /* Package up arguments */
+    psa = marshalArgs(args, n_args);
+    VariantInit(&result);
+    
+    hr = stringToBSTR(assemName, &b_assemName);
+    hr = stringToBSTR(methName, &b_methName);
+
+    hr = InvokeBridge_CreateObject(pBridge,
+				   b_assemName,
+				   b_methName,
+				   psa,
+				   &result);
+    SysFreeString(b_assemName);
+    SysFreeString(b_methName);
+    if (FAILED(hr)) {
+	genError((IUnknown*)pBridge, hr, "DN_createObject", &errMsg);
+	return errMsg;
+    }
+    
+    fromVariant(resultTy, &result, res, &errMsg);
+    freeArgs(psa);
+  
+    return errMsg;
+}
+
+/*
+ * Function: DN_getStatic()
+ *
+ * Given assembly and fully-qualified field name, fetch value of static
+ * field.
+ *
+ * Returns NULL on success, pointer to error message if an error.
+ *
+ */
+char*
+DN_getStatic ( char       *assemName,
+	       char       *fieldClsName,
+	       /* the next two args are ignored */
+	       DotnetArg  *args,
+	       int        n_args,
+	       DotnetType resultTy,
+	       void       *res)
+{
+    VARIANT    result;
+    HRESULT    hr;
+    BSTR       b_assemName;
+    BSTR       b_clsName;
+    BSTR       b_fieldName;
+    char*      errMsg = NULL;
+    char*      fieldName;
+    char*      clsName = fieldName;
+    
+    if (!pBridge && !startBridge(&errMsg)) {
+      return errMsg;
+    }
+    
+    fieldName = (char*)malloc(sizeof(char) * (strlen(fieldClsName) + 1));
+    strcpy(fieldName, fieldClsName);
+    clsName = fieldName;
+    
+    if (( fieldName = strrchr(fieldName, '.')) == NULL ) {
+      genError((IUnknown*)pBridge, 0x0, "Invoke.getStatic - malformed field spec", &errMsg);
+      return errMsg;
+    }
+    *fieldName = '\0';
+    fieldName++;
+    
+    VariantInit(&result);
+    
+    hr = stringToBSTR(assemName, &b_assemName);
+    hr = stringToBSTR(fieldName, &b_fieldName);
+    hr = stringToBSTR(clsName, &b_clsName);
+    /* ToDo: honour assembly spec */
+    hr = InvokeBridge_GetStaticField(pBridge,
+				     b_clsName,
+				     b_fieldName,
+				     &result);
+    SysFreeString(b_assemName);
+    SysFreeString(b_clsName);
+    SysFreeString(b_fieldName);
+    if (FAILED(hr)) {
+	genError((IUnknown*)pBridge, hr, "Invoke.getStatic", &errMsg);
+	return errMsg;
+    }
+    fromVariant(resultTy, &result, res, &errMsg);
+  
+    return errMsg;
+}
+
+/*
+ * Function: DN_setStatic()
+ *
+ * Given assembly and fully-qualified field name, set value of static
+ * field.
+ *
+ * Returns NULL on success, pointer to error message if an error.
+ *
+ */
+char*
+DN_setStatic ( char       *assemName,
+	       char       *fieldClsName,
+	       DotnetArg  *args,
+	       int        n_args,
+	       /* the next two args are ignored */
+	       DotnetType resultTy,
+	       void       *res)
+{
+    VARIANT    result;
+    VARIANT    *pVal;
+    HRESULT    hr;
+    BSTR       b_assemName;
+    BSTR       b_clsName;
+    BSTR       b_fieldName;
+    char*      errMsg = NULL;
+    char*      fieldName;
+    char*      clsName = fieldName;
+    
+    if (!pBridge && !startBridge(&errMsg)) {
+      return errMsg;
+    }
+    
+    fieldName = (char*)malloc(sizeof(char) * (strlen(fieldClsName) + 1));
+    strcpy(fieldName, fieldClsName);
+    clsName = fieldName;
+    
+    if (( fieldName = strrchr(fieldName, '.')) == NULL ) {
+      genError((IUnknown*)pBridge, 0x0, "Invoke.setStatic - malformed field spec", &errMsg);
+      return errMsg;
+    }
+    *fieldName = '\0';
+    fieldName++;
+    
+    pVal = toVariant(&args[0]);
+    VariantInit(&result);
+    
+    hr = stringToBSTR(assemName, &b_assemName);
+    hr = stringToBSTR(fieldName, &b_fieldName);
+    hr = stringToBSTR(clsName, &b_clsName);
+    /* ToDo: honour assembly spec */
+    hr = InvokeBridge_SetStaticField(pBridge,
+				     b_clsName,
+				     b_fieldName,
+				     *pVal);
+    SysFreeString(b_assemName);
+    SysFreeString(b_clsName);
+    SysFreeString(b_fieldName);
+    VariantClear(pVal);
+    free(pVal);
+    if (FAILED(hr)) {
+	genError((IUnknown*)pBridge, hr, "Invoke.setStatic", &errMsg);
+	return errMsg;
+    }
+    fromVariant(resultTy, &result, res, &errMsg);
+  
+    return errMsg;
+}
+
+
+
+
+/*
+ * Function: startBridge(pErrMsg)
+ *
+ * Instantiates an InvokeBridge component, which is then
+ * used to interact with the .NET world.
+ *
+ * If the component isn't available locally, zero is returned.
+ * Otherwise, 1.
+ */
+static
+int
+startBridge(char** pErrMsg)
+{
+    HRESULT   hr;
+    IUnknown *pUnk;
+
+    hr = CoInitializeEx(NULL, COINIT_APARTMENTTHREADED);
+    if (FAILED(hr)) {
+	genError(NULL, hr, "DInvoke.createBridge.CoInitializeEx", pErrMsg);
+	return FALSE;
+    }
+
+    hr = CoCreateInstance( &CLSID_InvokeBridge,
+			   NULL,
+			   CLSCTX_INPROC_SERVER,
+			   &IID_IUnknown,
+			   (void**)&pUnk);
+    if (FAILED(hr)) {
+	genError(NULL, hr, "DInvoke.createBridge.CoCreateInstance", pErrMsg);
+	return 0;
+    }
+    
+    hr = IUnknown_QueryInterface(pUnk, &IID_InvokeBridge, (void**)&pBridge);
+    IUnknown_Release(pUnk);
+    if (FAILED(hr)) {
+	genError(pUnk, hr, "DInvoke.createBridge.QueryInterface.InvokeBridge", pErrMsg);
+	return 0;
+    }
+    
+    return 1;
+}
+
+/*
+ * Function: stopBridge()
+ *
+ * Releases the InvokeBridge object and closes the COM library.
+ * 
+ */
+void
+stopDotnetBridge()
+{
+    if (pBridge) {
+	InvokeBridge_Release(pBridge);
+	pBridge = NULL;
+	CoUninitialize();
+    }
+    /* Match up the call to CoInitializeEx() in startBridge(). */
+}
+
+/*
+ * Function: genError()
+ *
+ * Construct a string describing an error condition given
+ * an HRESULT and a location. 
+ * 
+ * If an interface pointer is passed in via the first arg, 
+ * attempts are made to get at richer error information through
+ * the IErrorInfo interface. (Note: we don't currently look for
+ * the _Exception interface for even more detailed info.)
+ *
+ */
+#define LOCATION_HDR "Location: "
+#define HRESULT_HDR  "HRESULT: "
+#define SOURCE_HDR   "Source: "
+#define DESCR_HDR    "Description: "
+#define NEWLINE_EXTRA 3
+
+static
+void
+genError(IUnknown* pUnk,
+	 HRESULT err,
+	 char* loc,
+	 char** pErrMsg)
+{
+  HRESULT hr;
+  HRESULT invoke_hr = err;
+  char*   invoke_src   = NULL;
+  char*   invoke_descr = NULL;
+  char*   buf;
+  int     bufLen;
+  
+  /* If an interface pointer has been supplied, look for
+   * IErrorInfo in order to get more detailed information
+   * on the failure.
+   *
+   * The CLR's .NET COM Interop implementation does provide
+   * IErrorInfo, so we're not really clutching at straws here..
+   *
+   * Note: CLR also reflects .NET exceptions via the _Exception*
+   * interface..
+   *
+   */
+  if (pUnk) {
+    ISupportErrorInfo *pSupp;
+    IErrorInfo        *pErrInfo;
+    BSTR src   = NULL;
+    BSTR descr = NULL;
+
+    hr = IUnknown_QueryInterface(pUnk, 
+				 &IID_ISupportErrorInfo,
+				 (void**)&pSupp);
+    if ( SUCCEEDED(hr) ) {
+      hr = ISupportErrorInfo_InterfaceSupportsErrorInfo(pSupp,
+							&IID_InvokeBridge);
+      if ( SUCCEEDED(hr) ) {
+	hr = GetErrorInfo(0,&pErrInfo);
+	if ( SUCCEEDED(hr) ) {
+	  IErrorInfo_GetSource(pErrInfo,&src);
+	  IErrorInfo_GetDescription(pErrInfo,&descr);
+	  invoke_src   = bstrToString(src);
+	  invoke_descr = bstrToString(descr);
+
+	  IErrorInfo_Release(pErrInfo);
+	  if (src)   { SysFreeString(src);   src = NULL;   }
+	  if (descr) { SysFreeString(descr); descr = NULL; }
+	}
+	ISupportErrorInfo_Release(pSupp);
+      }
+    }
+  }
+  /* Putting it all together.. */
+  bufLen  = sizeof(LOCATION_HDR) + strlen(loc) + NEWLINE_EXTRA +
+            sizeof(HRESULT_HDR)  + 16 + NEWLINE_EXTRA + 
+            sizeof(SOURCE_HDR)   + (invoke_src ? strlen(invoke_src) : 16) + NEWLINE_EXTRA +
+            sizeof(DESCR_HDR)    + (invoke_descr ? strlen(invoke_descr) : 16) + NEWLINE_EXTRA;
+  buf = (char*) malloc(sizeof(char) * (bufLen + 1));
+  if (!buf) {
+    fprintf(stderr, "Unable to allocate %d for error message", (bufLen + 1));
+    *pErrMsg = NULL;
+    return;
+  }
+    
+  _snprintf(buf, bufLen, "%s%s\n%s0x%08x\n%s%s\n%s%s",
+	   LOCATION_HDR, loc,
+	   HRESULT_HDR,  invoke_hr,
+	   SOURCE_HDR,   invoke_src,
+	   DESCR_HDR,    invoke_descr);
+
+  /* Done with these chaps */
+  if (invoke_src)   free(invoke_src);
+  if (invoke_descr) free(invoke_descr);
+  
+  if (pErrMsg) *pErrMsg = buf;
+  fprintf(stderr, "**InvokeBridge Error:\n%s", buf); fflush(stderr);
+}
+
+/* Converting to/from VARIANTs */
+
+/*
+ * Function: fromVariant()
+ *
+ * Unmarshal the contents of a VARIANT, converting its embedded value
+ * into the desired DotnetType (if possible.)
+ *
+ * Returns 1 if successful, 0 otherwise. If the conversion fails, 
+ * *pErrMsg holds the error message string.
+ */
+static
+int
+fromVariant (DotnetType resTy, 
+	     VARIANT* pVar, 
+	     void* res,
+	     char** pErrMsg)
+{
+    VARIANT vNew;
+    HRESULT hr;
+
+    VariantInit(&vNew);
+    switch(resTy) {
+    case Dotnet_Byte:
+    case Dotnet_Char:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_UI1);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_UI1}", pErrMsg);
+	    return FALSE;
+	}
+	*((unsigned char*)res) = vNew.bVal;
+	return 1;
+    case Dotnet_Boolean:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_BOOL);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_BOOL}", pErrMsg);
+	    return 0;
+	}
+	*((unsigned char*)res) = vNew.bVal;
+	return 1;
+    case Dotnet_Int:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_INT);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_INT}", pErrMsg);
+	    return 0;
+	}
+	*((int*)res) = vNew.intVal;
+	return 1;
+    case Dotnet_Int8:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_I1);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_I1}", pErrMsg);
+	    return 0;
+	}
+	*((signed char*)res) = vNew.bVal;
+	return 1;
+    case Dotnet_Int16:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_I2);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_I2}", pErrMsg);
+	    return 0;
+	}
+	*((signed short*)res) = vNew.iVal;
+	return 1;
+    case Dotnet_Int32:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_I4);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_I4}", pErrMsg);
+	    return 0;
+	}
+	*((signed int*)res) = vNew.lVal;
+	return 1;
+    case Dotnet_Int64:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_I8);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_I8}", pErrMsg);
+	    return 0;
+	}
+#ifdef _MSC_VER
+	*((__int64*)res) = vNew.llVal;
+#else
+	*((long long*)res) = vNew.lVal;
+#endif
+	return 1;
+    case Dotnet_Float:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_R4);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_R4}", pErrMsg);
+	    return 0;
+	}
+	*((float*)res) = vNew.fltVal;
+	return 1;
+    case Dotnet_Double:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_R8);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_R4}", pErrMsg);
+	    return 0;
+	}
+	*((double*)res) = vNew.dblVal;
+	return 1;
+    case Dotnet_Word8:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_UI1);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_UI1}", pErrMsg);
+	    return 0;
+	}
+	*((unsigned char*)res) = vNew.bVal;
+	return 1;
+    case Dotnet_Word16:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_UI2);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_UI2}", pErrMsg);
+	    return 0;
+	}
+	*((unsigned short*)res) = vNew.uiVal;
+	return 1;
+    case Dotnet_Word32:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_UI4);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_UI4}", pErrMsg);
+	    return 0;
+	}
+	*((unsigned int*)res) = vNew.ulVal;
+	return 1;
+    case Dotnet_Word64:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_UI8);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_UI8}", pErrMsg);
+	    return 0;
+	}
+#ifdef _MSC_VER
+	*((unsigned __int64*)res) = vNew.ullVal;
+#else
+	*((unsigned long long*)res) = vNew.lVal;
+#endif
+	return 1;
+    case Dotnet_Ptr:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_BYREF);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_BYREF}", pErrMsg);
+	    return 0;
+	}
+	*((void**)res) = vNew.byref;
+	return 1;
+    case Dotnet_Unit:
+	return 0;
+    case Dotnet_Object:
+      if ( pVar->vt == VT_BSTR ) {
+	/* Special handling for strings. If the user has asked for
+	 * the string in object form, give him/her that. 
+	 */
+	VARIANT res;
+
+	VariantInit(&res);
+	hr = InvokeBridge_NewString(pBridge,
+				    pVar->bstrVal,
+				    &res);
+	if (SUCCEEDED(hr)) {
+	  pVar = &res;
+	}
+      }
+	hr = VariantChangeType (&vNew, pVar, 0, VT_UNKNOWN);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_UNKNOWN}", pErrMsg);
+	    return 0;
+	}
+	*((IUnknown**)res) = vNew.punkVal;
+	return 1;
+    case Dotnet_String:
+	hr = VariantChangeType (&vNew, pVar, 0, VT_BSTR);
+	if (FAILED(hr)) {
+	    genError(NULL, hr, "DInvoke.fromVariant{VT_BSTR}", pErrMsg);
+	    return 0;
+	}
+	/* Storage is allocated by malloc(), caller is resp for freeing. */
+	*((char**)res) = bstrToString(vNew.bstrVal);
+	return 1;
+    }
+    return 0;
+}
+
+/*
+ * Function: toVariant()
+ *
+ * Convert a DotnetArg into a VARIANT. The VARIANT
+ * is dynamically allocated.
+ *
+ * The result is the pointer to the filled-in VARIANT structure;
+ * NULL if allocation failed.
+ *
+ */
+static
+VARIANT*
+toVariant ( DotnetArg* p )
+{
+  VARIANT* v = (VARIANT*)malloc(sizeof(VARIANT));
+  if (!v) return NULL;
+  VariantInit(v);
+  switch (p->arg_type) {
+  case Dotnet_Byte:
+    v->vt = VT_UI1;
+    v->bVal = p->arg.arg_byte;
+    break;
+  case Dotnet_Char:
+    v->vt = VT_UI1;
+    v->bVal = p->arg.arg_char;
+    break;
+  case Dotnet_Boolean:
+    v->vt = VT_BOOL;
+    v->boolVal = p->arg.arg_bool;
+    break;
+  case Dotnet_Int:
+    v->vt = VT_INT;
+    v->intVal = p->arg.arg_int;
+    break;
+  case Dotnet_Int8:
+    v->vt = VT_I1;
+    v->bVal = p->arg.arg_int8;
+    break;
+  case Dotnet_Int16:
+    v->vt = VT_I2;
+    v->iVal = p->arg.arg_int16;
+    break;
+  case Dotnet_Int32:
+    v->vt = VT_I4;
+    v->lVal = p->arg.arg_int32;
+    break;
+  case Dotnet_Int64:
+    v->vt = VT_I8;
+#ifdef _MSC_VER
+    v->llVal = p->arg.arg_int64;
+#else
+    (long long*)(v->lVal) = p->arg.arg_int64;
+#endif
+    break;
+  case Dotnet_Float:
+    v->vt = VT_R4;
+    v->fltVal = p->arg.arg_float;
+    break;
+  case Dotnet_Double:
+    v->vt = VT_R8;
+    v->dblVal = p->arg.arg_double;
+    break;
+  case Dotnet_Word8:
+    v->vt = VT_UI1;
+    v->bVal = p->arg.arg_word8;
+    break;
+  case Dotnet_Word16:
+    v->vt = VT_UI2;
+    v->uiVal = p->arg.arg_word16;
+    break;
+  case Dotnet_Word32:
+    v->vt = VT_UI4;
+    v->ulVal = p->arg.arg_word32;
+    break;
+  case Dotnet_Word64:
+    v->vt = VT_UI8;
+#ifdef _MSC_VER
+    v->ullVal = p->arg.arg_word64;
+#else
+    (unsigned long long*)(v->lVal) = p->arg.arg_word64;
+#endif
+    break;
+  case Dotnet_Ptr:
+    v->vt = VT_BYREF;
+    v->byref = p->arg.arg_ptr;
+    break;
+  case Dotnet_Unit:
+    v->vt = VT_EMPTY;
+    break;
+  case Dotnet_Object:
+    v->vt = VT_UNKNOWN;
+    v->punkVal = (IUnknown*)p->arg.arg_obj;
+    break;
+  case Dotnet_String: {
+    BSTR b;
+    HRESULT hr;
+    v->vt = VT_BSTR;
+    hr = stringToBSTR((const char*)p->arg.arg_str,&b);
+    v->bstrVal = b;
+    break; }
+  }
+  return v;
+}
diff --git a/rts/dotnet/Invoker.cpp b/rts/dotnet/Invoker.cpp
new file mode 100644
index 0000000000..d8ad87212d
--- /dev/null
+++ b/rts/dotnet/Invoker.cpp
@@ -0,0 +1,338 @@
+//
+// (c) 2002-2003, sof.
+//
+// Dynamic invocation helper classes. The details of how
+// to access the .NET object model via the Reflection API
+// is taken care of by Invoker.{h,cpp}
+//
+#include "Invoker.h"
+
+namespace DynInvoke {
+
+static TypeName* ParseType(String* str) {
+    int curPos = 0;
+    int endPos;
+
+    //    Console::WriteLine("x{0}y", str);
+    TypeName* typeName = new TypeName();
+
+    if ( str->get_Chars(0) == '[' ) {
+      endPos = str->IndexOf(']');
+      curPos = endPos + 1;
+      typeName->m_assembly = str->Substring(1,endPos-1);
+      typeName->m_length = endPos+1;
+    }
+    String* delimStr = " ,()";
+    Char delims __gc [] = delimStr->ToCharArray();
+
+    endPos  = str->IndexOfAny(delims,curPos);
+    //    Console::WriteLine("{0} {1} x{2}x", __box(endPos), __box(curPos), str);
+    if ( endPos == -1 ) {
+      typeName->m_class = str->Substring(curPos);
+    } else {
+      typeName->m_class = str->Substring(curPos,endPos-curPos);
+    }
+
+    //    typeName->m_class = str->Substring(curPos,endPos-curPos);
+    typeName->m_length += endPos-curPos;
+
+    return typeName;
+}
+
+// Method: GetType(String* typeName);
+// 
+// Purpose: Assembly-savvy version of Type::GetType()
+//
+Type* InvokeBridge::GetType(String* typeName) {
+
+  try {
+    Type* t = Type::GetType(typeName);
+    if (t) return t;
+  } catch (Exception*) {
+    ;
+  }
+
+  for (int i=0;i < InvokeBridge::m_assemblies->Count; i++) {
+     try {
+       String* stuff = String::Format("{0},{1}",typeName,InvokeBridge::m_assemblies->get_Item(i)->ToString());
+       //       Console::WriteLine(stuff);
+       Type* t = Type::GetType(stuff);
+       if (t) {
+	 return t;
+       }
+     } catch (Exception*) {
+       continue;
+     }
+  }
+  return 0;
+}
+
+//
+// Method:  CreateInstance(String* typeName, Object* [])
+//
+// Purpose: Assembly-savvy invocation of Activator::CreateInstance
+Object* InvokeBridge::CreateInstance(TypeName* typeName,
+				     Object* args[]) {
+
+  Object* instance = 0;
+  Type*   t = InvokeBridge::GetType(typeName->toStdString());
+
+  //  Console::WriteLine("x{0} y{1}", typeName->toStdString(), t);
+  if (!t) {
+    try {
+      Assembly* localA = Assembly::LoadFrom(typeName->m_assembly);
+      t = localA->GetType(typeName->m_class);
+    } catch (Exception* e) {
+      ;
+    }
+  }
+  
+  if (!t) {
+    try {
+      AppDomain* currentDomain = AppDomain::CurrentDomain;
+      
+      //      Assembly* stuff[] = currentDomain->GetAssemblies();
+      //      for (int i=0;i < stuff.Length; i++) {
+      //	Console::WriteLine("x{0} y{1}", stuff[i]->ToString(), stuff[i]->FullName);
+      //      }
+      //      Console::WriteLine("x{0} y{1}", typeName->toStdString(), t);
+      Assembly* localA = Assembly::LoadWithPartialName("HugsAssembly");
+      t = localA->GetType(typeName->m_class);
+      //      Console::WriteLine("x{0} y{1}", typeName->toStdString(), t);
+    } catch (Exception*) {
+      ;
+    }
+  }
+
+  if (t) {
+    try {
+      Object* o =Activator::CreateInstance(t,(Object* [])args);
+      return o;
+    } catch (Exception* e) {
+      Console::WriteLine("Failure: {0}", e);
+      return 0;
+    }
+  }
+}
+
+//
+// Method: CreateObject(String* objSpec, Object* args[])
+//
+// Purpose: Given a fully qualified name of a class/type, try
+//          to create an instance of it.
+//
+Object* InvokeBridge::CreateObject(String* assemName,
+				   String* objSpec,
+				   Object* args[]) {
+ 
+  Object* instance = 0;
+
+  // Unravel the name of the class/type.
+  TypeName* typeName = ParseType(objSpec);
+  
+  if (assemName != 0 && assemName->Length > 0) {
+    typeName->m_assembly = assemName;
+  }
+
+  // Try creating the instance..
+  try {
+    instance = InvokeBridge::CreateInstance(typeName,(Object* [])args);
+  } catch (Exception* e) {
+    Console::WriteLine("Unable to create instance \"{0}\" {1}", objSpec, e);
+    throw(e);
+  }
+  if (!instance) {
+    Console::WriteLine("Unable to create instance \"{0}\"", objSpec);
+  }
+  return instance;
+}
+
+//
+// Method:  InvokeMethod
+// 
+// Purpose: Given a pointer to an already created object, look up
+//          one of its method. If found, invoke the method passing it
+//          'args' as arguments.
+//
+Object*
+InvokeBridge::InvokeMethod(Object* obj, 
+			   String* methName,
+			   Object* args[]) {
+  // Get the methods from the type
+  MethodInfo* methods __gc[] = obj->GetType()->GetMethods();
+  MethodInfo* mInfo;
+
+  if (!methods) {
+    Console::WriteLine("InvokeMethod: No matching types found");
+    return 0;
+  }
+			
+  System::Reflection::BindingFlags flgs 
+    = (System::Reflection::BindingFlags) // why do I need to cast?
+      (System::Reflection::BindingFlags::Public       |
+       System::Reflection::BindingFlags::NonPublic    |
+       System::Reflection::BindingFlags::Instance     |
+       System::Reflection::BindingFlags::Static       |
+       System::Reflection::BindingFlags::InvokeMethod);
+    
+  /* Caller is assumed to catch any exceptions raised. */
+  return obj->GetType()->InvokeMember(methName,
+				      flgs,
+				      0,
+				      obj,
+				      (Object __gc* [])args);
+}
+
+//
+// Method:  InvokeStaticMethod
+// 
+// Purpose: Invoke a static method, given the fully qualified name
+//          of the method (and its arguments). If found, invoke the
+//          method passing it 'args' as arguments.
+//
+Object* InvokeBridge::InvokeStaticMethod(String* assemName,
+					 String* typeAndMethName,
+					 Object* args[]) {
+
+  // Get the methods from the type
+  MethodInfo* methods __gc[];
+  MethodInfo* mInfo;
+
+  int lastDot = typeAndMethName->LastIndexOf('.');
+  String* className = typeAndMethName->Substring(0,lastDot);
+  String* methName  = typeAndMethName->Substring(lastDot+1);
+
+  // Unravel the name of the class/type.
+  TypeName* typeName = ParseType(className);
+  Type* t;
+  
+  if (assemName != 0 && assemName->Length > 0) {
+    typeName->m_assembly = assemName;
+  }
+  
+  try {
+    t = InvokeBridge::GetType(typeName->toStdString());
+    
+    if (!t) {
+      try {
+	Assembly* localA = Assembly::LoadFrom(typeName->m_assembly);
+	t = localA->GetType(typeName->m_class);
+	//	Console::WriteLine("InvokeStaticMethod: Type {0} found", t);
+      } catch (Exception* e) {
+	;
+      }
+    }
+
+    if (t) {
+      methods = t->GetMethods();
+    } else {
+      Console::WriteLine("InvokeStaticMethod: Type {0} not found", className);
+      return 0;
+    }
+  } catch (Exception *e) {
+      Console::WriteLine("InvokeStaticMethod: Type {0} not found", className);
+      throw(e);
+  }
+
+  System::Reflection::BindingFlags flgs 
+    = (System::Reflection::BindingFlags) // why do I need to cast?
+      (System::Reflection::BindingFlags::DeclaredOnly |
+       System::Reflection::BindingFlags::Public       |
+       System::Reflection::BindingFlags::NonPublic    |
+       System::Reflection::BindingFlags::Static       |
+       System::Reflection::BindingFlags::InvokeMethod);
+    
+  return t->InvokeMember(methName,
+			 flgs,
+			 0,
+			 0,
+			 (Object __gc* [])args);
+}
+
+//
+// Method:  GetField
+//
+// Purpose: Fetch the (boxed) value of named field of a given object.
+//
+Object* InvokeBridge::GetField(Object* obj, System::String* fieldName) {
+
+  FieldInfo* fInfo = obj->GetType()->GetField(fieldName);
+  return fInfo->GetValue(obj);
+}
+
+//
+// Method:  GetStaticField
+//
+// Purpose: Fetch the (boxed) value of named static field.
+//
+Object* InvokeBridge::GetStaticField(System::String* clsName,
+				     System::String* fieldName) {
+
+  Type* ty = InvokeBridge::GetType(clsName);
+  System::Reflection::BindingFlags static_field_flgs 
+    = (System::Reflection::BindingFlags)
+    (System::Reflection::BindingFlags::Public       |
+     System::Reflection::BindingFlags::NonPublic    |
+     System::Reflection::BindingFlags::FlattenHierarchy |
+     System::Reflection::BindingFlags::Static);
+
+  FieldInfo* fInfo = ty->GetField(fieldName, static_field_flgs);
+  return fInfo->GetValue(0); // according to doc, ok to pass any val here.
+}
+
+//
+// Method:  SetField
+//
+// Purpose: Replace the (boxed) value of named field of a given object.
+//
+void InvokeBridge::SetField(Object* obj, System::String* fieldName, Object* val) {
+
+  FieldInfo* fInfo = obj->GetType()->GetField(fieldName);
+  fInfo->SetValue(obj,val);
+  return;
+}
+
+//
+// Method:  SetStaticField
+//
+// Purpose: Replace the (boxed) value of named static field.
+//
+void InvokeBridge::SetStaticField(System::String* clsName,
+				  System::String* fieldName,
+				  Object* val) {
+
+  Type* ty = InvokeBridge::GetType(clsName);
+  System::Reflection::BindingFlags static_field_flgs 
+    = (System::Reflection::BindingFlags)
+    (System::Reflection::BindingFlags::Public       |
+     System::Reflection::BindingFlags::NonPublic    |
+     System::Reflection::BindingFlags::FlattenHierarchy |
+     System::Reflection::BindingFlags::Static);
+  
+  FieldInfo* fInfo = ty->GetField(fieldName,static_field_flgs);
+  fInfo->SetValue(0,val);
+  return;
+}
+
+Object* InvokeBridge::NewString(System::String* s)
+{
+  System::String* c = System::String::Copy(s);
+  return dynamic_cast<Object*>(c);
+}
+
+Array* InvokeBridge::NewArgArray(int sz)
+{
+ return Array::CreateInstance(__typeof(Object), sz); 
+}
+
+void InvokeBridge::SetArg(Object* arr[], Object* val, int idx)
+{
+ arr->SetValue(val,idx);
+}
+
+Object* InvokeBridge::GetArg(Object* arr[], int idx)
+{
+ return arr->GetValue(idx);
+}
+
+} /* namespace */
diff --git a/rts/dotnet/Invoker.h b/rts/dotnet/Invoker.h
new file mode 100644
index 0000000000..d649a4c716
--- /dev/null
+++ b/rts/dotnet/Invoker.h
@@ -0,0 +1,197 @@
+//
+// (c) 2003, sof.
+//
+// Dynamic invocation helper classes. The details of how
+// to access the .NET object model via the Reflection API
+// is taken care of by Invoker.{h,cpp}
+//
+#pragma once
+#using <mscorlib.dll>
+
+using namespace System;
+using namespace System::Reflection;
+using namespace System::Text;
+using namespace System::Runtime::InteropServices;
+
+[assembly:AssemblyKeyFileAttribute(S"invoker.snk")];
+
+namespace DynInvoke {
+
+//
+// Class: TypeName
+//
+// Purpose: pairing up an assembly name and the type/class name.
+//
+[ComVisible(false)]
+public __gc class TypeName {
+
+public:
+  System::String* m_assembly;
+  System::String* m_class;
+  int     m_length;
+
+  TypeName() { 
+    m_assembly = String::Empty;
+    m_class = String::Empty;
+    m_length = 0;
+  }
+
+  void Print() {
+    if (m_assembly && m_assembly != String::Empty ) {
+      Console::Write("[");
+      Console::Write(m_assembly);
+      Console::Write("]");
+    }
+    Console::WriteLine(m_class);
+  }
+  
+  int Length() { return m_length; }
+
+  System::String* toStdString() {
+    System::String* res = new System::String(m_class->ToCharArray());
+    
+    if (m_assembly && m_assembly != String::Empty ){
+      res = String::Concat(res, S",");
+      res = String::Concat(res, m_assembly);
+    }
+    return res;
+  }
+};
+
+// 
+// Class:   InvokeBridge
+// 
+// Purpose: Collection of (static) methods for dynamically creating
+//          objects and accessing methods/fields on them. 
+//
+[ClassInterface(ClassInterfaceType::AutoDual),
+GuidAttribute("39D497D9-60E0-3525-B7F2-7BC096D3A2A3"),
+ComVisible(true)
+]
+public __gc class InvokeBridge {
+public:
+  InvokeBridge() {
+    Assembly* corAss      = Assembly::Load("mscorlib.dll"); 
+    System::String*  dir  = System::IO::Path::GetDirectoryName(corAss->Location);
+   
+    m_assemblies = new System::Collections::ArrayList();
+   
+    System::String* fs[] = System::IO::Directory::GetFiles(dir, "*.dll");
+    for (int i=0;i < fs->Length; i++) {
+      try {
+	Assembly* tAss = Assembly::LoadFrom(fs[i]);
+	m_assemblies->Add(tAss->FullName);
+      } catch (Exception* e) {
+	continue;
+      }
+    }
+  }
+
+  //
+  // Method: CreateObject(String* assemName, String* objSpec, Object* args[])
+  //
+  // Purpose: Given a fully qualified name of a class/type, try
+  //          to create an instance of it.
+  //
+  Object* CreateObject(System::String* assemName,
+		       System::String* objSpec,
+		       Object* args[]);
+	  
+  //
+  // Method:  InvokeMethod
+  // 
+  // Purpose: Given a pointer to an already created object, look up
+  //          one of its method. If found, invoke the method passing it
+  //          'args' as arguments.
+  //
+  // Comments: the format of the method-spec is "methodName(type1,..,typeN)" [N>=0]
+  //
+  Object* InvokeMethod(Object* obj, 
+		       System::String* methSpec,
+		       Object* args[]);
+			      
+  //
+  // Method:  InvokeStaticMethod
+  // 
+  // Purpose: Invoke a static method, given the fully qualified name
+  //          of the method (and its arguments). If found, invoke the
+  //          method passing it 'args' as arguments.
+  //
+  // Comments: the format of the method-spec is 
+  //              "T1.T2.<..>.Tn.methodName(type1,..,typeN)" [N>=0]
+  //
+  Object* InvokeStaticMethod(System::String* assemName,
+			     System::String* methSpec,
+			     Object* args[]);
+			      
+  //
+  // Method:  GetField
+  //
+  // Purpose: Fetch the (boxed) value of named field of a given object.
+  //
+  Object* GetField(Object* obj, System::String* fieldSpec);
+
+  //
+  // Method:  GetField
+  //
+  // Purpose: Fetch the (boxed) value of named static field.
+  //
+  Object* GetStaticField(System::String* clsName, 
+			 System::String* fieldSpec);
+
+  //
+  // Method:  SetField
+  //
+  // Purpose: Replace the (boxed) value of named field of a given object.
+  //
+  void SetField(Object* obj, System::String* fieldSpec, Object* val);
+	    
+  //
+  // Method:  SetStaticField
+  //
+  // Purpose: Replace the (boxed) value of named field of a given object.
+  //
+  void SetStaticField(System::String* clsName,
+		      System::String* fieldSpec,
+		      Object* val);
+	    
+
+  // 
+  // Method:  NewString
+  // 
+  // Purpose: construct a System.String object copy in a manner that avoids
+  //          COM Interop from deconstructing it to a BSTR.
+  //
+  System::Object* NewString( System::String* s);
+
+  //
+  // Method:  NewArgArray
+  //
+  // Purpose: create a new array for holding (boxed) arguments to constructors/
+  //          methods.
+  //
+  Array* NewArgArray(int sz);
+  
+  //
+  // Method: SetArg
+  //
+  // Purpose: set an entry in the argument vector.
+  //
+  void SetArg(Object* arr[], Object* val, int idx);
+
+  //
+  // Method: GetArg
+  //
+  // Purpose: get an entry in the argument vector.
+  //
+  Object* GetArg(Object* arr[], int idx);
+
+  System::Type* InvokeBridge::GetType(System::String* typeName);
+
+protected:
+  System::Collections::ArrayList __gc* m_assemblies;
+  Object* InvokeBridge::CreateInstance(TypeName* typeName,
+				       Object* args[]);
+};
+
+} /* namespace */
diff --git a/rts/dotnet/InvokerClient.h b/rts/dotnet/InvokerClient.h
new file mode 100644
index 0000000000..122f455c01
--- /dev/null
+++ b/rts/dotnet/InvokerClient.h
@@ -0,0 +1,180 @@
+/*
+ * InvokerClient interface defns for use with gcc.
+ *
+ * Note: These declarations mirror those of the InvokeBridge
+ *       class declaration. 
+ *
+ */
+
+#include <windows.h>
+#include <wtypes.h>
+#include <oaidl.h>
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#ifndef STDCALL
+#define STDCALL __stdcall
+#endif
+
+extern const CLSID CLSID_InvokeBridge;
+extern const IID   IID_IUnknown;
+extern const IID   IID_NULL;
+extern const IID   IID_InvokeBridge;
+
+#ifdef WANT_UUID_DECLS
+const CLSID CLSID_InvokeBridge = { 0x39D497D9,0x60E0,0x3525,{0xB7,0xF2,0x7B,0xC0,0x96,0xD3,0xA2,0xA3}};
+//const IID IID_NULL = {0x00000000L, 0x0000, 0x0000, {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}};
+//const IID IID_IUnknown = {0x00000000L, 0x0000, 0x0000, {0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46}};
+const IID IID_InvokeBridge = { 0xAFF5FFCA, 0xC5C2, 0x3D5B, {0xAF, 0xD5, 0xED, 0x8E, 0x4B, 0x38, 0xDB, 0x7B}};
+  //0x3A85D703, 0xFAE4,0x3C5E, {0x9F,0x7E,0x20,0x98,0x31,0xCD,0x61,0x7A}};
+#endif
+
+#ifndef __InvokeBridge_INTERFACE_DEFINED__
+#define __InvokeBridge_INTERFACE_DEFINED__
+#undef INTERFACE
+#define INTERFACE InvokeBridge
+DECLARE_INTERFACE(InvokeBridge)
+{
+    STDMETHOD(QueryInterface)(THIS_ REFIID,PVOID*) PURE;
+    STDMETHOD_(ULONG,AddRef)(THIS) PURE;
+    STDMETHOD_(ULONG,Release)(THIS) PURE;
+    STDMETHOD(GetTypeInfoCount)(THIS_ UINT*) PURE;
+    STDMETHOD(GetTypeInfo)(THIS_ UINT,LCID,LPTYPEINFO*) PURE;
+    STDMETHOD(GetIDsOfNames)(THIS_ REFIID,LPOLESTR*,UINT,LCID,DISPID*) PURE;
+    STDMETHOD(Invoke)(THIS_ DISPID,REFIID,LCID,WORD,DISPPARAMS*,VARIANT*,EXCEPINFO*,UINT*) PURE;
+
+    STDMETHOD(ToString)(THIS_ BSTR*) PURE;
+    STDMETHOD(Equals)(THIS_ BSTR*) PURE;
+    STDMETHOD(GetHashCode)(THIS_ long*) PURE;
+    STDMETHOD(GetType)(THIS_ IUnknown**);
+    STDMETHOD(CreateObject)(THIS_ BSTR,BSTR,SAFEARRAY*, VARIANT*) PURE;
+    STDMETHOD(InvokeMethod)(THIS_ VARIANT,BSTR,SAFEARRAY*,VARIANT*) PURE;
+    STDMETHOD(InvokeStaticMethod)(THIS_ BSTR,BSTR,SAFEARRAY*,VARIANT*) PURE;
+
+    HRESULT ( STDCALL *GetField )( 
+            InvokeBridge * This,
+            /* [in] */ VARIANT obj,
+            /* [in] */ BSTR fieldSpec,
+            /* [retval][out] */ VARIANT *pRetVal);
+        
+        HRESULT ( STDCALL *GetStaticField )( 
+            InvokeBridge * This,
+            /* [in] */ BSTR clsName,
+            /* [in] */ BSTR fieldSpec,
+            /* [retval][out] */ VARIANT *pRetVal);
+        
+        HRESULT ( STDCALL *SetField )( 
+            InvokeBridge * This,
+            /* [in] */ VARIANT obj,
+            /* [in] */ BSTR fieldSpec,
+            /* [in] */ VARIANT val);
+        
+        HRESULT ( STDCALL *SetStaticField )( 
+            InvokeBridge * This,
+            /* [in] */ BSTR clsName,
+            /* [in] */ BSTR fieldSpec,
+            /* [in] */ VARIANT val);
+        
+        HRESULT ( STDCALL *NewString )( 
+            InvokeBridge * This,
+            /* [in] */ BSTR s,
+            /* [retval][out] */VARIANT* pRetVal);
+        
+        HRESULT ( STDCALL *NewArgArray )( 
+            InvokeBridge * This,
+            /* [in] */ long sz,
+            /* [retval][out] */IUnknown **pRetVal);
+        
+        HRESULT ( STDCALL *SetArg )( 
+            InvokeBridge * This,
+            /* [in] */ SAFEARRAY * arr,
+            /* [in] */ VARIANT val,
+            /* [in] */ long idx);
+        
+        HRESULT ( STDCALL *GetArg )( 
+            InvokeBridge * This,
+            /* [in] */ SAFEARRAY * arr,
+            /* [in] */ long idx,
+            /* [retval][out] */ VARIANT *pRetVal);
+        
+        HRESULT ( STDCALL *GetType_2 )( 
+            InvokeBridge * This,
+            /* [in] */ BSTR typeName,
+            /* [retval][out] */ IUnknown **pRetVal);
+};
+#endif
+
+#define InvokeBridge_QueryInterface(This,riid,ppvObject)	\
+    (This)->lpVtbl->QueryInterface(This,riid,ppvObject)
+
+#define InvokeBridge_AddRef(This)	\
+    (This)->lpVtbl->AddRef(This)
+
+#define InvokeBridge_Release(This)	\
+    (This)->lpVtbl->Release(This)
+
+#define InvokeBridge_GetTypeInfoCount(This,pctinfo)	\
+    (This)->lpVtbl->GetTypeInfoCount(This,pctinfo)
+
+#define InvokeBridge_GetTypeInfo(This,iTInfo,lcid,ppTInfo)	\
+    (This)->lpVtbl->GetTypeInfo(This,iTInfo,lcid,ppTInfo)
+
+#define InvokeBridge_GetIDsOfNames(This,riid,rgszNames,cNames,lcid,rgDispId)	\
+    (This)->lpVtbl->GetIDsOfNames(This,riid,rgszNames,cNames,lcid,rgDispId)
+
+#define InvokeBridge_Invoke(This,dispIdMember,riid,lcid,wFlags,pDispParams,pVarResult,pExcepInfo,puArgErr)	\
+    (This)->lpVtbl->Invoke(This,dispIdMember,riid,lcid,wFlags,pDispParams,pVarResult,pExcepInfo,puArgErr)
+
+#define InvokeBridge_get_ToString(This,pRetVal)	\
+    (This)->lpVtbl->get_ToString(This,pRetVal)
+
+#define InvokeBridge_Equals(This,obj,pRetVal)	\
+    (This)->lpVtbl->Equals(This,obj,pRetVal)
+
+#define InvokeBridge_GetHashCode(This,pRetVal)	\
+    (This)->lpVtbl->GetHashCode(This,pRetVal)
+
+#define InvokeBridge_GetType(This,pRetVal)	\
+    (This)->lpVtbl->GetType(This,pRetVal)
+
+#define InvokeBridge_CreateObject(This,assemName,objSpec,args,pRetVal)	\
+    (This)->lpVtbl->CreateObject(This,assemName,objSpec,args,pRetVal)
+
+#define InvokeBridge_InvokeMethod(This,obj,methSpec,args,pRetVal)	\
+    (This)->lpVtbl->InvokeMethod(This,obj,methSpec,args,pRetVal)
+
+#define InvokeBridge_InvokeStaticMethod(This,assemName,methSpec,args,pRetVal)	\
+    (This)->lpVtbl->InvokeStaticMethod(This,assemName,methSpec,args,pRetVal)
+
+#define InvokeBridge_GetField(This,obj,fieldSpec,pRetVal)	\
+    (This)->lpVtbl->GetField(This,obj,fieldSpec,pRetVal)
+
+#define InvokeBridge_GetStaticField(This,clsName,fieldSpec,pRetVal)	\
+    (This)->lpVtbl->GetStaticField(This,clsName,fieldSpec,pRetVal)
+
+#define InvokeBridge_SetField(This,obj,fieldSpec,val)	\
+    (This)->lpVtbl->SetField(This,obj,fieldSpec,val)
+
+#define InvokeBridge_SetStaticField(This,clsName,fieldSpec,val)	\
+    (This)->lpVtbl->SetStaticField(This,clsName,fieldSpec,val)
+
+#define InvokeBridge_NewString(This,s,pRetVal)	\
+    (This)->lpVtbl->NewString(This,s,pRetVal)
+
+#define InvokeBridge_NewArgArray(This,sz,pRetVal)	\
+    (This)->lpVtbl->NewArgArray(This,sz,pRetVal)
+
+#define InvokeBridge_SetArg(This,arr,val,idx)	\
+    (This)->lpVtbl->SetArg(This,arr,val,idx)
+
+#define InvokeBridge_GetArg(This,arr,idx,pRetVal)	\
+    (This)->lpVtbl->GetArg(This,arr,idx,pRetVal)
+
+#define InvokeBridge_GetType_2(This,typeName,pRetVal)	\
+    (This)->lpVtbl->GetType_2(This,typeName,pRetVal)
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/rts/dotnet/Makefile b/rts/dotnet/Makefile
new file mode 100644
index 0000000000..95b6c38890
--- /dev/null
+++ b/rts/dotnet/Makefile
@@ -0,0 +1,53 @@
+#
+# .NET interop for GHC.
+#
+#  (c) 2003, sof.
+# 
+TOP=../..
+include $(TOP)/mk/boilerplate.mk
+
+all :: Invoker.dll Invoke.o
+
+#
+# To compile the dotnet interop bits, you need to have the
+# .NET Framework SDK or VS.NET installed. The following
+# apps are used:
+# 
+MCPP=cl
+TLBEXP=tlbexp
+REGASM=regasm
+GACUTIL=gacutil
+
+Invoker.dll : Invoker.obj 
+	$(MCPP) /LD /clr /o Invoker.dll Invoker.obj
+	$(TLBEXP) Invoker.dll
+	$(REGASM) Invoker.dll
+	$(GACUTIL) /i Invoker.dll
+
+Invoker.obj : Invoker.cpp Invoker.h
+	$(MCPP) /LD /clr /c Invoker.cpp
+
+CLEAN_FILES += $(wildcard *.obj *.dll *.tlb)
+
+# ToDo: 
+#   - switch to /ir (i.e., copy it into the GAC.)
+#   - sort out installation story.
+
+# drop the assembly
+remove : 
+	$(GACUTIL) /u Invoker
+
+#
+# NOTE: For DotnetCc a version of gcc later than gcc-2.95 is
+#       required (I'm using the gcc-3.2 snapshot that comes with mingw-2)
+#
+ifeq "$(DotnetCc)" ""
+DotnetCc=$(CC)
+endif
+DotnetCcOpts=$(CC_OPTS) $(DOTNET_EXTRA_CC_OPTS)
+SRC_CC_OPTS += -I$(TOP)/includes
+
+Invoke.o : Invoke.c 
+	$(DotnetCc) $(DotnetCcOpts) -c $< -o $@ 
+
+include $(TOP)/mk/target.mk
diff --git a/rts/dotnet/invoker.snk b/rts/dotnet/invoker.snk
new file mode 100644
index 0000000000..05a222178a
--- /dev/null
+++ b/rts/dotnet/invoker.snk
diff --git a/rts/ghc-frontpanel.glade b/rts/ghc-frontpanel.glade
new file mode 100644
index 0000000000..9b73afce47
--- /dev/null
+++ b/rts/ghc-frontpanel.glade
@@ -0,0 +1,1622 @@
+<?xml version="1.0" standalone="no"?> <!--*- mode: xml -*-->
+<!DOCTYPE glade-interface SYSTEM "http://glade.gnome.org/glade-2.0.dtd">
+
+<glade-interface>
+
+<widget class="GtkWindow" id="GHC Front Panel">
+  <property name="visible">True</property>
+  <property name="title" translatable="yes">GHC Front Panel</property>
+  <property name="type">GTK_WINDOW_TOPLEVEL</property>
+  <property name="window_position">GTK_WIN_POS_NONE</property>
+  <property name="modal">False</property>
+  <property name="default_width">450</property>
+  <property name="default_height">600</property>
+  <property name="resizable">True</property>
+  <property name="destroy_with_parent">False</property>
+  <property name="decorated">True</property>
+  <property name="skip_taskbar_hint">False</property>
+  <property name="skip_pager_hint">False</property>
+  <property name="type_hint">GDK_WINDOW_TYPE_HINT_NORMAL</property>
+  <property name="gravity">GDK_GRAVITY_NORTH_WEST</property>
+  <property name="focus_on_map">True</property>
+
+  <child>
+    <widget class="GtkVBox" id="vbox1">
+      <property name="visible">True</property>
+      <property name="homogeneous">False</property>
+      <property name="spacing">0</property>
+
+      <child>
+	<widget class="GtkHBox" id="hbox1">
+	  <property name="border_width">10</property>
+	  <property name="visible">True</property>
+	  <property name="homogeneous">False</property>
+	  <property name="spacing">10</property>
+
+	  <child>
+	    <widget class="GtkVBox" id="vbox4">
+	      <property name="visible">True</property>
+	      <property name="homogeneous">False</property>
+	      <property name="spacing">10</property>
+
+	      <child>
+		<widget class="GtkFrame" id="frame3">
+		  <property name="visible">True</property>
+		  <property name="label_xalign">0</property>
+		  <property name="label_yalign">0.5</property>
+		  <property name="shadow_type">GTK_SHADOW_ETCHED_IN</property>
+
+		  <child>
+		    <widget class="GtkHBox" id="hbox3">
+		      <property name="visible">True</property>
+		      <property name="homogeneous">False</property>
+		      <property name="spacing">0</property>
+
+		      <child>
+			<widget class="GtkLabel" id="label40">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">Mb</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">0.5</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="padding">0</property>
+			  <property name="expand">False</property>
+			  <property name="fill">False</property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkVRuler" id="map_ruler">
+			  <property name="visible">True</property>
+			  <property name="metric">GTK_PIXELS</property>
+			  <property name="lower">0</property>
+			  <property name="upper">10</property>
+			  <property name="position">1.40845072269</property>
+			  <property name="max_size">10</property>
+			</widget>
+			<packing>
+			  <property name="padding">0</property>
+			  <property name="expand">False</property>
+			  <property name="fill">False</property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkDrawingArea" id="memmap">
+			  <property name="visible">True</property>
+			</widget>
+			<packing>
+			  <property name="padding">0</property>
+			  <property name="expand">True</property>
+			  <property name="fill">True</property>
+			</packing>
+		      </child>
+		    </widget>
+		  </child>
+
+		  <child>
+		    <widget class="GtkLabel" id="label1">
+		      <property name="visible">True</property>
+		      <property name="label" translatable="yes">Memory Map</property>
+		      <property name="use_underline">False</property>
+		      <property name="use_markup">False</property>
+		      <property name="justify">GTK_JUSTIFY_LEFT</property>
+		      <property name="wrap">False</property>
+		      <property name="selectable">False</property>
+		      <property name="xalign">0.5</property>
+		      <property name="yalign">0.5</property>
+		      <property name="xpad">0</property>
+		      <property name="ypad">0</property>
+		      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+		      <property name="width_chars">-1</property>
+		      <property name="single_line_mode">False</property>
+		      <property name="angle">0</property>
+		    </widget>
+		    <packing>
+		      <property name="type">label_item</property>
+		    </packing>
+		  </child>
+		</widget>
+		<packing>
+		  <property name="padding">0</property>
+		  <property name="expand">True</property>
+		  <property name="fill">True</property>
+		</packing>
+	      </child>
+
+	      <child>
+		<widget class="GtkFrame" id="frame8">
+		  <property name="visible">True</property>
+		  <property name="label_xalign">0</property>
+		  <property name="label_yalign">0.5</property>
+		  <property name="shadow_type">GTK_SHADOW_ETCHED_IN</property>
+
+		  <child>
+		    <widget class="GtkVBox" id="vbox14">
+		      <property name="visible">True</property>
+		      <property name="homogeneous">False</property>
+		      <property name="spacing">0</property>
+
+		      <child>
+			<widget class="GtkTable" id="table4">
+			  <property name="visible">True</property>
+			  <property name="n_rows">2</property>
+			  <property name="n_columns">3</property>
+			  <property name="homogeneous">False</property>
+			  <property name="row_spacing">0</property>
+			  <property name="column_spacing">0</property>
+
+			  <child>
+			    <widget class="GtkVRuler" id="gen_ruler">
+			      <property name="visible">True</property>
+			      <property name="metric">GTK_PIXELS</property>
+			      <property name="lower">0</property>
+			      <property name="upper">10</property>
+			      <property name="position">1.69934999943</property>
+			      <property name="max_size">10</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">1</property>
+			      <property name="right_attach">2</property>
+			      <property name="top_attach">0</property>
+			      <property name="bottom_attach">1</property>
+			      <property name="x_options">fill</property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkHBox" id="gen_hbox">
+			      <property name="visible">True</property>
+			      <property name="homogeneous">False</property>
+			      <property name="spacing">0</property>
+
+			      <child>
+				<placeholder/>
+			      </child>
+
+			      <child>
+				<placeholder/>
+			      </child>
+
+			      <child>
+				<placeholder/>
+			      </child>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">2</property>
+			      <property name="right_attach">3</property>
+			      <property name="top_attach">1</property>
+			      <property name="bottom_attach">2</property>
+			      <property name="y_options">fill</property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkDrawingArea" id="generations">
+			      <property name="visible">True</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">2</property>
+			      <property name="right_attach">3</property>
+			      <property name="top_attach">0</property>
+			      <property name="bottom_attach">1</property>
+			      <property name="x_options">fill</property>
+			      <property name="y_options">fill</property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkLabel" id="label39">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes">Mb</property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_CENTER</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">0.5</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">0</property>
+			      <property name="right_attach">1</property>
+			      <property name="top_attach">0</property>
+			      <property name="bottom_attach">1</property>
+			      <property name="x_options"></property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+			</widget>
+			<packing>
+			  <property name="padding">0</property>
+			  <property name="expand">True</property>
+			  <property name="fill">True</property>
+			</packing>
+		      </child>
+		    </widget>
+		  </child>
+
+		  <child>
+		    <widget class="GtkLabel" id="label41">
+		      <property name="visible">True</property>
+		      <property name="label" translatable="yes">Generations</property>
+		      <property name="use_underline">False</property>
+		      <property name="use_markup">False</property>
+		      <property name="justify">GTK_JUSTIFY_LEFT</property>
+		      <property name="wrap">False</property>
+		      <property name="selectable">False</property>
+		      <property name="xalign">0.5</property>
+		      <property name="yalign">0.5</property>
+		      <property name="xpad">0</property>
+		      <property name="ypad">0</property>
+		      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+		      <property name="width_chars">-1</property>
+		      <property name="single_line_mode">False</property>
+		      <property name="angle">0</property>
+		    </widget>
+		    <packing>
+		      <property name="type">label_item</property>
+		    </packing>
+		  </child>
+		</widget>
+		<packing>
+		  <property name="padding">0</property>
+		  <property name="expand">True</property>
+		  <property name="fill">True</property>
+		</packing>
+	      </child>
+
+	      <child>
+		<widget class="GtkFrame" id="frame7">
+		  <property name="visible">True</property>
+		  <property name="label_xalign">0</property>
+		  <property name="label_yalign">0.5</property>
+		  <property name="shadow_type">GTK_SHADOW_ETCHED_IN</property>
+
+		  <child>
+		    <widget class="GtkTable" id="table3">
+		      <property name="border_width">2</property>
+		      <property name="visible">True</property>
+		      <property name="n_rows">3</property>
+		      <property name="n_columns">3</property>
+		      <property name="homogeneous">False</property>
+		      <property name="row_spacing">0</property>
+		      <property name="column_spacing">0</property>
+
+		      <child>
+			<widget class="GtkHRuler" id="res_hruler">
+			  <property name="visible">True</property>
+			  <property name="metric">GTK_PIXELS</property>
+			  <property name="lower">0</property>
+			  <property name="upper">10</property>
+			  <property name="position">8.35443019867</property>
+			  <property name="max_size">10</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">2</property>
+			  <property name="right_attach">3</property>
+			  <property name="top_attach">1</property>
+			  <property name="bottom_attach">2</property>
+			  <property name="y_options">fill</property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkVRuler" id="res_vruler">
+			  <property name="visible">True</property>
+			  <property name="metric">GTK_PIXELS</property>
+			  <property name="lower">0</property>
+			  <property name="upper">10</property>
+			  <property name="position">9.69925022125</property>
+			  <property name="max_size">10</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">1</property>
+			  <property name="right_attach">2</property>
+			  <property name="top_attach">2</property>
+			  <property name="bottom_attach">3</property>
+			  <property name="x_options">fill</property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkDrawingArea" id="res_drawingarea">
+			  <property name="visible">True</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">2</property>
+			  <property name="right_attach">3</property>
+			  <property name="top_attach">2</property>
+			  <property name="bottom_attach">3</property>
+			  <property name="x_options">fill</property>
+			  <property name="y_options">fill</property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="label37">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">Secs</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">0.5</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">2</property>
+			  <property name="right_attach">3</property>
+			  <property name="top_attach">0</property>
+			  <property name="bottom_attach">1</property>
+			  <property name="x_options"></property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="label38">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">Mb</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">0.5</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">0</property>
+			  <property name="right_attach">1</property>
+			  <property name="top_attach">2</property>
+			  <property name="bottom_attach">3</property>
+			  <property name="x_options"></property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+		    </widget>
+		  </child>
+
+		  <child>
+		    <widget class="GtkLabel" id="label42">
+		      <property name="visible">True</property>
+		      <property name="label" translatable="yes">Residency</property>
+		      <property name="use_underline">False</property>
+		      <property name="use_markup">False</property>
+		      <property name="justify">GTK_JUSTIFY_LEFT</property>
+		      <property name="wrap">False</property>
+		      <property name="selectable">False</property>
+		      <property name="xalign">0.5</property>
+		      <property name="yalign">0.5</property>
+		      <property name="xpad">0</property>
+		      <property name="ypad">0</property>
+		      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+		      <property name="width_chars">-1</property>
+		      <property name="single_line_mode">False</property>
+		      <property name="angle">0</property>
+		    </widget>
+		    <packing>
+		      <property name="type">label_item</property>
+		    </packing>
+		  </child>
+		</widget>
+		<packing>
+		  <property name="padding">0</property>
+		  <property name="expand">True</property>
+		  <property name="fill">True</property>
+		</packing>
+	      </child>
+	    </widget>
+	    <packing>
+	      <property name="padding">0</property>
+	      <property name="expand">True</property>
+	      <property name="fill">True</property>
+	    </packing>
+	  </child>
+
+	  <child>
+	    <widget class="GtkVBox" id="vbox5">
+	      <property name="visible">True</property>
+	      <property name="homogeneous">False</property>
+	      <property name="spacing">10</property>
+
+	      <child>
+		<widget class="GtkFrame" id="frame5">
+		  <property name="visible">True</property>
+		  <property name="label_xalign">0</property>
+		  <property name="label_yalign">0.5</property>
+		  <property name="shadow_type">GTK_SHADOW_ETCHED_IN</property>
+
+		  <child>
+		    <widget class="GtkVBox" id="vbox6">
+		      <property name="border_width">5</property>
+		      <property name="visible">True</property>
+		      <property name="homogeneous">False</property>
+		      <property name="spacing">0</property>
+
+		      <child>
+			<placeholder/>
+		      </child>
+
+		      <child>
+			<placeholder/>
+		      </child>
+
+		      <child>
+			<widget class="GtkTable" id="table1">
+			  <property name="visible">True</property>
+			  <property name="n_rows">4</property>
+			  <property name="n_columns">3</property>
+			  <property name="homogeneous">False</property>
+			  <property name="row_spacing">0</property>
+			  <property name="column_spacing">7</property>
+
+			  <child>
+			    <widget class="GtkLabel" id="label12">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes">Allocated</property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_RIGHT</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">1</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">0</property>
+			      <property name="right_attach">1</property>
+			      <property name="top_attach">1</property>
+			      <property name="bottom_attach">2</property>
+			      <property name="x_options">fill</property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkLabel" id="label13">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes">Live</property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_RIGHT</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">1</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">0</property>
+			      <property name="right_attach">1</property>
+			      <property name="top_attach">0</property>
+			      <property name="bottom_attach">1</property>
+			      <property name="x_options">fill</property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkLabel" id="label14">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes">Allocation Rate</property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_RIGHT</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">1</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">0</property>
+			      <property name="right_attach">1</property>
+			      <property name="top_attach">3</property>
+			      <property name="bottom_attach">4</property>
+			      <property name="x_options"></property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkLabel" id="label15">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes">		Footprint</property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_RIGHT</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">1</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">0</property>
+			      <property name="right_attach">1</property>
+			      <property name="top_attach">2</property>
+			      <property name="bottom_attach">3</property>
+			      <property name="x_options">fill</property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkLabel" id="label16">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes">M/sec</property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_CENTER</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">0.5</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">2</property>
+			      <property name="right_attach">3</property>
+			      <property name="top_attach">3</property>
+			      <property name="bottom_attach">4</property>
+			      <property name="x_options"></property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkLabel" id="label17">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes">M</property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_LEFT</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">7.45058015283e-09</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">2</property>
+			      <property name="right_attach">3</property>
+			      <property name="top_attach">2</property>
+			      <property name="bottom_attach">3</property>
+			      <property name="x_options">fill</property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkLabel" id="label18">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes">M</property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_CENTER</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">7.45058015283e-09</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">2</property>
+			      <property name="right_attach">3</property>
+			      <property name="top_attach">1</property>
+			      <property name="bottom_attach">2</property>
+			      <property name="x_options">fill</property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkLabel" id="label19">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes">M</property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_CENTER</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">7.45058015283e-09</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">2</property>
+			      <property name="right_attach">3</property>
+			      <property name="top_attach">0</property>
+			      <property name="bottom_attach">1</property>
+			      <property name="x_options">fill</property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkLabel" id="live_label">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes"></property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_CENTER</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">1</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">1</property>
+			      <property name="right_attach">2</property>
+			      <property name="top_attach">0</property>
+			      <property name="bottom_attach">1</property>
+			      <property name="x_options">fill</property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkLabel" id="allocated_label">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes"></property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_CENTER</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">1</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">1</property>
+			      <property name="right_attach">2</property>
+			      <property name="top_attach">1</property>
+			      <property name="bottom_attach">2</property>
+			      <property name="x_options">fill</property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkLabel" id="footprint_label">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes"></property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_CENTER</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">1</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">1</property>
+			      <property name="right_attach">2</property>
+			      <property name="top_attach">2</property>
+			      <property name="bottom_attach">3</property>
+			      <property name="x_options">fill</property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkLabel" id="alloc_rate_label">
+			      <property name="visible">True</property>
+			      <property name="label" translatable="yes"></property>
+			      <property name="use_underline">False</property>
+			      <property name="use_markup">False</property>
+			      <property name="justify">GTK_JUSTIFY_CENTER</property>
+			      <property name="wrap">False</property>
+			      <property name="selectable">False</property>
+			      <property name="xalign">1</property>
+			      <property name="yalign">0.5</property>
+			      <property name="xpad">0</property>
+			      <property name="ypad">0</property>
+			      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			      <property name="width_chars">-1</property>
+			      <property name="single_line_mode">False</property>
+			      <property name="angle">0</property>
+			    </widget>
+			    <packing>
+			      <property name="left_attach">1</property>
+			      <property name="right_attach">2</property>
+			      <property name="top_attach">3</property>
+			      <property name="bottom_attach">4</property>
+			      <property name="x_options">fill</property>
+			      <property name="y_options"></property>
+			    </packing>
+			  </child>
+			</widget>
+			<packing>
+			  <property name="padding">0</property>
+			  <property name="expand">True</property>
+			  <property name="fill">True</property>
+			</packing>
+		      </child>
+		    </widget>
+		  </child>
+
+		  <child>
+		    <widget class="GtkLabel" id="label43">
+		      <property name="visible">True</property>
+		      <property name="label" translatable="yes">Stats</property>
+		      <property name="use_underline">False</property>
+		      <property name="use_markup">False</property>
+		      <property name="justify">GTK_JUSTIFY_LEFT</property>
+		      <property name="wrap">False</property>
+		      <property name="selectable">False</property>
+		      <property name="xalign">0.5</property>
+		      <property name="yalign">0.5</property>
+		      <property name="xpad">0</property>
+		      <property name="ypad">0</property>
+		      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+		      <property name="width_chars">-1</property>
+		      <property name="single_line_mode">False</property>
+		      <property name="angle">0</property>
+		    </widget>
+		    <packing>
+		      <property name="type">label_item</property>
+		    </packing>
+		  </child>
+		</widget>
+		<packing>
+		  <property name="padding">0</property>
+		  <property name="expand">False</property>
+		  <property name="fill">True</property>
+		</packing>
+	      </child>
+
+	      <child>
+		<widget class="GtkFrame" id="frame9">
+		  <property name="visible">True</property>
+		  <property name="label_xalign">0</property>
+		  <property name="label_yalign">0.5</property>
+		  <property name="shadow_type">GTK_SHADOW_ETCHED_IN</property>
+
+		  <child>
+		    <widget class="GtkTable" id="table5">
+		      <property name="border_width">6</property>
+		      <property name="visible">True</property>
+		      <property name="n_rows">9</property>
+		      <property name="n_columns">2</property>
+		      <property name="homogeneous">False</property>
+		      <property name="row_spacing">0</property>
+		      <property name="column_spacing">10</property>
+
+		      <child>
+			<widget class="GtkLabel" id="label20">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">Running</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">1</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">0</property>
+			  <property name="right_attach">1</property>
+			  <property name="top_attach">0</property>
+			  <property name="bottom_attach">1</property>
+			  <property name="x_options">fill</property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="label21">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">Blocked on I/O (Read)</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">1</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">0</property>
+			  <property name="right_attach">1</property>
+			  <property name="top_attach">1</property>
+			  <property name="bottom_attach">2</property>
+			  <property name="x_options">fill</property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="label22">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">Blocked on MVar</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">1</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">0</property>
+			  <property name="right_attach">1</property>
+			  <property name="top_attach">3</property>
+			  <property name="bottom_attach">4</property>
+			  <property name="x_options">fill</property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="label24">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">Blocked on throwTo</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">1</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">0</property>
+			  <property name="right_attach">1</property>
+			  <property name="top_attach">4</property>
+			  <property name="bottom_attach">5</property>
+			  <property name="x_options">fill</property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="label26">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">Blocked on Black Hole</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">1</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">0</property>
+			  <property name="right_attach">1</property>
+			  <property name="top_attach">5</property>
+			  <property name="bottom_attach">6</property>
+			  <property name="x_options">fill</property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="label25">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">Sleeping</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">1</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">0</property>
+			  <property name="right_attach">1</property>
+			  <property name="top_attach">6</property>
+			  <property name="bottom_attach">7</property>
+			  <property name="x_options">fill</property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="label27">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">Blocked on I/O (Write)</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">1</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">0</property>
+			  <property name="right_attach">1</property>
+			  <property name="top_attach">2</property>
+			  <property name="bottom_attach">3</property>
+			  <property name="x_options"></property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="running_label">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">label28</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">0.5</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">1</property>
+			  <property name="right_attach">2</property>
+			  <property name="top_attach">0</property>
+			  <property name="bottom_attach">1</property>
+			  <property name="x_options"></property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="blockread_label">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">label29</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">0.5</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">1</property>
+			  <property name="right_attach">2</property>
+			  <property name="top_attach">1</property>
+			  <property name="bottom_attach">2</property>
+			  <property name="x_options"></property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="blockwrite_label">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">label30</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">0.5</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">1</property>
+			  <property name="right_attach">2</property>
+			  <property name="top_attach">2</property>
+			  <property name="bottom_attach">3</property>
+			  <property name="x_options"></property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="blockmvar_label">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">label31</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">0.5</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">1</property>
+			  <property name="right_attach">2</property>
+			  <property name="top_attach">3</property>
+			  <property name="bottom_attach">4</property>
+			  <property name="x_options"></property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="blockthrowto_label">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">label32</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">0.5</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">1</property>
+			  <property name="right_attach">2</property>
+			  <property name="top_attach">4</property>
+			  <property name="bottom_attach">5</property>
+			  <property name="x_options"></property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="blockbh_label">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">label33</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">0.5</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">1</property>
+			  <property name="right_attach">2</property>
+			  <property name="top_attach">5</property>
+			  <property name="bottom_attach">6</property>
+			  <property name="x_options"></property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="sleeping_label">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">label34</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">0.5</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">1</property>
+			  <property name="right_attach">2</property>
+			  <property name="top_attach">6</property>
+			  <property name="bottom_attach">7</property>
+			  <property name="x_options"></property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkHSeparator" id="hseparator1">
+			  <property name="visible">True</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">0</property>
+			  <property name="right_attach">1</property>
+			  <property name="top_attach">7</property>
+			  <property name="bottom_attach">8</property>
+			  <property name="x_options">fill</property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkHSeparator" id="hseparator2">
+			  <property name="visible">True</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">1</property>
+			  <property name="right_attach">2</property>
+			  <property name="top_attach">7</property>
+			  <property name="bottom_attach">8</property>
+			  <property name="x_options">fill</property>
+			  <property name="y_options">fill</property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="label35">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">Total</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">1</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">0</property>
+			  <property name="right_attach">1</property>
+			  <property name="top_attach">8</property>
+			  <property name="bottom_attach">9</property>
+			  <property name="x_options">fill</property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkLabel" id="total_label">
+			  <property name="visible">True</property>
+			  <property name="label" translatable="yes">label36</property>
+			  <property name="use_underline">False</property>
+			  <property name="use_markup">False</property>
+			  <property name="justify">GTK_JUSTIFY_CENTER</property>
+			  <property name="wrap">False</property>
+			  <property name="selectable">False</property>
+			  <property name="xalign">0.5</property>
+			  <property name="yalign">0.5</property>
+			  <property name="xpad">0</property>
+			  <property name="ypad">0</property>
+			  <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+			  <property name="width_chars">-1</property>
+			  <property name="single_line_mode">False</property>
+			  <property name="angle">0</property>
+			</widget>
+			<packing>
+			  <property name="left_attach">1</property>
+			  <property name="right_attach">2</property>
+			  <property name="top_attach">8</property>
+			  <property name="bottom_attach">9</property>
+			  <property name="x_options"></property>
+			  <property name="y_options"></property>
+			</packing>
+		      </child>
+		    </widget>
+		  </child>
+
+		  <child>
+		    <widget class="GtkLabel" id="label44">
+		      <property name="visible">True</property>
+		      <property name="label" translatable="yes">Threads</property>
+		      <property name="use_underline">False</property>
+		      <property name="use_markup">False</property>
+		      <property name="justify">GTK_JUSTIFY_LEFT</property>
+		      <property name="wrap">False</property>
+		      <property name="selectable">False</property>
+		      <property name="xalign">0.5</property>
+		      <property name="yalign">0.5</property>
+		      <property name="xpad">0</property>
+		      <property name="ypad">0</property>
+		      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+		      <property name="width_chars">-1</property>
+		      <property name="single_line_mode">False</property>
+		      <property name="angle">0</property>
+		    </widget>
+		    <packing>
+		      <property name="type">label_item</property>
+		    </packing>
+		  </child>
+		</widget>
+		<packing>
+		  <property name="padding">0</property>
+		  <property name="expand">False</property>
+		  <property name="fill">True</property>
+		</packing>
+	      </child>
+
+	      <child>
+		<widget class="GtkFrame" id="frame6">
+		  <property name="visible">True</property>
+		  <property name="label_xalign">0</property>
+		  <property name="label_yalign">0.5</property>
+		  <property name="shadow_type">GTK_SHADOW_ETCHED_IN</property>
+
+		  <child>
+		    <widget class="GtkVBox" id="vbox7">
+		      <property name="border_width">5</property>
+		      <property name="visible">True</property>
+		      <property name="homogeneous">False</property>
+		      <property name="spacing">10</property>
+
+		      <child>
+			<widget class="GtkVBox" id="vbox9">
+			  <property name="visible">True</property>
+			  <property name="homogeneous">False</property>
+			  <property name="spacing">0</property>
+
+			  <child>
+			    <widget class="GtkRadioButton" id="cont_radio">
+			      <property name="visible">True</property>
+			      <property name="can_focus">True</property>
+			      <property name="label" translatable="yes">Continuous</property>
+			      <property name="use_underline">True</property>
+			      <property name="relief">GTK_RELIEF_NORMAL</property>
+			      <property name="focus_on_click">True</property>
+			      <property name="active">True</property>
+			      <property name="inconsistent">False</property>
+			      <property name="draw_indicator">True</property>
+			      <signal name="clicked" handler="on_cont_radio_clicked"/>
+			    </widget>
+			    <packing>
+			      <property name="padding">0</property>
+			      <property name="expand">False</property>
+			      <property name="fill">False</property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkRadioButton" id="stop_before_radio">
+			      <property name="visible">True</property>
+			      <property name="can_focus">True</property>
+			      <property name="label" translatable="yes">Stop before GC</property>
+			      <property name="use_underline">True</property>
+			      <property name="relief">GTK_RELIEF_NORMAL</property>
+			      <property name="focus_on_click">True</property>
+			      <property name="active">False</property>
+			      <property name="inconsistent">False</property>
+			      <property name="draw_indicator">True</property>
+			      <property name="group">cont_radio</property>
+			      <signal name="clicked" handler="on_stop_before_radio_clicked"/>
+			    </widget>
+			    <packing>
+			      <property name="padding">0</property>
+			      <property name="expand">False</property>
+			      <property name="fill">False</property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkRadioButton" id="stop_after_radio">
+			      <property name="visible">True</property>
+			      <property name="can_focus">True</property>
+			      <property name="label" translatable="yes">Stop after GC</property>
+			      <property name="use_underline">True</property>
+			      <property name="relief">GTK_RELIEF_NORMAL</property>
+			      <property name="focus_on_click">True</property>
+			      <property name="active">False</property>
+			      <property name="inconsistent">False</property>
+			      <property name="draw_indicator">True</property>
+			      <property name="group">cont_radio</property>
+			      <signal name="clicked" handler="on_stop_after_radio_clicked"/>
+			    </widget>
+			    <packing>
+			      <property name="padding">0</property>
+			      <property name="expand">False</property>
+			      <property name="fill">False</property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkRadioButton" id="stop_both_radio">
+			      <property name="visible">True</property>
+			      <property name="can_focus">True</property>
+			      <property name="label" translatable="yes">Stop before &amp; after GC</property>
+			      <property name="use_underline">True</property>
+			      <property name="relief">GTK_RELIEF_NORMAL</property>
+			      <property name="focus_on_click">True</property>
+			      <property name="active">False</property>
+			      <property name="inconsistent">False</property>
+			      <property name="draw_indicator">True</property>
+			      <property name="group">cont_radio</property>
+			      <signal name="clicked" handler="on_stop_both_radio_clicked"/>
+			    </widget>
+			    <packing>
+			      <property name="padding">0</property>
+			      <property name="expand">False</property>
+			      <property name="fill">False</property>
+			    </packing>
+			  </child>
+			</widget>
+			<packing>
+			  <property name="padding">0</property>
+			  <property name="expand">True</property>
+			  <property name="fill">True</property>
+			</packing>
+		      </child>
+
+		      <child>
+			<widget class="GtkVBox" id="vbox8">
+			  <property name="visible">True</property>
+			  <property name="homogeneous">False</property>
+			  <property name="spacing">0</property>
+
+			  <child>
+			    <widget class="GtkButton" id="stop_but">
+			      <property name="visible">True</property>
+			      <property name="can_focus">True</property>
+			      <property name="label" translatable="yes">Stop</property>
+			      <property name="use_underline">True</property>
+			      <property name="relief">GTK_RELIEF_NORMAL</property>
+			      <property name="focus_on_click">True</property>
+			      <signal name="clicked" handler="on_stop_but_clicked"/>
+			    </widget>
+			    <packing>
+			      <property name="padding">0</property>
+			      <property name="expand">False</property>
+			      <property name="fill">False</property>
+			    </packing>
+			  </child>
+
+			  <child>
+			    <widget class="GtkButton" id="continue_but">
+			      <property name="visible">True</property>
+			      <property name="can_focus">True</property>
+			      <property name="label" translatable="yes">Continue</property>
+			      <property name="use_underline">True</property>
+			      <property name="relief">GTK_RELIEF_NORMAL</property>
+			      <property name="focus_on_click">True</property>
+			      <signal name="clicked" handler="on_continue_but_clicked"/>
+			    </widget>
+			    <packing>
+			      <property name="padding">0</property>
+			      <property name="expand">False</property>
+			      <property name="fill">False</property>
+			    </packing>
+			  </child>
+			</widget>
+			<packing>
+			  <property name="padding">0</property>
+			  <property name="expand">False</property>
+			  <property name="fill">False</property>
+			</packing>
+		      </child>
+		    </widget>
+		  </child>
+
+		  <child>
+		    <widget class="GtkLabel" id="label45">
+		      <property name="visible">True</property>
+		      <property name="label" translatable="yes">Updates</property>
+		      <property name="use_underline">False</property>
+		      <property name="use_markup">False</property>
+		      <property name="justify">GTK_JUSTIFY_LEFT</property>
+		      <property name="wrap">False</property>
+		      <property name="selectable">False</property>
+		      <property name="xalign">0.5</property>
+		      <property name="yalign">0.5</property>
+		      <property name="xpad">0</property>
+		      <property name="ypad">0</property>
+		      <property name="ellipsize">PANGO_ELLIPSIZE_NONE</property>
+		      <property name="width_chars">-1</property>
+		      <property name="single_line_mode">False</property>
+		      <property name="angle">0</property>
+		    </widget>
+		    <packing>
+		      <property name="type">label_item</property>
+		    </packing>
+		  </child>
+		</widget>
+		<packing>
+		  <property name="padding">0</property>
+		  <property name="expand">False</property>
+		  <property name="fill">False</property>
+		</packing>
+	      </child>
+
+	      <child>
+		<widget class="GtkButton" id="quit_but">
+		  <property name="visible">True</property>
+		  <property name="can_focus">True</property>
+		  <property name="label" translatable="yes">Quit</property>
+		  <property name="use_underline">True</property>
+		  <property name="relief">GTK_RELIEF_NORMAL</property>
+		  <property name="focus_on_click">True</property>
+		  <signal name="clicked" handler="on_quit_but_clicked"/>
+		</widget>
+		<packing>
+		  <property name="padding">0</property>
+		  <property name="expand">False</property>
+		  <property name="fill">False</property>
+		  <property name="pack_type">GTK_PACK_END</property>
+		</packing>
+	      </child>
+	    </widget>
+	    <packing>
+	      <property name="padding">0</property>
+	      <property name="expand">False</property>
+	      <property name="fill">False</property>
+	      <property name="pack_type">GTK_PACK_END</property>
+	    </packing>
+	  </child>
+	</widget>
+	<packing>
+	  <property name="padding">0</property>
+	  <property name="expand">True</property>
+	  <property name="fill">True</property>
+	</packing>
+      </child>
+
+      <child>
+	<widget class="GtkStatusbar" id="statusbar">
+	  <property name="visible">True</property>
+	  <property name="has_resize_grip">True</property>
+	</widget>
+	<packing>
+	  <property name="padding">0</property>
+	  <property name="expand">False</property>
+	  <property name="fill">False</property>
+	</packing>
+      </child>
+    </widget>
+  </child>
+</widget>
+
+</glade-interface>
diff --git a/rts/gmp/.gdbinit b/rts/gmp/.gdbinit
new file mode 100644
index 0000000000..843c109e89
--- /dev/null
+++ b/rts/gmp/.gdbinit
@@ -0,0 +1,34 @@
+# Copyright (C) 1999 Free Software Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+define pz
+set __gmpz_dump ($)
+end
+
+define pq
+set __gmpz_dump ($->_mp_num)
+echo /
+set __gmpz_dump ($->_mp_den)
+end
+
+define pf
+set __gmpf_dump ($)
+end
+
diff --git a/rts/gmp/AUTHORS b/rts/gmp/AUTHORS
new file mode 100644
index 0000000000..1fa057af6c
--- /dev/null
+++ b/rts/gmp/AUTHORS
@@ -0,0 +1,12 @@
+Authors if GNU MP (in chronological order)
+Torbj�rn Granlund
+John Amanatides
+Paul Zimmermann
+Ken Weber
+Bennet Yee
+Andreas Schwab
+Robert Harley
+Linus Nordberg
+Kent Boortz
+Kevin Ryde
+Guillaume Hanrot
diff --git a/rts/gmp/COPYING b/rts/gmp/COPYING
new file mode 100644
index 0000000000..a6d7d0188a
--- /dev/null
+++ b/rts/gmp/COPYING
@@ -0,0 +1,336 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year  name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Hereny it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Section
diff --git a/rts/gmp/COPYING.LIB b/rts/gmp/COPYING.LIB
new file mode 100644
index 0000000000..c4792dd27a
--- /dev/null
+++ b/rts/gmp/COPYING.LIB
@@ -0,0 +1,515 @@
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations
+below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+^L
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it
+becomes
+a de-facto standard.  To achieve this, non-free programs must be
+allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+^L
+                  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control
+compilation
+and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+^L
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+^L
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at
+    least three years, to give the same user the materials
+    specified in Subsection 6a, above, for a charge no more
+    than the cost of performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+^L
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+^L
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply, and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License
+may add an explicit geographical distribution limitation excluding those
+countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+^L
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+                            NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+^L
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms
+of the ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.
+It is safest to attach them to the start of each source file to most
+effectively convey the exclusion of warranty; and each file should
+have at least the "copyright" line and a pointer to where the full
+notice is found.
+
+
+    <one line to give the library's name and a brief idea of what it
+does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+
+Also add information on how to contact you by electronic and paper
+mail.
+
+You should also get your employer (if you work as a programmer) or
+your
+school, if any, to sign a "copyright disclaimer" for the library, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James
+Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
+
+
diff --git a/rts/gmp/INSTALL b/rts/gmp/INSTALL
new file mode 100644
index 0000000000..62faa1a2e3
--- /dev/null
+++ b/rts/gmp/INSTALL
@@ -0,0 +1,146 @@
+
+                          INSTALLING GNU MP
+                          =================
+
+
+These instructions are only for the impatient.  Others should read the install
+instructions in the manual, gmp.info.  Use
+
+	info -f ./gmp.info
+
+or in emacs
+
+	C-u C-h i gmp.info
+
+
+Here are some brief instructions on how to install GMP, and some examples to
+help you get started using GMP.
+
+First, you need to compile, and optionally install, GMP.  Since you're
+impatient, try this:
+
+	./configure; make
+
+If that fails, or you care about the performance of GMP, you need to read the
+full instructions in the chapter "Installing GMP", in the manual.
+
+Next, try some small test programs, for example the ones below.
+
+In GMP programs, all variables need to be initialized before they are
+assigned, and cleared out before program flow leaves the scope in which they
+were declared.  Here is an example program that reads two numbers from the
+command line, multiplies them, and prints the result to stdout.
+
+
+    #include <stdio.h>
+    #include <gmp.h>         /* All GMP programs need to include gmp.h */
+
+    main (int argc, char **argv)
+    {
+      mpz_t a, b, p;
+
+      if (argc != 3)
+        { printf ("Usage: %s <number> <number>\n", argv[0]); exit (1); }
+
+      /* Initialize variables */
+      mpz_init (a);
+      mpz_init (b);
+      mpz_init (p);
+
+      /* Assign a and b from base 10 strings in argv */
+      mpz_set_str (a, argv[1], 10);
+      mpz_set_str (b, argv[2], 10);
+
+      /* Multiply a and b and put the result in p */
+      mpz_mul (p, a, b);
+
+      /* Print p in base 10 */
+      mpz_out_str (stdout, 10, p);
+      fputc ('\n', stdout);
+
+      /* Clear out variables */
+      mpz_clear (a);
+      mpz_clear (b);
+      mpz_clear (p);
+      exit (0);
+    }
+
+
+This might look tedious, with all the initializing and clearing.  Fortunately
+some of these operations can be combined, and other operations can often be
+avoided.  An experienced GMP user might write:
+
+
+    #include <stdio.h>
+    #include <gmp.h>
+
+    main (int argc, char **argv)
+    {
+      mpz_t a, b, p;
+
+      if (argc != 3)
+        { printf ("Usage: %s <number> <number>\n", argv[0]); exit (1); }
+
+      /* Initialize and assign a and b from base 10 strings in argv */
+      mpz_init_set_str (a, argv[1], 10);
+      mpz_init_set_str (b, argv[2], 10);
+      /* Initialize p */
+      mpz_init (p);
+
+      /* Multiply a and b and put the result in p */
+      mpz_mul (p, a, b);
+
+      /* Print p in base 10 */
+      mpz_out_str (stdout, 10, p);
+      fputc ('\n', stdout);
+
+      /* Since we're about to exit, no need to clear out variables */
+      exit (0);
+    }
+
+
+Now you have to compile your test program, and link it with the GMP library.
+Assuming your working directory is still the gmp source directory, and your
+source file is called example.c, enter:
+
+	gcc -g -I. example.c .libs/libgmp.a
+
+After installing, the command becomes: "gcc -g example.c -lgmp".  Also, GMP is
+libtool based so you can use that to link if you want.
+
+Now try to run the example:
+
+	./a.out 98365871231256752134 319378318340103345227
+	31415926535897932384618573336104570964418
+
+The functions used here all operate on signed integers, and have names
+starting with "mpz_".  There are many more such functions than used in these
+examples.  See the chapter "Integer Functions" in the manual, for a complete
+list.
+
+There are two other main classes of functions in GMP.  They operate on
+rational numbers and floating-point numbers, respectively.  The chapters
+"Rational Number Functions", and "Floating-point Functions" document these
+classes.
+
+To run a set of tests, do "make check".  This will take a while.
+
+To create the printable documentation from the texinfo source, type "make
+gmp.dvi" or "make gmp.ps".  This requires various "tex" commands.
+
+To install the library, do "make install" (then you can use -lgmp instead of
+.libs/libgmp.a).
+
+If you decide to use GMP, it is a good idea you at least read the chapter "GMP
+Basics" in the manual.
+
+Some known build problems are noted in the "Installing GMP" chapter of
+the manual.  Please report other problems to bug-gmp@gnu.org.
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 78
+End:
diff --git a/rts/gmp/Makefile.am b/rts/gmp/Makefile.am
new file mode 100644
index 0000000000..b73b805c6e
--- /dev/null
+++ b/rts/gmp/Makefile.am
@@ -0,0 +1,197 @@
+## Process this file with automake to generate Makefile.in
+
+
+# Copyright (C) 1991, 1993, 1994, 1996, 1997, 1999, 2000 Free Software
+# Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# make check
+#
+#     It'd be good if "make check" first did a "make all" or whatever to
+#     build libgmp.la, but it's not clear how best to do this.  Putting a
+#     "check:" target is overridden by automake, and a "check-local:" runs
+#     too late (due to depth-first subdirectory processing).  For now it's
+#     necessary to do "make && make check".
+#
+# MPF_OBJECTS etc
+#
+#     Libtool needs all the .lo files passed to it if it's going to build
+#     both a static and shared library.  If a convenience library like
+#     mpf/libmpf.la is passed then the resulting libgmp.a gets the PIC .lo
+#     objects rather than the non-PIC .o's.
+#
+#     Unfortunately this leads to the big lists of objects below.  Something
+#     like mpz/*.lo would probably work, but might risk missing something
+#     out or getting something extra.  The source files for each .lo are
+#     listed in the Makefile.am's in the subdirectories.
+
+
+# Libtool -version-info for libgmp.la and libmp.la.  See (libtool)Versioning
+#
+# 1. No interfaces changed, only implementations (good): Increment REVISION.
+#
+# 2. Interfaces added, none removed (good): Increment CURRENT, increment
+#    AGE, set REVISION to 0.
+#
+# 3. Interfaces removed (BAD, breaks upward compatibility): Increment
+#    CURRENT, set AGE and REVISION to 0.
+#
+# Do this separately for libgmp and libmp, only do it just before a release.
+#
+#	  GMP	   -version-info
+#	release   libgmp  libmp
+#	 2.0.x      -       -
+#	 3.0      3:0:0   3:0:0
+#        3.0.1    3:1:0   3:0:0
+#        3.1      4:0:1   4:0:1
+#        3.1.1    4:1:1   4:1:1
+#
+#
+# Starting at 3:0:0 is a slight abuse of the versioning system, but it
+# ensures we're past soname libgmp.so.2, which is what has been used on
+# Debian GNU/Linux packages of gmp 2.  Pretend gmp 2 was 2:0:0, so the
+# interface changes for gmp 3 mean 3:0:0 is right.
+
+LIBGMP_LT_CURRENT =  4
+LIBGMP_LT_REVISION = 1
+LIBGMP_LT_AGE =      1
+
+LIBMP_LT_CURRENT =  4
+LIBMP_LT_REVISION = 1
+LIBMP_LT_AGE =      1
+
+
+AUTOMAKE_OPTIONS = gnu check-news no-dependencies ansi2knr
+
+SUBDIRS = mpn mpz mpq mpf mpbsd mpfr tests demos tune
+
+include_HEADERS = gmp.h $(MPBSD_HEADERS_OPTION) $(MPFR_HEADERS_OPTION)
+EXTRA_HEADERS = mp.h
+
+lib_LTLIBRARIES = libgmp.la $(MPBSD_LTLIBRARIES_OPTION)
+
+EXTRA_DIST = .gdbinit gmp-impl.h longlong.h stack-alloc.h urandom.h doc macos
+
+DISTCLEANFILES = asm-syntax.h config.m4 @gmp_srclinks@
+
+
+MPF_OBJECTS = mpf/init.lo mpf/init2.lo mpf/set.lo mpf/set_ui.lo mpf/set_si.lo \
+  mpf/set_str.lo mpf/set_d.lo mpf/set_z.lo mpf/iset.lo mpf/iset_ui.lo \
+  mpf/iset_si.lo mpf/iset_str.lo mpf/iset_d.lo mpf/clear.lo mpf/get_str.lo \
+  mpf/dump.lo mpf/size.lo mpf/eq.lo mpf/reldiff.lo mpf/sqrt.lo mpf/random2.lo \
+  mpf/inp_str.lo mpf/out_str.lo mpf/add.lo mpf/add_ui.lo mpf/sub.lo \
+  mpf/sub_ui.lo mpf/ui_sub.lo mpf/mul.lo mpf/mul_ui.lo mpf/div.lo \
+  mpf/div_ui.lo mpf/cmp.lo mpf/cmp_ui.lo mpf/cmp_si.lo mpf/mul_2exp.lo \
+  mpf/div_2exp.lo mpf/abs.lo mpf/neg.lo mpf/set_q.lo mpf/get_d.lo \
+  mpf/set_dfl_prec.lo mpf/set_prc.lo mpf/set_prc_raw.lo mpf/get_prc.lo \
+  mpf/ui_div.lo mpf/sqrt_ui.lo mpf/floor.lo mpf/ceil.lo mpf/trunc.lo \
+  mpf/pow_ui.lo mpf/urandomb.lo mpf/swap.lo
+MPZ_OBJECTS = mpz/abs.lo mpz/add.lo mpz/add_ui.lo mpz/addmul_ui.lo mpz/and.lo \
+  mpz/array_init.lo mpz/bin_ui.lo mpz/bin_uiui.lo mpz/cdiv_q.lo \
+  mpz/cdiv_q_ui.lo mpz/cdiv_qr.lo mpz/cdiv_qr_ui.lo mpz/cdiv_r.lo \
+  mpz/cdiv_r_ui.lo mpz/cdiv_ui.lo mpz/clear.lo mpz/clrbit.lo mpz/cmp.lo \
+  mpz/cmp_si.lo mpz/cmp_ui.lo mpz/cmpabs.lo mpz/cmpabs_ui.lo mpz/com.lo \
+  mpz/divexact.lo mpz/dump.lo mpz/fac_ui.lo mpz/fdiv_q.lo mpz/fdiv_q_2exp.lo \
+  mpz/fdiv_q_ui.lo mpz/fdiv_qr.lo mpz/fdiv_qr_ui.lo mpz/fdiv_r.lo \
+  mpz/fdiv_r_2exp.lo mpz/fdiv_r_ui.lo mpz/fdiv_ui.lo mpz/fib_ui.lo \
+  mpz/fits_sint_p.lo mpz/fits_slong_p.lo mpz/fits_sshort_p.lo \
+  mpz/fits_uint_p.lo mpz/fits_ulong_p.lo mpz/fits_ushort_p.lo mpz/gcd.lo \
+  mpz/gcd_ui.lo mpz/gcdext.lo mpz/get_d.lo mpz/get_si.lo mpz/get_str.lo \
+  mpz/get_ui.lo mpz/getlimbn.lo mpz/hamdist.lo mpz/init.lo mpz/inp_raw.lo \
+  mpz/inp_str.lo mpz/invert.lo mpz/ior.lo mpz/iset.lo mpz/iset_d.lo \
+  mpz/iset_si.lo mpz/iset_str.lo mpz/iset_ui.lo mpz/jacobi.lo \
+  mpz/kronsz.lo mpz/kronuz.lo mpz/kronzs.lo mpz/kronzu.lo \
+  mpz/lcm.lo mpz/legendre.lo \
+  mpz/mod.lo mpz/mul.lo mpz/mul_2exp.lo mpz/mul_si.lo mpz/mul_ui.lo \
+  mpz/neg.lo mpz/nextprime.lo mpz/out_raw.lo mpz/out_str.lo mpz/perfpow.lo mpz/perfsqr.lo \
+  mpz/popcount.lo mpz/pow_ui.lo mpz/powm.lo mpz/powm_ui.lo mpz/pprime_p.lo \
+  mpz/random.lo mpz/random2.lo mpz/realloc.lo mpz/remove.lo mpz/root.lo \
+  mpz/rrandomb.lo \
+  mpz/scan0.lo mpz/scan1.lo mpz/set.lo mpz/set_d.lo mpz/set_f.lo mpz/set_q.lo \
+  mpz/set_si.lo mpz/set_str.lo mpz/set_ui.lo mpz/setbit.lo mpz/size.lo \
+  mpz/sizeinbase.lo mpz/sqrt.lo mpz/sqrtrem.lo mpz/sub.lo mpz/sub_ui.lo \
+  mpz/swap.lo mpz/tdiv_ui.lo mpz/tdiv_q.lo mpz/tdiv_q_2exp.lo mpz/tdiv_q_ui.lo \
+  mpz/tdiv_qr.lo mpz/tdiv_qr_ui.lo mpz/tdiv_r.lo mpz/tdiv_r_2exp.lo \
+  mpz/tdiv_r_ui.lo mpz/tstbit.lo mpz/ui_pow_ui.lo mpz/urandomb.lo \
+  mpz/urandomm.lo mpz/xor.lo
+MPQ_OBJECTS = mpq/add.lo mpq/canonicalize.lo mpq/clear.lo mpq/cmp.lo \
+  mpq/cmp_ui.lo mpq/div.lo mpq/get_d.lo mpq/get_den.lo mpq/get_num.lo \
+  mpq/init.lo mpq/inv.lo mpq/mul.lo mpq/neg.lo mpq/out_str.lo \
+  mpq/set.lo mpq/set_den.lo \
+  mpq/set_num.lo mpq/set_si.lo mpq/set_ui.lo mpq/sub.lo mpq/equal.lo \
+  mpq/set_z.lo mpq/set_d.lo mpq/swap.lo
+MPN_OBJECTS = @mpn_objs_in_libgmp@
+
+MPBSD_OBJECTS = mpbsd/add.lo mpbsd/tdiv_qr.lo mpbsd/move.lo mpbsd/powm.lo \
+  mpbsd/sub.lo mpbsd/cmp.lo mpbsd/mfree.lo mpbsd/mtox.lo mpbsd/realloc.lo \
+  mpbsd/gcd.lo mpbsd/itom.lo mpbsd/min.lo mpbsd/mul.lo mpbsd/mout.lo	  \
+  mpbsd/pow_ui.lo mpbsd/sdiv.lo mpbsd/sqrtrem.lo mpbsd/xtom.lo 
+
+# FIXME: Add mpfr/rnd_mode.lo when it's clean.
+MPFR_OBJECTS = mpfr/add.lo mpfr/div_2exp.lo mpfr/neg.lo mpfr/set_dfl_prec.lo \
+  mpfr/set_str_raw.lo mpfr/agm.lo mpfr/get_str.lo mpfr/print_raw.lo \
+  mpfr/set_dfl_rnd.lo mpfr/sqrt.lo mpfr/clear.lo mpfr/init.lo \
+  mpfr/set_f.lo mpfr/sub.lo mpfr/cmp.lo mpfr/mul.lo mpfr/round.lo \
+  mpfr/set_prec.lo mpfr/cmp_ui.lo mpfr/mul_2exp.lo mpfr/set.lo mpfr/set_si.lo \
+  mpfr/div.lo mpfr/mul_ui.lo mpfr/set_d.lo mpfr/pow.lo mpfr/out_str.lo \
+  mpfr/pi.lo mpfr/set_z.lo mpfr/add_ulp.lo mpfr/log2.lo mpfr/random.lo \
+  mpfr/log.lo mpfr/exp.lo mpfr/div_ui.lo mpfr/zeta.lo mpfr/karadiv.lo \
+  mpfr/karasqrt.lo mpfr/print_rnd_mode.lo
+
+
+if WANT_MPFR
+MPFR_HEADERS_OPTION = mpfr/mpfr.h
+MPFR_OBJECTS_OPTION = $(MPFR_OBJECTS)
+MPFR_LIBADD_OPTION = -lm
+endif
+libgmp_la_SOURCES = assert.c compat.c errno.c memory.c mp_set_fns.c	 \
+  mp_clz_tab.c mp_minv_tab.c						 \
+  rand.c randclr.c randlc.c randlc2x.c randraw.c randsd.c		 \
+  randsdui.c version.c stack-alloc.c mp_bpl.c extract-dbl.c insert-dbl.c
+libgmp_la_DEPENDENCIES = \
+  $(MPF_OBJECTS) $(MPZ_OBJECTS) $(MPN_OBJECTS) $(MPQ_OBJECTS) \
+  $(MPFR_OBJECTS_OPTION)
+libgmp_la_LIBADD = $(libgmp_la_DEPENDENCIES) $(MPFR_LIBADD_OPTION)
+libgmp_la_LDFLAGS = \
+  -version-info $(LIBGMP_LT_CURRENT):$(LIBGMP_LT_REVISION):$(LIBGMP_LT_AGE)
+
+
+if WANT_MPBSD
+MPBSD_HEADERS_OPTION = mp.h
+MPBSD_LTLIBRARIES_OPTION = libmp.la
+endif
+libmp_la_SOURCES = assert.c errno.c memory.c mp_bpl.c mp_clz_tab.c	\
+  mp_minv_tab.c mp_set_fns.c stack-alloc.c
+libmp_la_DEPENDENCIES = $(MPBSD_OBJECTS) $(MPN_OBJECTS)			\
+  mpz/add.lo mpz/clear.lo mpz/cmp.lo mpz/init.lo mpz/mod.lo mpz/mul.lo	\
+  mpz/mul_2exp.lo mpz/realloc.lo mpz/set.lo mpz/set_ui.lo mpz/tdiv_r.lo	\
+  mpz/sub.lo
+libmp_la_LIBADD = $(libmp_la_DEPENDENCIES)
+libmp_la_LDFLAGS = \
+  -version-info $(LIBMP_LT_CURRENT):$(LIBMP_LT_REVISION):$(LIBMP_LT_AGE)
+
+
+info_TEXINFOS = gmp.texi
+
+
+# Don't ship CVS directories or emacs backups.
+dist-hook:
+	-find $(distdir) \( -name CVS -type d \) -o -name "*.~*" \
+		| xargs rm -rf
diff --git a/rts/gmp/Makefile.in b/rts/gmp/Makefile.in
new file mode 100644
index 0000000000..e63383e7a7
--- /dev/null
+++ b/rts/gmp/Makefile.in
@@ -0,0 +1,932 @@
+# Makefile.in generated automatically by automake 1.4a from Makefile.am
+
+# Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+SHELL = @SHELL@
+
+srcdir = @srcdir@
+top_srcdir = @top_srcdir@
+VPATH = @srcdir@
+prefix = @prefix@
+exec_prefix = @exec_prefix@
+
+bindir = @bindir@
+sbindir = @sbindir@
+libexecdir = @libexecdir@
+datadir = @datadir@
+sysconfdir = @sysconfdir@
+sharedstatedir = @sharedstatedir@
+localstatedir = @localstatedir@
+libdir = @libdir@
+infodir = @infodir@
+mandir = @mandir@
+includedir = @includedir@
+oldincludedir = /usr/include
+
+DESTDIR =
+
+pkgdatadir = $(datadir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+
+top_builddir = .
+
+ACLOCAL = @ACLOCAL@
+AUTOCONF = @AUTOCONF@
+AUTOMAKE = @AUTOMAKE@
+AUTOHEADER = @AUTOHEADER@
+
+INSTALL = @INSTALL@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_FLAG =
+transform = @program_transform_name@
+
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+
+@SET_MAKE@
+build_alias = @build_alias@
+build_triplet = @build@
+host_alias = @host_alias@
+host_triplet = @host@
+target_alias = @target_alias@
+target_triplet = @target@
+AMDEP = @AMDEP@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AWK = @AWK@
+CALLING_CONVENTIONS_OBJS = @CALLING_CONVENTIONS_OBJS@
+CC = @CC@
+CCAS = @CCAS@
+CPP = @CPP@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+EXEEXT = @EXEEXT@
+LIBTOOL = @LIBTOOL@
+LN_S = @LN_S@
+M4 = @M4@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+PACKAGE = @PACKAGE@
+RANLIB = @RANLIB@
+SPEED_CYCLECOUNTER_OBJS = @SPEED_CYCLECOUNTER_OBJS@
+STRIP = @STRIP@
+U = @U@
+VERSION = @VERSION@
+gmp_srclinks = @gmp_srclinks@
+install_sh = @install_sh@
+mpn_objects = @mpn_objects@
+mpn_objs_in_libgmp = @mpn_objs_in_libgmp@
+
+# Copyright (C) 1991, 1993, 1994, 1996, 1997, 1999, 2000 Free Software
+# Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+# make check
+#
+#     It'd be good if "make check" first did a "make all" or whatever to
+#     build libgmp.la, but it's not clear how best to do this.  Putting a
+#     "check:" target is overridden by automake, and a "check-local:" runs
+#     too late (due to depth-first subdirectory processing).  For now it's
+#     necessary to do "make && make check".
+#
+# MPF_OBJECTS etc
+#
+#     Libtool needs all the .lo files passed to it if it's going to build
+#     both a static and shared library.  If a convenience library like
+#     mpf/libmpf.la is passed then the resulting libgmp.a gets the PIC .lo
+#     objects rather than the non-PIC .o's.
+#
+#     Unfortunately this leads to the big lists of objects below.  Something
+#     like mpz/*.lo would probably work, but might risk missing something
+#     out or getting something extra.  The source files for each .lo are
+#     listed in the Makefile.am's in the subdirectories.
+
+# Libtool -version-info for libgmp.la and libmp.la.  See (libtool)Versioning
+#
+# 1. No interfaces changed, only implementations (good): Increment REVISION.
+#
+# 2. Interfaces added, none removed (good): Increment CURRENT, increment
+#    AGE, set REVISION to 0.
+#
+# 3. Interfaces removed (BAD, breaks upward compatibility): Increment
+#    CURRENT, set AGE and REVISION to 0.
+#
+# Do this separately for libgmp and libmp, only do it just before a release.
+#
+#	  GMP	   -version-info
+#	release   libgmp  libmp
+#	 2.0.x      -       -
+#	 3.0      3:0:0   3:0:0
+#        3.0.1    3:1:0   3:0:0
+#        3.1      4:0:1   4:0:1
+#        3.1.1    4:1:1   4:1:1
+#
+#
+# Starting at 3:0:0 is a slight abuse of the versioning system, but it
+# ensures we're past soname libgmp.so.2, which is what has been used on
+# Debian GNU/Linux packages of gmp 2.  Pretend gmp 2 was 2:0:0, so the
+# interface changes for gmp 3 mean 3:0:0 is right.
+
+
+LIBGMP_LT_CURRENT = 4
+LIBGMP_LT_REVISION = 1
+LIBGMP_LT_AGE = 1
+
+LIBMP_LT_CURRENT = 4
+LIBMP_LT_REVISION = 1
+LIBMP_LT_AGE = 1
+
+AUTOMAKE_OPTIONS = gnu check-news no-dependencies ansi2knr
+
+SUBDIRS = mpn mpz
+
+include_HEADERS = gmp.h $(MPBSD_HEADERS_OPTION) $(MPFR_HEADERS_OPTION)
+EXTRA_HEADERS = mp.h
+
+lib_LTLIBRARIES = libgmp.la $(MPBSD_LTLIBRARIES_OPTION)
+
+EXTRA_DIST = .gdbinit gmp-impl.h longlong.h stack-alloc.h urandom.h doc macos
+
+DISTCLEANFILES = asm-syntax.h config.m4 @gmp_srclinks@
+
+MPZ_OBJECTS = mpz/abs.lo mpz/add.lo mpz/add_ui.lo mpz/addmul_ui.lo mpz/and.lo \
+  mpz/array_init.lo mpz/bin_ui.lo mpz/bin_uiui.lo mpz/cdiv_q.lo \
+  mpz/cdiv_q_ui.lo mpz/cdiv_qr.lo mpz/cdiv_qr_ui.lo mpz/cdiv_r.lo \
+  mpz/cdiv_r_ui.lo mpz/cdiv_ui.lo mpz/clear.lo mpz/clrbit.lo mpz/cmp.lo \
+  mpz/cmp_si.lo mpz/cmp_ui.lo mpz/cmpabs.lo mpz/cmpabs_ui.lo mpz/com.lo \
+  mpz/divexact.lo mpz/dump.lo mpz/fac_ui.lo mpz/fdiv_q.lo mpz/fdiv_q_2exp.lo \
+  mpz/fdiv_q_ui.lo mpz/fdiv_qr.lo mpz/fdiv_qr_ui.lo mpz/fdiv_r.lo \
+  mpz/fdiv_r_2exp.lo mpz/fdiv_r_ui.lo mpz/fdiv_ui.lo mpz/fib_ui.lo \
+  mpz/fits_sint_p.lo mpz/fits_slong_p.lo mpz/fits_sshort_p.lo \
+  mpz/fits_uint_p.lo mpz/fits_ulong_p.lo mpz/fits_ushort_p.lo mpz/gcd.lo \
+  mpz/gcd_ui.lo mpz/gcdext.lo mpz/get_d.lo mpz/get_si.lo mpz/get_str.lo \
+  mpz/get_ui.lo mpz/getlimbn.lo mpz/hamdist.lo mpz/init.lo mpz/inp_raw.lo \
+  mpz/inp_str.lo mpz/invert.lo mpz/ior.lo mpz/iset.lo mpz/iset_d.lo \
+  mpz/iset_si.lo mpz/iset_str.lo mpz/iset_ui.lo mpz/jacobi.lo \
+  mpz/kronsz.lo mpz/kronuz.lo mpz/kronzs.lo mpz/kronzu.lo \
+  mpz/lcm.lo mpz/legendre.lo \
+  mpz/mod.lo mpz/mul.lo mpz/mul_2exp.lo mpz/mul_si.lo mpz/mul_ui.lo \
+  mpz/neg.lo mpz/nextprime.lo mpz/out_raw.lo mpz/out_str.lo mpz/perfpow.lo mpz/perfsqr.lo \
+  mpz/popcount.lo mpz/pow_ui.lo mpz/powm.lo mpz/powm_ui.lo mpz/pprime_p.lo \
+  mpz/random.lo mpz/random2.lo mpz/realloc.lo mpz/remove.lo mpz/root.lo \
+  mpz/rrandomb.lo \
+  mpz/scan0.lo mpz/scan1.lo mpz/set.lo mpz/set_d.lo mpz/set_f.lo mpz/set_q.lo \
+  mpz/set_si.lo mpz/set_str.lo mpz/set_ui.lo mpz/setbit.lo mpz/size.lo \
+  mpz/sizeinbase.lo mpz/sqrt.lo mpz/sqrtrem.lo mpz/sub.lo mpz/sub_ui.lo \
+  mpz/swap.lo mpz/tdiv_ui.lo mpz/tdiv_q.lo mpz/tdiv_q_2exp.lo mpz/tdiv_q_ui.lo \
+  mpz/tdiv_qr.lo mpz/tdiv_qr_ui.lo mpz/tdiv_r.lo mpz/tdiv_r_2exp.lo \
+  mpz/tdiv_r_ui.lo mpz/tstbit.lo mpz/ui_pow_ui.lo mpz/urandomb.lo \
+  mpz/urandomm.lo mpz/xor.lo
+
+MPN_OBJECTS = @mpn_objs_in_libgmp@
+
+MPBSD_OBJECTS = mpbsd/add.lo mpbsd/tdiv_qr.lo mpbsd/move.lo mpbsd/powm.lo \
+  mpbsd/sub.lo mpbsd/cmp.lo mpbsd/mfree.lo mpbsd/mtox.lo mpbsd/realloc.lo \
+  mpbsd/gcd.lo mpbsd/itom.lo mpbsd/min.lo mpbsd/mul.lo mpbsd/mout.lo	  \
+  mpbsd/pow_ui.lo mpbsd/sdiv.lo mpbsd/sqrtrem.lo mpbsd/xtom.lo 
+
+
+
+@WANT_MPFR_TRUE@MPFR_HEADERS_OPTION = @WANT_MPFR_TRUE@mpfr/mpfr.h
+@WANT_MPFR_TRUE@MPFR_OBJECTS_OPTION = @WANT_MPFR_TRUE@$(MPFR_OBJECTS)
+@WANT_MPFR_TRUE@MPFR_LIBADD_OPTION = @WANT_MPFR_TRUE@-lm
+libgmp_la_SOURCES = assert.c compat.c errno.c memory.c mp_set_fns.c	 \
+  mp_clz_tab.c mp_minv_tab.c						 \
+  version.c stack-alloc.c mp_bpl.c extract-dbl.c insert-dbl.c
+
+libgmp_la_DEPENDENCIES = \
+  $(MPF_OBJECTS) $(MPZ_OBJECTS) $(MPN_OBJECTS) $(MPQ_OBJECTS) \
+  $(MPFR_OBJECTS_OPTION)
+
+libgmp_la_LIBADD = $(libgmp_la_DEPENDENCIES) $(MPFR_LIBADD_OPTION)
+libgmp_la_LDFLAGS = \
+  -version-info $(LIBGMP_LT_CURRENT):$(LIBGMP_LT_REVISION):$(LIBGMP_LT_AGE)
+
+
+@WANT_MPBSD_TRUE@MPBSD_HEADERS_OPTION = @WANT_MPBSD_TRUE@mp.h
+@WANT_MPBSD_TRUE@MPBSD_LTLIBRARIES_OPTION = @WANT_MPBSD_TRUE@libmp.la
+libmp_la_SOURCES = assert.c errno.c memory.c mp_bpl.c mp_clz_tab.c	\
+  mp_minv_tab.c mp_set_fns.c stack-alloc.c
+
+libmp_la_DEPENDENCIES = $(MPBSD_OBJECTS) $(MPN_OBJECTS)			\
+  mpz/add.lo mpz/clear.lo mpz/cmp.lo mpz/init.lo mpz/mod.lo mpz/mul.lo	\
+  mpz/mul_2exp.lo mpz/realloc.lo mpz/set.lo mpz/set_ui.lo mpz/tdiv_r.lo	\
+  mpz/sub.lo
+
+libmp_la_LIBADD = $(libmp_la_DEPENDENCIES)
+libmp_la_LDFLAGS = \
+  -version-info $(LIBMP_LT_CURRENT):$(LIBMP_LT_REVISION):$(LIBMP_LT_AGE)
+
+
+info_TEXINFOS = gmp.texi
+subdir = .
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs
+CONFIG_HEADER = config.h
+CONFIG_CLEAN_FILES = 
+LTLIBRARIES =  $(lib_LTLIBRARIES)
+
+
+DEFS = @DEFS@ -I. -I$(srcdir) -I.
+CPPFLAGS = @CPPFLAGS@
+LDFLAGS = @LDFLAGS@
+LIBS = @LIBS@
+ANSI2KNR = @ANSI2KNR@
+am_libgmp_la_OBJECTS =  assert$U.lo compat$U.lo errno$U.lo memory$U.lo \
+mp_set_fns$U.lo mp_clz_tab$U.lo mp_minv_tab$U.lo rand$U.lo randclr$U.lo \
+randlc$U.lo randlc2x$U.lo randraw$U.lo randsd$U.lo randsdui$U.lo \
+version$U.lo stack-alloc$U.lo mp_bpl$U.lo extract-dbl$U.lo \
+insert-dbl$U.lo
+libgmp_la_OBJECTS =  $(am_libgmp_la_OBJECTS)
+am_libmp_la_OBJECTS =  assert$U.lo errno$U.lo memory$U.lo mp_bpl$U.lo \
+mp_clz_tab$U.lo mp_minv_tab$U.lo mp_set_fns$U.lo stack-alloc$U.lo
+libmp_la_OBJECTS =  $(am_libmp_la_OBJECTS)
+COMPILE = $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CFLAGS = @CFLAGS@
+CCLD = $(CC)
+LINK = $(LIBTOOL) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+DIST_SOURCES =  $(libgmp_la_SOURCES) $(libmp_la_SOURCES)
+TEXI2DVI = texi2dvi
+# INFO_DEPS = gmp.info
+DVIS = gmp.dvi
+TEXINFOS = gmp.texi
+HEADERS =  $(include_HEADERS)
+
+DIST_COMMON =  README $(EXTRA_HEADERS) $(include_HEADERS) ./stamp-h.in \
+AUTHORS COPYING COPYING.LIB ChangeLog INSTALL Makefile.am Makefile.in \
+NEWS acconfig.h acinclude.m4 aclocal.m4 ansi2knr.1 ansi2knr.c \
+config.guess config.in config.sub configure configure.in depcomp \
+install-sh ltconfig ltmain.sh mdate-sh missing mkinstalldirs stamp-vti \
+texinfo.tex version.texi
+
+
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+
+GZIP_ENV = --best
+depcomp = 
+SOURCES = $(libgmp_la_SOURCES) $(libmp_la_SOURCES)
+OBJECTS = $(am_libgmp_la_OBJECTS) $(am_libmp_la_OBJECTS)
+
+all: all-redirect
+.SUFFIXES:
+.SUFFIXES: .c .dvi .info .lo .o .obj .ps .texi .texinfo .txi
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4) 
+	cd $(top_srcdir) && $(AUTOMAKE) --gnu Makefile
+
+Makefile: $(srcdir)/Makefile.in  $(top_builddir)/config.status
+	cd $(top_builddir) \
+	  && CONFIG_FILES=$@ CONFIG_HEADERS= $(SHELL) ./config.status
+
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ configure.in  acinclude.m4
+	cd $(srcdir) && $(ACLOCAL)
+
+config.status: $(srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	$(SHELL) ./config.status --recheck
+$(srcdir)/configure: @MAINTAINER_MODE_TRUE@$(srcdir)/configure.in $(ACLOCAL_M4) $(CONFIGURE_DEPENDENCIES)
+	cd $(srcdir) && $(AUTOCONF)
+
+config.h: stamp-h
+	@if test ! -f $@; then \
+		rm -f stamp-h; \
+		$(MAKE) stamp-h; \
+	else :; fi
+stamp-h: $(srcdir)/config.in $(top_builddir)/config.status
+	@rm -f stamp-h stamp-hT
+	@echo timestamp > stamp-hT 2> /dev/null
+	cd $(top_builddir) \
+	  && CONFIG_FILES= CONFIG_HEADERS=config.h:config.in \
+	     $(SHELL) ./config.status
+	@mv stamp-hT stamp-h
+$(srcdir)/config.in: @MAINTAINER_MODE_TRUE@$(srcdir)/./stamp-h.in
+	@if test ! -f $@; then \
+		rm -f $(srcdir)/./stamp-h.in; \
+		$(MAKE) $(srcdir)/./stamp-h.in; \
+	else :; fi
+$(srcdir)/./stamp-h.in: $(top_srcdir)/configure.in $(ACLOCAL_M4) acconfig.h
+	@rm -f $(srcdir)/./stamp-h.in $(srcdir)/./stamp-h.inT
+	@echo timestamp > $(srcdir)/./stamp-h.inT 2> /dev/null
+	cd $(top_srcdir) && $(AUTOHEADER)
+	@mv $(srcdir)/./stamp-h.inT $(srcdir)/./stamp-h.in
+
+mostlyclean-hdr:
+
+clean-hdr:
+
+distclean-hdr:
+	-rm -f config.h
+
+maintainer-clean-hdr:
+
+mostlyclean-libLTLIBRARIES:
+
+clean-libLTLIBRARIES:
+	-test -z "$(lib_LTLIBRARIES)" || rm -f $(lib_LTLIBRARIES)
+
+distclean-libLTLIBRARIES:
+
+maintainer-clean-libLTLIBRARIES:
+
+install-libLTLIBRARIES: $(lib_LTLIBRARIES)
+	@$(NORMAL_INSTALL)
+	$(mkinstalldirs) $(DESTDIR)$(libdir)
+	@list='$(lib_LTLIBRARIES)'; for p in $$list; do \
+	  if test -f $$p; then \
+	    echo " $(LIBTOOL)  --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$p $(DESTDIR)$(libdir)/$$p"; \
+	    $(LIBTOOL)  --mode=install $(INSTALL) $(INSTALL_STRIP_FLAG) $$p $(DESTDIR)$(libdir)/$$p; \
+	  else :; fi; \
+	done
+
+uninstall-libLTLIBRARIES:
+	@$(NORMAL_UNINSTALL)
+	@list='$(lib_LTLIBRARIES)'; for p in $$list; do \
+	  echo " $(LIBTOOL)  --mode=uninstall rm -f $(DESTDIR)$(libdir)/$$p"; \
+	  $(LIBTOOL)  --mode=uninstall rm -f $(DESTDIR)$(libdir)/$$p; \
+	done
+
+mostlyclean-compile:
+	-rm -f *.o core *.core
+	-rm -f *.$(OBJEXT)
+
+clean-compile:
+
+distclean-compile:
+	-rm -f *.tab.c
+
+maintainer-clean-compile:
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+distclean-libtool:
+
+maintainer-clean-libtool:
+
+mostlyclean-krextra:
+
+clean-krextra:
+	-rm -f ansi2knr
+
+distclean-krextra:
+
+maintainer-clean-krextra:
+ansi2knr: ansi2knr.$(OBJEXT)
+	$(LINK) ansi2knr.$(OBJEXT) $(LIBS)
+ansi2knr.$(OBJEXT): $(CONFIG_HEADER)
+
+
+mostlyclean-kr:
+	-rm -f *_.c
+
+clean-kr:
+
+distclean-kr:
+
+maintainer-clean-kr:
+
+gmp.dll: libgmp.a
+	dllwrap -mno-cygwin --target=i386-unknown-mingw32 \
+		--export-all --dllname gmp.dll --output-lib=libgmp_imp.a \
+		-o gmp.dll libgmp.a
+
+libgmp.la: $(libgmp_la_OBJECTS) $(libgmp_la_DEPENDENCIES)
+	$(LINK) -rpath $(libdir) $(libgmp_la_LDFLAGS) $(libgmp_la_OBJECTS) $(libgmp_la_LIBADD) $(LIBS)
+
+libmp.la: $(libmp_la_OBJECTS) $(libmp_la_DEPENDENCIES)
+	$(LINK) -rpath $(libdir) $(libmp_la_LDFLAGS) $(libmp_la_OBJECTS) $(libmp_la_LIBADD) $(LIBS)
+.c.o:
+	$(COMPILE) -c $<
+.c.obj:
+	$(COMPILE) -c `cygpath -w $<`
+.c.lo:
+	$(LTCOMPILE) -c -o $@ $<
+assert_.c: assert.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/assert.c; then echo $(srcdir)/assert.c; else echo assert.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > assert_.c
+compat_.c: compat.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/compat.c; then echo $(srcdir)/compat.c; else echo compat.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > compat_.c
+errno_.c: errno.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/errno.c; then echo $(srcdir)/errno.c; else echo errno.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > errno_.c
+extract-dbl_.c: extract-dbl.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/extract-dbl.c; then echo $(srcdir)/extract-dbl.c; else echo extract-dbl.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > extract-dbl_.c
+insert-dbl_.c: insert-dbl.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/insert-dbl.c; then echo $(srcdir)/insert-dbl.c; else echo insert-dbl.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > insert-dbl_.c
+memory_.c: memory.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/memory.c; then echo $(srcdir)/memory.c; else echo memory.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > memory_.c
+mp_bpl_.c: mp_bpl.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mp_bpl.c; then echo $(srcdir)/mp_bpl.c; else echo mp_bpl.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > mp_bpl_.c
+mp_clz_tab_.c: mp_clz_tab.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mp_clz_tab.c; then echo $(srcdir)/mp_clz_tab.c; else echo mp_clz_tab.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > mp_clz_tab_.c
+mp_minv_tab_.c: mp_minv_tab.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mp_minv_tab.c; then echo $(srcdir)/mp_minv_tab.c; else echo mp_minv_tab.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > mp_minv_tab_.c
+mp_set_fns_.c: mp_set_fns.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/mp_set_fns.c; then echo $(srcdir)/mp_set_fns.c; else echo mp_set_fns.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > mp_set_fns_.c
+rand_.c: rand.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/rand.c; then echo $(srcdir)/rand.c; else echo rand.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > rand_.c
+randclr_.c: randclr.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/randclr.c; then echo $(srcdir)/randclr.c; else echo randclr.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > randclr_.c
+randlc_.c: randlc.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/randlc.c; then echo $(srcdir)/randlc.c; else echo randlc.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > randlc_.c
+randlc2x_.c: randlc2x.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/randlc2x.c; then echo $(srcdir)/randlc2x.c; else echo randlc2x.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > randlc2x_.c
+randraw_.c: randraw.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/randraw.c; then echo $(srcdir)/randraw.c; else echo randraw.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > randraw_.c
+randsd_.c: randsd.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/randsd.c; then echo $(srcdir)/randsd.c; else echo randsd.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > randsd_.c
+randsdui_.c: randsdui.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/randsdui.c; then echo $(srcdir)/randsdui.c; else echo randsdui.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > randsdui_.c
+stack-alloc_.c: stack-alloc.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/stack-alloc.c; then echo $(srcdir)/stack-alloc.c; else echo stack-alloc.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > stack-alloc_.c
+version_.c: version.c $(ANSI2KNR)
+	$(CPP) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/version.c; then echo $(srcdir)/version.c; else echo version.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > version_.c
+assert_.$(OBJEXT) assert_.lo compat_.$(OBJEXT) compat_.lo \
+errno_.$(OBJEXT) errno_.lo extract-dbl_.$(OBJEXT) extract-dbl_.lo \
+insert-dbl_.$(OBJEXT) insert-dbl_.lo memory_.$(OBJEXT) memory_.lo \
+mp_bpl_.$(OBJEXT) mp_bpl_.lo mp_clz_tab_.$(OBJEXT) mp_clz_tab_.lo \
+mp_minv_tab_.$(OBJEXT) mp_minv_tab_.lo mp_set_fns_.$(OBJEXT) \
+mp_set_fns_.lo rand_.$(OBJEXT) rand_.lo randclr_.$(OBJEXT) randclr_.lo \
+randlc_.$(OBJEXT) randlc_.lo randlc2x_.$(OBJEXT) randlc2x_.lo \
+randraw_.$(OBJEXT) randraw_.lo randsd_.$(OBJEXT) randsd_.lo \
+randsdui_.$(OBJEXT) randsdui_.lo stack-alloc_.$(OBJEXT) stack-alloc_.lo \
+version_.$(OBJEXT) version_.lo : $(ANSI2KNR)
+
+$(srcdir)/version.texi: @MAINTAINER_MODE_TRUE@stamp-vti
+	@:
+
+$(srcdir)/stamp-vti: gmp.texi $(top_srcdir)/configure.in
+	@echo "@set UPDATED `$(SHELL) $(srcdir)/mdate-sh $(srcdir)/gmp.texi`" > vti.tmp
+	@echo "@set EDITION $(VERSION)" >> vti.tmp
+	@echo "@set VERSION $(VERSION)" >> vti.tmp
+	@cmp -s vti.tmp $(srcdir)/version.texi \
+	  || (echo "Updating $(srcdir)/version.texi"; \
+	      cp vti.tmp $(srcdir)/version.texi)
+	-@rm -f vti.tmp
+	@cp $(srcdir)/version.texi $@
+
+mostlyclean-vti:
+	-rm -f vti.tmp
+
+clean-vti:
+
+distclean-vti:
+
+maintainer-clean-vti:
+	-@MAINTAINER_MODE_TRUE@rm -f $(srcdir)/stamp-vti $(srcdir)/version.texi
+
+# gmp.info: gmp.texi version.texi
+# gmp.dvi: gmp.texi version.texi
+
+
+DVIPS = dvips
+
+.texi.info:
+	@cd $(srcdir) && rm -f $@ $@-[0-9] $@-[0-9][0-9]
+	cd $(srcdir) \
+	  && $(MAKEINFO) `echo $< | sed 's,.*/,,'`
+
+.texi.dvi:
+	TEXINPUTS=$(srcdir):$$TEXINPUTS \
+	  MAKEINFO='$(MAKEINFO) -I $(srcdir)' $(TEXI2DVI) $<
+
+.texi:
+	@cd $(srcdir) && rm -f $@ $@-[0-9] $@-[0-9][0-9]
+	cd $(srcdir) \
+	  && $(MAKEINFO) `echo $< | sed 's,.*/,,'`
+
+.texinfo.info:
+	@cd $(srcdir) && rm -f $@ $@-[0-9] $@-[0-9][0-9]
+	cd $(srcdir) \
+	  && $(MAKEINFO) `echo $< | sed 's,.*/,,'`
+
+.texinfo:
+	@cd $(srcdir) && rm -f $@ $@-[0-9] $@-[0-9][0-9]
+	cd $(srcdir) \
+	  && $(MAKEINFO) `echo $< | sed 's,.*/,,'`
+
+.texinfo.dvi:
+	TEXINPUTS=$(srcdir):$$TEXINPUTS \
+	  MAKEINFO='$(MAKEINFO) -I $(srcdir)' $(TEXI2DVI) $<
+
+.txi.info:
+	@cd $(srcdir) && rm -f $@ $@-[0-9] $@-[0-9][0-9]
+	cd $(srcdir) \
+	  && $(MAKEINFO) `echo $< | sed 's,.*/,,'`
+
+.txi.dvi:
+	TEXINPUTS=$(srcdir):$$TEXINPUTS \
+	  MAKEINFO='$(MAKEINFO) -I $(srcdir)' $(TEXI2DVI) $<
+
+.txi:
+	@cd $(srcdir) && rm -f $@ $@-[0-9] $@-[0-9][0-9]
+	cd $(srcdir) \
+	  && $(MAKEINFO) `echo $< | sed 's,.*/,,'`
+.dvi.ps:
+	$(DVIPS) $< -o $@
+
+install-info-am: $(INFO_DEPS)
+	@$(NORMAL_INSTALL)
+	$(mkinstalldirs) $(DESTDIR)$(infodir)
+	@list='$(INFO_DEPS)'; \
+	for file in $$list; do \
+	  d=$(srcdir); \
+	  for ifile in `CDPATH=: && cd $$d && echo $$file $$file-[0-9] $$file-[0-9][0-9]`; do \
+	    if test -f $$d/$$ifile; then \
+	      echo " $(INSTALL_DATA) $$d/$$ifile $(DESTDIR)$(infodir)/$$ifile"; \
+	      $(INSTALL_DATA) $$d/$$ifile $(DESTDIR)$(infodir)/$$ifile; \
+	    else : ; fi; \
+	  done; \
+	done
+	@$(POST_INSTALL)
+	@if $(SHELL) -c 'install-info --version | sed 1q | fgrep -s -v -i debian' >/dev/null 2>&1; then \
+	  list='$(INFO_DEPS)'; \
+	  for file in $$list; do \
+	    echo " install-info --info-dir=$(DESTDIR)$(infodir) $(DESTDIR)$(infodir)/$$file";\
+	    install-info --info-dir=$(DESTDIR)$(infodir) $(DESTDIR)$(infodir)/$$file || :;\
+	  done; \
+	else : ; fi
+
+uninstall-info:
+	$(PRE_UNINSTALL)
+	@if $(SHELL) -c 'install-info --version | sed 1q | fgrep -s -v -i debian' >/dev/null 2>&1; then \
+	  list='$(INFO_DEPS)'; \
+	  for file in $$list; do \
+	    echo " install-info --info-dir=$(DESTDIR)$(infodir) --remove $(DESTDIR)$(infodir)/$$file"; \
+	    install-info --info-dir=$(DESTDIR)$(infodir) --remove $(DESTDIR)$(infodir)/$$file; \
+	  done; \
+	else :; fi
+	@$(NORMAL_UNINSTALL)
+	@list='$(INFO_DEPS)'; \
+	for file in $$list; do \
+	  (if cd $(DESTDIR)$(infodir); then \
+	     echo " rm -f $$file $$file-[0-9] $$file-[0-9][0-9])"; \
+	     rm -f $$file $$file-[0-9] $$file-[0-9][0-9]; \
+	   else :; fi); \
+	done
+
+dist-info: $(INFO_DEPS)
+	list='$(INFO_DEPS)'; \
+	for base in $$list; do \
+	  d=$(srcdir); \
+	  for file in `CDPATH=: && cd $$d && eval echo $$base*`; do \
+	    test -f $(distdir)/$$file \
+	    || cp -p $$d/$$file $(distdir)/$$file; \
+	  done; \
+	done
+
+mostlyclean-aminfo:
+	-rm -f gmp.aux gmp.cp gmp.cps gmp.dvi gmp.fn gmp.fns gmp.pgs gmp.ky \
+	  gmp.kys gmp.ps gmp.log gmp.pg gmp.toc gmp.tp gmp.tps gmp.vr \
+	  gmp.vrs gmp.op gmp.tr gmp.cv gmp.cn gmp.cm gmp.ov
+
+clean-aminfo:
+
+distclean-aminfo:
+
+maintainer-clean-aminfo:
+	cd $(srcdir) && for i in $(INFO_DEPS); do \
+	  rm -f $$i; \
+	  if test "`echo $$i-[0-9]*`" != "$$i-[0-9]*"; then \
+	    rm -f $$i-[0-9]*; \
+	  fi; \
+	done
+
+install-includeHEADERS: $(include_HEADERS)
+	@$(NORMAL_INSTALL)
+	$(mkinstalldirs) $(DESTDIR)$(includedir)
+	@list='$(include_HEADERS)'; for p in $$list; do \
+	  if test -f "$$p"; then d= ; else d="$(srcdir)/"; fi; \
+	  f="`echo $$p | sed -e 's|^.*/||'`"; \
+	  echo " $(INSTALL_DATA) $$d$$p $(DESTDIR)$(includedir)/$$f"; \
+	  $(INSTALL_DATA) $$d$$p $(DESTDIR)$(includedir)/$$f; \
+	done
+
+uninstall-includeHEADERS:
+	@$(NORMAL_UNINSTALL)
+	@list='$(include_HEADERS)'; for p in $$list; do \
+	  f="`echo $$p | sed -e 's|^.*/||'`"; \
+	  echo " rm -f $(DESTDIR)$(includedir)/$$f"; \
+	  rm -f $(DESTDIR)$(includedir)/$$f; \
+	done
+
+# This directory's subdirectories are mostly independent; you can cd
+# into them and run `make' without going through this Makefile.
+# To change the values of `make' variables: instead of editing Makefiles,
+# (1) if the variable is set in `config.status', edit `config.status'
+#     (which will cause the Makefiles to be regenerated when you run `make');
+# (2) otherwise, pass the desired values on the `make' command line.
+
+all-recursive install-data-recursive install-exec-recursive \
+installdirs-recursive install-recursive uninstall-recursive  \
+check-recursive installcheck-recursive info-recursive dvi-recursive:
+	@set fnord $(MAKEFLAGS); amf=$$2; \
+	dot_seen=no; \
+	target=`echo $@ | sed s/-recursive//`; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    dot_seen=yes; \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	   || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
+	done; \
+	if test "$$dot_seen" = "no"; then \
+	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
+	fi; test -z "$$fail"
+
+mostlyclean-recursive clean-recursive distclean-recursive \
+maintainer-clean-recursive:
+	@set fnord $(MAKEFLAGS); amf=$$2; \
+	dot_seen=no; \
+	rev=''; list='$(SUBDIRS)'; for subdir in $$list; do \
+	  rev="$$subdir $$rev"; \
+	  if test "$$subdir" = "."; then dot_seen=yes; else :; fi; \
+	done; \
+	test "$$dot_seen" = "no" && rev=". $$rev"; \
+	target=`echo $@ | sed s/-recursive//`; \
+	for subdir in $$rev; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	   || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
+	done && test -z "$$fail"
+tags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
+	done
+
+tags: TAGS
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	mkid -f$$here/ID $$unique $(LISP)
+
+TAGS: tags-recursive $(HEADERS) $(SOURCES) config.in $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	tags=; \
+	here=`pwd`; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+   if test "$$subdir" = .; then :; else \
+	    test -f $$subdir/TAGS && tags="$$tags -i $$here/$$subdir/TAGS"; \
+   fi; \
+	done; \
+	list='$(SOURCES) $(HEADERS) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	test -z "$(ETAGS_ARGS)config.in$$unique$(LISP)$$tags" \
+	  || etags $(ETAGS_ARGS) $$tags config.in $$unique $(LISP)
+
+mostlyclean-tags:
+
+clean-tags:
+
+distclean-tags:
+	-rm -f TAGS ID
+
+maintainer-clean-tags:
+
+distdir = $(PACKAGE)-$(VERSION)
+top_distdir = $(distdir)
+
+
+# This target untars the dist file and tries a VPATH configuration.  Then
+# it guarantees that the distribution is self-contained by making another
+# tarfile.
+distcheck: dist
+	-chmod -R a+w $(distdir) > /dev/null 2>&1; rm -rf $(distdir)
+	GZIP=$(GZIP_ENV) gunzip -c $(distdir).tar.gz | $(AMTAR) xf -
+	chmod -R a-w $(distdir); chmod a+w $(distdir)
+	mkdir $(distdir)/=build
+	mkdir $(distdir)/=inst
+	chmod a-w $(distdir)
+	dc_install_base=`CDPATH=: && cd $(distdir)/=inst && pwd` \
+	  && cd $(distdir)/=build \
+	  && ../configure --srcdir=.. --prefix=$$dc_install_base \
+	  && $(MAKE) $(AM_MAKEFLAGS) \
+	  && $(MAKE) $(AM_MAKEFLAGS) dvi \
+	  && $(MAKE) $(AM_MAKEFLAGS) check \
+	  && $(MAKE) $(AM_MAKEFLAGS) install \
+	  && $(MAKE) $(AM_MAKEFLAGS) installcheck \
+	  && $(MAKE) $(AM_MAKEFLAGS) uninstall \
+	  && test `find $$dc_install_base -type f -print | wc -l` -le 1 \
+	  && $(MAKE) $(AM_MAKEFLAGS) dist \
+	  && $(MAKE) $(AM_MAKEFLAGS) distclean \
+	  && rm -f $(distdir).tar.gz \
+	  && test `find . -type f -print | wc -l` -eq 0
+	-chmod -R a+w $(distdir) > /dev/null 2>&1; rm -rf $(distdir)
+	@banner="$(distdir).tar.gz is ready for distribution"; \
+	dashes=`echo "$$banner" | sed s/./=/g`; \
+	echo "$$dashes"; \
+	echo "$$banner"; \
+	echo "$$dashes"
+dist: distdir
+	-find $(distdir) -type d ! -perm -777 -exec chmod a+rwx {} \; -o \
+	  ! -type d ! -perm -444 -links 1 -exec chmod a+r {} \; -o \
+	  ! -type d ! -perm -400 -exec chmod a+r {} \; -o \
+	  ! -type d ! -perm -444 -exec $(SHELL) $(install_sh) -c -m a+r {} {} \; \
+	|| chmod -R a+r $(distdir)
+	$(AMTAR) chof - $(distdir) | GZIP=$(GZIP_ENV) gzip -c > $(distdir).tar.gz
+	-chmod -R a+w $(distdir) > /dev/null 2>&1; rm -rf $(distdir)
+dist-all: distdir
+	-find $(distdir) -type d ! -perm -777 -exec chmod a+rwx {} \; -o \
+	  ! -type d ! -perm -444 -links 1 -exec chmod a+r {} \; -o \
+	  ! -type d ! -perm -400 -exec chmod a+r {} \; -o \
+	  ! -type d ! -perm -444 -exec $(SHELL) $(install_sh) -c -m a+r {} {} \; \
+	|| chmod -R a+r $(distdir)
+	$(AMTAR) chof - $(distdir) | GZIP=$(GZIP_ENV) gzip -c > $(distdir).tar.gz
+	-chmod -R a+w $(distdir) > /dev/null 2>&1; rm -rf $(distdir)
+distdir: $(DISTFILES)
+	@if sed 15q $(srcdir)/NEWS | fgrep -e "$(VERSION)" > /dev/null; then :; else \
+	  echo "NEWS not updated; not releasing" 1>&2; \
+	  exit 1; \
+	fi
+	-chmod -R a+w $(distdir) > /dev/null 2>&1; rm -rf $(distdir)
+	mkdir $(distdir)
+	$(mkinstalldirs) $(distdir)/mpfr
+	@for file in $(DISTFILES); do \
+	  d=$(srcdir); \
+	  if test -d $$d/$$file; then \
+	    cp -pR $$d/$$file $(distdir); \
+	  else \
+	    test -f $(distdir)/$$file \
+	    || cp -p $$d/$$file $(distdir)/$$file || :; \
+	  fi; \
+	done
+	for subdir in $(SUBDIRS); do \
+	  if test "$$subdir" = .; then :; else \
+	    test -d $(distdir)/$$subdir \
+	    || mkdir $(distdir)/$$subdir \
+	    || exit 1; \
+	    (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir=../$(distdir) distdir=../$(distdir)/$$subdir distdir) \
+	      || exit 1; \
+	  fi; \
+	done
+	$(MAKE) $(AM_MAKEFLAGS) top_distdir="$(top_distdir)" distdir="$(distdir)" dist-info
+	$(MAKE) $(AM_MAKEFLAGS) top_distdir="$(top_distdir)" distdir="$(distdir)" dist-hook
+info-am: $(INFO_DEPS)
+info: info-recursive
+dvi-am: $(DVIS)
+dvi: dvi-recursive
+check-am: all-am
+check: check-recursive
+installcheck-am:
+installcheck: installcheck-recursive
+all-recursive-am: config.h
+	$(MAKE) $(AM_MAKEFLAGS) all-recursive
+
+install-exec-am: install-libLTLIBRARIES
+install-exec: install-exec-recursive
+
+install-data-am: install-info-am install-includeHEADERS
+install-data: install-data-recursive
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+install: install-recursive
+uninstall-am: uninstall-libLTLIBRARIES uninstall-info \
+		uninstall-includeHEADERS
+uninstall: uninstall-recursive
+all-am: Makefile $(INFO_DEPS) $(ANSI2KNR) $(LTLIBRARIES) $(HEADERS) \
+		config.h
+all-redirect: all-recursive-am
+install-strip:
+	$(MAKE) $(AM_MAKEFLAGS) INSTALL_STRIP_FLAG=-s install
+installdirs: installdirs-recursive
+installdirs-am:
+	$(mkinstalldirs)  $(DESTDIR)$(libdir) $(DESTDIR)$(infodir) \
+		$(DESTDIR)$(includedir)
+
+
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-rm -f Makefile $(CONFIG_CLEAN_FILES)
+	-rm -f config.cache config.log stamp-h stamp-h[0-9]*
+	-test -z "$(DISTCLEANFILES)" || rm -f $(DISTCLEANFILES)
+
+maintainer-clean-generic:
+	-rm -f Makefile.in
+mostlyclean-am:  mostlyclean-hdr mostlyclean-libLTLIBRARIES \
+		mostlyclean-compile mostlyclean-libtool \
+		mostlyclean-krextra mostlyclean-kr mostlyclean-vti \
+		mostlyclean-aminfo mostlyclean-tags mostlyclean-generic
+
+mostlyclean: mostlyclean-recursive
+
+clean-am:  clean-hdr clean-libLTLIBRARIES clean-compile clean-libtool \
+		clean-krextra clean-kr clean-vti clean-aminfo \
+		clean-tags clean-generic mostlyclean-am
+
+clean: clean-recursive
+
+distclean-am:  distclean-hdr distclean-libLTLIBRARIES distclean-compile \
+		distclean-libtool distclean-krextra distclean-kr \
+		distclean-vti distclean-aminfo distclean-tags \
+		distclean-generic clean-am
+	-rm -f libtool
+
+distclean: distclean-recursive
+	-rm -f config.status
+
+maintainer-clean-am:  maintainer-clean-hdr \
+		maintainer-clean-libLTLIBRARIES \
+		maintainer-clean-compile maintainer-clean-libtool \
+		maintainer-clean-krextra maintainer-clean-kr \
+		maintainer-clean-vti maintainer-clean-aminfo \
+		maintainer-clean-tags maintainer-clean-generic \
+		distclean-am
+	@echo "This command is intended for maintainers to use;"
+	@echo "it deletes files that may require special tools to rebuild."
+
+maintainer-clean: maintainer-clean-recursive
+	-rm -f config.status
+
+.PHONY: mostlyclean-hdr distclean-hdr clean-hdr maintainer-clean-hdr \
+mostlyclean-libLTLIBRARIES distclean-libLTLIBRARIES \
+clean-libLTLIBRARIES maintainer-clean-libLTLIBRARIES \
+uninstall-libLTLIBRARIES install-libLTLIBRARIES mostlyclean-compile \
+distclean-compile clean-compile maintainer-clean-compile \
+mostlyclean-libtool distclean-libtool clean-libtool \
+maintainer-clean-libtool mostlyclean-krextra distclean-krextra \
+clean-krextra maintainer-clean-krextra mostlyclean-kr distclean-kr \
+clean-kr maintainer-clean-kr mostlyclean-vti distclean-vti clean-vti \
+maintainer-clean-vti install-info-am uninstall-info mostlyclean-aminfo \
+distclean-aminfo clean-aminfo maintainer-clean-aminfo \
+uninstall-includeHEADERS install-includeHEADERS install-recursive \
+uninstall-recursive install-data-recursive uninstall-data-recursive \
+install-exec-recursive uninstall-exec-recursive installdirs-recursive \
+uninstalldirs-recursive all-recursive check-recursive \
+installcheck-recursive info-recursive dvi-recursive \
+mostlyclean-recursive distclean-recursive clean-recursive \
+maintainer-clean-recursive tags tags-recursive mostlyclean-tags \
+distclean-tags clean-tags maintainer-clean-tags distdir info-am info \
+dvi-am dvi check check-am installcheck-am installcheck all-recursive-am \
+install-exec-am install-exec install-data-am install-data install-am \
+install uninstall-am uninstall all-redirect all-am all install-strip \
+installdirs-am installdirs mostlyclean-generic distclean-generic \
+clean-generic maintainer-clean-generic clean mostlyclean distclean \
+maintainer-clean
+
+
+# Don't ship CVS directories or emacs backups.
+dist-hook:
+	-find $(distdir) \( -name CVS -type d \) -o -name "*.~*" \
+		| xargs rm -rf
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/rts/gmp/NEWS b/rts/gmp/NEWS
new file mode 100644
index 0000000000..3b549d59f3
--- /dev/null
+++ b/rts/gmp/NEWS
@@ -0,0 +1,136 @@
+Changes between MP version 3.1 and 3.1.1
+
+* Bug fixes for division (rare), mpf_get_str, FFT, and miscellaneous minor
+  things.
+
+Changes between MP version 3.0 and 3.1
+
+* Bug fixes.
+* Improved `make check' running more tests.
+* Tuned algorithm cutoff points for many machines.  This will improve speed for
+  a lot of operations, in some cases by a large amount.
+* Major speed improvments: Alpha 21264.
+* Some speed improvments: Cray vector computers, AMD K6 and Athlon, Intel P5
+  and Pentium Pro/II/III.
+* The mpf_get_prec function now works as it did in GMP 2.
+* New utilities for auto-tuning and speed measuring.
+* Multiplication now optionally uses FFT for very large operands.  (To enable
+  it, pass --enable-fft to configure.)
+* Support for new systems: Solaris running on x86, FreeBSD 5, HP-UX 11, Cray
+  vector computers, Rhapsody, Nextstep/Openstep, MacOS.
+* Support for shared libraries on 32-bit HPPA.
+* New integer functions: mpz_mul_si, mpz_odd_p, mpz_even_p.
+* New Kronecker symbol functions: mpz_kronecker_si, mpz_kronecker_ui,
+  mpz_si_kronecker, mpz_ui_kronecker.
+* New rational functions: mpq_out_str, mpq_swap.
+* New float functions: mpf_swap.
+* New mpn functions: mpn_divexact_by3c, mpn_tdiv_qr.
+* New EXPERIMENTAL function layer for accurate floating-point arithmetic, mpfr.
+  To try it, pass --enable-mpfr to configure.  See the mpfr subdirectory for
+  more information; it is not documented in the main GMP manual.
+
+Changes between MP version 3.0 and 3.0.1
+
+* Memory leaks in gmp_randinit and mpz_probab_prime_p fixed.
+* Documentation for gmp_randinit fixed.  Misc documentation errors fixed.
+
+Changes between MP version 2.0 and 3.0
+
+* Source level compatibility with past releases (except mpn_gcd).
+* Bug fixes.
+* Much improved speed thanks to both host independent and host dependent
+  optimizations.
+* Switch to autoconf/automake/libtool.
+* Support for building libgmp as a shared library.
+* Multiplication and squaring using 3-way Toom-Cook.
+* Division using the Burnikel-Ziegler method.
+* New functions computing binomial coefficients: mpz_bin_ui, mpz_bin_uiui.
+* New function computing Fibonacci numbers: mpz_fib_ui.
+* New random number generators: mpf_urandomb, mpz_rrandomb, mpz_urandomb,
+  mpz_urandomm, gmp_randclear, gmp_randinit, gmp_randinit_lc_2exp, gmp_randseed, 
+  gmp_randseed_ui.
+* New function for quickly extracting limbs: mpz_getlimbn.
+* New functions performing integer size tests: mpz_fits_sint_p,
+  mpz_fits_slong_p, mpz_fits_sshort_p, mpz_fits_uint_p, mpz_fits_ulong_p,
+  mpz_fits_ushort_p.
+* New mpf functions: mpf_ceil, mpf_floor, mpf_pow_ui, mpf_trunc.
+* New mpq function: mpq_set_d.
+* New mpz functions: mpz_addmul_ui, mpz_cmpabs, mpz_cmpabs_ui, mpz_lcm,
+  mpz_nextprime, mpz_perfect_power_p, mpz_remove, mpz_root, mpz_swap,
+  mpz_tdiv_ui, mpz_tstbit, mpz_xor.
+* New mpn function: mpn_divexact_by3.
+* New CPU support: DEC Alpha 21264, AMD K6 and Athlon, HPPA 2.0 and 64,
+  Intel Pentium Pro and Pentium-II/III, Sparc 64, PowerPC 64.
+* Almost 10 times faster mpz_invert and mpn_gcdext.
+* The interface of mpn_gcd has changed.
+* Better support for MIPS R4x000 and R5000 under Irix 6.
+* Improved support for SPARCv8 and SPARCv9 processors.
+
+Changes between MP version 2.0 and 2.0.2
+
+* Many bug fixes.
+
+Changes between MP version 1.3.2 and 2.0
+
+* Division routines in the mpz class have changed.  There are three classes of
+  functions, that rounds the quotient to -infinity, 0, and +infinity,
+  respectively.  The first class of functions have names that begin with
+  mpz_fdiv (f is short for floor), the second class' names begin with mpz_tdiv
+  (t is short for trunc), and the third class' names begin with mpz_cdiv (c is
+  short for ceil).
+
+  The old division routines beginning with mpz_m are similar to the new
+  mpz_fdiv, with the exception that some of the new functions return useful
+  values.
+
+  The old function names can still be used.  All the old functions names will
+  now do floor division, not trunc division as some of them used to.  This was
+  changed to make the functions more compatible with common mathematical
+  practice.
+
+  The mpz_mod and mpz_mod_ui functions now compute the mathematical mod
+  function.  I.e., the sign of the 2nd argument is ignored.
+
+* The mpq assignment functions do not canonicalize their results.  A new
+  function, mpq_canonicalize must be called by the user if the result is not
+  known to be canonical.
+* The mpn functions are now documented.  These functions are intended for
+  very time critical applications, or applications that need full control over
+  memory allocation.  Note that the mpn interface is irregular and hard to
+  use.
+* New functions for arbitrary precision floating point arithmetic.  Names
+  begin with `mpf_'.  Associated type mpf_t.
+* New and improved mpz functions, including much faster GCD, fast exact
+  division (mpz_divexact), bit scan (mpz_scan0 and mpz_scan1), and number
+  theoretical functions like Jacobi (mpz_jacobi) and multiplicative inverse
+  (mpz_invert).
+* New variable types (mpz_t and mpq_t) are available that makes syntax of
+  mpz and mpq calls nicer (no need for & before variables).  The MP_INT and
+  MP_RAT types are still available for compatibility.
+* Uses GNU configure.  This makes it possible to choose target architecture
+  and CPU variant, and to compile into a separate object directory.
+* Carefully optimized assembly for important inner loops.  Support for DEC
+  Alpha, Amd 29000, HPPA 1.0 and 1.1, Intel Pentium and generic x86, Intel
+  i960, Motorola MC68000, MC68020, MC88100, and MC88110, Motorola/IBM
+  PowerPC, National NS32000, IBM POWER, MIPS R3000, R4000, SPARCv7,
+  SuperSPARC, generic SPARCv8, and DEC VAX.  Some support also for ARM,
+  Clipper, IBM ROMP (RT), and Pyramid AP/XP.
+* Faster.  Thanks to the assembler code, new algorithms, and general tuning.
+  In particular, the speed on machines without GCC is improved.
+* Support for machines without alloca.
+* Now under the LGPL.
+
+INCOMPATIBILITIES BETWEEN GMP 1 AND GMP 2
+
+* mpq assignment functions do not canonicalize their results.
+* mpz division functions round differently.
+* mpz mod functions now really compute mod.
+* mpz_powm and mpz_powm_ui now really use mod for reduction.
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/rts/gmp/README b/rts/gmp/README
new file mode 100644
index 0000000000..177c97eb12
--- /dev/null
+++ b/rts/gmp/README
@@ -0,0 +1,84 @@
+
+			THE GNU MP LIBRARY
+
+
+GNU MP is a library for arbitrary precision arithmetic, operating on signed
+integers, rational numbers, and floating point numbers.  It has a rich set of
+functions, and the functions have a regular interface.
+
+GNU MP is designed to be as fast as possible, both for small operands and huge
+operands.  The speed is achieved by using fullwords as the basic arithmetic
+type, by using fast algorithms, with carefully optimized assembly code for the
+most common inner loops for lots of CPUs, and by a general emphasis on speed
+(instead of simplicity or elegance).
+
+GNU MP is believed to be faster than any other similar library.  Its advantage
+increases with operand sizes for certain operations, since GNU MP in many
+cases has asymptotically faster algorithms.
+
+GNU MP is free software and may be freely copied on the terms contained in the
+files COPYING.LIB and COPYING (most of GNU MP is under the former, some under
+the latter).
+
+
+
+			OVERVIEW OF GNU MP
+
+There are five classes of functions in GNU MP.
+
+ 1. Signed integer arithmetic functions (mpz).  These functions are intended
+    to be easy to use, with their regular interface.  The associated type is
+    `mpz_t'.
+
+ 2. Rational arithmetic functions (mpq).  For now, just a small set of
+    functions necessary for basic rational arithmetics.  The associated type
+    is `mpq_t'.
+
+ 3. Floating-point arithmetic functions (mpf).  If the C type `double'
+    doesn't give enough precision for your application, declare your
+    variables as `mpf_t' instead, set the precision to any number desired,
+    and call the functions in the mpf class for the arithmetic operations.
+
+ 4. Positive-integer, hard-to-use, very low overhead functions are in the
+    mpn class.  No memory management is performed.  The caller must ensure
+    enough space is available for the results.  The set of functions is not
+    regular, nor is the calling interface.  These functions accept input
+    arguments in the form of pairs consisting of a pointer to the least
+    significant word, and an integral size telling how many limbs (= words)
+    the pointer points to.
+
+    Almost all calculations, in the entire package, are made by calling these
+    low-level functions.
+
+ 5. Berkeley MP compatible functions.
+
+    To use these functions, include the file "mp.h".  You can test if you are
+    using the GNU version by testing if the symbol __GNU_MP__ is defined.
+
+For more information on how to use GNU MP, please refer to the documentation.
+It is composed from the file gmp.texi, and can be displayed on the screen or
+printed.  How to do that, as well how to build the library, is described in
+the INSTALL file in this directory.
+
+
+
+			REPORTING BUGS
+
+If you find a bug in the library, please make sure to tell us about it!
+
+You should first check the GNU MP web pages at http://www.swox.com/gmp/,
+under "Status of the current release".  There will be patches for all known
+serious bugs there.
+
+Report bugs to bug-gmp@gnu.org.  What information is needed in a good bug
+report is described in the manual.  The same address can be used for
+suggesting modifications and enhancements.
+
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 78
+End:
diff --git a/rts/gmp/acconfig.h b/rts/gmp/acconfig.h
new file mode 100644
index 0000000000..dfb1b0b039
--- /dev/null
+++ b/rts/gmp/acconfig.h
@@ -0,0 +1,92 @@
+/*
+Copyright (C) 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+@TOP@
+
+/* Define if a limb is long long. */
+#undef _LONG_LONG_LIMB
+
+/* Define if we have native implementation of function. */
+#undef HAVE_NATIVE_
+#undef HAVE_NATIVE_mpn_add                 
+#undef HAVE_NATIVE_mpn_add_1               
+#undef HAVE_NATIVE_mpn_add_n               
+#undef HAVE_NATIVE_mpn_add_nc              
+#undef HAVE_NATIVE_mpn_addmul_1            
+#undef HAVE_NATIVE_mpn_addmul_1c
+#undef HAVE_NATIVE_mpn_addsub_n            
+#undef HAVE_NATIVE_mpn_addsub_nc
+#undef HAVE_NATIVE_mpn_and_n               
+#undef HAVE_NATIVE_mpn_andn_n              
+#undef HAVE_NATIVE_mpn_bdivmod             
+#undef HAVE_NATIVE_mpn_cmp                 
+#undef HAVE_NATIVE_mpn_com_n               
+#undef HAVE_NATIVE_mpn_copyd               
+#undef HAVE_NATIVE_mpn_copyi               
+#undef HAVE_NATIVE_mpn_divexact_by3c
+#undef HAVE_NATIVE_mpn_divrem              
+#undef HAVE_NATIVE_mpn_divrem_1            
+#undef HAVE_NATIVE_mpn_divrem_1c
+#undef HAVE_NATIVE_mpn_divrem_2            
+#undef HAVE_NATIVE_mpn_divrem_newton       
+#undef HAVE_NATIVE_mpn_divrem_classic      
+#undef HAVE_NATIVE_mpn_dump                
+#undef HAVE_NATIVE_mpn_gcd                 
+#undef HAVE_NATIVE_mpn_gcd_1               
+#undef HAVE_NATIVE_mpn_gcdext              
+#undef HAVE_NATIVE_mpn_get_str             
+#undef HAVE_NATIVE_mpn_hamdist             
+#undef HAVE_NATIVE_mpn_invert_limb
+#undef HAVE_NATIVE_mpn_ior_n               
+#undef HAVE_NATIVE_mpn_iorn_n              
+#undef HAVE_NATIVE_mpn_lshift              
+#undef HAVE_NATIVE_mpn_mod_1               
+#undef HAVE_NATIVE_mpn_mod_1c
+#undef HAVE_NATIVE_mpn_mul                 
+#undef HAVE_NATIVE_mpn_mul_1               
+#undef HAVE_NATIVE_mpn_mul_1c
+#undef HAVE_NATIVE_mpn_mul_basecase        
+#undef HAVE_NATIVE_mpn_mul_n               
+#undef HAVE_NATIVE_mpn_nand_n              
+#undef HAVE_NATIVE_mpn_nior_n              
+#undef HAVE_NATIVE_mpn_perfect_square_p    
+#undef HAVE_NATIVE_mpn_popcount            
+#undef HAVE_NATIVE_mpn_preinv_mod_1        
+#undef HAVE_NATIVE_mpn_random2             
+#undef HAVE_NATIVE_mpn_random              
+#undef HAVE_NATIVE_mpn_rawrandom           
+#undef HAVE_NATIVE_mpn_rshift              
+#undef HAVE_NATIVE_mpn_scan0               
+#undef HAVE_NATIVE_mpn_scan1               
+#undef HAVE_NATIVE_mpn_set_str             
+#undef HAVE_NATIVE_mpn_sqrtrem             
+#undef HAVE_NATIVE_mpn_sqr_basecase        
+#undef HAVE_NATIVE_mpn_sub                 
+#undef HAVE_NATIVE_mpn_sub_1               
+#undef HAVE_NATIVE_mpn_sub_n               
+#undef HAVE_NATIVE_mpn_sub_nc              
+#undef HAVE_NATIVE_mpn_submul_1            
+#undef HAVE_NATIVE_mpn_submul_1c
+#undef HAVE_NATIVE_mpn_udiv_w_sdiv         
+#undef HAVE_NATIVE_mpn_umul_ppmm
+#undef HAVE_NATIVE_mpn_udiv_qrnnd
+#undef HAVE_NATIVE_mpn_xor_n               
+#undef HAVE_NATIVE_mpn_xnor_n              
diff --git a/rts/gmp/acinclude.m4 b/rts/gmp/acinclude.m4
new file mode 100644
index 0000000000..a02394a963
--- /dev/null
+++ b/rts/gmp/acinclude.m4
@@ -0,0 +1,835 @@
+dnl  GMP specific autoconf macros
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+dnl  GMP_HEADER_GETVAL(NAME,FILE)
+dnl  ----------------------------
+dnl  Expand to the value of a "#define NAME" from the given FILE.
+dnl  The regexps here aren't very rugged, but are enough for gmp.
+dnl  /dev/null as a parameter prevents a hang if $2 is accidentally omitted.
+
+define(GMP_HEADER_GETVAL,
+[patsubst(patsubst(
+esyscmd([grep "^#define $1 " $2 /dev/null 2>/dev/null]),
+[^.*$1[ 	]+],[]),
+[[
+ 	]*$],[])])
+
+
+dnl  GMP_VERSION
+dnl  -----------
+dnl  The gmp version number, extracted from the #defines in gmp.h.
+dnl  Two digits like 3.0 if patchlevel <= 0, or three digits like 3.0.1 if
+dnl  patchlevel > 0.
+
+define(GMP_VERSION,
+[GMP_HEADER_GETVAL(__GNU_MP_VERSION,gmp.h)[]dnl
+.GMP_HEADER_GETVAL(__GNU_MP_VERSION_MINOR,gmp.h)[]dnl
+ifelse(m4_eval(GMP_HEADER_GETVAL(__GNU_MP_VERSION_PATCHLEVEL,gmp.h) > 0),1,
+[.GMP_HEADER_GETVAL(__GNU_MP_VERSION_PATCHLEVEL,gmp.h)])])
+
+
+dnl  GMP_PROG_M4()
+dnl  -------------
+dnl
+dnl  Find a working m4, either in $PATH or likely locations, and setup $M4
+dnl  and an AC_SUBST accordingly.  If $M4 is already set then it's a user
+dnl  choice and is accepted with no checks.  GMP_PROG_M4 is like
+dnl  AC_PATH_PROG or AC_CHECK_PROG, but it tests each m4 found to see if
+dnl  it's good enough.
+dnl 
+dnl  See mpn/asm-defs.m4 for details on the known bad m4s.
+
+AC_DEFUN(GMP_PROG_M4,
+[AC_CACHE_CHECK([for suitable m4],
+                gmp_cv_prog_m4,
+[if test -n "$M4"; then
+  gmp_cv_prog_m4="$M4"
+else
+  cat >conftest.m4 <<\EOF
+dnl  must protect this against being expanded during autoconf m4!
+[define(dollarhash,``$][#'')dnl
+ifelse(dollarhash(x),1,`define(t1,Y)',
+``bad: $][# not supported (SunOS /usr/bin/m4)
+'')dnl
+ifelse(eval(89),89,`define(t2,Y)',
+`bad: eval() doesnt support 8 or 9 in a constant (OpenBSD 2.6 m4)
+')dnl
+ifelse(t1`'t2,YY,`good
+')dnl]
+EOF
+  echo "trying m4" 1>&AC_FD_CC
+  gmp_tmp_val="`(m4 conftest.m4) 2>&AC_FD_CC`"
+  echo "$gmp_tmp_val" 1>&AC_FD_CC
+  if test "$gmp_tmp_val" = good; then
+    gmp_cv_prog_m4="m4"
+  else
+    IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+dnl $ac_dummy forces splitting on constant user-supplied paths.
+dnl POSIX.2 word splitting is done only on the output of word expansions,
+dnl not every word.  This closes a longstanding sh security hole.
+    ac_dummy="$PATH:/usr/5bin"
+    for ac_dir in $ac_dummy; do
+      test -z "$ac_dir" && ac_dir=.
+      echo "trying $ac_dir/m4" 1>&AC_FD_CC
+      gmp_tmp_val="`($ac_dir/m4 conftest.m4) 2>&AC_FD_CC`"
+      echo "$gmp_tmp_val" 1>&AC_FD_CC
+      if test "$gmp_tmp_val" = good; then
+        gmp_cv_prog_m4="$ac_dir/m4"
+        break
+      fi
+    done
+    IFS="$ac_save_ifs"
+    if test -z "$gmp_cv_prog_m4"; then
+      AC_MSG_ERROR([No usable m4 in \$PATH or /usr/5bin (see config.log for reasons).])
+    fi
+  fi
+  rm -f conftest.m4
+fi])
+M4="$gmp_cv_prog_m4"
+AC_SUBST(M4)
+])
+
+
+dnl  GMP_PROG_CC_FIND([CC_LIST], [REQ_64BIT_CC])
+dnl  Find first working compiler in CC_LIST.
+dnl  If REQ_64BIT_CC is "yes", the compiler is required to be able to 
+dnl  produce 64-bit code.
+dnl  NOTE: If a compiler needs any special flags for producing 64-bit code,
+dnl  these have to be found in shell variable `gmp_cflags64_{cc}', where `{cc}'
+dnl  is the name of the compiler.
+dnl  Set CC to the name of the first working compiler.
+dnl  If a 64-bit compiler is found, set CC64 to the name of the compiler and
+dnl  CFLAGS64 to flags to use.
+dnl  This macro does not test if any of the compilers found is a GNU compiler.
+dnl  To do this, when you have finally made up your mind on which one to use, 
+dnl  and set CC accordingly, invoke [GMP_PROG_CC_SELECT].  That macro will 
+dnl  also make sure that your selection of CFLAGS is valid.
+dnl
+AC_DEFUN(GMP_PROG_CC_FIND,
+[AC_BEFORE([$0], [CC_PROG_CPP])
+ifelse([$1], , gmp_cc_list="gcc cc", gmp_cc_list="[$1]")
+ifelse([$2], , gmp_req_64bit_cc="no", gmp_req_64bit_cc="[$2]")
+
+CC32=
+CC64=
+for c in $gmp_cc_list; do
+  # Avoid cache hits.
+  unset CC
+  unset ac_cv_prog_CC
+  AC_CHECK_TOOL(CC, $c, $c)
+  if test -n "$CC"; then
+    eval c_flags=\$gmp_cflags_$c
+    GMP_PROG_CC_WORKS($CC, $c_flags,
+		      gmp_prog_cc_works=yes, 
+		      gmp_prog_cc_works=no)
+
+    if test "$gmp_prog_cc_works" != "yes"; then
+      continue
+    fi
+
+    # Save first working compiler, whether 32- or 64-bit capable.
+    if test -z "$CC32"; then
+      CC32="$CC"
+    fi
+    if test "$gmp_req_64bit_cc" = "yes"; then
+      eval c_flags=\$gmp_cflags64_$c
+
+      # Verify that the compiler works in 64-bit mode as well.
+      # /usr/ucb/cc on Solaris 7 can *compile* in 64-bit mode, but not link.
+      GMP_PROG_CC_WORKS($c, $c_flags,
+		      	gmp_prog_cc_works=yes, 
+		      	gmp_prog_cc_works=no)
+
+      if test "$gmp_prog_cc_works" = "yes"; then
+        GMP_CHECK_CC_64BIT($c, $c_flags)
+        if test "$gmp_cv_cc_64bit" = "yes"; then
+          test -z "$CC64" && CC64="$c"
+          test -z "$CFLAGS64" && CFLAGS64="$c_flags"
+	  # We have CC64 so we're done.
+          break
+        fi
+      fi
+    else
+      # We have CC32, and we don't need a 64-bit compiler so we're done.
+      break
+    fi
+  fi
+done
+CC="$CC32"
+])dnl
+
+dnl  GMP_PROG_CC_SELECT
+dnl  Check that `CC' works with `CFLAGS'.  Check if `CC' is a GNU compiler.
+dnl  Cache the result as `ac_cv_prog_CC'.
+AC_DEFUN(GMP_PROG_CC_SELECT,
+[AC_BEFORE([$0], [CC_PROG_CPP])
+AC_PROG_CC_WORKS
+AC_PROG_CC_GNU
+
+if test "$ac_cv_prog_gcc" = "yes"; then
+  GCC=yes
+else
+  GCC=
+fi
+
+# Set CFLAGS if not already set.
+if test -z "$CFLAGS"; then
+  CFLAGS="-g"
+  if test "$GCC" = "yes"; then
+    CFLAGS="$CFLAGS -O2"
+  fi
+fi
+
+AC_SUBST(CC)
+AC_CACHE_VAL(ac_cv_prog_CC, ac_cv_prog_CC="$CC")
+AC_PROVIDE([AC_PROG_CC])
+])dnl
+
+dnl  GMP_CHECK_CC_64BIT(cc, cflags64)
+dnl  Find out if `CC' can produce 64-bit code.
+dnl  Requires NM to be set to nm for target.
+dnl  FIXME: Cache result.
+AC_DEFUN(GMP_CHECK_CC_64BIT,
+[
+  gmp_tmp_CC_save="$CC"
+  CC="[$1]"
+  AC_MSG_CHECKING([whether the C compiler ($CC) is 64-bit capable])
+  if test -z "$NM"; then
+    echo; echo ["configure: $0: fatal: need nm"]
+    exit 1
+  fi
+  gmp_tmp_CFLAGS_save="$CFLAGS"
+  CFLAGS="[$2]"
+
+  case "$target" in 
+    hppa2.0*-*-*)
+      # FIXME: If gcc is installed under another name than "gcc", we will 
+      # test the wrong thing.
+      if test "$CC" != "gcc"; then
+        dnl Let compiler version A.10.32.30 or higher be ok.
+        dnl Bad compiler output:
+        dnl   ccom: HP92453-01 G.10.32.05 HP C Compiler
+        dnl Good compiler output:
+        dnl   ccom: HP92453-01 A.10.32.30 HP C Compiler
+        echo >conftest.c
+        gmp_tmp_vs=`$CC $CFLAGS -V -c -o conftest.o conftest.c 2>&1 | grep "^ccom:"`
+        rm conftest*
+        gmp_tmp_v1=`echo $gmp_tmp_vs | sed 's/.* .\.\(.*\)\..*\..* HP C.*/\1/'`
+        gmp_tmp_v2=`echo $gmp_tmp_vs | sed 's/.* .\..*\.\(.*\)\..* HP C.*/\1/'`
+        gmp_tmp_v3=`echo $gmp_tmp_vs | sed 's/.* .\..*\..*\.\(.*\) HP C.*/\1/'`
+	gmp_cv_cc_64bit=no
+	test -n "$gmp_tmp_v1" && test "$gmp_tmp_v1" -ge "10" \
+  	  && test -n "$gmp_tmp_v2" && test "$gmp_tmp_v2" -ge "32" \
+    	  && test -n "$gmp_tmp_v3" && test "$gmp_tmp_v3" -ge "30" \
+	  && gmp_cv_cc_64bit=yes
+      else	# gcc
+	# FIXME: Compile a minimal file and determine if the resulting object 
+	# file is an ELF file.  If so, gcc can produce 64-bit code.
+	# Do we have file(1) for target?
+	gmp_cv_cc_64bit=no
+      fi
+      ;;
+    mips-sgi-irix6.*)
+      # We use `-n32' to cc and `-mabi=n32' to gcc, resulting in 64-bit 
+      # arithmetic but not 64-bit pointers, so the general test for sizeof
+      # (void *) is not valid.
+      # Simply try to compile an empty main.  If that succeeds return
+      # true.
+      AC_TRY_COMPILE( , ,
+                     gmp_cv_cc_64bit=yes, gmp_cv_cc_64bit=no,
+                     gmp_cv_cc_64bit=no)
+      ;;
+    *-*-*)
+      # Allocate an array of size sizeof (void *) and use nm to determine its 
+      # size.  We depend on the first declared variable being put at address 0.
+      cat >conftest.c <<EOF
+[char arr[sizeof (void *)]={0};
+char post=0;]
+EOF
+      gmp_compile="$CC $CFLAGS -c conftest.c 1>&AC_FD_CC"
+      if AC_TRY_EVAL(gmp_compile); then
+        changequote(<,>)dnl
+	gmp_tmp_val=`$NM conftest.o | grep post | sed -e 's;[[][0-9][]]\(.*\);\1;' \
+          -e 's;[^1-9]*\([0-9]*\).*;\1;'`
+        changequote([, ])dnl
+        if test "$gmp_tmp_val" = "8"; then
+	  gmp_cv_cc_64bit=yes
+	else
+	  gmp_cv_cc_64bit=no
+        fi
+      else
+        echo "configure: failed program was:" >&AC_FD_CC
+        cat conftest.$ac_ext >&AC_FD_CC
+        gmp_cv_cc_64bit=no
+      fi
+      rm -f conftest*
+      ;;
+  esac
+
+  CC="$gmp_tmp_CC_save"
+  CFLAGS="$gmp_tmp_CFLAGS_save"
+  AC_MSG_RESULT($gmp_cv_cc_64bit)
+])dnl
+
+dnl  GMP_INIT([M4-DEF-FILE])
+dnl  
+AC_DEFUN(GMP_INIT,
+[ifelse([$1], , gmp_configm4=config.m4, gmp_configm4="[$1]")
+gmp_tmpconfigm4=cnfm4.tmp
+gmp_tmpconfigm4i=cnfm4i.tmp
+gmp_tmpconfigm4p=cnfm4p.tmp
+test -f $gmp_tmpconfigm4 && rm $gmp_tmpconfigm4
+test -f $gmp_tmpconfigm4i && rm $gmp_tmpconfigm4i
+test -f $gmp_tmpconfigm4p && rm $gmp_tmpconfigm4p
+])dnl
+
+dnl  GMP_FINISH
+dnl  ----------
+dnl  Create config.m4 from its accumulated parts.
+dnl
+dnl  __CONFIG_M4_INCLUDED__ is used so that a second or subsequent include
+dnl  of config.m4 is harmless.
+dnl
+dnl  A separate ifdef on the angle bracket quoted part ensures the quoting
+dnl  style there is respected.  The basic defines from gmp_tmpconfigm4 are
+dnl  fully quoted but are still put under an ifdef in case any have been
+dnl  redefined by one of the m4 include files.
+dnl
+dnl  Doing a big ifdef within asm-defs.m4 and/or other macro files wouldn't
+dnl  work, since it'd interpret parentheses and quotes in dnl comments, and
+dnl  having a whole file as a macro argument would overflow the string space
+dnl  on BSD m4.
+
+AC_DEFUN(GMP_FINISH,
+[AC_REQUIRE([GMP_INIT])
+echo "creating $gmp_configm4"
+echo ["dnl $gmp_configm4.  Generated automatically by configure."] > $gmp_configm4
+if test -f $gmp_tmpconfigm4; then
+  echo ["changequote(<,>)dnl"] >> $gmp_configm4
+  echo ["ifdef(<__CONFIG_M4_INCLUDED__>,,<"] >> $gmp_configm4
+  cat $gmp_tmpconfigm4 >> $gmp_configm4
+  echo [">)"] >> $gmp_configm4
+  echo ["changequote(\`,')dnl"] >> $gmp_configm4
+  rm $gmp_tmpconfigm4
+fi
+echo ["ifdef(\`__CONFIG_M4_INCLUDED__',,\`"] >> $gmp_configm4
+if test -f $gmp_tmpconfigm4i; then
+  cat $gmp_tmpconfigm4i >> $gmp_configm4
+  rm $gmp_tmpconfigm4i
+fi
+if test -f $gmp_tmpconfigm4p; then
+  cat $gmp_tmpconfigm4p >> $gmp_configm4
+  rm $gmp_tmpconfigm4p
+fi
+echo ["')"] >> $gmp_configm4
+echo ["define(\`__CONFIG_M4_INCLUDED__')"] >> $gmp_configm4
+])dnl
+
+dnl  GMP_INCLUDE(FILE)
+AC_DEFUN(GMP_INCLUDE,
+[AC_REQUIRE([GMP_INIT])
+echo ["include(\`$1')"] >> $gmp_tmpconfigm4i
+])dnl
+
+dnl  GMP_SINCLUDE(FILE)
+AC_DEFUN(GMP_SINCLUDE,
+[AC_REQUIRE([GMP_INIT])
+echo ["sinclude(\`$1')"] >> $gmp_tmpconfigm4i
+])dnl
+
+dnl GMP_DEFINE(MACRO, DEFINITION [, LOCATION])
+dnl [ Define M4 macro MACRO as DEFINITION in temporary file.		]
+dnl [ If LOCATION is `POST', the definition will appear after any	]
+dnl [ include() directives inserted by GMP_INCLUDE/GMP_SINCLUDE.	]
+dnl [ Mind the quoting!  No shell variables will get expanded.		]
+dnl [ Don't forget to invoke GMP_FINISH to create file config.m4.	]
+dnl [ config.m4 uses `<' and '>' as quote characters for all defines.	]
+AC_DEFUN(GMP_DEFINE, 
+[AC_REQUIRE([GMP_INIT])
+echo ['define(<$1>, <$2>)'] >> ifelse([$3], [POST], $gmp_tmpconfigm4p, $gmp_tmpconfigm4)
+])dnl
+
+dnl GMP_DEFINE_RAW(STRING, [, LOCATION])
+dnl [ Put STRING in temporary file.					]
+dnl [ If LOCATION is `POST', the definition will appear after any	]
+dnl [ include() directives inserted by GMP_INCLUDE/GMP_SINCLUDE.	]
+dnl [ Don't forget to invoke GMP_FINISH to create file config.m4.	]
+AC_DEFUN(GMP_DEFINE_RAW,
+[AC_REQUIRE([GMP_INIT])
+echo [$1] >> ifelse([$2], [POST], $gmp_tmpconfigm4p, $gmp_tmpconfigm4)
+])dnl
+
+dnl  GMP_CHECK_ASM_LABEL_SUFFIX
+dnl  Should a label have a colon or not?
+AC_DEFUN(GMP_CHECK_ASM_LABEL_SUFFIX,
+[AC_CACHE_CHECK([what assembly label suffix to use],
+               gmp_cv_check_asm_label_suffix,
+[case "$target" in 
+  *-*-hpux*) gmp_cv_check_asm_label_suffix=[""] ;;
+  *) gmp_cv_check_asm_label_suffix=[":"] ;;
+esac
+])
+echo ["define(<LABEL_SUFFIX>, <\$][1$gmp_cv_check_asm_label_suffix>)"] >> $gmp_tmpconfigm4
+])dnl
+
+dnl  GMP_CHECK_ASM_UNDERSCORE([ACTION-IF-FOUND [, ACTION-IF-NOT-FOUND]])
+dnl  Shamelessly borrowed from glibc.
+AC_DEFUN(GMP_CHECK_ASM_UNDERSCORE,
+[AC_CACHE_CHECK([if symbols are prefixed by underscore], 
+	        gmp_cv_check_asm_underscore,
+[cat > conftest.$ac_ext <<EOF
+dnl This sometimes fails to find confdefs.h, for some reason.
+dnl [#]line __oline__ "[$]0"
+[#]line __oline__ "configure"
+#include "confdefs.h"
+int underscore_test() {
+return; }
+EOF
+if AC_TRY_EVAL(ac_compile); then
+  if grep _underscore_test conftest* >/dev/null; then
+    gmp_cv_check_asm_underscore=yes
+  else
+    gmp_cv_check_asm_underscore=no
+  fi
+else
+  echo "configure: failed program was:" >&AC_FD_CC
+  cat conftest.$ac_ext >&AC_FD_CC
+fi
+rm -f conftest*
+])
+if test "$gmp_cv_check_asm_underscore" = "yes"; then
+  GMP_DEFINE(GSYM_PREFIX, [_])
+  ifelse([$1], , :, [$1])
+else
+  GMP_DEFINE(GSYM_PREFIX, [])
+  ifelse([$2], , :, [$2])
+fi    
+])dnl
+
+dnl  GMP_CHECK_ASM_ALIGN_LOG([ACTION-IF-FOUND [, ACTION-IF-NOT-FOUND]])
+dnl  Is parameter to `.align' logarithmic?
+dnl  Requires NM to be set to nm for target.
+AC_DEFUN(GMP_CHECK_ASM_ALIGN_LOG,
+[AC_REQUIRE([GMP_CHECK_ASM_GLOBL])
+AC_REQUIRE([GMP_CHECK_ASM_DATA])
+AC_REQUIRE([GMP_CHECK_ASM_LABEL_SUFFIX])
+AC_CACHE_CHECK([if .align assembly directive is logarithmic],
+		gmp_cv_check_asm_align_log,
+[if test -z "$NM"; then
+  echo; echo ["configure: $0: fatal: need nm"]
+  exit 1
+fi
+cat > conftest.s <<EOF
+      	$gmp_cv_check_asm_data
+      	.align  4
+	$gmp_cv_check_asm_globl	foo
+	.byte	1
+	.align	4
+foo$gmp_cv_check_asm_label_suffix
+	.byte	2
+EOF
+ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+if AC_TRY_EVAL(ac_assemble); then
+  changequote(<,>)
+  gmp_tmp_val=`$NM conftest.o | grep foo | sed -e 's;[[][0-9][]]\(.*\);\1;' \
+       -e 's;[^1-9]*\([0-9]*\).*;\1;'`
+  changequote([, ])dnl
+  if test "$gmp_tmp_val" = "10" || test "$gmp_tmp_val" = "16"; then
+    gmp_cv_check_asm_align_log=yes
+  else
+    gmp_cv_check_asm_align_log=no
+  fi
+else 
+  echo "configure: failed program was:" >&AC_FD_CC
+  cat conftest.s >&AC_FD_CC
+fi
+rm -f conftest*
+])
+GMP_DEFINE_RAW(["define(<ALIGN_LOGARITHMIC>,<$gmp_cv_check_asm_align_log>)"])
+if test "$gmp_cv_check_asm_align_log" = "yes"; then
+  ifelse([$1], , :, [$1])
+else
+  ifelse([$2], , :, [$2])
+fi  
+])dnl
+
+
+dnl  GMP_CHECK_ASM_ALIGN_FILL_0x90
+dnl  -----------------------------
+dnl  Determine whether a ",0x90" suffix works on a .align directive.
+dnl  This is only meant for use on x86, where 0x90 is a "nop".
+dnl
+dnl  Old gas, eg. 1.92.3 - needs ",0x90" or else the fill is an invalid 0x00.
+dnl  New gas, eg. 2.91 - generates the good multibyte nop fills even when
+dnl                      ",0x90" is given.
+dnl  Solaris 2.6 as - doesn't allow ",0x90", gives a fatal error.
+dnl  Solaris 2.8 as - gives a warning for ",0x90", no ill effect.
+dnl
+dnl  Note that both solaris "as"s only care about ",0x90" if they actually
+dnl  have to use it to fill something, hence the .byte in the sample.  It's
+dnl  only the second .align that provokes an error or warning.
+dnl
+dnl  We prefer to suppress the warning from solaris 2.8 to stop anyone
+dnl  worrying something might be wrong.
+
+AC_DEFUN(GMP_CHECK_ASM_ALIGN_FILL_0x90,
+[AC_CACHE_CHECK([if the .align directive accepts an 0x90 fill in .text],
+                gmp_cv_check_asm_align_fill_0x90,
+[AC_REQUIRE([GMP_CHECK_ASM_TEXT])
+cat > conftest.s <<EOF
+      	$gmp_cv_check_asm_text
+      	.align  4, 0x90
+	.byte   0
+      	.align  4, 0x90
+EOF
+gmp_tmp_val="`$CCAS $CFLAGS conftest.s 2>&1`"
+if test $? = 0; then
+  echo "$gmp_tmp_val" 1>&AC_FD_CC
+  if echo "$gmp_tmp_val" | grep "Warning: Fill parameter ignored for executable section"; then
+    echo "Supressing this warning by omitting 0x90" 1>&AC_FD_CC
+    gmp_cv_check_asm_align_fill_0x90=no
+  else
+    gmp_cv_check_asm_align_fill_0x90=yes
+  fi
+else
+  echo "Non-zero exit code" 1>&AC_FD_CC
+  echo "$gmp_tmp_val" 1>&AC_FD_CC
+  gmp_cv_check_asm_align_fill_0x90=no
+fi
+rm -f conftest*
+])
+GMP_DEFINE_RAW(
+["define(<ALIGN_FILL_0x90>,<$gmp_cv_check_asm_align_fill_0x90>)"])
+])
+
+
+dnl  GMP_CHECK_ASM_TEXT
+AC_DEFUN(GMP_CHECK_ASM_TEXT,
+[AC_CACHE_CHECK([how to switch to text section], gmp_cv_check_asm_text,
+[case "$target" in
+  *-*-aix*)
+    changequote({, })
+    gmp_cv_check_asm_text={".csect .text[PR]"}
+    changequote([, ])
+    ;;
+  *-*-hpux*) gmp_cv_check_asm_text=[".code"] ;;
+  *) gmp_cv_check_asm_text=[".text"] ;;
+esac
+])
+echo ["define(<TEXT>, <$gmp_cv_check_asm_text>)"] >> $gmp_tmpconfigm4
+])dnl
+
+dnl  GMP_CHECK_ASM_DATA
+dnl  Can we say `.data'?
+AC_DEFUN(GMP_CHECK_ASM_DATA,
+[AC_CACHE_CHECK([how to switch to data section], gmp_cv_check_asm_data,
+[case "$target" in
+  *-*-aix*)
+    changequote({, })
+    gmp_cv_check_asm_data={".csect .data[RW]"}
+    changequote([, ])
+    ;;
+  *) gmp_cv_check_asm_data=[".data"] ;;
+esac
+])
+echo ["define(<DATA>, <$gmp_cv_check_asm_data>)"] >> $gmp_tmpconfigm4
+])dnl
+
+dnl  GMP_CHECK_ASM_GLOBL
+dnl  Can we say `.global'?
+AC_DEFUN(GMP_CHECK_ASM_GLOBL,
+[AC_CACHE_CHECK([how to export a symbol], gmp_cv_check_asm_globl,
+[case "$target" in
+  *-*-hpux*) gmp_cv_check_asm_globl=[".export"] ;;
+  *) gmp_cv_check_asm_globl=[".globl"] ;;
+esac
+])
+echo ["define(<GLOBL>, <$gmp_cv_check_asm_globl>)"] >> $gmp_tmpconfigm4
+])dnl
+
+dnl  GMP_CHECK_ASM_TYPE
+dnl  Can we say `.type'?
+AC_DEFUN(GMP_CHECK_ASM_TYPE,
+[AC_CACHE_CHECK([how the .type assembly directive should be used],
+gmp_cv_check_asm_type,
+[ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+for gmp_tmp_prefix in @ \# %; do
+  echo "	.type	sym,${gmp_tmp_prefix}function" > conftest.s
+  if AC_TRY_EVAL(ac_assemble); then
+    gmp_cv_check_asm_type="[.type	\$][1,${gmp_tmp_prefix}\$][2]"
+    break
+  fi
+done
+if test -z "$gmp_cv_check_asm_type"; then
+  gmp_cv_check_asm_type="[dnl]"
+fi
+])
+echo ["define(<TYPE>, <$gmp_cv_check_asm_type>)"] >> $gmp_tmpconfigm4
+])dnl
+
+dnl  GMP_CHECK_ASM_SIZE
+dnl  Can we say `.size'?
+AC_DEFUN(GMP_CHECK_ASM_SIZE,
+[AC_CACHE_CHECK([if the .size assembly directive works], gmp_cv_check_asm_size,
+[ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+echo '	.size	sym,1' > conftest.s
+if AC_TRY_EVAL(ac_assemble); then
+  gmp_cv_check_asm_size="[.size	\$][1,\$][2]"
+else
+  gmp_cv_check_asm_size="[dnl]"
+fi
+])
+echo ["define(<SIZE>, <$gmp_cv_check_asm_size>)"] >> $gmp_tmpconfigm4
+])dnl
+
+dnl  GMP_CHECK_ASM_LSYM_PREFIX
+dnl  What is the prefix for a local label?
+dnl  Requires NM to be set to nm for target.
+AC_DEFUN(GMP_CHECK_ASM_LSYM_PREFIX,
+[AC_REQUIRE([GMP_CHECK_ASM_LABEL_SUFFIX])
+AC_CACHE_CHECK([what prefix to use for a local label], 
+gmp_cv_check_asm_lsym_prefix,
+[if test -z "$NM"; then
+  echo; echo ["$0: fatal: need nm"]
+  exit 1
+fi
+ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+gmp_cv_check_asm_lsym_prefix="L"
+for gmp_tmp_pre in L .L $ L$; do
+  cat > conftest.s <<EOF
+dummy${gmp_cv_check_asm_label_suffix}
+${gmp_tmp_pre}gurkmacka${gmp_cv_check_asm_label_suffix}
+	.byte 0
+EOF
+  if AC_TRY_EVAL(ac_assemble); then
+    $NM conftest.o >/dev/null 2>&1
+    gmp_rc=$?
+    if test "$gmp_rc" != "0"; then
+      echo "configure: $NM failure, using default"
+      break
+    fi
+    if $NM conftest.o | grep gurkmacka >/dev/null; then true; else
+      gmp_cv_check_asm_lsym_prefix="$gmp_tmp_pre"
+      break
+    fi
+  else
+    echo "configure: failed program was:" >&AC_FD_CC
+    cat conftest.s >&AC_FD_CC
+    # Use default.
+  fi
+done
+rm -f conftest*
+])
+echo ["define(<LSYM_PREFIX>, <${gmp_cv_check_asm_lsym_prefix}>)"] >> $gmp_tmpconfigm4
+])
+
+dnl  GMP_CHECK_ASM_W32
+dnl  How to [define] a 32-bit word.
+dnl  Requires NM to be set to nm for target.
+AC_DEFUN(GMP_CHECK_ASM_W32,
+[AC_REQUIRE([GMP_CHECK_ASM_DATA])
+AC_REQUIRE([GMP_CHECK_ASM_GLOBL])
+AC_REQUIRE([GMP_CHECK_ASM_LABEL_SUFFIX])
+AC_CACHE_CHECK([how to [define] a 32-bit word],
+	       gmp_cv_check_asm_w32,
+[if test -z "$NM"; then
+  echo; echo ["configure: $0: fatal: need nm"]
+  exit 1
+fi
+
+# FIXME: HPUX puts first symbol at 0x40000000, breaking our assumption
+# that it's at 0x0.  We'll have to declare another symbol before the
+# .long/.word and look at the distance between the two symbols.  The
+# only problem is that the sed expression(s) barfs (on Solaris, for
+# example) for the symbol with value 0.  For now, HPUX uses .word.
+
+case "$target" in 
+  *-*-hpux*)
+    gmp_cv_check_asm_w32=".word"
+    ;;
+  *-*-*)
+    ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+    for gmp_tmp_op in .long .word; do
+      cat > conftest.s <<EOF
+	$gmp_cv_check_asm_data
+	$gmp_cv_check_asm_globl	foo
+	$gmp_tmp_op	0
+foo${gmp_cv_check_asm_label_suffix}
+	.byte	0
+EOF
+      if AC_TRY_EVAL(ac_assemble); then
+        changequote(<,>)
+        gmp_tmp_val=`$NM conftest.o | grep foo | sed -e 's;[[][0-9][]]\(.*\);\1;' \
+             -e 's;[^1-9]*\([0-9]*\).*;\1;'`
+        changequote([, ])dnl
+        if test "$gmp_tmp_val" = "4"; then
+          gmp_cv_check_asm_w32="$gmp_tmp_op"
+          break
+        fi
+      fi
+    done
+    ;;
+esac
+
+if test -z "$gmp_cv_check_asm_w32"; then
+  echo; echo ["configure: $0: fatal: do not know how to define a 32-bit word"]
+  exit 1
+fi
+rm -f conftest*
+])
+echo ["define(<W32>, <$gmp_cv_check_asm_w32>)"] >> $gmp_tmpconfigm4
+])
+
+dnl  GMP_CHECK_ASM_MMX([ACTION-IF-FOUND, [ACTION-IF-NOT-FOUND]])
+dnl  Can we assemble MMX insns?
+AC_DEFUN(GMP_CHECK_ASM_MMX,
+[AC_REQUIRE([GMP_CHECK_ASM_TEXT])
+AC_CACHE_CHECK([if the assembler knows about MMX instructions],
+		gmp_cv_check_asm_mmx,
+[cat > conftest.s <<EOF
+	$gmp_cv_check_asm_text
+	por	%mm0, %mm0
+EOF
+ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+if AC_TRY_EVAL(ac_assemble); then
+  gmp_cv_check_asm_mmx=yes
+else 
+  gmp_cv_check_asm_mmx=no
+fi
+rm -f conftest*
+])
+if test "$gmp_cv_check_asm_mmx" = "yes"; then
+  ifelse([$1], , :, [$1])
+else
+  AC_MSG_WARN([+----------------------------------------------------------])
+  AC_MSG_WARN([| WARNING WARNING WARNING])
+  AC_MSG_WARN([| Target CPU has MMX code, but it can't be assembled by])
+  AC_MSG_WARN([|     $CCAS $CFLAGS])
+  AC_MSG_WARN([| Non-MMX replacements will be used.])
+  AC_MSG_WARN([| This will be an inferior build.])
+  AC_MSG_WARN([+----------------------------------------------------------])
+  ifelse([$2], , :, [$2])
+fi
+])dnl
+
+dnl  GMP_CHECK_ASM_SHLDL_CL([ACTION-IF-FOUND, [ACTION-IF-NOT-FOUND]])
+AC_DEFUN(GMP_CHECK_ASM_SHLDL_CL,
+[AC_REQUIRE([GMP_CHECK_ASM_TEXT])
+AC_CACHE_CHECK([if the assembler takes cl with shldl],
+		gmp_cv_check_asm_shldl_cl,
+[cat > conftest.s <<EOF
+	$gmp_cv_check_asm_text
+	shldl	%cl, %eax, %ebx
+EOF
+ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+if AC_TRY_EVAL(ac_assemble); then
+  gmp_cv_check_asm_shldl_cl=yes
+else 
+  gmp_cv_check_asm_shldl_cl=no
+fi
+rm -f conftest*
+])
+if test "$gmp_cv_check_asm_shldl_cl" = "yes"; then
+  ifelse([$1], , :, [$1])
+else
+  ifelse([$2], , :, [$2])
+fi
+])dnl
+
+dnl  GMP_PROG_CC_WORKS(CC, CFLAGS, ACTION-IF-WORKS, [ACTION-IF-NOT-WORKS])
+dnl  Check if CC can compile and link.  Perform various target specific tests.
+dnl  FIXME: Require `$target'.
+AC_DEFUN(GMP_PROG_CC_WORKS,
+[AC_LANG_C	dnl  Note: Destructive.
+CC="[$1]"
+CFLAGS="[$2]"
+AC_MSG_CHECKING([if the C compiler ($CC) works with flags $CFLAGS])
+
+# Simple test for all targets.
+AC_TRY_COMPILER([int main(){return(0);}],
+                tmp_works, tmp_cross)
+
+# Target specific tests.
+if test "$tmp_works" = "yes"; then
+  case "$target" in 
+    *-*-aix*)	# Returning a funcptr.
+      AC_TRY_COMPILE( , [} void *g(); void *f() { return g(); } int bar(){],
+                      tmp_works=yes, tmp_works=no)
+      ;;
+  esac
+fi
+
+if test "$tmp_works" = "yes"; then
+  [$3]
+else
+  ifelse([$4], , :, [$4])
+fi
+
+AC_MSG_RESULT($tmp_works)
+])dnl
+
+
+dnl  GMP_C_ANSI2KNR
+dnl  --------------
+dnl  Setup to use ansi2knr if necessary.
+dnl
+dnl  The test here is simply that if an ANSI style function works then
+dnl  ansi2knr isn't needed.  The normal tests for whether $CC works mean we
+dnl  don't need to worry here about anything badly broken.
+dnl
+dnl  AM_C_PROTOTYPES is the normal way to set up ansi2knr, but (in automake
+dnl  March 2000) it gives the wrong answer on a C++ compiler because its
+dnl  test requires that the compiler accept both ANSI and K&R, or otherwise
+dnl  ansi2knr is used.  A C++ compiler fails on the K&R part, which makes
+dnl  AM_C_PROTOTYPES think it needs ansi2knr!  GMP has no bare K&R so we
+dnl  only need ANSI or K&R to work, not both.
+
+AC_DEFUN(GMP_C_ANSI2KNR,
+[AC_CACHE_CHECK([if ansi2knr should be used],
+                gmp_cv_c_ansi2knr,
+[cat >conftest.c <<EOF
+int main (int argc, char *argv[]) { return 0; }
+EOF
+if AC_TRY_EVAL(ac_compile); then
+  gmp_cv_c_ansi2knr=no
+else
+  gmp_cv_c_ansi2knr=yes
+fi
+rm -f conftest.*
+])
+if test $gmp_cv_c_ansi2knr = no; then
+  U= ANSI2KNR=
+else
+  U=_ ANSI2KNR=./ansi2knr
+  # Ensure some checks needed by ansi2knr itself.
+  AC_HEADER_STDC
+  AC_CHECK_HEADERS(string.h)
+fi
+AC_SUBST(U)
+AC_SUBST(ANSI2KNR)
+])
+
+
+dnl  Deal with bad synchronization of Autoconf with Libtool.
+AC_DEFUN(AC_CANONICAL_BUILD, [_AC_CANONICAL_BUILD])
+AC_DEFUN(AC_CHECK_TOOL_PREFIX, [_AC_CHECK_TOOL_PREFIX])
diff --git a/rts/gmp/aclocal.m4 b/rts/gmp/aclocal.m4
new file mode 100644
index 0000000000..086c77915c
--- /dev/null
+++ b/rts/gmp/aclocal.m4
@@ -0,0 +1,1963 @@
+dnl aclocal.m4 generated automatically by aclocal 1.4a
+
+dnl Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc.
+dnl This file is free software; the Free Software Foundation
+dnl gives unlimited permission to copy and/or distribute it,
+dnl with or without modifications, as long as this notice is preserved.
+
+dnl This program is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+dnl even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+dnl PARTICULAR PURPOSE.
+
+dnl  GMP specific autoconf macros
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+dnl  GMP_HEADER_GETVAL(NAME,FILE)
+dnl  ----------------------------
+dnl  Expand to the value of a "#define NAME" from the given FILE.
+dnl  The regexps here aren't very rugged, but are enough for gmp.
+dnl  /dev/null as a parameter prevents a hang if $2 is accidentally omitted.
+
+define(GMP_HEADER_GETVAL,
+[patsubst(patsubst(
+esyscmd([grep "^#define $1 " $2 /dev/null 2>/dev/null]),
+[^.*$1[ 	]+],[]),
+[[
+ 	]*$],[])])
+
+
+dnl  GMP_VERSION
+dnl  -----------
+dnl  The gmp version number, extracted from the #defines in gmp.h.
+dnl  Two digits like 3.0 if patchlevel <= 0, or three digits like 3.0.1 if
+dnl  patchlevel > 0.
+
+define(GMP_VERSION,
+[GMP_HEADER_GETVAL(__GNU_MP_VERSION,gmp.h)[]dnl
+.GMP_HEADER_GETVAL(__GNU_MP_VERSION_MINOR,gmp.h)[]dnl
+ifelse(m4_eval(GMP_HEADER_GETVAL(__GNU_MP_VERSION_PATCHLEVEL,gmp.h) > 0),1,
+[.GMP_HEADER_GETVAL(__GNU_MP_VERSION_PATCHLEVEL,gmp.h)])])
+
+
+dnl  GMP_PROG_M4()
+dnl  -------------
+dnl
+dnl  Find a working m4, either in $PATH or likely locations, and setup $M4
+dnl  and an AC_SUBST accordingly.  If $M4 is already set then it's a user
+dnl  choice and is accepted with no checks.  GMP_PROG_M4 is like
+dnl  AC_PATH_PROG or AC_CHECK_PROG, but it tests each m4 found to see if
+dnl  it's good enough.
+dnl 
+dnl  See mpn/asm-defs.m4 for details on the known bad m4s.
+
+AC_DEFUN(GMP_PROG_M4,
+[AC_CACHE_CHECK([for suitable m4],
+                gmp_cv_prog_m4,
+[if test -n "$M4"; then
+  gmp_cv_prog_m4="$M4"
+else
+  cat >conftest.m4 <<\EOF
+dnl  must protect this against being expanded during autoconf m4!
+[define(dollarhash,``$][#'')dnl
+ifelse(dollarhash(x),1,`define(t1,Y)',
+``bad: $][# not supported (SunOS /usr/bin/m4)
+'')dnl
+ifelse(eval(89),89,`define(t2,Y)',
+`bad: eval() doesnt support 8 or 9 in a constant (OpenBSD 2.6 m4)
+')dnl
+ifelse(t1`'t2,YY,`good
+')dnl]
+EOF
+  echo "trying m4" 1>&AC_FD_CC
+  gmp_tmp_val="`(m4 conftest.m4) 2>&AC_FD_CC`"
+  echo "$gmp_tmp_val" 1>&AC_FD_CC
+  if test "$gmp_tmp_val" = good; then
+    gmp_cv_prog_m4="m4"
+  else
+    IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+dnl $ac_dummy forces splitting on constant user-supplied paths.
+dnl POSIX.2 word splitting is done only on the output of word expansions,
+dnl not every word.  This closes a longstanding sh security hole.
+    ac_dummy="$PATH:/usr/5bin"
+    for ac_dir in $ac_dummy; do
+      test -z "$ac_dir" && ac_dir=.
+      echo "trying $ac_dir/m4" 1>&AC_FD_CC
+      gmp_tmp_val="`($ac_dir/m4 conftest.m4) 2>&AC_FD_CC`"
+      echo "$gmp_tmp_val" 1>&AC_FD_CC
+      if test "$gmp_tmp_val" = good; then
+        gmp_cv_prog_m4="$ac_dir/m4"
+        break
+      fi
+    done
+    IFS="$ac_save_ifs"
+    if test -z "$gmp_cv_prog_m4"; then
+      AC_MSG_ERROR([No usable m4 in \$PATH or /usr/5bin (see config.log for reasons).])
+    fi
+  fi
+  rm -f conftest.m4
+fi])
+M4="$gmp_cv_prog_m4"
+AC_SUBST(M4)
+])
+
+
+dnl  GMP_PROG_CC_FIND([CC_LIST], [REQ_64BIT_CC])
+dnl  Find first working compiler in CC_LIST.
+dnl  If REQ_64BIT_CC is "yes", the compiler is required to be able to 
+dnl  produce 64-bit code.
+dnl  NOTE: If a compiler needs any special flags for producing 64-bit code,
+dnl  these have to be found in shell variable `gmp_cflags64_{cc}', where `{cc}'
+dnl  is the name of the compiler.
+dnl  Set CC to the name of the first working compiler.
+dnl  If a 64-bit compiler is found, set CC64 to the name of the compiler and
+dnl  CFLAGS64 to flags to use.
+dnl  This macro does not test if any of the compilers found is a GNU compiler.
+dnl  To do this, when you have finally made up your mind on which one to use, 
+dnl  and set CC accordingly, invoke [GMP_PROG_CC_SELECT].  That macro will 
+dnl  also make sure that your selection of CFLAGS is valid.
+dnl
+AC_DEFUN(GMP_PROG_CC_FIND,
+[AC_BEFORE([$0], [CC_PROG_CPP])
+ifelse([$1], , gmp_cc_list="gcc cc", gmp_cc_list="[$1]")
+ifelse([$2], , gmp_req_64bit_cc="no", gmp_req_64bit_cc="[$2]")
+
+CC32=
+CC64=
+for c in $gmp_cc_list; do
+  # Avoid cache hits.
+  unset CC
+  unset ac_cv_prog_CC
+  AC_CHECK_TOOL(CC, $c, $c)
+  if test -n "$CC"; then
+    eval c_flags=\$gmp_cflags_$c
+    GMP_PROG_CC_WORKS($CC, $c_flags,
+		      gmp_prog_cc_works=yes, 
+		      gmp_prog_cc_works=no)
+
+    if test "$gmp_prog_cc_works" != "yes"; then
+      continue
+    fi
+
+    # Save first working compiler, whether 32- or 64-bit capable.
+    if test -z "$CC32"; then
+      CC32="$CC"
+    fi
+    if test "$gmp_req_64bit_cc" = "yes"; then
+      eval c_flags=\$gmp_cflags64_$c
+
+      # Verify that the compiler works in 64-bit mode as well.
+      # /usr/ucb/cc on Solaris 7 can *compile* in 64-bit mode, but not link.
+      GMP_PROG_CC_WORKS($c, $c_flags,
+		      	gmp_prog_cc_works=yes, 
+		      	gmp_prog_cc_works=no)
+
+      if test "$gmp_prog_cc_works" = "yes"; then
+        GMP_CHECK_CC_64BIT($c, $c_flags)
+        if test "$gmp_cv_cc_64bit" = "yes"; then
+          test -z "$CC64" && CC64="$c"
+          test -z "$CFLAGS64" && CFLAGS64="$c_flags"
+	  # We have CC64 so we're done.
+          break
+        fi
+      fi
+    else
+      # We have CC32, and we don't need a 64-bit compiler so we're done.
+      break
+    fi
+  fi
+done
+CC="$CC32"
+])dnl
+
+dnl  GMP_PROG_CC_SELECT
+dnl  Check that `CC' works with `CFLAGS'.  Check if `CC' is a GNU compiler.
+dnl  Cache the result as `ac_cv_prog_CC'.
+AC_DEFUN(GMP_PROG_CC_SELECT,
+[AC_BEFORE([$0], [CC_PROG_CPP])
+AC_PROG_CC_WORKS
+AC_PROG_CC_GNU
+
+if test "$ac_cv_prog_gcc" = "yes"; then
+  GCC=yes
+else
+  GCC=
+fi
+
+# Set CFLAGS if not already set.
+if test -z "$CFLAGS"; then
+  CFLAGS="-g"
+  if test "$GCC" = "yes"; then
+    CFLAGS="$CFLAGS -O2"
+  fi
+fi
+
+AC_SUBST(CC)
+AC_CACHE_VAL(ac_cv_prog_CC, ac_cv_prog_CC="$CC")
+AC_PROVIDE([AC_PROG_CC])
+])dnl
+
+dnl  GMP_CHECK_CC_64BIT(cc, cflags64)
+dnl  Find out if `CC' can produce 64-bit code.
+dnl  Requires NM to be set to nm for target.
+dnl  FIXME: Cache result.
+AC_DEFUN(GMP_CHECK_CC_64BIT,
+[
+  gmp_tmp_CC_save="$CC"
+  CC="[$1]"
+  AC_MSG_CHECKING([whether the C compiler ($CC) is 64-bit capable])
+  if test -z "$NM"; then
+    echo; echo ["configure: $0: fatal: need nm"]
+    exit 1
+  fi
+  gmp_tmp_CFLAGS_save="$CFLAGS"
+  CFLAGS="[$2]"
+
+  case "$target" in 
+    hppa2.0*-*-*)
+      # FIXME: If gcc is installed under another name than "gcc", we will 
+      # test the wrong thing.
+      if test "$CC" != "gcc"; then
+        dnl Let compiler version A.10.32.30 or higher be ok.
+        dnl Bad compiler output:
+        dnl   ccom: HP92453-01 G.10.32.05 HP C Compiler
+        dnl Good compiler output:
+        dnl   ccom: HP92453-01 A.10.32.30 HP C Compiler
+        echo >conftest.c
+        gmp_tmp_vs=`$CC $CFLAGS -V -c -o conftest.o conftest.c 2>&1 | grep "^ccom:"`
+        rm conftest*
+        gmp_tmp_v1=`echo $gmp_tmp_vs | sed 's/.* .\.\(.*\)\..*\..* HP C.*/\1/'`
+        gmp_tmp_v2=`echo $gmp_tmp_vs | sed 's/.* .\..*\.\(.*\)\..* HP C.*/\1/'`
+        gmp_tmp_v3=`echo $gmp_tmp_vs | sed 's/.* .\..*\..*\.\(.*\) HP C.*/\1/'`
+	gmp_cv_cc_64bit=no
+	test -n "$gmp_tmp_v1" && test "$gmp_tmp_v1" -ge "10" \
+  	  && test -n "$gmp_tmp_v2" && test "$gmp_tmp_v2" -ge "32" \
+    	  && test -n "$gmp_tmp_v3" && test "$gmp_tmp_v3" -ge "30" \
+	  && gmp_cv_cc_64bit=yes
+      else	# gcc
+	# FIXME: Compile a minimal file and determine if the resulting object 
+	# file is an ELF file.  If so, gcc can produce 64-bit code.
+	# Do we have file(1) for target?
+	gmp_cv_cc_64bit=no
+      fi
+      ;;
+    mips-sgi-irix6.*)
+      # We use `-n32' to cc and `-mabi=n32' to gcc, resulting in 64-bit 
+      # arithmetic but not 64-bit pointers, so the general test for sizeof
+      # (void *) is not valid.
+      # Simply try to compile an empty main.  If that succeeds return
+      # true.
+      AC_TRY_COMPILE( , ,
+                     gmp_cv_cc_64bit=yes, gmp_cv_cc_64bit=no,
+                     gmp_cv_cc_64bit=no)
+      ;;
+    *-*-*)
+      # Allocate an array of size sizeof (void *) and use nm to determine its 
+      # size.  We depend on the first declared variable being put at address 0.
+      cat >conftest.c <<EOF
+[char arr[sizeof (void *)]={0};
+char post=0;]
+EOF
+      gmp_compile="$CC $CFLAGS -c conftest.c 1>&AC_FD_CC"
+      if AC_TRY_EVAL(gmp_compile); then
+        changequote(<,>)dnl
+	gmp_tmp_val=`$NM conftest.o | grep post | sed -e 's;[[][0-9][]]\(.*\);\1;' \
+          -e 's;[^1-9]*\([0-9]*\).*;\1;'`
+        changequote([, ])dnl
+        if test "$gmp_tmp_val" = "8"; then
+	  gmp_cv_cc_64bit=yes
+	else
+	  gmp_cv_cc_64bit=no
+        fi
+      else
+        echo "configure: failed program was:" >&AC_FD_CC
+        cat conftest.$ac_ext >&AC_FD_CC
+        gmp_cv_cc_64bit=no
+      fi
+      rm -f conftest*
+      ;;
+  esac
+
+  CC="$gmp_tmp_CC_save"
+  CFLAGS="$gmp_tmp_CFLAGS_save"
+  AC_MSG_RESULT($gmp_cv_cc_64bit)
+])dnl
+
+dnl  GMP_INIT([M4-DEF-FILE])
+dnl  
+AC_DEFUN(GMP_INIT,
+[ifelse([$1], , gmp_configm4=config.m4, gmp_configm4="[$1]")
+gmp_tmpconfigm4=cnfm4.tmp
+gmp_tmpconfigm4i=cnfm4i.tmp
+gmp_tmpconfigm4p=cnfm4p.tmp
+test -f $gmp_tmpconfigm4 && rm $gmp_tmpconfigm4
+test -f $gmp_tmpconfigm4i && rm $gmp_tmpconfigm4i
+test -f $gmp_tmpconfigm4p && rm $gmp_tmpconfigm4p
+])dnl
+
+dnl  GMP_FINISH
+dnl  ----------
+dnl  Create config.m4 from its accumulated parts.
+dnl
+dnl  __CONFIG_M4_INCLUDED__ is used so that a second or subsequent include
+dnl  of config.m4 is harmless.
+dnl
+dnl  A separate ifdef on the angle bracket quoted part ensures the quoting
+dnl  style there is respected.  The basic defines from gmp_tmpconfigm4 are
+dnl  fully quoted but are still put under an ifdef in case any have been
+dnl  redefined by one of the m4 include files.
+dnl
+dnl  Doing a big ifdef within asm-defs.m4 and/or other macro files wouldn't
+dnl  work, since it'd interpret parentheses and quotes in dnl comments, and
+dnl  having a whole file as a macro argument would overflow the string space
+dnl  on BSD m4.
+
+AC_DEFUN(GMP_FINISH,
+[AC_REQUIRE([GMP_INIT])
+echo "creating $gmp_configm4"
+echo ["dnl $gmp_configm4.  Generated automatically by configure."] > $gmp_configm4
+if test -f $gmp_tmpconfigm4; then
+  echo ["changequote(<,>)dnl"] >> $gmp_configm4
+  echo ["ifdef(<__CONFIG_M4_INCLUDED__>,,<"] >> $gmp_configm4
+  cat $gmp_tmpconfigm4 >> $gmp_configm4
+  echo [">)"] >> $gmp_configm4
+  echo ["changequote(\`,')dnl"] >> $gmp_configm4
+  rm $gmp_tmpconfigm4
+fi
+echo ["ifdef(\`__CONFIG_M4_INCLUDED__',,\`"] >> $gmp_configm4
+if test -f $gmp_tmpconfigm4i; then
+  cat $gmp_tmpconfigm4i >> $gmp_configm4
+  rm $gmp_tmpconfigm4i
+fi
+if test -f $gmp_tmpconfigm4p; then
+  cat $gmp_tmpconfigm4p >> $gmp_configm4
+  rm $gmp_tmpconfigm4p
+fi
+echo ["')"] >> $gmp_configm4
+echo ["define(\`__CONFIG_M4_INCLUDED__')"] >> $gmp_configm4
+])dnl
+
+dnl  GMP_INCLUDE(FILE)
+AC_DEFUN(GMP_INCLUDE,
+[AC_REQUIRE([GMP_INIT])
+echo ["include(\`$1')"] >> $gmp_tmpconfigm4i
+])dnl
+
+dnl  GMP_SINCLUDE(FILE)
+AC_DEFUN(GMP_SINCLUDE,
+[AC_REQUIRE([GMP_INIT])
+echo ["sinclude(\`$1')"] >> $gmp_tmpconfigm4i
+])dnl
+
+dnl GMP_DEFINE(MACRO, DEFINITION [, LOCATION])
+dnl [ Define M4 macro MACRO as DEFINITION in temporary file.		]
+dnl [ If LOCATION is `POST', the definition will appear after any	]
+dnl [ include() directives inserted by GMP_INCLUDE/GMP_SINCLUDE.	]
+dnl [ Mind the quoting!  No shell variables will get expanded.		]
+dnl [ Don't forget to invoke GMP_FINISH to create file config.m4.	]
+dnl [ config.m4 uses `<' and '>' as quote characters for all defines.	]
+AC_DEFUN(GMP_DEFINE, 
+[AC_REQUIRE([GMP_INIT])
+echo ['define(<$1>, <$2>)'] >> ifelse([$3], [POST], $gmp_tmpconfigm4p, $gmp_tmpconfigm4)
+])dnl
+
+dnl GMP_DEFINE_RAW(STRING, [, LOCATION])
+dnl [ Put STRING in temporary file.					]
+dnl [ If LOCATION is `POST', the definition will appear after any	]
+dnl [ include() directives inserted by GMP_INCLUDE/GMP_SINCLUDE.	]
+dnl [ Don't forget to invoke GMP_FINISH to create file config.m4.	]
+AC_DEFUN(GMP_DEFINE_RAW,
+[AC_REQUIRE([GMP_INIT])
+echo [$1] >> ifelse([$2], [POST], $gmp_tmpconfigm4p, $gmp_tmpconfigm4)
+])dnl
+
+dnl  GMP_CHECK_ASM_LABEL_SUFFIX
+dnl  Should a label have a colon or not?
+AC_DEFUN(GMP_CHECK_ASM_LABEL_SUFFIX,
+[AC_CACHE_CHECK([what assembly label suffix to use],
+               gmp_cv_check_asm_label_suffix,
+[case "$target" in 
+  *-*-hpux*) gmp_cv_check_asm_label_suffix=[""] ;;
+  *) gmp_cv_check_asm_label_suffix=[":"] ;;
+esac
+])
+echo ["define(<LABEL_SUFFIX>, <\$][1$gmp_cv_check_asm_label_suffix>)"] >> $gmp_tmpconfigm4
+])dnl
+
+dnl  GMP_CHECK_ASM_UNDERSCORE([ACTION-IF-FOUND [, ACTION-IF-NOT-FOUND]])
+dnl  Shamelessly borrowed from glibc.
+AC_DEFUN(GMP_CHECK_ASM_UNDERSCORE,
+[AC_CACHE_CHECK([if symbols are prefixed by underscore], 
+	        gmp_cv_check_asm_underscore,
+[cat > conftest.$ac_ext <<EOF
+dnl This sometimes fails to find confdefs.h, for some reason.
+dnl [#]line __oline__ "[$]0"
+[#]line __oline__ "configure"
+#include "confdefs.h"
+int underscore_test() {
+return; }
+EOF
+if AC_TRY_EVAL(ac_compile); then
+  if grep _underscore_test conftest* >/dev/null; then
+    gmp_cv_check_asm_underscore=yes
+  else
+    gmp_cv_check_asm_underscore=no
+  fi
+else
+  echo "configure: failed program was:" >&AC_FD_CC
+  cat conftest.$ac_ext >&AC_FD_CC
+fi
+rm -f conftest*
+])
+if test "$gmp_cv_check_asm_underscore" = "yes"; then
+  GMP_DEFINE(GSYM_PREFIX, [_])
+  ifelse([$1], , :, [$1])
+else
+  GMP_DEFINE(GSYM_PREFIX, [])
+  ifelse([$2], , :, [$2])
+fi    
+])dnl
+
+dnl  GMP_CHECK_ASM_ALIGN_LOG([ACTION-IF-FOUND [, ACTION-IF-NOT-FOUND]])
+dnl  Is parameter to `.align' logarithmic?
+dnl  Requires NM to be set to nm for target.
+AC_DEFUN(GMP_CHECK_ASM_ALIGN_LOG,
+[AC_REQUIRE([GMP_CHECK_ASM_GLOBL])
+AC_REQUIRE([GMP_CHECK_ASM_DATA])
+AC_REQUIRE([GMP_CHECK_ASM_LABEL_SUFFIX])
+AC_CACHE_CHECK([if .align assembly directive is logarithmic],
+		gmp_cv_check_asm_align_log,
+[if test -z "$NM"; then
+  echo; echo ["configure: $0: fatal: need nm"]
+  exit 1
+fi
+cat > conftest.s <<EOF
+      	$gmp_cv_check_asm_data
+      	.align  4
+	$gmp_cv_check_asm_globl	foo
+	.byte	1
+	.align	4
+foo$gmp_cv_check_asm_label_suffix
+	.byte	2
+EOF
+ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+if AC_TRY_EVAL(ac_assemble); then
+  changequote(<,>)
+  gmp_tmp_val=`$NM conftest.o | grep foo | sed -e 's;[[][0-9][]]\(.*\);\1;' \
+       -e 's;[^1-9]*\([0-9]*\).*;\1;'`
+  changequote([, ])dnl
+  if test "$gmp_tmp_val" = "10" || test "$gmp_tmp_val" = "16"; then
+    gmp_cv_check_asm_align_log=yes
+  else
+    gmp_cv_check_asm_align_log=no
+  fi
+else 
+  echo "configure: failed program was:" >&AC_FD_CC
+  cat conftest.s >&AC_FD_CC
+fi
+rm -f conftest*
+])
+GMP_DEFINE_RAW(["define(<ALIGN_LOGARITHMIC>,<$gmp_cv_check_asm_align_log>)"])
+if test "$gmp_cv_check_asm_align_log" = "yes"; then
+  ifelse([$1], , :, [$1])
+else
+  ifelse([$2], , :, [$2])
+fi  
+])dnl
+
+
+dnl  GMP_CHECK_ASM_ALIGN_FILL_0x90
+dnl  -----------------------------
+dnl  Determine whether a ",0x90" suffix works on a .align directive.
+dnl  This is only meant for use on x86, where 0x90 is a "nop".
+dnl
+dnl  Old gas, eg. 1.92.3 - needs ",0x90" or else the fill is an invalid 0x00.
+dnl  New gas, eg. 2.91 - generates the good multibyte nop fills even when
+dnl                      ",0x90" is given.
+dnl  Solaris 2.6 as - doesn't allow ",0x90", gives a fatal error.
+dnl  Solaris 2.8 as - gives a warning for ",0x90", no ill effect.
+dnl
+dnl  Note that both solaris "as"s only care about ",0x90" if they actually
+dnl  have to use it to fill something, hence the .byte in the sample.  It's
+dnl  only the second .align that provokes an error or warning.
+dnl
+dnl  We prefer to suppress the warning from solaris 2.8 to stop anyone
+dnl  worrying something might be wrong.
+
+AC_DEFUN(GMP_CHECK_ASM_ALIGN_FILL_0x90,
+[AC_CACHE_CHECK([if the .align directive accepts an 0x90 fill in .text],
+                gmp_cv_check_asm_align_fill_0x90,
+[AC_REQUIRE([GMP_CHECK_ASM_TEXT])
+cat > conftest.s <<EOF
+      	$gmp_cv_check_asm_text
+      	.align  4, 0x90
+	.byte   0
+      	.align  4, 0x90
+EOF
+gmp_tmp_val="`$CCAS $CFLAGS conftest.s 2>&1`"
+if test $? = 0; then
+  echo "$gmp_tmp_val" 1>&AC_FD_CC
+  if echo "$gmp_tmp_val" | grep "Warning: Fill parameter ignored for executable section"; then
+    echo "Supressing this warning by omitting 0x90" 1>&AC_FD_CC
+    gmp_cv_check_asm_align_fill_0x90=no
+  else
+    gmp_cv_check_asm_align_fill_0x90=yes
+  fi
+else
+  echo "Non-zero exit code" 1>&AC_FD_CC
+  echo "$gmp_tmp_val" 1>&AC_FD_CC
+  gmp_cv_check_asm_align_fill_0x90=no
+fi
+rm -f conftest*
+])
+GMP_DEFINE_RAW(
+["define(<ALIGN_FILL_0x90>,<$gmp_cv_check_asm_align_fill_0x90>)"])
+])
+
+
+dnl  GMP_CHECK_ASM_TEXT
+AC_DEFUN(GMP_CHECK_ASM_TEXT,
+[AC_CACHE_CHECK([how to switch to text section], gmp_cv_check_asm_text,
+[case "$target" in
+  *-*-aix*)
+    changequote({, })
+    gmp_cv_check_asm_text={".csect .text[PR]"}
+    changequote([, ])
+    ;;
+  *-*-hpux*) gmp_cv_check_asm_text=[".code"] ;;
+  *) gmp_cv_check_asm_text=[".text"] ;;
+esac
+])
+echo ["define(<TEXT>, <$gmp_cv_check_asm_text>)"] >> $gmp_tmpconfigm4
+])dnl
+
+dnl  GMP_CHECK_ASM_DATA
+dnl  Can we say `.data'?
+AC_DEFUN(GMP_CHECK_ASM_DATA,
+[AC_CACHE_CHECK([how to switch to data section], gmp_cv_check_asm_data,
+[case "$target" in
+  *-*-aix*)
+    changequote({, })
+    gmp_cv_check_asm_data={".csect .data[RW]"}
+    changequote([, ])
+    ;;
+  *) gmp_cv_check_asm_data=[".data"] ;;
+esac
+])
+echo ["define(<DATA>, <$gmp_cv_check_asm_data>)"] >> $gmp_tmpconfigm4
+])dnl
+
+dnl  GMP_CHECK_ASM_GLOBL
+dnl  Can we say `.global'?
+AC_DEFUN(GMP_CHECK_ASM_GLOBL,
+[AC_CACHE_CHECK([how to export a symbol], gmp_cv_check_asm_globl,
+[case "$target" in
+  *-*-hpux*) gmp_cv_check_asm_globl=[".export"] ;;
+  *) gmp_cv_check_asm_globl=[".globl"] ;;
+esac
+])
+echo ["define(<GLOBL>, <$gmp_cv_check_asm_globl>)"] >> $gmp_tmpconfigm4
+])dnl
+
+dnl  GMP_CHECK_ASM_TYPE
+dnl  Can we say `.type'?
+AC_DEFUN(GMP_CHECK_ASM_TYPE,
+[AC_CACHE_CHECK([how the .type assembly directive should be used],
+gmp_cv_check_asm_type,
+[ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+for gmp_tmp_prefix in @ \# %; do
+  echo "	.type	sym,${gmp_tmp_prefix}function" > conftest.s
+  if AC_TRY_EVAL(ac_assemble); then
+    gmp_cv_check_asm_type="[.type	\$][1,${gmp_tmp_prefix}\$][2]"
+    break
+  fi
+done
+if test -z "$gmp_cv_check_asm_type"; then
+  gmp_cv_check_asm_type="[dnl]"
+fi
+])
+echo ["define(<TYPE>, <$gmp_cv_check_asm_type>)"] >> $gmp_tmpconfigm4
+])dnl
+
+dnl  GMP_CHECK_ASM_SIZE
+dnl  Can we say `.size'?
+AC_DEFUN(GMP_CHECK_ASM_SIZE,
+[AC_CACHE_CHECK([if the .size assembly directive works], gmp_cv_check_asm_size,
+[ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+echo '	.size	sym,1' > conftest.s
+if AC_TRY_EVAL(ac_assemble); then
+  gmp_cv_check_asm_size="[.size	\$][1,\$][2]"
+else
+  gmp_cv_check_asm_size="[dnl]"
+fi
+])
+echo ["define(<SIZE>, <$gmp_cv_check_asm_size>)"] >> $gmp_tmpconfigm4
+])dnl
+
+dnl  GMP_CHECK_ASM_LSYM_PREFIX
+dnl  What is the prefix for a local label?
+dnl  Requires NM to be set to nm for target.
+AC_DEFUN(GMP_CHECK_ASM_LSYM_PREFIX,
+[AC_REQUIRE([GMP_CHECK_ASM_LABEL_SUFFIX])
+AC_CACHE_CHECK([what prefix to use for a local label], 
+gmp_cv_check_asm_lsym_prefix,
+[if test -z "$NM"; then
+  echo; echo ["$0: fatal: need nm"]
+  exit 1
+fi
+ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+gmp_cv_check_asm_lsym_prefix="L"
+for gmp_tmp_pre in L .L $ L$; do
+  cat > conftest.s <<EOF
+dummy${gmp_cv_check_asm_label_suffix}
+${gmp_tmp_pre}gurkmacka${gmp_cv_check_asm_label_suffix}
+	.byte 0
+EOF
+  if AC_TRY_EVAL(ac_assemble); then
+    $NM conftest.o >/dev/null 2>&1
+    gmp_rc=$?
+    if test "$gmp_rc" != "0"; then
+      echo "configure: $NM failure, using default"
+      break
+    fi
+    if $NM conftest.o | grep gurkmacka >/dev/null; then true; else
+      gmp_cv_check_asm_lsym_prefix="$gmp_tmp_pre"
+      break
+    fi
+  else
+    echo "configure: failed program was:" >&AC_FD_CC
+    cat conftest.s >&AC_FD_CC
+    # Use default.
+  fi
+done
+rm -f conftest*
+])
+echo ["define(<LSYM_PREFIX>, <${gmp_cv_check_asm_lsym_prefix}>)"] >> $gmp_tmpconfigm4
+])
+
+dnl  GMP_CHECK_ASM_W32
+dnl  How to [define] a 32-bit word.
+dnl  Requires NM to be set to nm for target.
+AC_DEFUN(GMP_CHECK_ASM_W32,
+[AC_REQUIRE([GMP_CHECK_ASM_DATA])
+AC_REQUIRE([GMP_CHECK_ASM_GLOBL])
+AC_REQUIRE([GMP_CHECK_ASM_LABEL_SUFFIX])
+AC_CACHE_CHECK([how to [define] a 32-bit word],
+	       gmp_cv_check_asm_w32,
+[if test -z "$NM"; then
+  echo; echo ["configure: $0: fatal: need nm"]
+  exit 1
+fi
+
+# FIXME: HPUX puts first symbol at 0x40000000, breaking our assumption
+# that it's at 0x0.  We'll have to declare another symbol before the
+# .long/.word and look at the distance between the two symbols.  The
+# only problem is that the sed expression(s) barfs (on Solaris, for
+# example) for the symbol with value 0.  For now, HPUX uses .word.
+
+case "$target" in 
+  *-*-hpux*)
+    gmp_cv_check_asm_w32=".word"
+    ;;
+  *-*-*)
+    ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+    for gmp_tmp_op in .long .word; do
+      cat > conftest.s <<EOF
+	$gmp_cv_check_asm_data
+	$gmp_cv_check_asm_globl	foo
+	$gmp_tmp_op	0
+foo${gmp_cv_check_asm_label_suffix}
+	.byte	0
+EOF
+      if AC_TRY_EVAL(ac_assemble); then
+        changequote(<,>)
+        gmp_tmp_val=`$NM conftest.o | grep foo | sed -e 's;[[][0-9][]]\(.*\);\1;' \
+             -e 's;[^1-9]*\([0-9]*\).*;\1;'`
+        changequote([, ])dnl
+        if test "$gmp_tmp_val" = "4"; then
+          gmp_cv_check_asm_w32="$gmp_tmp_op"
+          break
+        fi
+      fi
+    done
+    ;;
+esac
+
+if test -z "$gmp_cv_check_asm_w32"; then
+  echo; echo ["configure: $0: fatal: do not know how to define a 32-bit word"]
+  exit 1
+fi
+rm -f conftest*
+])
+echo ["define(<W32>, <$gmp_cv_check_asm_w32>)"] >> $gmp_tmpconfigm4
+])
+
+dnl  GMP_CHECK_ASM_MMX([ACTION-IF-FOUND, [ACTION-IF-NOT-FOUND]])
+dnl  Can we assemble MMX insns?
+AC_DEFUN(GMP_CHECK_ASM_MMX,
+[AC_REQUIRE([GMP_CHECK_ASM_TEXT])
+AC_CACHE_CHECK([if the assembler knows about MMX instructions],
+		gmp_cv_check_asm_mmx,
+[cat > conftest.s <<EOF
+	$gmp_cv_check_asm_text
+	por	%mm0, %mm0
+EOF
+ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+if AC_TRY_EVAL(ac_assemble); then
+  gmp_cv_check_asm_mmx=yes
+else 
+  gmp_cv_check_asm_mmx=no
+fi
+rm -f conftest*
+])
+if test "$gmp_cv_check_asm_mmx" = "yes"; then
+  ifelse([$1], , :, [$1])
+else
+  AC_MSG_WARN([+----------------------------------------------------------])
+  AC_MSG_WARN([| WARNING WARNING WARNING])
+  AC_MSG_WARN([| Target CPU has MMX code, but it can't be assembled by])
+  AC_MSG_WARN([|     $CCAS $CFLAGS])
+  AC_MSG_WARN([| Non-MMX replacements will be used.])
+  AC_MSG_WARN([| This will be an inferior build.])
+  AC_MSG_WARN([+----------------------------------------------------------])
+  ifelse([$2], , :, [$2])
+fi
+])dnl
+
+dnl  GMP_CHECK_ASM_SHLDL_CL([ACTION-IF-FOUND, [ACTION-IF-NOT-FOUND]])
+AC_DEFUN(GMP_CHECK_ASM_SHLDL_CL,
+[AC_REQUIRE([GMP_CHECK_ASM_TEXT])
+AC_CACHE_CHECK([if the assembler takes cl with shldl],
+		gmp_cv_check_asm_shldl_cl,
+[cat > conftest.s <<EOF
+	$gmp_cv_check_asm_text
+	shldl	%cl, %eax, %ebx
+EOF
+ac_assemble="$CCAS $CFLAGS conftest.s 1>&AC_FD_CC"
+if AC_TRY_EVAL(ac_assemble); then
+  gmp_cv_check_asm_shldl_cl=yes
+else 
+  gmp_cv_check_asm_shldl_cl=no
+fi
+rm -f conftest*
+])
+if test "$gmp_cv_check_asm_shldl_cl" = "yes"; then
+  ifelse([$1], , :, [$1])
+else
+  ifelse([$2], , :, [$2])
+fi
+])dnl
+
+dnl  GMP_PROG_CC_WORKS(CC, CFLAGS, ACTION-IF-WORKS, [ACTION-IF-NOT-WORKS])
+dnl  Check if CC can compile and link.  Perform various target specific tests.
+dnl  FIXME: Require `$target'.
+AC_DEFUN(GMP_PROG_CC_WORKS,
+[AC_LANG_C	dnl  Note: Destructive.
+CC="[$1]"
+CFLAGS="[$2]"
+AC_MSG_CHECKING([if the C compiler ($CC) works with flags $CFLAGS])
+
+# Simple test for all targets.
+AC_TRY_COMPILER([int main(){return(0);}],
+                tmp_works, tmp_cross)
+
+# Target specific tests.
+if test "$tmp_works" = "yes"; then
+  case "$target" in 
+    *-*-aix*)	# Returning a funcptr.
+      AC_TRY_COMPILE( , [} void *g(); void *f() { return g(); } int bar(){],
+                      tmp_works=yes, tmp_works=no)
+      ;;
+  esac
+fi
+
+if test "$tmp_works" = "yes"; then
+  [$3]
+else
+  ifelse([$4], , :, [$4])
+fi
+
+AC_MSG_RESULT($tmp_works)
+])dnl
+
+
+dnl  GMP_C_ANSI2KNR
+dnl  --------------
+dnl  Setup to use ansi2knr if necessary.
+dnl
+dnl  The test here is simply that if an ANSI style function works then
+dnl  ansi2knr isn't needed.  The normal tests for whether $CC works mean we
+dnl  don't need to worry here about anything badly broken.
+dnl
+dnl  AM_C_PROTOTYPES is the normal way to set up ansi2knr, but (in automake
+dnl  March 2000) it gives the wrong answer on a C++ compiler because its
+dnl  test requires that the compiler accept both ANSI and K&R, or otherwise
+dnl  ansi2knr is used.  A C++ compiler fails on the K&R part, which makes
+dnl  AM_C_PROTOTYPES think it needs ansi2knr!  GMP has no bare K&R so we
+dnl  only need ANSI or K&R to work, not both.
+
+AC_DEFUN(GMP_C_ANSI2KNR,
+[AC_CACHE_CHECK([if ansi2knr should be used],
+                gmp_cv_c_ansi2knr,
+[cat >conftest.c <<EOF
+int main (int argc, char *argv[]) { return 0; }
+EOF
+if AC_TRY_EVAL(ac_compile); then
+  gmp_cv_c_ansi2knr=no
+else
+  gmp_cv_c_ansi2knr=yes
+fi
+rm -f conftest.*
+])
+if test $gmp_cv_c_ansi2knr = no; then
+  U= ANSI2KNR=
+else
+  U=_ ANSI2KNR=./ansi2knr
+  # Ensure some checks needed by ansi2knr itself.
+  AC_HEADER_STDC
+  AC_CHECK_HEADERS(string.h)
+fi
+AC_SUBST(U)
+AC_SUBST(ANSI2KNR)
+])
+
+
+dnl  Deal with bad synchronization of Autoconf with Libtool.
+AC_DEFUN(AC_CANONICAL_BUILD, [_AC_CANONICAL_BUILD])
+AC_DEFUN(AC_CHECK_TOOL_PREFIX, [_AC_CHECK_TOOL_PREFIX])
+
+
+# serial 1
+
+AC_DEFUN(AM_C_PROTOTYPES,
+[AC_REQUIRE([AM_PROG_CC_STDC])
+AC_REQUIRE([AC_PROG_CPP])
+AC_MSG_CHECKING([for function prototypes])
+if test "$am_cv_prog_cc_stdc" != no; then
+  AC_MSG_RESULT(yes)
+  AC_DEFINE(PROTOTYPES,1,[Define if compiler has function prototypes])
+  U= ANSI2KNR=
+else
+  AC_MSG_RESULT(no)
+  U=_ ANSI2KNR=./ansi2knr
+  # Ensure some checks needed by ansi2knr itself.
+  AC_HEADER_STDC
+  AC_CHECK_HEADERS(string.h)
+fi
+AC_SUBST(U)dnl
+AC_SUBST(ANSI2KNR)dnl
+])
+
+
+# serial 1
+
+# @defmac AC_PROG_CC_STDC
+# @maindex PROG_CC_STDC
+# @ovindex CC
+# If the C compiler in not in ANSI C mode by default, try to add an option
+# to output variable @code{CC} to make it so.  This macro tries various
+# options that select ANSI C on some system or another.  It considers the
+# compiler to be in ANSI C mode if it handles function prototypes correctly.
+#
+# If you use this macro, you should check after calling it whether the C
+# compiler has been set to accept ANSI C; if not, the shell variable
+# @code{am_cv_prog_cc_stdc} is set to @samp{no}.  If you wrote your source
+# code in ANSI C, you can make an un-ANSIfied copy of it by using the
+# program @code{ansi2knr}, which comes with Ghostscript.
+# @end defmac
+
+AC_DEFUN(AM_PROG_CC_STDC,
+[AC_REQUIRE([AC_PROG_CC])
+AC_BEFORE([$0], [AC_C_INLINE])
+AC_BEFORE([$0], [AC_C_CONST])
+dnl Force this before AC_PROG_CPP.  Some cpp's, eg on HPUX, require
+dnl a magic option to avoid problems with ANSI preprocessor commands
+dnl like #elif.
+dnl FIXME: can't do this because then AC_AIX won't work due to a
+dnl circular dependency.
+dnl AC_BEFORE([$0], [AC_PROG_CPP])
+AC_MSG_CHECKING(for ${CC-cc} option to accept ANSI C)
+AC_CACHE_VAL(am_cv_prog_cc_stdc,
+[am_cv_prog_cc_stdc=no
+ac_save_CC="$CC"
+# Don't try gcc -ansi; that turns off useful extensions and
+# breaks some systems' header files.
+# AIX			-qlanglvl=ansi
+# Ultrix and OSF/1	-std1
+# HP-UX 10.20 and later	-Ae
+# HP-UX older versions	-Aa -D_HPUX_SOURCE
+# SVR4			-Xc -D__EXTENSIONS__
+for ac_arg in "" -qlanglvl=ansi -std1 -Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__"
+do
+  CC="$ac_save_CC $ac_arg"
+  AC_TRY_COMPILE(
+[#include <stdarg.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+/* Most of the following tests are stolen from RCS 5.7's src/conf.sh.  */
+struct buf { int x; };
+FILE * (*rcsopen) (struct buf *, struct stat *, int);
+static char *e (p, i)
+     char **p;
+     int i;
+{
+  return p[i];
+}
+static char *f (char * (*g) (char **, int), char **p, ...)
+{
+  char *s;
+  va_list v;
+  va_start (v,p);
+  s = g (p, va_arg (v,int));
+  va_end (v);
+  return s;
+}
+int test (int i, double x);
+struct s1 {int (*f) (int a);};
+struct s2 {int (*f) (double a);};
+int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int);
+int argc;
+char **argv;
+], [
+return f (e, argv, 0) != argv[0]  ||  f (e, argv, 1) != argv[1];
+],
+[am_cv_prog_cc_stdc="$ac_arg"; break])
+done
+CC="$ac_save_CC"
+])
+if test -z "$am_cv_prog_cc_stdc"; then
+  AC_MSG_RESULT([none needed])
+else
+  AC_MSG_RESULT($am_cv_prog_cc_stdc)
+fi
+case "x$am_cv_prog_cc_stdc" in
+  x|xno) ;;
+  *) CC="$CC $am_cv_prog_cc_stdc" ;;
+esac
+])
+
+# Do all the work for Automake.  This macro actually does too much --
+# some checks are only needed if your package does certain things.
+# But this isn't really a big deal.
+
+# serial 1
+
+dnl Usage:
+dnl AM_INIT_AUTOMAKE(package,version, [no-define])
+
+AC_DEFUN(AM_INIT_AUTOMAKE,
+[AC_REQUIRE([AC_PROG_INSTALL])
+dnl We require 2.13 because we rely on SHELL being computed by configure.
+AC_PREREQ([2.13])
+PACKAGE=[$1]
+AC_SUBST(PACKAGE)
+VERSION=[$2]
+AC_SUBST(VERSION)
+dnl test to see if srcdir already configured
+if test "`CDPATH=: && cd $srcdir && pwd`" != "`pwd`" &&
+   test -f $srcdir/config.status; then
+  AC_MSG_ERROR([source directory already configured; run "make distclean" there first])
+fi
+ifelse([$3],,
+AC_DEFINE_UNQUOTED(PACKAGE, "$PACKAGE", [Name of package])
+AC_DEFINE_UNQUOTED(VERSION, "$VERSION", [Version number of package]))
+AC_REQUIRE([AM_SANITY_CHECK])
+AC_REQUIRE([AC_ARG_PROGRAM])
+AM_MISSING_PROG(ACLOCAL, aclocal)
+AM_MISSING_PROG(AUTOCONF, autoconf)
+AM_MISSING_PROG(AUTOMAKE, automake)
+AM_MISSING_PROG(AUTOHEADER, autoheader)
+AM_MISSING_PROG(MAKEINFO, makeinfo)
+AM_MISSING_PROG(AMTAR, tar)
+AM_MISSING_INSTALL_SH
+dnl We need awk for the "check" target.  The system "awk" is bad on
+dnl some platforms.
+AC_REQUIRE([AC_PROG_AWK])
+AC_REQUIRE([AC_PROG_MAKE_SET])
+AC_REQUIRE([AM_DEP_TRACK])
+AC_REQUIRE([AM_SET_DEPDIR])
+ifdef([AC_PROVIDE_AC_PROG_CC], [AM_DEPENDENCIES(CC)], [
+   define([AC_PROG_CC], defn([AC_PROG_CC])[AM_DEPENDENCIES(CC)])])
+ifdef([AC_PROVIDE_AC_PROG_CXX], [AM_DEPENDENCIES(CXX)], [
+   define([AC_PROG_CXX], defn([AC_PROG_CXX])[AM_DEPENDENCIES(CXX)])])
+])
+
+#
+# Check to make sure that the build environment is sane.
+#
+
+AC_DEFUN(AM_SANITY_CHECK,
+[AC_MSG_CHECKING([whether build environment is sane])
+# Just in case
+sleep 1
+echo timestamp > conftestfile
+# Do `set' in a subshell so we don't clobber the current shell's
+# arguments.  Must try -L first in case configure is actually a
+# symlink; some systems play weird games with the mod time of symlinks
+# (eg FreeBSD returns the mod time of the symlink's containing
+# directory).
+if (
+   set X `ls -Lt $srcdir/configure conftestfile 2> /dev/null`
+   if test "[$]*" = "X"; then
+      # -L didn't work.
+      set X `ls -t $srcdir/configure conftestfile`
+   fi
+   if test "[$]*" != "X $srcdir/configure conftestfile" \
+      && test "[$]*" != "X conftestfile $srcdir/configure"; then
+
+      # If neither matched, then we have a broken ls.  This can happen
+      # if, for instance, CONFIG_SHELL is bash and it inherits a
+      # broken ls alias from the environment.  This has actually
+      # happened.  Such a system could not be considered "sane".
+      AC_MSG_ERROR([ls -t appears to fail.  Make sure there is not a broken
+alias in your environment])
+   fi
+
+   test "[$]2" = conftestfile
+   )
+then
+   # Ok.
+   :
+else
+   AC_MSG_ERROR([newly created file is older than distributed files!
+Check your system clock])
+fi
+rm -f conftest*
+AC_MSG_RESULT(yes)])
+
+dnl AM_MISSING_PROG(NAME, PROGRAM)
+AC_DEFUN(AM_MISSING_PROG, [
+AC_REQUIRE([AM_MISSING_HAS_RUN])
+$1=${$1-"${am_missing_run}$2"}
+AC_SUBST($1)])
+
+dnl Like AM_MISSING_PROG, but only looks for install-sh.
+dnl AM_MISSING_INSTALL_SH()
+AC_DEFUN(AM_MISSING_INSTALL_SH, [
+AC_REQUIRE([AM_MISSING_HAS_RUN])
+if test -z "$install_sh"; then
+   install_sh="$ac_aux_dir/install-sh"
+   test -f "$install_sh" || install_sh="$ac_aux_dir/install.sh"
+   test -f "$install_sh" || install_sh="${am_missing_run}${ac_auxdir}/install-sh"
+   dnl FIXME: an evil hack: we remove the SHELL invocation from
+   dnl install_sh because automake adds it back in.  Sigh.
+   install_sh="`echo $install_sh | sed -e 's/\${SHELL}//'`"
+fi
+AC_SUBST(install_sh)])
+
+dnl AM_MISSING_HAS_RUN.
+dnl Define MISSING if not defined so far and test if it supports --run.
+dnl If it does, set am_missing_run to use it, otherwise, to nothing.
+AC_DEFUN([AM_MISSING_HAS_RUN], [
+test x"${MISSING+set}" = xset || \
+  MISSING="\${SHELL} `CDPATH=: && cd $ac_aux_dir && pwd`/missing"
+dnl Use eval to expand $SHELL
+if eval "$MISSING --run :"; then
+  am_missing_run="$MISSING --run "
+else
+  am_missing_run=
+  am_backtick='`'
+  AC_MSG_WARN([${am_backtick}missing' script is too old or missing])
+fi
+])
+
+dnl See how the compiler implements dependency checking.
+dnl Usage:
+dnl AM_DEPENDENCIES(NAME)
+dnl NAME is "CC", "CXX" or "OBJC".
+
+dnl We try a few techniques and use that to set a single cache variable.
+
+AC_DEFUN(AM_DEPENDENCIES,[
+AC_REQUIRE([AM_SET_DEPDIR])
+AC_REQUIRE([AM_OUTPUT_DEPENDENCY_COMMANDS])
+ifelse([$1],CC,[
+AC_REQUIRE([AC_PROG_CC])
+AC_REQUIRE([AC_PROG_CPP])
+depcc="$CC"
+depcpp="$CPP"],[$1],CXX,[
+AC_REQUIRE([AC_PROG_CXX])
+AC_REQUIRE([AC_PROG_CXXCPP])
+depcc="$CXX"
+depcpp="$CXXCPP"],[$1],OBJC,[
+am_cv_OBJC_dependencies_compiler_type=gcc],[
+AC_REQUIRE([AC_PROG_][$1])
+depcc="$[$1]"
+depcpp=""])
+AC_MSG_CHECKING([dependency style of $depcc])
+AC_CACHE_VAL(am_cv_[$1]_dependencies_compiler_type,[
+if test -z "$AMDEP"; then
+  echo '#include "conftest.h"' > conftest.c
+  echo 'int i;' > conftest.h
+
+  am_cv_[$1]_dependencies_compiler_type=none
+  for depmode in `sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < "$am_depcomp"`; do
+    case "$depmode" in
+    nosideeffect)
+      # after this tag, mechanisms are not by side-effect, so they'll
+      # only be used when explicitly requested
+      if test "x$enable_dependency_tracking" = xyes; then
+	continue
+      else
+	break
+      fi
+      ;;
+    none) break ;;
+    esac
+    if depmode="$depmode" \
+       source=conftest.c object=conftest.o \
+       depfile=conftest.Po tmpdepfile=conftest.TPo \
+       $SHELL $am_depcomp $depcc -c conftest.c 2>/dev/null &&
+       grep conftest.h conftest.Po > /dev/null 2>&1; then
+      am_cv_[$1]_dependencies_compiler_type="$depmode"
+      break
+    fi
+  done
+
+  rm -f conftest.*
+else
+  am_cv_[$1]_dependencies_compiler_type=none
+fi
+])
+AC_MSG_RESULT($am_cv_[$1]_dependencies_compiler_type)
+[$1]DEPMODE="depmode=$am_cv_[$1]_dependencies_compiler_type"
+AC_SUBST([$1]DEPMODE)
+])
+
+dnl Choose a directory name for dependency files.
+dnl This macro is AC_REQUIREd in AM_DEPENDENCIES
+
+AC_DEFUN(AM_SET_DEPDIR,[
+if test -d .deps || mkdir .deps 2> /dev/null || test -d .deps; then
+  DEPDIR=.deps
+else
+  DEPDIR=_deps
+fi
+AC_SUBST(DEPDIR)
+])
+
+AC_DEFUN(AM_DEP_TRACK,[
+AC_ARG_ENABLE(dependency-tracking,
+[  --disable-dependency-tracking Speeds up one-time builds
+  --enable-dependency-tracking  Do not reject slow dependency extractors])
+if test "x$enable_dependency_tracking" = xno; then
+  AMDEP="#"
+else
+  am_depcomp="$ac_aux_dir/depcomp"
+  if test ! -f "$am_depcomp"; then
+    AMDEP="#"
+  else
+    AMDEP=
+  fi
+fi
+AC_SUBST(AMDEP)
+if test -z "$AMDEP"; then
+  AMDEPBACKSLASH='\'
+else
+  AMDEPBACKSLASH=
+fi
+pushdef([subst], defn([AC_SUBST]))
+subst(AMDEPBACKSLASH)
+popdef([subst])
+])
+
+dnl Generate code to set up dependency tracking.
+dnl This macro should only be invoked once -- use via AC_REQUIRE.
+dnl Usage:
+dnl AM_OUTPUT_DEPENDENCY_COMMANDS
+
+dnl
+dnl This code is only required when automatic dependency tracking
+dnl is enabled.  FIXME.  This creates each `.P' file that we will
+dnl need in order to bootstrap the dependency handling code.
+AC_DEFUN(AM_OUTPUT_DEPENDENCY_COMMANDS,[
+AC_OUTPUT_COMMANDS([
+test x"$AMDEP" != x"" ||
+for mf in $CONFIG_FILES; do
+  case "$mf" in
+  Makefile) dirpart=.;;
+  */Makefile) dirpart=`echo "$mf" | sed -e 's|/[^/]*$||'`;;
+  *) continue;;
+  esac
+  grep '^DEP_FILES *= *[^ #]' < "$mf" > /dev/null || continue
+  # Extract the definition of DEP_FILES from the Makefile without
+  # running `make'.
+  DEPDIR=`sed -n -e '/^DEPDIR = / s///p' < "$mf"`
+  test -z "$DEPDIR" && continue
+  # When using ansi2knr, U may be empty or an underscore; expand it
+  U=`sed -n -e '/^U = / s///p' < "$mf"`
+  test -d "$dirpart/$DEPDIR" || mkdir "$dirpart/$DEPDIR"
+  # We invoke sed twice because it is the simplest approach to
+  # changing $(DEPDIR) to its actual value in the expansion.
+  for file in `sed -n -e '
+    /^DEP_FILES = .*\\\\$/ {
+      s/^DEP_FILES = //
+      :loop
+	s/\\\\$//
+	p
+	n
+	/\\\\$/ b loop
+      p
+    }
+    /^DEP_FILES = / s/^DEP_FILES = //p' < "$mf" | \
+       sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g' -e 's/\$U/'"$U"'/g'`; do
+    # Make sure the directory exists.
+    test -f "$dirpart/$file" && continue
+    fdir=`echo "$file" | sed -e 's|/[^/]*$||'`
+    $ac_aux_dir/mkinstalldirs "$dirpart/$fdir" > /dev/null 2>&1
+    # echo "creating $dirpart/$file"
+    echo '# dummy' > "$dirpart/$file"
+  done
+done
+], [AMDEP="$AMDEP"
+ac_aux_dir="$ac_aux_dir"])])
+
+# Like AC_CONFIG_HEADER, but automatically create stamp file.
+
+AC_DEFUN(AM_CONFIG_HEADER,
+[AC_PREREQ([2.12])
+AC_CONFIG_HEADER([$1])
+dnl When config.status generates a header, we must update the stamp-h file.
+dnl This file resides in the same directory as the config header
+dnl that is generated.  We must strip everything past the first ":",
+dnl and everything past the last "/".
+AC_OUTPUT_COMMANDS(changequote(<<,>>)dnl
+ifelse(patsubst(<<$1>>, <<[^ ]>>, <<>>), <<>>,
+<<test -z "<<$>>CONFIG_HEADERS" || echo timestamp > patsubst(<<$1>>, <<^\([^:]*/\)?.*>>, <<\1>>)stamp-h<<>>dnl>>,
+<<am_indx=1
+for am_file in <<$1>>; do
+  case " <<$>>CONFIG_HEADERS " in
+  *" <<$>>am_file "*<<)>>
+    echo timestamp > `echo <<$>>am_file | sed -e 's%:.*%%' -e 's%[^/]*$%%'`stamp-h$am_indx
+    ;;
+  esac
+  am_indx=`expr "<<$>>am_indx" + 1`
+done<<>>dnl>>)
+changequote([,]))])
+
+# Add --enable-maintainer-mode option to configure.
+# From Jim Meyering
+
+# serial 1
+
+AC_DEFUN(AM_MAINTAINER_MODE,
+[AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles])
+  dnl maintainer-mode is disabled by default
+  AC_ARG_ENABLE(maintainer-mode,
+[  --enable-maintainer-mode enable make rules and dependencies not useful
+                          (and sometimes confusing) to the casual installer],
+      USE_MAINTAINER_MODE=$enableval,
+      USE_MAINTAINER_MODE=no)
+  AC_MSG_RESULT($USE_MAINTAINER_MODE)
+  AM_CONDITIONAL(MAINTAINER_MODE, test $USE_MAINTAINER_MODE = yes)
+  MAINT=$MAINTAINER_MODE_TRUE
+  AC_SUBST(MAINT)dnl
+]
+)
+
+# Define a conditional.
+
+AC_DEFUN(AM_CONDITIONAL,
+[AC_SUBST($1_TRUE)
+AC_SUBST($1_FALSE)
+if $2; then
+  $1_TRUE=
+  $1_FALSE='#'
+else
+  $1_TRUE='#'
+  $1_FALSE=
+fi])
+
+
+# serial 42 AC_PROG_LIBTOOL
+AC_DEFUN(AC_PROG_LIBTOOL,
+[AC_REQUIRE([AC_LIBTOOL_SETUP])dnl
+
+# Save cache, so that ltconfig can load it
+AC_CACHE_SAVE
+
+# Actually configure libtool.  ac_aux_dir is where install-sh is found.
+AR="$AR" CC="$CC" CFLAGS="$CFLAGS" CPPFLAGS="$CPPFLAGS" \
+MAGIC="$MAGIC" LD="$LD" LDFLAGS="$LDFLAGS" LIBS="$LIBS" \
+LN_S="$LN_S" NM="$NM" RANLIB="$RANLIB" STRIP="$STRIP" \
+AS="$AS" DLLTOOL="$DLLTOOL" OBJDUMP="$OBJDUMP" \
+objext="$OBJEXT" exeext="$EXEEXT" reload_flag="$reload_flag" \
+deplibs_check_method="$deplibs_check_method" file_magic_cmd="$file_magic_cmd" \
+${CONFIG_SHELL-/bin/sh} $ac_aux_dir/ltconfig --no-reexec \
+$libtool_flags --no-verify --build="$build" $ac_aux_dir/ltmain.sh $lt_target \
+|| AC_MSG_ERROR([libtool configure failed])
+
+# Reload cache, that may have been modified by ltconfig
+AC_CACHE_LOAD
+
+# This can be used to rebuild libtool when needed
+LIBTOOL_DEPS="$ac_aux_dir/ltconfig $ac_aux_dir/ltmain.sh"
+
+# Always use our own libtool.
+LIBTOOL='$(SHELL) $(top_builddir)/libtool'
+AC_SUBST(LIBTOOL)dnl
+
+# Redirect the config.log output again, so that the ltconfig log is not
+# clobbered by the next message.
+exec 5>>./config.log
+])
+
+AC_DEFUN(AC_LIBTOOL_SETUP,
+[AC_PREREQ(2.13)dnl
+AC_REQUIRE([AC_ENABLE_SHARED])dnl
+AC_REQUIRE([AC_ENABLE_STATIC])dnl
+AC_REQUIRE([AC_ENABLE_FAST_INSTALL])dnl
+AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_REQUIRE([AC_CANONICAL_BUILD])dnl
+AC_REQUIRE([AC_PROG_CC])dnl
+AC_REQUIRE([AC_PROG_LD])dnl
+AC_REQUIRE([AC_PROG_LD_RELOAD_FLAG])dnl
+AC_REQUIRE([AC_PROG_NM])dnl
+AC_REQUIRE([AC_PROG_LN_S])dnl
+AC_REQUIRE([AC_DEPLIBS_CHECK_METHOD])dnl
+AC_REQUIRE([AC_OBJEXT])dnl
+AC_REQUIRE([AC_EXEEXT])dnl
+dnl
+
+# Only perform the check for file, if the check method requires it
+case "$deplibs_check_method" in
+file_magic*)
+  if test "$file_magic_cmd" = '${MAGIC}'; then
+    AC_PATH_MAGIC
+  fi
+  ;;
+esac
+
+case "$target" in
+NONE) lt_target="$host" ;;
+*) lt_target="$target" ;;
+esac
+
+AC_CHECK_TOOL(RANLIB, ranlib, :)
+AC_CHECK_TOOL(STRIP, strip, :)
+
+# Check for any special flags to pass to ltconfig.
+libtool_flags="--cache-file=$cache_file"
+test "$enable_shared" = no && libtool_flags="$libtool_flags --disable-shared"
+test "$enable_static" = no && libtool_flags="$libtool_flags --disable-static"
+test "$enable_fast_install" = no && libtool_flags="$libtool_flags --disable-fast-install"
+test "$ac_cv_prog_gcc" = yes && libtool_flags="$libtool_flags --with-gcc"
+test "$ac_cv_prog_gnu_ld" = yes && libtool_flags="$libtool_flags --with-gnu-ld"
+ifdef([AC_PROVIDE_AC_LIBTOOL_DLOPEN],
+[libtool_flags="$libtool_flags --enable-dlopen"])
+ifdef([AC_PROVIDE_AC_LIBTOOL_WIN32_DLL],
+[libtool_flags="$libtool_flags --enable-win32-dll"])
+AC_ARG_ENABLE(libtool-lock,
+  [  --disable-libtool-lock  avoid locking (might break parallel builds)])
+test "x$enable_libtool_lock" = xno && libtool_flags="$libtool_flags --disable-lock"
+test x"$silent" = xyes && libtool_flags="$libtool_flags --silent"
+
+AC_ARG_WITH(pic,
+  [  --with-pic              try to use only PIC/non-PIC objects [default=use both]],
+     pic_mode="$withval", pic_mode=default)
+test x"$pic_mode" = xyes && libtool_flags="$libtool_flags --prefer-pic"
+test x"$pic_mode" = xno && libtool_flags="$libtool_flags --prefer-non-pic"
+
+# Some flags need to be propagated to the compiler or linker for good
+# libtool support.
+case "$lt_target" in
+*-*-irix6*)
+  # Find out which ABI we are using.
+  echo '[#]line __oline__ "configure"' > conftest.$ac_ext
+  if AC_TRY_EVAL(ac_compile); then
+    case "`/usr/bin/file conftest.o`" in
+    *32-bit*)
+      LD="${LD-ld} -32"
+      ;;
+    *N32*)
+      LD="${LD-ld} -n32"
+      ;;
+    *64-bit*)
+      LD="${LD-ld} -64"
+      ;;
+    esac
+  fi
+  rm -rf conftest*
+  ;;
+
+*-*-sco3.2v5*)
+  # On SCO OpenServer 5, we need -belf to get full-featured binaries.
+  SAVE_CFLAGS="$CFLAGS"
+  CFLAGS="$CFLAGS -belf"
+  AC_CACHE_CHECK([whether the C compiler needs -belf], lt_cv_cc_needs_belf,
+    [AC_LANG_SAVE
+     AC_LANG_C
+     AC_TRY_LINK([],[],[lt_cv_cc_needs_belf=yes],[lt_cv_cc_needs_belf=no])
+     AC_LANG_RESTORE])
+  if test x"$lt_cv_cc_needs_belf" != x"yes"; then
+    # this is probably gcc 2.8.0, egcs 1.0 or newer; no need for -belf
+    CFLAGS="$SAVE_CFLAGS"
+  fi
+  ;;
+
+ifdef([AC_PROVIDE_AC_LIBTOOL_WIN32_DLL],
+[*-*-cygwin* | *-*-mingw*)
+  AC_CHECK_TOOL(DLLTOOL, dlltool, false)
+  AC_CHECK_TOOL(AS, as, false)
+  AC_CHECK_TOOL(OBJDUMP, objdump, false)
+
+  # recent cygwin and mingw systems supply a stub DllMain which the user
+  # can override, but on older systems we have to supply one
+  AC_CACHE_CHECK([if libtool should supply DllMain function], lt_cv_need_dllmain,
+    [AC_TRY_LINK([],
+      [extern int __attribute__((__stdcall__)) DllMain(void*, int, void*);
+      DllMain (0, 0, 0);],
+      [lt_cv_need_dllmain=no],[lt_cv_need_dllmain=yes])])
+
+  case "$lt_target/$CC" in
+  *-*-cygwin*/gcc*-mno-cygwin*|*-*-mingw*)
+    # old mingw systems require "-dll" to link a DLL, while more recent ones
+    # require "-mdll"
+    SAVE_CFLAGS="$CFLAGS"
+    CFLAGS="$CFLAGS -mdll"
+    AC_CACHE_CHECK([how to link DLLs], lt_cv_cc_dll_switch,
+      [AC_TRY_LINK([], [], [lt_cv_cc_dll_switch=-mdll],[lt_cv_cc_dll_switch=-dll])])
+    CFLAGS="$SAVE_CFLAGS" ;;
+  *-*-cygwin*)
+    # cygwin systems need to pass --dll to the linker, and not link
+    # crt.o which will require a WinMain@16 definition.
+    lt_cv_cc_dll_switch="-Wl,--dll -nostartfiles" ;;
+  esac
+  ;;
+  ])
+esac
+])
+
+# AC_LIBTOOL_DLOPEN - enable checks for dlopen support
+AC_DEFUN(AC_LIBTOOL_DLOPEN, [AC_BEFORE([$0],[AC_LIBTOOL_SETUP])])
+
+# AC_LIBTOOL_WIN32_DLL - declare package support for building win32 dll's
+AC_DEFUN(AC_LIBTOOL_WIN32_DLL, [AC_BEFORE([$0], [AC_LIBTOOL_SETUP])])
+
+# AC_ENABLE_SHARED - implement the --enable-shared flag
+# Usage: AC_ENABLE_SHARED[(DEFAULT)]
+#   Where DEFAULT is either `yes' or `no'.  If omitted, it defaults to
+#   `yes'.
+AC_DEFUN(AC_ENABLE_SHARED, [dnl
+define([AC_ENABLE_SHARED_DEFAULT], ifelse($1, no, no, yes))dnl
+AC_ARG_ENABLE(shared,
+changequote(<<, >>)dnl
+<<  --enable-shared[=PKGS]  build shared libraries [default=>>AC_ENABLE_SHARED_DEFAULT],
+changequote([, ])dnl
+[p=${PACKAGE-default}
+case "$enableval" in
+yes) enable_shared=yes ;;
+no) enable_shared=no ;;
+*)
+  enable_shared=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_shared=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac],
+enable_shared=AC_ENABLE_SHARED_DEFAULT)dnl
+])
+
+# AC_DISABLE_SHARED - set the default shared flag to --disable-shared
+AC_DEFUN(AC_DISABLE_SHARED, [AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+AC_ENABLE_SHARED(no)])
+
+# AC_ENABLE_STATIC - implement the --enable-static flag
+# Usage: AC_ENABLE_STATIC[(DEFAULT)]
+#   Where DEFAULT is either `yes' or `no'.  If omitted, it defaults to
+#   `yes'.
+AC_DEFUN(AC_ENABLE_STATIC, [dnl
+define([AC_ENABLE_STATIC_DEFAULT], ifelse($1, no, no, yes))dnl
+AC_ARG_ENABLE(static,
+changequote(<<, >>)dnl
+<<  --enable-static[=PKGS]  build static libraries [default=>>AC_ENABLE_STATIC_DEFAULT],
+changequote([, ])dnl
+[p=${PACKAGE-default}
+case "$enableval" in
+yes) enable_static=yes ;;
+no) enable_static=no ;;
+*)
+  enable_static=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_static=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac],
+enable_static=AC_ENABLE_STATIC_DEFAULT)dnl
+])
+
+# AC_DISABLE_STATIC - set the default static flag to --disable-static
+AC_DEFUN(AC_DISABLE_STATIC, [AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+AC_ENABLE_STATIC(no)])
+
+
+# AC_ENABLE_FAST_INSTALL - implement the --enable-fast-install flag
+# Usage: AC_ENABLE_FAST_INSTALL[(DEFAULT)]
+#   Where DEFAULT is either `yes' or `no'.  If omitted, it defaults to
+#   `yes'.
+AC_DEFUN(AC_ENABLE_FAST_INSTALL, [dnl
+define([AC_ENABLE_FAST_INSTALL_DEFAULT], ifelse($1, no, no, yes))dnl
+AC_ARG_ENABLE(fast-install,
+changequote(<<, >>)dnl
+<<  --enable-fast-install[=PKGS]  optimize for fast installation [default=>>AC_ENABLE_FAST_INSTALL_DEFAULT],
+changequote([, ])dnl
+[p=${PACKAGE-default}
+case "$enableval" in
+yes) enable_fast_install=yes ;;
+no) enable_fast_install=no ;;
+*)
+  enable_fast_install=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_fast_install=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac],
+enable_fast_install=AC_ENABLE_FAST_INSTALL_DEFAULT)dnl
+])
+
+# AC_ENABLE_FAST_INSTALL - set the default to --disable-fast-install
+AC_DEFUN(AC_DISABLE_FAST_INSTALL, [AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+AC_ENABLE_FAST_INSTALL(no)])
+
+
+# AC_PATH_TOOL_PREFIX - find a file program which can recognise shared library
+AC_DEFUN(AC_PATH_TOOL_PREFIX,
+[AC_MSG_CHECKING([for $1])
+AC_CACHE_VAL(lt_cv_path_MAGIC,
+[case "$MAGIC" in
+  /*)
+  lt_cv_path_MAGIC="$MAGIC" # Let the user override the test with a path.
+  ;;
+  ?:/*)
+  ac_cv_path_MAGIC="$MAGIC" # Let the user override the test with a dos path.
+  ;;
+  *)
+  ac_save_MAGIC="$MAGIC"
+  IFS="${IFS=   }"; ac_save_ifs="$IFS"; IFS=":"
+dnl $ac_dummy forces splitting on constant user-supplied paths.
+dnl POSIX.2 word splitting is done only on the output of word expansions,
+dnl not every word.  This closes a longstanding sh security hole.
+  ac_dummy="ifelse([$2], , $PATH, [$2])"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/$1; then
+      lt_cv_path_MAGIC="$ac_dir/$1"
+      if test -n "$file_magic_test_file"; then
+	case "$deplibs_check_method" in
+	"file_magic "*)
+	  file_magic_regex="`expr \"$deplibs_check_method\" : \"file_magic \(.*\)\"`"
+	  MAGIC="$lt_cv_path_MAGIC"
+	  if eval $file_magic_cmd \$file_magic_test_file 2> /dev/null |
+	    egrep "$file_magic_regex" > /dev/null; then
+	    :
+	  else
+	    cat <<EOF 1>&2
+
+*** Warning: the command libtool uses to detect shared libraries,
+*** $file_magic_cmd, produces output that libtool cannot recognize.
+*** The result is that libtool may fail to recognize shared libraries
+*** as such.  This will affect the creation of libtool libraries that
+*** depend on shared libraries, but programs linked with such libtool
+*** libraries will work regardless of this problem.  Nevertheless, you
+*** may want to report the problem to your system manager and/or to
+*** bug-libtool@gnu.org
+
+EOF
+	  fi ;;
+	esac
+      fi
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+  MAGIC="$ac_save_MAGIC"
+  ;;
+esac])
+MAGIC="$lt_cv_path_MAGIC"
+if test -n "$MAGIC"; then
+  AC_MSG_RESULT($MAGIC)
+else
+  AC_MSG_RESULT(no)
+fi
+])
+
+
+# AC_PATH_MAGIC - find a file program which can recognise a shared library
+AC_DEFUN(AC_PATH_MAGIC,
+[AC_REQUIRE([AC_CHECK_TOOL_PREFIX])dnl
+AC_PATH_TOOL_PREFIX(${ac_tool_prefix}file, /usr/bin:$PATH)
+if test -z "$lt_cv_path_MAGIC"; then
+  if test -n "$ac_tool_prefix"; then
+    AC_PATH_TOOL_PREFIX(file, /usr/bin:$PATH)
+  else
+    MAGIC=:
+  fi
+fi
+])
+
+
+# AC_PROG_LD - find the path to the GNU or non-GNU linker
+AC_DEFUN(AC_PROG_LD,
+[AC_ARG_WITH(gnu-ld,
+[  --with-gnu-ld           assume the C compiler uses GNU ld [default=no]],
+test "$withval" = no || with_gnu_ld=yes, with_gnu_ld=no)
+AC_REQUIRE([AC_PROG_CC])dnl
+AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_REQUIRE([AC_CANONICAL_BUILD])dnl
+ac_prog=ld
+if test "$ac_cv_prog_gcc" = yes; then
+  # Check if gcc -print-prog-name=ld gives a path.
+  AC_MSG_CHECKING([for ld used by GCC])
+  case $lt_target in
+  *-*-mingw*)
+    # gcc leaves a trailing carriage return which upsets mingw
+    ac_prog=`($CC -print-prog-name=ld) 2>&5 | tr -d '\015'` ;;
+  *)
+    ac_prog=`($CC -print-prog-name=ld) 2>&5` ;;
+  esac
+  case "$ac_prog" in
+    # Accept absolute paths.
+changequote(,)dnl
+    [\\/]* | [A-Za-z]:[\\/]*)
+      re_direlt='/[^/][^/]*/\.\./'
+changequote([,])dnl
+      # Canonicalize the path of ld
+      ac_prog=`echo $ac_prog| sed 's%\\\\%/%g'`
+      while echo $ac_prog | grep "$re_direlt" > /dev/null 2>&1; do
+	ac_prog=`echo $ac_prog| sed "s%$re_direlt%/%"`
+      done
+      test -z "$LD" && LD="$ac_prog"
+      ;;
+  "")
+    # If it fails, then pretend we aren't using GCC.
+    ac_prog=ld
+    ;;
+  *)
+    # If it is relative, then search for the first ld in PATH.
+    with_gnu_ld=unknown
+    ;;
+  esac
+elif test "$with_gnu_ld" = yes; then
+  AC_MSG_CHECKING([for GNU ld])
+else
+  AC_MSG_CHECKING([for non-GNU ld])
+fi
+AC_CACHE_VAL(ac_cv_path_LD,
+[if test -z "$LD"; then
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}${PATH_SEPARATOR-:}"
+  for ac_dir in $PATH; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f "$ac_dir/$ac_prog" || test -f "$ac_dir/$ac_prog$ac_exeext"; then
+      ac_cv_path_LD="$ac_dir/$ac_prog"
+      # Check to see if the program is GNU ld.  I'd rather use --version,
+      # but apparently some GNU ld's only accept -v.
+      # Break only if it was the GNU/non-GNU ld that we prefer.
+      if "$ac_cv_path_LD" -v 2>&1 < /dev/null | egrep '(GNU|with BFD)' > /dev/null; then
+	test "$with_gnu_ld" != no && break
+      else
+	test "$with_gnu_ld" != yes && break
+      fi
+    fi
+  done
+  IFS="$ac_save_ifs"
+else
+  ac_cv_path_LD="$LD" # Let the user override the test with a path.
+fi])
+LD="$ac_cv_path_LD"
+if test -n "$LD"; then
+  AC_MSG_RESULT($LD)
+else
+  AC_MSG_RESULT(no)
+fi
+test -z "$LD" && AC_MSG_ERROR([no acceptable ld found in \$PATH])
+AC_PROG_LD_GNU
+])
+
+AC_DEFUN(AC_PROG_LD_GNU,
+[AC_CACHE_CHECK([if the linker ($LD) is GNU ld], ac_cv_prog_gnu_ld,
+[# I'd rather use --version here, but apparently some GNU ld's only accept -v.
+if $LD -v 2>&1 </dev/null | egrep '(GNU|with BFD)' 1>&5; then
+  ac_cv_prog_gnu_ld=yes
+else
+  ac_cv_prog_gnu_ld=no
+fi])
+with_gnu_ld=$ac_cv_prog_gnu_ld
+])
+
+# AC_PROG_LD_RELOAD_FLAG - find reload flag for linker
+#   -- PORTME Some linkers may need a different reload flag.
+AC_DEFUN(AC_PROG_LD_RELOAD_FLAG,
+[AC_CACHE_CHECK([for $LD option to reload object files], lt_cv_ld_reload_flag,
+[lt_cv_ld_reload_flag='-r'])
+reload_flag=$lt_cv_ld_reload_flag
+test -n "$reload_flag" && reload_flag=" $reload_flag"
+])
+
+# AC_DEPLIBS_CHECK_METHOD - how to check for library dependencies
+#  -- PORTME fill in with the dynamic library characteristics
+AC_DEFUN(AC_DEPLIBS_CHECK_METHOD,
+[AC_CACHE_CHECK([how to recognise dependant libraries],
+lt_cv_deplibs_check_method,
+[lt_cv_file_magic_cmd='${MAGIC}'
+lt_cv_file_magic_test_file=
+lt_cv_deplibs_check_method='unknown'
+# Need to set the preceding variable on all platforms that support
+# interlibrary dependencies.
+# 'none' -- dependencies not supported.
+# `unknown' -- same as none, but documents that we really don't know.
+# 'pass_all' -- all dependencies passed with no checks.
+# 'test_compile' -- check by making test program.
+# 'file_magic [regex]' -- check by looking for files in library path
+# which responds to the $file_magic_cmd with a given egrep regex.
+# If you have `file' or equivalent on your system and you're not sure
+# whether `pass_all' will *always* work, you probably want this one.
+
+case "$host_os" in
+aix4* | beos*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+bsdi4*)
+  changequote(,)dnl
+  lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [ML]SB (shared object|dynamic lib)'
+  changequote([, ])dnl
+  lt_cv_file_magic_test_file=/shlib/libc.so
+  ;;
+
+cygwin* | mingw*)
+  lt_cv_deplibs_check_method='file_magic file format pei*-i386(.*architecture: i386)?'
+  lt_cv_file_magic_cmd='${OBJDUMP} -f'
+  ;;
+
+freebsd*)
+  case "$version_type" in
+  freebsd-elf*)
+    lt_cv_deplibs_check_method=pass_all
+    ;;
+  esac
+  ;;
+
+gnu*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+irix5* | irix6*)
+  case "$host_os" in
+  irix5*)
+    # this will be overridden with pass_all, but let us keep it just in case
+    lt_cv_deplibs_check_method="file_magic ELF 32-bit MSB dynamic lib MIPS - version 1"
+    ;;
+  *)
+    case "$LD" in
+    *-32|*"-32 ") libmagic=32-bit;;
+    *-n32|*"-n32 ") libmagic=N32;;
+    *-64|*"-64 ") libmagic=64-bit;;
+    *) libmagic=never-match;;
+    esac
+    # this will be overridden with pass_all, but let us keep it just in case
+    changequote(,)dnl
+    lt_cv_deplibs_check_method="file_magic ELF ${libmagic} MSB mips-[1234] dynamic lib MIPS - version 1"
+    changequote([, ])dnl
+    ;;
+  esac
+  lt_cv_file_magic_test_file=`echo /lib${libsuff}/libc.so*`
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+# This must be Linux ELF.
+linux-gnu*)
+  case "$host_cpu" in
+  alpha* | i*86 | powerpc* | sparc* )
+    lt_cv_deplibs_check_method=pass_all ;;
+  *)
+    # glibc up to 2.1.1 does not perform some relocations on ARM
+    changequote(,)dnl
+    lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [LM]SB (shared object|dynamic lib )' ;;
+    changequote([, ])dnl
+  esac
+  lt_cv_file_magic_test_file=`echo /lib/libc.so* /lib/libc-*.so`
+  ;;
+
+osf3* | osf4* | osf5*)
+  # this will be overridden with pass_all, but let us keep it just in case
+  lt_cv_deplibs_check_method='file_magic COFF format alpha shared library'
+  lt_cv_file_magic_test_file=/shlib/libc.so
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+sco3.2v5*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+solaris*)
+  lt_cv_deplibs_check_method=pass_all
+  lt_cv_file_magic_test_file=/lib/libc.so
+  ;;
+
+sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+  case "$host_vendor" in
+  ncr)
+    lt_cv_deplibs_check_method=pass_all
+    ;;
+  motorola)
+    changequote(,)dnl
+    lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [ML]SB (shared object|dynamic lib) M[0-9][0-9]* Version [0-9]'
+    changequote([, ])dnl
+    lt_cv_file_magic_test_file=`echo /usr/lib/libc.so*`
+    ;;
+  esac
+  ;;
+esac
+])
+file_magic_cmd=$lt_cv_file_magic_cmd
+deplibs_check_method=$lt_cv_deplibs_check_method
+])
+
+
+# AC_PROG_NM - find the path to a BSD-compatible name lister
+AC_DEFUN(AC_PROG_NM,
+[AC_MSG_CHECKING([for BSD-compatible nm])
+AC_CACHE_VAL(ac_cv_path_NM,
+[if test -n "$NM"; then
+  # Let the user override the test.
+  ac_cv_path_NM="$NM"
+else
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}${PATH_SEPARATOR-:}"
+  for ac_dir in $PATH /usr/ccs/bin /usr/ucb /bin; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/nm || test -f $ac_dir/nm$ac_exeext ; then
+      # Check to see if the nm accepts a BSD-compat flag.
+      # Adding the `sed 1q' prevents false positives on HP-UX, which says:
+      #   nm: unknown option "B" ignored
+      if ($ac_dir/nm -B /dev/null 2>&1 | sed '1q'; exit 0) | egrep /dev/null >/dev/null; then
+	ac_cv_path_NM="$ac_dir/nm -B"
+	break
+      elif ($ac_dir/nm -p /dev/null 2>&1 | sed '1q'; exit 0) | egrep /dev/null >/dev/null; then
+	ac_cv_path_NM="$ac_dir/nm -p"
+	break
+      else
+	ac_cv_path_NM=${ac_cv_path_NM="$ac_dir/nm"} # keep the first match, but
+	continue # so that we can try to find one that supports BSD flags
+      fi
+    fi
+  done
+  IFS="$ac_save_ifs"
+  test -z "$ac_cv_path_NM" && ac_cv_path_NM=nm
+fi])
+NM="$ac_cv_path_NM"
+AC_MSG_RESULT([$NM])
+])
+
+# AC_CHECK_LIBM - check for math library
+AC_DEFUN(AC_CHECK_LIBM,
+[AC_REQUIRE([AC_CANONICAL_HOST])dnl
+LIBM=
+case "$lt_target" in
+*-*-beos* | *-*-cygwin*)
+  # These system don't have libm
+  ;;
+*-ncr-sysv4.3*)
+  AC_CHECK_LIB(mw, _mwvalidcheckl, LIBM="-lmw")
+  AC_CHECK_LIB(m, main, LIBM="$LIBM -lm")
+  ;;
+*)
+  AC_CHECK_LIB(m, main, LIBM="-lm")
+  ;;
+esac
+])
+
+# AC_LIBLTDL_CONVENIENCE[(dir)] - sets LIBLTDL to the link flags for
+# the libltdl convenience library, adds --enable-ltdl-convenience to
+# the configure arguments.  Note that LIBLTDL is not AC_SUBSTed, nor
+# is AC_CONFIG_SUBDIRS called.  If DIR is not provided, it is assumed
+# to be `${top_builddir}/libltdl'.  Make sure you start DIR with
+# '${top_builddir}/' (note the single quotes!) if your package is not
+# flat, and, if you're not using automake, define top_builddir as
+# appropriate in the Makefiles.
+AC_DEFUN(AC_LIBLTDL_CONVENIENCE, [AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+  case "$enable_ltdl_convenience" in
+  no) AC_MSG_ERROR([this package needs a convenience libltdl]) ;;
+  "") enable_ltdl_convenience=yes
+      ac_configure_args="$ac_configure_args --enable-ltdl-convenience" ;;
+  esac
+  LIBLTDL=ifelse($#,1,$1,['${top_builddir}/libltdl'])/libltdlc.la
+  INCLTDL=ifelse($#,1,-I$1,['-I${top_srcdir}/libltdl'])
+])
+
+# AC_LIBLTDL_INSTALLABLE[(dir)] - sets LIBLTDL to the link flags for
+# the libltdl installable library, and adds --enable-ltdl-install to
+# the configure arguments.  Note that LIBLTDL is not AC_SUBSTed, nor
+# is AC_CONFIG_SUBDIRS called.  If DIR is not provided, it is assumed
+# to be `${top_builddir}/libltdl'.  Make sure you start DIR with
+# '${top_builddir}/' (note the single quotes!) if your package is not
+# flat, and, if you're not using automake, define top_builddir as
+# appropriate in the Makefiles.
+# In the future, this macro may have to be called after AC_PROG_LIBTOOL.
+AC_DEFUN(AC_LIBLTDL_INSTALLABLE, [AC_BEFORE([$0],[AC_LIBTOOL_SETUP])dnl
+  AC_CHECK_LIB(ltdl, main,
+  [test x"$enable_ltdl_install" != xyes && enable_ltdl_install=no],
+  [if test x"$enable_ltdl_install" = xno; then
+     AC_MSG_WARN([libltdl not installed, but installation disabled])
+   else
+     enable_ltdl_install=yes
+   fi
+  ])
+  if test x"$enable_ltdl_install" = x"yes"; then
+    ac_configure_args="$ac_configure_args --enable-ltdl-install"
+    LIBLTDL=ifelse($#,1,$1,['${top_builddir}/libltdl'])/libltdl.la
+    INCLTDL=ifelse($#,1,-I$1,['-I${top_srcdir}/libltdl'])
+  else
+    ac_configure_args="$ac_configure_args --enable-ltdl-install=no"
+    LIBLTDL="-lltdl"
+    INCLTDL=
+  fi
+])
+
+dnl old names
+AC_DEFUN(AM_PROG_LIBTOOL, [indir([AC_PROG_LIBTOOL])])dnl
+AC_DEFUN(AM_ENABLE_SHARED, [indir([AC_ENABLE_SHARED], $@)])dnl
+AC_DEFUN(AM_ENABLE_STATIC, [indir([AC_ENABLE_STATIC], $@)])dnl
+AC_DEFUN(AM_DISABLE_SHARED, [indir([AC_DISABLE_SHARED], $@)])dnl
+AC_DEFUN(AM_DISABLE_STATIC, [indir([AC_DISABLE_STATIC], $@)])dnl
+AC_DEFUN(AM_PROG_LD, [indir([AC_PROG_LD])])dnl
+AC_DEFUN(AM_PROG_NM, [indir([AC_PROG_NM])])dnl
+
+dnl This is just to silence aclocal about the macro not being used
+ifelse([AC_DISABLE_FAST_INSTALL])dnl
+
diff --git a/rts/gmp/ansi2knr.1 b/rts/gmp/ansi2knr.1
new file mode 100644
index 0000000000..f9ee5a631c
--- /dev/null
+++ b/rts/gmp/ansi2knr.1
@@ -0,0 +1,36 @@
+.TH ANSI2KNR 1 "19 Jan 1996"
+.SH NAME
+ansi2knr \- convert ANSI C to Kernighan & Ritchie C
+.SH SYNOPSIS
+.I ansi2knr
+[--varargs] input_file [output_file]
+.SH DESCRIPTION
+If no output_file is supplied, output goes to stdout.
+.br
+There are no error messages.
+.sp
+.I ansi2knr
+recognizes function definitions by seeing a non-keyword identifier at the left
+margin, followed by a left parenthesis, with a right parenthesis as the last
+character on the line, and with a left brace as the first token on the
+following line (ignoring possible intervening comments).  It will recognize a
+multi-line header provided that no intervening line ends with a left or right
+brace or a semicolon.  These algorithms ignore whitespace and comments, except
+that the function name must be the first thing on the line.
+.sp
+The following constructs will confuse it:
+.br
+     - Any other construct that starts at the left margin and follows the
+above syntax (such as a macro or function call).
+.br
+     - Some macros that tinker with the syntax of the function header.
+.sp
+The --varargs switch is obsolete, and is recognized only for
+backwards compatibility.  The present version of
+.I ansi2knr
+will always attempt to convert a ... argument to va_alist and va_dcl.
+.SH AUTHOR
+L. Peter Deutsch <ghost@aladdin.com> wrote the original ansi2knr and
+continues to maintain the current version; most of the code in the current
+version is his work.  ansi2knr also includes contributions by Francois
+Pinard <pinard@iro.umontreal.ca> and Jim Avera <jima@netcom.com>.
diff --git a/rts/gmp/ansi2knr.c b/rts/gmp/ansi2knr.c
new file mode 100644
index 0000000000..937c731886
--- /dev/null
+++ b/rts/gmp/ansi2knr.c
@@ -0,0 +1,677 @@
+/* Copyright (C) 1989, 1997, 1998, 1999 Aladdin Enterprises.  All rights reserved. */
+
+/* Convert ANSI C function definitions to K&R ("traditional C") syntax */
+
+/*
+ansi2knr is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY.  No author or distributor accepts responsibility to anyone for the
+consequences of using it or for whether it serves any particular purpose or
+works at all, unless he says so in writing.  Refer to the GNU General Public
+License (the "GPL") for full details.
+
+Everyone is granted permission to copy, modify and redistribute ansi2knr,
+but only under the conditions described in the GPL.  A copy of this license
+is supposed to have been given to you along with ansi2knr so you can know
+your rights and responsibilities.  It should be in a file named COPYLEFT,
+or, if there is no file named COPYLEFT, a file named COPYING.  Among other
+things, the copyright notice and this notice must be preserved on all
+copies.
+
+We explicitly state here what we believe is already implied by the GPL: if
+the ansi2knr program is distributed as a separate set of sources and a
+separate executable file which are aggregated on a storage medium together
+with another program, this in itself does not bring the other program under
+the GPL, nor does the mere fact that such a program or the procedures for
+constructing it invoke the ansi2knr executable bring any other part of the
+program under the GPL.
+*/
+
+/*
+ * Usage:
+	ansi2knr [--filename FILENAME] [INPUT_FILE [OUTPUT_FILE]]
+ * --filename provides the file name for the #line directive in the output,
+ * overriding input_file (if present).
+ * If no input_file is supplied, input is read from stdin.
+ * If no output_file is supplied, output goes to stdout.
+ * There are no error messages.
+ *
+ * ansi2knr recognizes function definitions by seeing a non-keyword
+ * identifier at the left margin, followed by a left parenthesis,
+ * with a right parenthesis as the last character on the line,
+ * and with a left brace as the first token on the following line
+ * (ignoring possible intervening comments), except that a line
+ * consisting of only
+ *	identifier1(identifier2)
+ * will not be considered a function definition unless identifier2 is
+ * the word "void", and a line consisting of
+ *	identifier1(identifier2, <<arbitrary>>)
+ * will not be considered a function definition.
+ * ansi2knr will recognize a multi-line header provided
+ * that no intervening line ends with a left or right brace or a semicolon.
+ * These algorithms ignore whitespace and comments, except that
+ * the function name must be the first thing on the line.
+ * The following constructs will confuse it:
+ *	- Any other construct that starts at the left margin and
+ *	    follows the above syntax (such as a macro or function call).
+ *	- Some macros that tinker with the syntax of function headers.
+ */
+
+/*
+ * The original and principal author of ansi2knr is L. Peter Deutsch
+ * <ghost@aladdin.com>.  Other authors are noted in the change history
+ * that follows (in reverse chronological order):
+	lpd 1999-04-12 added minor fixes from Pavel Roskin
+		<pavel_roskin@geocities.com> for clean compilation with
+		gcc -W -Wall
+	lpd 1999-03-22 added hack to recognize lines consisting of
+		identifier1(identifier2, xxx) as *not* being procedures
+	lpd 1999-02-03 made indentation of preprocessor commands consistent
+	lpd 1999-01-28 fixed two bugs: a '/' in an argument list caused an
+		endless loop; quoted strings within an argument list
+		confused the parser
+	lpd 1999-01-24 added a check for write errors on the output,
+		suggested by Jim Meyering <meyering@ascend.com>
+	lpd 1998-11-09 added further hack to recognize identifier(void)
+		as being a procedure
+	lpd 1998-10-23 added hack to recognize lines consisting of
+		identifier1(identifier2) as *not* being procedures
+	lpd 1997-12-08 made input_file optional; only closes input and/or
+		output file if not stdin or stdout respectively; prints
+		usage message on stderr rather than stdout; adds
+		--filename switch (changes suggested by
+		<ceder@lysator.liu.se>)
+	lpd 1996-01-21 added code to cope with not HAVE_CONFIG_H and with
+		compilers that don't understand void, as suggested by
+		Tom Lane
+	lpd 1996-01-15 changed to require that the first non-comment token
+		on the line following a function header be a left brace,
+		to reduce sensitivity to macros, as suggested by Tom Lane
+		<tgl@sss.pgh.pa.us>
+	lpd 1995-06-22 removed #ifndefs whose sole purpose was to define
+		undefined preprocessor symbols as 0; changed all #ifdefs
+		for configuration symbols to #ifs
+	lpd 1995-04-05 changed copyright notice to make it clear that
+		including ansi2knr in a program does not bring the entire
+		program under the GPL
+	lpd 1994-12-18 added conditionals for systems where ctype macros
+		don't handle 8-bit characters properly, suggested by
+		Francois Pinard <pinard@iro.umontreal.ca>;
+		removed --varargs switch (this is now the default)
+	lpd 1994-10-10 removed CONFIG_BROKETS conditional
+	lpd 1994-07-16 added some conditionals to help GNU `configure',
+		suggested by Francois Pinard <pinard@iro.umontreal.ca>;
+		properly erase prototype args in function parameters,
+		contributed by Jim Avera <jima@netcom.com>;
+		correct error in writeblanks (it shouldn't erase EOLs)
+	lpd 1989-xx-xx original version
+ */
+
+/* Most of the conditionals here are to make ansi2knr work with */
+/* or without the GNU configure machinery. */
+
+#if HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <stdio.h>
+#include <ctype.h>
+
+#if HAVE_CONFIG_H
+
+/*
+   For properly autoconfiguring ansi2knr, use AC_CONFIG_HEADER(config.h).
+   This will define HAVE_CONFIG_H and so, activate the following lines.
+ */
+
+# if STDC_HEADERS || HAVE_STRING_H
+#  include <string.h>
+# else
+#  include <strings.h>
+# endif
+
+#else /* not HAVE_CONFIG_H */
+
+/* Otherwise do it the hard way */
+
+# ifdef BSD
+#  include <strings.h>
+# else
+#  ifdef VMS
+    extern int strlen(), strncmp();
+#  else
+#   include <string.h>
+#  endif
+# endif
+
+#endif /* not HAVE_CONFIG_H */
+
+#if STDC_HEADERS
+# include <stdlib.h>
+#else
+/*
+   malloc and free should be declared in stdlib.h,
+   but if you've got a K&R compiler, they probably aren't.
+ */
+# ifdef MSDOS
+#  include <malloc.h>
+# else
+#  ifdef VMS
+     extern char *malloc();
+     extern void free();
+#  else
+     extern char *malloc();
+     extern int free();
+#  endif
+# endif
+
+#endif
+
+/* Define NULL (for *very* old compilers). */
+#ifndef NULL
+# define NULL (0)
+#endif
+
+/*
+ * The ctype macros don't always handle 8-bit characters correctly.
+ * Compensate for this here.
+ */
+#ifdef isascii
+# undef HAVE_ISASCII		/* just in case */
+# define HAVE_ISASCII 1
+#else
+#endif
+#if STDC_HEADERS || !HAVE_ISASCII
+# define is_ascii(c) 1
+#else
+# define is_ascii(c) isascii(c)
+#endif
+
+#define is_space(c) (is_ascii(c) && isspace(c))
+#define is_alpha(c) (is_ascii(c) && isalpha(c))
+#define is_alnum(c) (is_ascii(c) && isalnum(c))
+
+/* Scanning macros */
+#define isidchar(ch) (is_alnum(ch) || (ch) == '_')
+#define isidfirstchar(ch) (is_alpha(ch) || (ch) == '_')
+
+/* Forward references */
+char *skipspace();
+char *scanstring();
+int writeblanks();
+int test1();
+int convert1();
+
+/* The main program */
+int
+main(argc, argv)
+    int argc;
+    char *argv[];
+{	FILE *in = stdin;
+	FILE *out = stdout;
+	char *filename = 0;
+	char *program_name = argv[0];
+	char *output_name = 0;
+#define bufsize 5000			/* arbitrary size */
+	char *buf;
+	char *line;
+	char *more;
+	char *usage =
+	  "Usage: ansi2knr [--filename FILENAME] [INPUT_FILE [OUTPUT_FILE]]\n";
+	/*
+	 * In previous versions, ansi2knr recognized a --varargs switch.
+	 * If this switch was supplied, ansi2knr would attempt to convert
+	 * a ... argument to va_alist and va_dcl; if this switch was not
+	 * supplied, ansi2knr would simply drop any such arguments.
+	 * Now, ansi2knr always does this conversion, and we only
+	 * check for this switch for backward compatibility.
+	 */
+	int convert_varargs = 1;
+	int output_error;
+
+	while ( argc > 1 && argv[1][0] == '-' ) {
+	  if ( !strcmp(argv[1], "--varargs") ) {
+	    convert_varargs = 1;
+	    argc--;
+	    argv++;
+	    continue;
+	  }
+	  if ( !strcmp(argv[1], "--filename") && argc > 2 ) {
+	    filename = argv[2];
+	    argc -= 2;
+	    argv += 2;
+	    continue;
+	  }
+	  fprintf(stderr, "%s: Unrecognized switch: %s\n", program_name,
+		  argv[1]);
+	  fprintf(stderr, usage);
+	  exit(1);
+	}
+	switch ( argc )
+	   {
+	default:
+		fprintf(stderr, usage);
+		exit(0);
+	case 3:
+		output_name = argv[2];
+		out = fopen(output_name, "w");
+		if ( out == NULL ) {
+		  fprintf(stderr, "%s: Cannot open output file %s\n",
+			  program_name, output_name);
+		  exit(1);
+		}
+		/* falls through */
+	case 2:
+		in = fopen(argv[1], "r");
+		if ( in == NULL ) {
+		  fprintf(stderr, "%s: Cannot open input file %s\n",
+			  program_name, argv[1]);
+		  exit(1);
+		}
+		if ( filename == 0 )
+		  filename = argv[1];
+		/* falls through */
+	case 1:
+		break;
+	   }
+	if ( filename )
+	  fprintf(out, "#line 1 \"%s\"\n", filename);
+	buf = malloc(bufsize);
+	if ( buf == NULL )
+	   {
+		fprintf(stderr, "Unable to allocate read buffer!\n");
+		exit(1);
+	   }
+	line = buf;
+	while ( fgets(line, (unsigned)(buf + bufsize - line), in) != NULL )
+	   {
+test:		line += strlen(line);
+		switch ( test1(buf) )
+		   {
+		case 2:			/* a function header */
+			convert1(buf, out, 1, convert_varargs);
+			break;
+		case 1:			/* a function */
+			/* Check for a { at the start of the next line. */
+			more = ++line;
+f:			if ( line >= buf + (bufsize - 1) ) /* overflow check */
+			  goto wl;
+			if ( fgets(line, (unsigned)(buf + bufsize - line), in) == NULL )
+			  goto wl;
+			switch ( *skipspace(more, 1) )
+			  {
+			  case '{':
+			    /* Definitely a function header. */
+			    convert1(buf, out, 0, convert_varargs);
+			    fputs(more, out);
+			    break;
+			  case 0:
+			    /* The next line was blank or a comment: */
+			    /* keep scanning for a non-comment. */
+			    line += strlen(line);
+			    goto f;
+			  default:
+			    /* buf isn't a function header, but */
+			    /* more might be. */
+			    fputs(buf, out);
+			    strcpy(buf, more);
+			    line = buf;
+			    goto test;
+			  }
+			break;
+		case -1:		/* maybe the start of a function */
+			if ( line != buf + (bufsize - 1) ) /* overflow check */
+			  continue;
+			/* falls through */
+		default:		/* not a function */
+wl:			fputs(buf, out);
+			break;
+		   }
+		line = buf;
+	   }
+	if ( line != buf )
+	  fputs(buf, out);
+	free(buf);
+	if ( output_name ) {
+	  output_error = ferror(out);
+	  output_error |= fclose(out);
+	} else {		/* out == stdout */
+	  fflush(out);
+	  output_error = ferror(out);
+	}
+	if ( output_error ) {
+	  fprintf(stderr, "%s: error writing to %s\n", program_name,
+		  (output_name ? output_name : "stdout"));
+	  exit(1);
+	}
+	if ( in != stdin )
+	  fclose(in);
+	return 0;
+}
+
+/* Skip over whitespace and comments, in either direction. */
+char *
+skipspace(p, dir)
+    register char *p;
+    register int dir;			/* 1 for forward, -1 for backward */
+{	for ( ; ; )
+	   {	while ( is_space(*p) )
+		  p += dir;
+		if ( !(*p == '/' && p[dir] == '*') )
+		  break;
+		p += dir;  p += dir;
+		while ( !(*p == '*' && p[dir] == '/') )
+		   {	if ( *p == 0 )
+			  return p;	/* multi-line comment?? */
+			p += dir;
+		   }
+		p += dir;  p += dir;
+	   }
+	return p;
+}
+
+/* Scan over a quoted string, in either direction. */
+char *
+scanstring(p, dir)
+    register char *p;
+    register int dir;
+{
+    for (p += dir; ; p += dir)
+	if (*p == '"' && p[-dir] != '\\')
+	    return p + dir;
+}
+
+/*
+ * Write blanks over part of a string.
+ * Don't overwrite end-of-line characters.
+ */
+int
+writeblanks(start, end)
+    char *start;
+    char *end;
+{	char *p;
+	for ( p = start; p < end; p++ )
+	  if ( *p != '\r' && *p != '\n' )
+	    *p = ' ';
+	return 0;
+}
+
+/*
+ * Test whether the string in buf is a function definition.
+ * The string may contain and/or end with a newline.
+ * Return as follows:
+ *	0 - definitely not a function definition;
+ *	1 - definitely a function definition;
+ *	2 - definitely a function prototype (NOT USED);
+ *	-1 - may be the beginning of a function definition,
+ *		append another line and look again.
+ * The reason we don't attempt to convert function prototypes is that
+ * Ghostscript's declaration-generating macros look too much like
+ * prototypes, and confuse the algorithms.
+ */
+int
+test1(buf)
+    char *buf;
+{	register char *p = buf;
+	char *bend;
+	char *endfn;
+	int contin;
+
+	if ( !isidfirstchar(*p) )
+	  return 0;		/* no name at left margin */
+	bend = skipspace(buf + strlen(buf) - 1, -1);
+	switch ( *bend )
+	   {
+	   case ';': contin = 0 /*2*/; break;
+	   case ')': contin = 1; break;
+	   case '{': return 0;		/* not a function */
+	   case '}': return 0;		/* not a function */
+	   default: contin = -1;
+	   }
+	while ( isidchar(*p) )
+	  p++;
+	endfn = p;
+	p = skipspace(p, 1);
+	if ( *p++ != '(' )
+	  return 0;		/* not a function */
+	p = skipspace(p, 1);
+	if ( *p == ')' )
+	  return 0;		/* no parameters */
+	/* Check that the apparent function name isn't a keyword. */
+	/* We only need to check for keywords that could be followed */
+	/* by a left parenthesis (which, unfortunately, is most of them). */
+	   {	static char *words[] =
+		   {	"asm", "auto", "case", "char", "const", "double",
+			"extern", "float", "for", "if", "int", "long",
+			"register", "return", "short", "signed", "sizeof",
+			"static", "switch", "typedef", "unsigned",
+			"void", "volatile", "while", 0
+		   };
+		char **key = words;
+		char *kp;
+		unsigned len = endfn - buf;
+
+		while ( (kp = *key) != 0 )
+		   {	if ( strlen(kp) == len && !strncmp(kp, buf, len) )
+			  return 0;	/* name is a keyword */
+			key++;
+		   }
+	   }
+	   {
+	       char *id = p;
+	       int len;
+	       /*
+		* Check for identifier1(identifier2) and not
+		* identifier1(void), or identifier1(identifier2, xxxx).
+		*/
+
+	       while ( isidchar(*p) )
+		   p++;
+	       len = p - id;
+	       p = skipspace(p, 1);
+	       if (*p == ',' ||
+		   (*p == ')' && (len != 4 || strncmp(id, "void", 4)))
+		   )
+		   return 0;	/* not a function */
+	   }
+	/*
+	 * If the last significant character was a ), we need to count
+	 * parentheses, because it might be part of a formal parameter
+	 * that is a procedure.
+	 */
+	if (contin > 0) {
+	    int level = 0;
+
+	    for (p = skipspace(buf, 1); *p; p = skipspace(p + 1, 1))
+		level += (*p == '(' ? 1 : *p == ')' ? -1 : 0);
+	    if (level > 0)
+		contin = -1;
+	}
+	return contin;
+}
+
+/* Convert a recognized function definition or header to K&R syntax. */
+int
+convert1(buf, out, header, convert_varargs)
+    char *buf;
+    FILE *out;
+    int header;			/* Boolean */
+    int convert_varargs;	/* Boolean */
+{	char *endfn;
+	register char *p;
+	/*
+	 * The breaks table contains pointers to the beginning and end
+	 * of each argument.
+	 */
+	char **breaks;
+	unsigned num_breaks = 2;	/* for testing */
+	char **btop;
+	char **bp;
+	char **ap;
+	char *vararg = 0;
+
+	/* Pre-ANSI implementations don't agree on whether strchr */
+	/* is called strchr or index, so we open-code it here. */
+	for ( endfn = buf; *(endfn++) != '('; )
+	  ;
+top:	p = endfn;
+	breaks = (char **)malloc(sizeof(char *) * num_breaks * 2);
+	if ( breaks == NULL )
+	   {	/* Couldn't allocate break table, give up */
+		fprintf(stderr, "Unable to allocate break table!\n");
+		fputs(buf, out);
+		return -1;
+	   }
+	btop = breaks + num_breaks * 2 - 2;
+	bp = breaks;
+	/* Parse the argument list */
+	do
+	   {	int level = 0;
+		char *lp = NULL;
+		char *rp = NULL;
+		char *end = NULL;
+
+		if ( bp >= btop )
+		   {	/* Filled up break table. */
+			/* Allocate a bigger one and start over. */
+			free((char *)breaks);
+			num_breaks <<= 1;
+			goto top;
+		   }
+		*bp++ = p;
+		/* Find the end of the argument */
+		for ( ; end == NULL; p++ )
+		   {	switch(*p)
+			   {
+			   case ',':
+				if ( !level ) end = p;
+				break;
+			   case '(':
+				if ( !level ) lp = p;
+				level++;
+				break;
+			   case ')':
+				if ( --level < 0 ) end = p;
+				else rp = p;
+				break;
+			   case '/':
+				if (p[1] == '*')
+				    p = skipspace(p, 1) - 1;
+				break;
+			   case '"':
+			       p = scanstring(p, 1) - 1;
+			       break;
+			   default:
+				;
+			   }
+		   }
+		/* Erase any embedded prototype parameters. */
+		if ( lp && rp )
+		  writeblanks(lp + 1, rp);
+		p--;			/* back up over terminator */
+		/* Find the name being declared. */
+		/* This is complicated because of procedure and */
+		/* array modifiers. */
+		for ( ; ; )
+		   {	p = skipspace(p - 1, -1);
+			switch ( *p )
+			   {
+			   case ']':	/* skip array dimension(s) */
+			   case ')':	/* skip procedure args OR name */
+			   {	int level = 1;
+				while ( level )
+				 switch ( *--p )
+				   {
+				   case ']': case ')':
+				       level++;
+				       break;
+				   case '[': case '(':
+				       level--;
+				       break;
+				   case '/':
+				       if (p > buf && p[-1] == '*')
+					   p = skipspace(p, -1) + 1;
+				       break;
+				   case '"':
+				       p = scanstring(p, -1) + 1;
+				       break;
+				   default: ;
+				   }
+			   }
+				if ( *p == '(' && *skipspace(p + 1, 1) == '*' )
+				   {	/* We found the name being declared */
+					while ( !isidfirstchar(*p) )
+					  p = skipspace(p, 1) + 1;
+					goto found;
+				   }
+				break;
+			   default:
+				goto found;
+			   }
+		   }
+found:		if ( *p == '.' && p[-1] == '.' && p[-2] == '.' )
+		  {	if ( convert_varargs )
+			  {	*bp++ = "va_alist";
+				vararg = p-2;
+			  }
+			else
+			  {	p++;
+				if ( bp == breaks + 1 )	/* sole argument */
+				  writeblanks(breaks[0], p);
+				else
+				  writeblanks(bp[-1] - 1, p);
+				bp--;
+			  }
+		   }
+		else
+		   {	while ( isidchar(*p) ) p--;
+			*bp++ = p+1;
+		   }
+		p = end;
+	   }
+	while ( *p++ == ',' );
+	*bp = p;
+	/* Make a special check for 'void' arglist */
+	if ( bp == breaks+2 )
+	   {	p = skipspace(breaks[0], 1);
+		if ( !strncmp(p, "void", 4) )
+		   {	p = skipspace(p+4, 1);
+			if ( p == breaks[2] - 1 )
+			   {	bp = breaks;	/* yup, pretend arglist is empty */
+				writeblanks(breaks[0], p + 1);
+			   }
+		   }
+	   }
+	/* Put out the function name and left parenthesis. */
+	p = buf;
+	while ( p != endfn ) putc(*p, out), p++;
+	/* Put out the declaration. */
+	if ( header )
+	  {	fputs(");", out);
+		for ( p = breaks[0]; *p; p++ )
+		  if ( *p == '\r' || *p == '\n' )
+		    putc(*p, out);
+	  }
+	else
+	  {	for ( ap = breaks+1; ap < bp; ap += 2 )
+		  {	p = *ap;
+			while ( isidchar(*p) )
+			  putc(*p, out), p++;
+			if ( ap < bp - 1 )
+			  fputs(", ", out);
+		  }
+		fputs(")  ", out);
+		/* Put out the argument declarations */
+		for ( ap = breaks+2; ap <= bp; ap += 2 )
+		  (*ap)[-1] = ';';
+		if ( vararg != 0 )
+		  {	*vararg = 0;
+			fputs(breaks[0], out);		/* any prior args */
+			fputs("va_dcl", out);		/* the final arg */
+			fputs(bp[0], out);
+		  }
+		else
+		  fputs(breaks[0], out);
+	  }
+	free((char *)breaks);
+	return 0;
+}
diff --git a/rts/gmp/assert.c b/rts/gmp/assert.c
new file mode 100644
index 0000000000..65eccfa30b
--- /dev/null
+++ b/rts/gmp/assert.c
@@ -0,0 +1,52 @@
+/* GMP assertion failure handler. */
+
+/*
+Copyright (C) 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include <stdio.h>
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+int
+#if __STDC__
+__gmp_assert_fail (const char *filename, int linenum,
+                   const char *expr)
+#else
+__gmp_assert_fail (filename, linenum, expr)
+char *filename;
+int  linenum;
+char *expr;
+#endif
+{
+  if (filename != NULL && filename[0] != '\0')
+    {
+      fprintf (stderr, "%s:", filename);
+      if (linenum != -1)
+        fprintf (stderr, "%d: ", linenum);
+    }
+
+  fprintf (stderr, "GNU MP assertion failed: %s\n", expr);
+  abort();
+
+  /*NOTREACHED*/
+  return 0;
+}
diff --git a/rts/gmp/compat.c b/rts/gmp/compat.c
new file mode 100644
index 0000000000..ab7529f52f
--- /dev/null
+++ b/rts/gmp/compat.c
@@ -0,0 +1,46 @@
+/* Old function entrypoints retained for binary compatibility. */
+
+/*
+Copyright (C) 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include <stdio.h>
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+/* mpn_divexact_by3 was a function in gmp 3.0, but as of gmp 3.1 it's a
+   macro calling mpn_divexact_by3c.  */
+int
+__MPN (divexact_by3) (mp_ptr dst, mp_srcptr src, mp_size_t size)
+{
+  mpn_divexact_by3 (dst, src, size);
+}
+
+
+/* mpn_divmod_1 was a function in gmp 3.0 and earlier, but marked obsolete
+   in gmp 2 and 3.  As of gmp 3.1 it's a macro calling mpn_divrem_1. */
+int
+__MPN (divmod_1) (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor)
+{
+  mpn_divmod_1 (dst, src, size, divisor);
+}
+
+
diff --git a/rts/gmp/config.guess b/rts/gmp/config.guess
new file mode 100644
index 0000000000..08018f497d
--- /dev/null
+++ b/rts/gmp/config.guess
@@ -0,0 +1,1373 @@
+#! /bin/sh
+# Attempt to guess a canonical system name.
+#
+# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000
+# Free Software Foundation, Inc.
+#
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# Written by Per Bothner <bothner@cygnus.com>.
+# Please send patches to <config-patches@gnu.org>.
+#
+# This script attempts to guess a canonical system name similar to
+# config.sub.  If it succeeds, it prints the system name on stdout, and
+# exits with 0.  Otherwise, it exits with 1.
+#
+# The plan is that this can be called by configure scripts if you
+# don't specify an explicit system type (host/target name).
+#
+# Only a few systems have been added to this list; please add others
+# (but try to keep the structure clean).
+#
+
+
+# This is needed to find uname on a Pyramid OSx when run in the BSD universe.
+# (ghazi@noc.rutgers.edu 8/24/94.)
+if (test -f /.attbin/uname) >/dev/null 2>&1 ; then
+	PATH=$PATH:/.attbin ; export PATH
+fi
+
+UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown
+UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
+UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown
+UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
+
+dummy=dummy-$$
+trap 'rm -f $dummy.c $dummy.o $dummy ${dummy}1.s ${dummy}2.c ; exit 1' 1 2 15
+
+# Use $HOST_CC if defined. $CC may point to a cross-compiler
+if test x"$CC_FOR_BUILD" = x; then
+  if test x"$HOST_CC" != x; then
+    CC_FOR_BUILD="$HOST_CC"
+  else
+    if test x"$CC" != x; then
+      CC_FOR_BUILD="$CC"
+    else
+      echo 'dummy(){}' >$dummy.c
+      for c in cc c89 gcc; do 
+	  ($c $dummy.c -c) >/dev/null 2>&1
+	  if test $? = 0; then
+	      CC_FOR_BUILD="$c"; break
+	  fi
+      done
+      rm -f $dummy.c $dummy.o
+      if test x"$CC_FOR_BUILD" = x; then
+	CC_FOR_BUILD=no_compiler_found
+      fi
+    fi
+  fi
+fi
+
+
+# First make a best effort at recognizing x86 CPU type and leave it in X86CPU.
+# If we fail, set X86CPU to UNAME_MACHINE
+#
+# DJGPP v2 (or 2.03 at least) always gives "pc" for uname -m, and the
+# OEM for uname -s.  Eg. pc:MS-DOS:6:2 on MS-DOS 6.21.  The list of
+# possible OEMs is in src/libc/dos/dos/getdos_v.c of djlsr203.zip, but
+# just pc:*:*:* seems ok.
+
+case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
+    i?86:*:*:* | i86pc:*:*:* | pc:*:*:*)
+	case "${UNAME_MACHINE}" in
+	i86pc | pc)  UNAME_MACHINE=i386 ;;
+	esac
+	cat <<EOF >${dummy}1.s
+	.globl cpuid
+	.globl _cpuid
+cpuid:
+_cpuid:
+	pushl %esi
+	pushl %ebx
+	movl 16(%esp),%eax
+	.byte 0x0f
+	.byte 0xa2
+	movl 12(%esp),%esi
+	movl %ebx,(%esi)
+	movl %edx,4(%esi)
+	movl %ecx,8(%esi)
+	popl %ebx
+	popl %esi
+	ret
+EOF
+	cat <<EOF >${dummy}2.c
+main ()
+{
+  char vendor_string[13];
+  char dummy_string[12];
+  long fms;
+  int family, model;
+  char *modelstr;
+
+  cpuid (vendor_string, 0);
+  vendor_string[12] = 0;
+
+  fms = cpuid (dummy_string, 1);
+
+  family = (fms >> 8) & 15;
+  model = (fms >> 4) & 15;
+
+  modelstr = "i486";
+  if (strcmp (vendor_string, "GenuineIntel") == 0)
+    {
+      switch (family)
+	{
+	case 5:
+	  if (model <= 2)
+	    modelstr = "pentium";
+	  else if (model >= 4)
+	    modelstr = "pentiummmx";
+	  break;
+	case 6:
+	  if (model == 1)
+	    modelstr = "pentiumpro";
+	  else if (model <= 6)
+	    modelstr = "pentium2";
+	  else
+	    modelstr = "pentium3";
+	  break;
+	}
+    }
+  else if (strcmp (vendor_string, "AuthenticAMD") == 0)
+    {
+      switch (family)
+	{
+	case 5:
+	  if (model <= 3)
+	    modelstr = "k5";
+	  else if (model <= 7)
+	    modelstr = "k6";
+	  else if (model <= 8)
+	    modelstr = "k62";
+	  else if (model <= 9)
+	    modelstr = "k63";
+	  break;
+	case 6:
+	  modelstr = "athlon";
+	  break;
+	}
+    }
+  else if (strcmp (vendor_string, "CyrixInstead") == 0)
+    {
+      /* Should recognize Cyrix' processors too.  */
+    }
+
+  printf ("%s\n", modelstr);
+  return 0;
+}
+EOF
+	$CC_FOR_BUILD ${dummy}1.s ${dummy}2.c -o $dummy >/dev/null 2>&1
+	if test "$?" = 0 ; then
+	  X86CPU=`./$dummy`
+	fi
+
+
+	# Default to believing uname -m if the program fails to compile or
+	# run.  Will fail to run on 386 since cpuid was only added on 486.
+	if test -z "$X86CPU"
+	then
+	    X86CPU="$UNAME_MACHINE"
+	fi
+	rm -f ${dummy}1.s ${dummy}2.c $dummy
+  ;;
+esac
+
+# Note: order is significant - the case branches are not exclusive.
+
+case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
+    alpha:OSF1:*:*)
+	if test $UNAME_RELEASE = "V4.0"; then
+		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
+	fi
+	# A Vn.n version is a released version.
+	# A Tn.n version is a released field test version.
+	# A Xn.n version is an unreleased experimental baselevel.
+	# 1.2 uses "1.2" for uname -r.
+	cat <<EOF >$dummy.s
+	.data
+\$Lformat:
+	.byte 37,100,45,37,120,10,0	# "%d-%x\n"
+
+	.text
+	.globl main
+	.align 4
+	.ent main
+main:
+	.frame \$30,16,\$26,0
+	ldgp \$29,0(\$27)
+	.prologue 1
+	.long 0x47e03d80 # implver \$0
+	lda \$2,-1
+	.long 0x47e20c21 # amask \$2,\$1
+	lda \$16,\$Lformat
+	mov \$0,\$17
+	not \$1,\$18
+	jsr \$26,printf
+	ldgp \$29,0(\$26)
+	mov 0,\$16
+	jsr \$26,exit
+	.end main
+EOF
+	$CC_FOR_BUILD $dummy.s -o $dummy 2>/dev/null
+	if test "$?" = 0 ; then
+		case `./$dummy` in
+			0-0)
+				UNAME_MACHINE="alpha"
+				;;
+			1-0)
+				UNAME_MACHINE="alphaev5"
+				;;
+			1-1)
+				UNAME_MACHINE="alphaev56"
+				;;
+			1-101)
+				UNAME_MACHINE="alphapca56"
+				;;
+			2-303)
+				UNAME_MACHINE="alphaev6"
+				;;
+			2-307)
+				UNAME_MACHINE="alphaev67"
+				;;
+		esac
+	fi
+	rm -f $dummy.s $dummy
+	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[VTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+	exit 0 ;;
+    alpha:NetBSD:*:* | alpha:FreeBSD:*:*)
+	cat <<EOF >$dummy.s
+	.globl main
+	.ent main
+main:
+	.frame \$30,0,\$26,0
+	.prologue 0
+	.long 0x47e03d80 # implver $0
+	lda \$2,259
+	.long 0x47e20c21 # amask $2,$1
+	srl \$1,8,\$2
+	sll \$2,2,\$2
+	sll \$0,3,\$0
+	addl \$1,\$0,\$0
+	addl \$2,\$0,\$0
+	ret \$31,(\$26),1
+	.end main
+EOF
+	$CC_FOR_BUILD $dummy.s -o $dummy 2>/dev/null
+	if test "$?" = 0 ; then
+		./$dummy
+		case "$?" in
+			7)
+				UNAME_MACHINE="alpha"
+				;;
+			15)
+				UNAME_MACHINE="alphaev5"
+				;;
+			14)
+				UNAME_MACHINE="alphaev56"
+				;;
+			10)
+				UNAME_MACHINE="alphapca56"
+				;;
+			16)
+				UNAME_MACHINE="alphaev6"
+				;;
+		esac
+	fi
+	rm -f $dummy.s $dummy
+	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM}${UNAME_RELEASE} | sed -e 's/^[VTX]//' -e 's/[-(].*//' | tr [[A-Z]] [[a-z]]`
+	exit 0 ;;
+    Alpha\ *:Windows_NT*:*)
+	# How do we know it's Interix rather than the generic POSIX subsystem?
+	# Should we change UNAME_MACHINE based on the output of uname instead
+	# of the specific Alpha model?
+	echo alpha-pc-interix
+	exit 0 ;;
+    21064:Windows_NT:50:3)
+	echo alpha-dec-winnt3.5
+	exit 0 ;;
+    Amiga*:UNIX_System_V:4.0:*)
+	echo m68k-cbm-sysv4
+	exit 0;;
+    amiga:NetBSD:*:*)
+      echo m68k-cbm-netbsd${UNAME_RELEASE}
+      exit 0 ;;
+    amiga:OpenBSD:*:*)
+	echo m68k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    *:[Aa]miga[Oo][Ss]:*:*)
+	echo ${UNAME_MACHINE}-unknown-amigaos
+	exit 0 ;;
+    arc64:OpenBSD:*:*)
+	echo mips64el-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    arc:OpenBSD:*:*)
+	echo mipsel-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    hkmips:OpenBSD:*:*)
+	echo mips-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    pmax:OpenBSD:*:*)
+	echo mipsel-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    sgi:OpenBSD:*:*)
+	echo mips-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    wgrisc:OpenBSD:*:*)
+	echo mipsel-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    *:OS/390:*:*)
+	echo i370-ibm-openedition
+	exit 0 ;;
+    arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
+	echo arm-acorn-riscix${UNAME_RELEASE}
+	exit 0;;
+    arm32:NetBSD:*:*)
+	echo arm-unknown-netbsd`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
+	exit 0 ;;
+    SR2?01:HI-UX/MPP:*:*)
+	echo hppa1.1-hitachi-hiuxmpp
+	exit 0;;
+    Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*)
+	# akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
+	if test "`(/bin/universe) 2>/dev/null`" = att ; then
+		echo pyramid-pyramid-sysv3
+	else
+		echo pyramid-pyramid-bsd
+	fi
+	exit 0 ;;
+    NILE*:*:*:dcosx)
+	echo pyramid-pyramid-svr4
+	exit 0 ;;
+    sun4H:SunOS:5.*:*)
+	echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit 0 ;;
+    sun4[md]:SunOS:5.*:*)
+	echo sparcv8-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit 0 ;;
+    sun4u:SunOS:5.*:*)
+	echo sparcv9-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit 0 ;;
+    sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
+	echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit 0 ;;
+    i386:SunOS:5.*:*)
+	echo ${X86CPU}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit 0 ;;
+    sun4[md]:SunOS:*:*)
+	case "`/usr/bin/arch -k`" in
+	    Series*|S4*)
+		UNAME_RELEASE=`uname -v`
+		;;
+	esac
+	# Japanese Language versions have a version number like `4.1.3-JL'.
+	echo sparcv8-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
+	exit 0 ;;
+    sun4*:SunOS:*:*)
+	case "`/usr/bin/arch -k`" in
+	    Series*|S4*)
+		UNAME_RELEASE=`uname -v`
+		;;
+	esac
+	# Japanese Language versions have a version number like `4.1.3-JL'.
+	echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
+	exit 0 ;;
+    sun3*:SunOS:*:*)
+	echo m68k-sun-sunos${UNAME_RELEASE}
+	exit 0 ;;
+    sun*:*:4.2BSD:*)
+	UNAME_RELEASE=`(head -1 /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
+	test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
+	case "`/bin/arch`" in
+	    sun3)
+		echo m68k-sun-sunos${UNAME_RELEASE}
+		;;
+	    sun4)
+		echo sparc-sun-sunos${UNAME_RELEASE}
+		;;
+	esac
+	exit 0 ;;
+    aushp:SunOS:*:*)
+	echo sparc-auspex-sunos${UNAME_RELEASE}
+	exit 0 ;;
+    atari*:NetBSD:*:*)
+	echo m68k-atari-netbsd${UNAME_RELEASE}
+	exit 0 ;;
+    atari*:OpenBSD:*:*)
+	echo m68k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    # The situation for MiNT is a little confusing.  The machine name
+    # can be virtually everything (everything which is not
+    # "atarist" or "atariste" at least should have a processor
+    # > m68000).  The system name ranges from "MiNT" over "FreeMiNT"
+    # to the lowercase version "mint" (or "freemint").  Finally
+    # the system name "TOS" denotes a system which is actually not
+    # MiNT.  But MiNT is downward compatible to TOS, so this should
+    # be no problem.
+    atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
+        echo m68k-atari-mint${UNAME_RELEASE}
+	exit 0 ;;
+    atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
+	echo m68k-atari-mint${UNAME_RELEASE}
+        exit 0 ;;
+    *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
+        echo m68k-atari-mint${UNAME_RELEASE}
+	exit 0 ;;
+    milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
+        echo m68k-milan-mint${UNAME_RELEASE}
+        exit 0 ;;
+    hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
+        echo m68k-hades-mint${UNAME_RELEASE}
+        exit 0 ;;
+    *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
+        echo m68k-unknown-mint${UNAME_RELEASE}
+        exit 0 ;;
+    sun3*:NetBSD:*:*)
+	echo m68k-sun-netbsd${UNAME_RELEASE}
+	exit 0 ;;
+    sun3*:OpenBSD:*:*)
+	echo m68k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    mac68k:NetBSD:*:*)
+	echo m68k-apple-netbsd${UNAME_RELEASE}
+	exit 0 ;;
+    mac68k:OpenBSD:*:*)
+	echo m68k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    macppc:NetBSD:*:*)
+	echo powerpc-apple-netbsd${UNAME_RELEASE}
+	exit 0 ;;
+    mvme68k:OpenBSD:*:*)
+	echo m68k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    mvme88k:OpenBSD:*:*)
+	echo m88k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    powerpc:machten:*:*)
+	echo powerpc-apple-machten${UNAME_RELEASE}
+	exit 0 ;;
+    RISC*:Mach:*:*)
+	echo mips-dec-mach_bsd4.3
+	exit 0 ;;
+    RISC*:ULTRIX:*:*)
+	echo mips-dec-ultrix${UNAME_RELEASE}
+	exit 0 ;;
+    VAX*:ULTRIX*:*:*)
+	echo vax-dec-ultrix${UNAME_RELEASE}
+	exit 0 ;;
+    2020:CLIX:*:* | 2430:CLIX:*:*)
+	echo clipper-intergraph-clix${UNAME_RELEASE}
+	exit 0 ;;
+    mips:*:*:UMIPS | mips:*:*:RISCos)
+	sed 's/^	//' << EOF >$dummy.c
+#ifdef __cplusplus
+#include <stdio.h>  /* for printf() prototype */
+	int main (int argc, char *argv[]) {
+#else
+	int main (argc, argv) int argc; char *argv[]; {
+#endif
+	#if defined (host_mips) && defined (MIPSEB)
+	#if defined (SYSTYPE_SYSV)
+	  printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0);
+	#endif
+	#if defined (SYSTYPE_SVR4)
+	  printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0);
+	#endif
+	#if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
+	  printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0);
+	#endif
+	#endif
+	  exit (-1);
+	}
+EOF
+	$CC_FOR_BUILD $dummy.c -o $dummy \
+	  && ./$dummy `echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` \
+	  && rm $dummy.c $dummy && exit 0
+	rm -f $dummy.c $dummy
+	echo mips-mips-riscos${UNAME_RELEASE}
+	exit 0 ;;
+    Night_Hawk:Power_UNIX:*:*)
+	echo powerpc-harris-powerunix
+	exit 0 ;;
+    m88k:CX/UX:7*:*)
+	echo m88k-harris-cxux7
+	exit 0 ;;
+    m88k:*:4*:R4*)
+	echo m88k-motorola-sysv4
+	exit 0 ;;
+    m88k:*:3*:R3*)
+	echo m88k-motorola-sysv3
+	exit 0 ;;
+    AViiON:dgux:*:*)
+        # DG/UX returns AViiON for all architectures
+        UNAME_PROCESSOR=`/usr/bin/uname -p`
+	if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ]
+	then
+	    if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \
+	       [ ${TARGET_BINARY_INTERFACE}x = x ]
+	    then
+		echo m88k-dg-dgux${UNAME_RELEASE}
+	    else
+		echo m88k-dg-dguxbcs${UNAME_RELEASE}
+	    fi
+	else
+	    echo i586-dg-dgux${UNAME_RELEASE}
+	fi
+ 	exit 0 ;;
+    M88*:DolphinOS:*:*)	# DolphinOS (SVR3)
+	echo m88k-dolphin-sysv3
+	exit 0 ;;
+    M88*:*:R3*:*)
+	# Delta 88k system running SVR3
+	echo m88k-motorola-sysv3
+	exit 0 ;;
+    XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
+	echo m88k-tektronix-sysv3
+	exit 0 ;;
+    Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
+	echo m68k-tektronix-bsd
+	exit 0 ;;
+    *:IRIX*:*:*)
+	echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'`
+	exit 0 ;;
+    ????????:AIX?:[12].1:2)   # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
+	echo romp-ibm-aix      # uname -m gives an 8 hex-code CPU id
+	exit 0 ;;              # Note that: echo "'`uname -s`'" gives 'AIX '
+    i?86:AIX:*:*)
+	echo i386-ibm-aix
+	exit 0 ;;
+    *:AIX:2:3)
+	if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
+		sed 's/^		//' << EOF >$dummy.c
+		#include <sys/systemcfg.h>
+
+		main()
+			{
+			if (!__power_pc())
+				exit(1);
+			puts("powerpc-ibm-aix3.2.5");
+			exit(0);
+			}
+EOF
+		$CC_FOR_BUILD $dummy.c -o $dummy && ./$dummy && rm $dummy.c $dummy && exit 0
+		rm -f $dummy.c $dummy
+		echo rs6000-ibm-aix3.2.5
+	elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
+		echo rs6000-ibm-aix3.2.4
+	else
+		echo rs6000-ibm-aix3.2
+	fi
+	exit 0 ;;
+    *:AIX:*:4)
+	sed 's/^	//' << EOF >$dummy.c
+	#include <stdio.h>
+	#include <sys/systemcfg.h>
+	main ()
+	{
+	  if (_system_configuration.architecture == POWER_RS
+	      || _system_configuration.implementation == POWER_601)
+	    puts ("power");
+	  else
+	    {
+	      if (_system_configuration.width == 64)
+		puts ("powerpc64");
+	      else
+		puts ("powerpc");
+	    }
+	  exit (0);
+	}
+EOF
+	$CC_FOR_BUILD $dummy.c -o $dummy
+	IBM_ARCH=`./$dummy`
+	rm -f $dummy.c $dummy
+	if [ -x /usr/bin/oslevel ] ; then
+		IBM_REV=`/usr/bin/oslevel`
+	else
+		IBM_REV=4.${UNAME_RELEASE}
+	fi
+	echo ${IBM_ARCH}-ibm-aix${IBM_REV}
+	exit 0 ;;
+    *:AIX:*:*)
+	echo rs6000-ibm-aix
+	exit 0 ;;
+    ibmrt:4.4BSD:*|romp-ibm:BSD:*)
+	echo romp-ibm-bsd4.4
+	exit 0 ;;
+    ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC BSD and
+	echo romp-ibm-bsd${UNAME_RELEASE}   # 4.3 with uname added to
+	exit 0 ;;                           # report: romp-ibm BSD 4.3
+    *:BOSX:*:*)
+	echo rs6000-bull-bosx
+	exit 0 ;;
+    DPX/2?00:B.O.S.:*:*)
+	echo m68k-bull-sysv3
+	exit 0 ;;
+    9000/[34]??:4.3bsd:1.*:*)
+	echo m68k-hp-bsd
+	exit 0 ;;
+    hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
+	echo m68k-hp-bsd4.4
+	exit 0 ;;
+    9000/[34678]??:HP-UX:*:*)
+	case "${UNAME_MACHINE}" in
+	    9000/31? )            HP_ARCH=m68000 ;;
+	    9000/[34]?? )         HP_ARCH=m68k ;;
+	    9000/[678][0-9][0-9])
+              sed 's/^              //' << EOF >$dummy.c
+
+              #define _HPUX_SOURCE
+              #include <stdlib.h>
+              #include <unistd.h>
+
+              int main ()
+              {
+              #if defined(_SC_KERNEL_BITS)
+                  long bits = sysconf(_SC_KERNEL_BITS);
+              #endif
+                  long cpu  = sysconf (_SC_CPU_VERSION);
+
+                  switch (cpu)
+              	{
+              	case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
+              	case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
+              	case CPU_PA_RISC2_0:
+              #if defined(_SC_KERNEL_BITS)
+              	    switch (bits)
+              		{
+              		case 64: puts ("hppa2.0w"); break;
+              		case 32: puts ("hppa2.0n"); break;
+              		default: puts ("hppa2.0"); break;
+              		} break;
+              #else  /* !defined(_SC_KERNEL_BITS) */
+              	    puts ("hppa2.0"); break;
+              #endif
+              	default: puts ("hppa1.0"); break;
+              	}
+                  exit (0);
+              }
+EOF
+	(CCOPTS= $CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null ) && HP_ARCH=`./$dummy`
+	rm -f $dummy.c $dummy
+	esac
+	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
+	echo ${HP_ARCH}-hp-hpux${HPUX_REV}
+	exit 0 ;;
+    3050*:HI-UX:*:*)
+	sed 's/^	//' << EOF >$dummy.c
+	#include <unistd.h>
+	int
+	main ()
+	{
+	  long cpu = sysconf (_SC_CPU_VERSION);
+	  /* The order matters, because CPU_IS_HP_MC68K erroneously returns
+	     true for CPU_PA_RISC1_0.  CPU_IS_PA_RISC returns correct
+	     results, however.  */
+	  if (CPU_IS_PA_RISC (cpu))
+	    {
+	      switch (cpu)
+		{
+		  case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break;
+		  case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break;
+		  case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break;
+		  default: puts ("hppa-hitachi-hiuxwe2"); break;
+		}
+	    }
+	  else if (CPU_IS_HP_MC68K (cpu))
+	    puts ("m68k-hitachi-hiuxwe2");
+	  else puts ("unknown-hitachi-hiuxwe2");
+	  exit (0);
+	}
+EOF
+	$CC_FOR_BUILD $dummy.c -o $dummy && ./$dummy && rm $dummy.c $dummy && exit 0
+	rm -f $dummy.c $dummy
+	echo unknown-hitachi-hiuxwe2
+	exit 0 ;;
+    9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* )
+	echo hppa1.1-hp-bsd
+	exit 0 ;;
+    9000/8??:4.3bsd:*:*)
+	echo hppa1.0-hp-bsd
+	exit 0 ;;
+    *9??*:MPE/iX:*:*)
+	echo hppa1.0-hp-mpeix
+	exit 0 ;;
+    hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* )
+	echo hppa1.1-hp-osf
+	exit 0 ;;
+    hp8??:OSF1:*:*)
+	echo hppa1.0-hp-osf
+	exit 0 ;;
+    i?86:OSF1:*:*)
+	if [ -x /usr/sbin/sysversion ] ; then
+	    echo ${UNAME_MACHINE}-unknown-osf1mk
+	else
+	    echo ${UNAME_MACHINE}-unknown-osf1
+	fi
+	exit 0 ;;
+    parisc*:Lites*:*:*)
+	echo hppa1.1-hp-lites
+	exit 0 ;;
+    hppa*:OpenBSD:*:*)
+	echo hppa-unknown-openbsd
+	exit 0 ;;
+    C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
+	echo c1-convex-bsd
+        exit 0 ;;
+    C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
+	if getsysinfo -f scalar_acc
+	then echo c32-convex-bsd
+	else echo c2-convex-bsd
+	fi
+        exit 0 ;;
+    C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
+	echo c34-convex-bsd
+        exit 0 ;;
+    C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
+	echo c38-convex-bsd
+        exit 0 ;;
+    C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
+	echo c4-convex-bsd
+        exit 0 ;;
+    CRAY*X-MP:*:*:*)
+	echo xmp-cray-unicos
+        exit 0 ;;
+    CRAY*Y-MP:*:*:*)
+	echo ymp-cray-unicos${UNAME_RELEASE}
+	exit 0 ;;
+    CRAY*[A-Z]90:*:*:*)
+	echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \
+	| sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
+	      -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/
+	exit 0 ;;
+    CRAY*TS:*:*:*)
+	echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit 0 ;;
+    CRAY*T3D:*:*:*)
+	echo alpha-cray-unicos
+	exit 0 ;;
+    CRAY*T3E:*:*:*)
+	echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit 0 ;;
+    CRAY*SV1:*:*:*)
+	echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+	exit 0 ;;
+    CRAY-2:*:*:*)
+	echo cray2-cray-unicos
+        exit 0 ;;
+    F300:UNIX_System_V:*:*)
+        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+        FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
+        echo "f300-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+        exit 0 ;;
+    F301:UNIX_System_V:*:*)
+       echo f301-fujitsu-uxpv`echo $UNAME_RELEASE | sed 's/ .*//'`
+       exit 0 ;;
+    hp3[0-9][05]:NetBSD:*:*)
+	echo m68k-hp-netbsd${UNAME_RELEASE}
+	exit 0 ;;
+    hp300:OpenBSD:*:*)
+	echo m68k-unknown-openbsd${UNAME_RELEASE}
+	exit 0 ;;
+    i?86:BSD/386:*:* | i?86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
+	echo ${X86CPU}-pc-bsdi${UNAME_RELEASE}
+	exit 0 ;;
+    sparc*:BSD/OS:*:*)
+	echo sparc-unknown-bsdi${UNAME_RELEASE}
+	exit 0 ;;
+    *:BSD/OS:*:*)
+	echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
+	exit 0 ;;
+    i386:FreeBSD:*:*)
+	echo ${X86CPU}-pc-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
+	exit 0 ;;
+    *:FreeBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
+	exit 0 ;;
+    i386:NetBSD:*:*)
+	echo ${X86CPU}-pc-netbsd`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
+	exit 0 ;;
+    *:NetBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-netbsd`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
+	exit 0 ;;
+    i386:OpenBSD:*:*)
+	echo ${X86CPU}-pc-openbsd`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
+	exit 0 ;;
+    *:OpenBSD:*:*)
+	echo ${UNAME_MACHINE}-unknown-openbsd`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
+	exit 0 ;;
+    i*:CYGWIN*:*)
+	echo ${X86CPU}-pc-cygwin
+	exit 0 ;;
+    i*:MINGW*:*)
+	echo ${UNAME_MACHINE}-pc-mingw32
+	exit 0 ;;
+    i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
+	# How do we know it's Interix rather than the generic POSIX subsystem?
+	# It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
+	# UNAME_MACHINE based on the output of uname instead of i386?
+	echo i386-pc-interix
+	exit 0 ;;
+    i*:UWIN*:*)
+	echo ${UNAME_MACHINE}-pc-uwin
+	exit 0 ;;
+    p*:CYGWIN*:*)
+	echo powerpcle-unknown-cygwin
+	exit 0 ;;
+    prep*:SunOS:5.*:*)
+	echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+	exit 0 ;;
+    *:GNU:*:*)
+	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
+	exit 0 ;;
+    *:Linux:*:*)
+
+	# The BFD linker knows what the default object file format is, so
+	# first see if it will tell us. cd to the root directory to prevent
+	# problems with other programs or directories called `ld' in the path.
+	ld_help_string=`cd /; ld --help 2>&1`
+	ld_supported_emulations=`echo $ld_help_string \
+			 | sed -ne '/supported emulations:/!d
+				    s/[ 	][ 	]*/ /g
+				    s/.*supported emulations: *//
+				    s/ .*//
+				    p'`
+        case "$ld_supported_emulations" in
+	  *ia64)
+		echo "${UNAME_MACHINE}-unknown-linux"
+		exit 0
+		;;
+	  i?86linux)
+		echo "${X86CPU}-pc-linux-gnuaout"
+		exit 0
+		;;
+	  i?86coff)
+		echo "${X86CPU}-pc-linux-gnucoff"
+		exit 0
+		;;
+	  sparclinux)
+		echo "${UNAME_MACHINE}-unknown-linux-gnuaout"
+		exit 0
+		;;
+	  armlinux)
+		echo "${UNAME_MACHINE}-unknown-linux-gnuaout"
+		exit 0
+		;;
+	  elf32arm*)
+		echo "${UNAME_MACHINE}-unknown-linux-gnuoldld"
+		exit 0
+		;;
+	  armelf_linux*)
+		echo "${UNAME_MACHINE}-unknown-linux-gnu"
+		exit 0
+		;;
+	  m68klinux)
+		echo "${UNAME_MACHINE}-unknown-linux-gnuaout"
+		exit 0
+		;;
+	  elf32ppc | elf32ppclinux)
+		# Determine Lib Version
+		cat >$dummy.c <<EOF
+#include <features.h>
+#if defined(__GLIBC__)
+extern char __libc_version[];
+extern char __libc_release[];
+#endif
+main(argc, argv)
+     int argc;
+     char *argv[];
+{
+#if defined(__GLIBC__)
+  printf("%s %s\n", __libc_version, __libc_release);
+#else
+  printf("unkown\n");
+#endif
+  return 0;
+}
+EOF
+		LIBC=""
+		$CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null
+		if test "$?" = 0 ; then
+			./$dummy | grep 1\.99 > /dev/null
+			if test "$?" = 0 ; then
+				LIBC="libc1"
+			fi
+		fi
+		rm -f $dummy.c $dummy
+		echo powerpc-unknown-linux-gnu${LIBC}
+		exit 0
+		;;
+	esac
+
+	if test "${UNAME_MACHINE}" = "alpha" ; then
+		cat <<EOF >$dummy.s
+			.data
+		\$Lformat:
+			.byte 37,100,45,37,120,10,0	# "%d-%x\n"
+
+			.text
+			.globl main
+			.align 4
+			.ent main
+		main:
+			.frame \$30,16,\$26,0
+			ldgp \$29,0(\$27)
+			.prologue 1
+			.long 0x47e03d80 # implver \$0
+			lda \$2,-1
+			.long 0x47e20c21 # amask \$2,\$1
+			lda \$16,\$Lformat
+			mov \$0,\$17
+			not \$1,\$18
+			jsr \$26,printf
+			ldgp \$29,0(\$26)
+			mov 0,\$16
+			jsr \$26,exit
+			.end main
+EOF
+		LIBC=""
+		$CC_FOR_BUILD $dummy.s -o $dummy 2>/dev/null
+		if test "$?" = 0 ; then
+			case `./$dummy` in
+			0-0)
+				UNAME_MACHINE="alpha"
+				;;
+			1-0)
+				UNAME_MACHINE="alphaev5"
+				;;
+			1-1)
+				UNAME_MACHINE="alphaev56"
+				;;
+			1-101)
+				UNAME_MACHINE="alphapca56"
+				;;
+			2-303)
+				UNAME_MACHINE="alphaev6"
+				;;
+			2-307)
+				UNAME_MACHINE="alphaev67"
+				;;
+			esac
+
+			objdump --private-headers $dummy | \
+			  grep ld.so.1 > /dev/null
+			if test "$?" = 0 ; then
+				LIBC="libc1"
+			fi
+		fi
+		rm -f $dummy.s $dummy
+		echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC} ; exit 0
+	elif test "${UNAME_MACHINE}" = "mips" ; then
+	  cat >$dummy.c <<EOF
+#ifdef __cplusplus
+#include <stdio.h>  /* for printf() prototype */
+	int main (int argc, char *argv[]) {
+#else
+	int main (argc, argv) int argc; char *argv[]; {
+#endif
+#ifdef __MIPSEB__
+  printf ("%s-unknown-linux-gnu\n", argv[1]);
+#endif
+#ifdef __MIPSEL__
+  printf ("%sel-unknown-linux-gnu\n", argv[1]);
+#endif
+  return 0;
+}
+EOF
+	  $CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null && ./$dummy "${UNAME_MACHINE}" && rm $dummy.c $dummy && exit 0
+	  rm -f $dummy.c $dummy
+	elif test "${UNAME_MACHINE}" = "s390"; then
+	  echo s390-ibm-linux && exit 0
+	else
+	  # Either a pre-BFD a.out linker (linux-gnuoldld)
+	  # or one that does not give us useful --help.
+	  # GCC wants to distinguish between linux-gnuoldld and linux-gnuaout.
+	  # If ld does not provide *any* "supported emulations:"
+	  # that means it is gnuoldld.
+	  echo "$ld_help_string" | grep >/dev/null 2>&1 "supported emulations:"
+	  test $? != 0 && echo "${X86CPU}-pc-linux-gnuoldld" && exit 0
+
+	  case "${UNAME_MACHINE}" in
+	  i?86)
+	    VENDOR=pc;
+	    UNAME_MACHINE=${X86CPU}
+	    ;;
+	  *)
+	    VENDOR=unknown;
+	    ;;
+	  esac
+	  # Determine whether the default compiler is a.out or elf
+	  cat >$dummy.c <<EOF
+#include <features.h>
+#ifdef __cplusplus
+#include <stdio.h>  /* for printf() prototype */
+	int main (int argc, char *argv[]) {
+#else
+	int main (argc, argv) int argc; char *argv[]; {
+#endif
+#ifdef __ELF__
+# ifdef __GLIBC__
+#  if __GLIBC__ >= 2
+    printf ("%s-${VENDOR}-linux-gnu\n", argv[1]);
+#  else
+    printf ("%s-${VENDOR}-linux-gnulibc1\n", argv[1]);
+#  endif
+# else
+   printf ("%s-${VENDOR}-linux-gnulibc1\n", argv[1]);
+# endif
+#else
+  printf ("%s-${VENDOR}-linux-gnuaout\n", argv[1]);
+#endif
+  return 0;
+}
+EOF
+	  $CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null && ./$dummy "${UNAME_MACHINE}" && rm $dummy.c $dummy && exit 0
+	  rm -f $dummy.c $dummy
+	fi ;;
+# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.  earlier versions
+# are messed up and put the nodename in both sysname and nodename.
+    i?86:DYNIX/ptx:4*:*)
+	echo i386-sequent-sysv4
+	exit 0 ;;
+    i?86:UNIX_SV:4.2MP:2.*)
+        # Unixware is an offshoot of SVR4, but it has its own version
+        # number series starting with 2...
+        # I am not positive that other SVR4 systems won't match this,
+	# I just have to hope.  -- rms.
+        # Use sysv4.2uw... so that sysv4* matches it.
+	echo ${X86CPU}-pc-sysv4.2uw${UNAME_VERSION}
+	exit 0 ;;
+    i?86:*:4.*:* | i?86:SYSTEM_V:4.*:*)
+	UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'`
+	if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
+		echo ${X86CPU}-univel-sysv${UNAME_REL}
+	else
+		echo ${X86CPU}-pc-sysv${UNAME_REL}
+	fi
+	exit 0 ;;
+    i?86:*:5:7*)
+        # Fixed at (any) Pentium or better
+        UNAME_MACHINE=i586
+        if [ ${UNAME_SYSTEM} = "UnixWare" ] ; then
+	    echo ${X86CPU}-sco-sysv${UNAME_RELEASE}uw${UNAME_VERSION}
+	else
+	    echo ${X86CPU}-pc-sysv${UNAME_RELEASE}
+	fi
+	exit 0 ;;
+    i?86:*:3.2:*)
+	if test -f /usr/options/cb.name; then
+		UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
+		echo ${X86CPU}-pc-isc$UNAME_REL
+	elif /bin/uname -X 2>/dev/null >/dev/null ; then
+		UNAME_REL=`(/bin/uname -X|egrep Release|sed -e 's/.*= //')`
+		(/bin/uname -X|egrep i80486 >/dev/null) && UNAME_MACHINE=i486
+		(/bin/uname -X|egrep '^Machine.*Pentium' >/dev/null) \
+			&& UNAME_MACHINE=i586
+		(/bin/uname -X|egrep '^Machine.*Pent ?II' >/dev/null) \
+			&& UNAME_MACHINE=i686
+		(/bin/uname -X|egrep '^Machine.*Pentium Pro' >/dev/null) \
+			&& UNAME_MACHINE=i686
+		echo ${X86CPU}-pc-sco$UNAME_REL
+	else
+		echo ${X86CPU}-pc-sysv32
+	fi
+	exit 0 ;;
+    i?86:*DOS:*:*)
+	echo ${X86CPU}-pc-msdosdjgpp
+	exit 0 ;;
+    pc:*:*:*)
+	# Left here for compatibility:
+        # uname -m prints for DJGPP always 'pc', but it prints nothing about
+        # the processor, so we play safe by assuming i386.
+	echo i386-pc-msdosdjgpp
+        exit 0 ;;
+    Intel:Mach:3*:*)
+	echo i386-pc-mach3
+	exit 0 ;;
+    paragon:*:*:*)
+	echo i860-intel-osf1
+	exit 0 ;;
+    i860:*:4.*:*) # i860-SVR4
+	if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
+	  echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4
+	else # Add other i860-SVR4 vendors below as they are discovered.
+	  echo i860-unknown-sysv${UNAME_RELEASE}  # Unknown i860-SVR4
+	fi
+	exit 0 ;;
+    mini*:CTIX:SYS*5:*)
+	# "miniframe"
+	echo m68010-convergent-sysv
+	exit 0 ;;
+    M68*:*:R3V[567]*:*)
+	test -r /sysV68 && echo 'm68k-motorola-sysv' && exit 0 ;;
+    3[34]??:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 4850:*:4.0:3.0)
+	OS_REL=''
+	test -r /etc/.relid \
+	&& OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	  && echo i486-ncr-sysv4.3${OS_REL} && exit 0
+	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
+	  && echo i586-ncr-sysv4.3${OS_REL} && exit 0 ;;
+    3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
+        /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+          && echo i486-ncr-sysv4 && exit 0 ;;
+    m68*:LynxOS:2.*:*)
+	echo m68k-unknown-lynxos${UNAME_RELEASE}
+	exit 0 ;;
+    mc68030:UNIX_System_V:4.*:*)
+	echo m68k-atari-sysv4
+	exit 0 ;;
+    i?86:LynxOS:2.*:* | i?86:LynxOS:3.[01]*:*)
+	echo i386-unknown-lynxos${UNAME_RELEASE}
+	exit 0 ;;
+    TSUNAMI:LynxOS:2.*:*)
+	echo sparc-unknown-lynxos${UNAME_RELEASE}
+	exit 0 ;;
+    rs6000:LynxOS:2.*:* | PowerPC:LynxOS:2.*:*)
+	echo rs6000-unknown-lynxos${UNAME_RELEASE}
+	exit 0 ;;
+    SM[BE]S:UNIX_SV:*:*)
+	echo mips-dde-sysv${UNAME_RELEASE}
+	exit 0 ;;
+    RM*:ReliantUNIX-*:*:*)
+	echo mips-sni-sysv4
+	exit 0 ;;
+    RM*:SINIX-*:*:*)
+	echo mips-sni-sysv4
+	exit 0 ;;
+    *:SINIX-*:*:*)
+	if uname -p 2>/dev/null >/dev/null ; then
+		UNAME_MACHINE=`(uname -p) 2>/dev/null`
+		echo ${UNAME_MACHINE}-sni-sysv4
+	else
+		echo ns32k-sni-sysv
+	fi
+	exit 0 ;;
+    PENTIUM:CPunix:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort
+                           # says <Richard.M.Bartel@ccMail.Census.GOV>
+        echo i586-unisys-sysv4
+        exit 0 ;;
+    *:UNIX_System_V:4*:FTX*)
+	# From Gerald Hewes <hewes@openmarket.com>.
+	# How about differentiating between stratus architectures? -djm
+	echo hppa1.1-stratus-sysv4
+	exit 0 ;;
+    *:*:*:FTX*)
+	# From seanf@swdc.stratus.com.
+	echo i860-stratus-sysv4
+	exit 0 ;;
+    mc68*:A/UX:*:*)
+	echo m68k-apple-aux${UNAME_RELEASE}
+	exit 0 ;;
+    news*:NEWS-OS:*:6*)
+	echo mips-sony-newsos6
+	exit 0 ;;
+    R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
+	if [ -d /usr/nec ]; then
+	        echo mips-nec-sysv${UNAME_RELEASE}
+	else
+	        echo mips-unknown-sysv${UNAME_RELEASE}
+	fi
+        exit 0 ;;
+    BeBox:BeOS:*:*)	# BeOS running on hardware made by Be, PPC only.
+	echo powerpc-be-beos
+	exit 0 ;;
+    BeMac:BeOS:*:*)	# BeOS running on Mac or Mac clone, PPC only.
+	echo powerpc-apple-beos
+	exit 0 ;;
+    BePC:BeOS:*:*)	# BeOS running on Intel PC compatible.
+	echo i586-pc-beos
+	exit 0 ;;
+    SX-4:SUPER-UX:*:*)
+	echo sx4-nec-superux${UNAME_RELEASE}
+	exit 0 ;;
+    SX-5:SUPER-UX:*:*)
+	echo sx5-nec-superux${UNAME_RELEASE}
+	exit 0 ;;
+    Power*:Rhapsody:*:*)
+	echo powerpc-apple-rhapsody${UNAME_RELEASE}
+	exit 0 ;;
+    *:Rhapsody:*:*)
+	echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE}
+	exit 0 ;;
+    Power*:Mac*OS:*:*)
+	echo powerpc-apple-macos${UNAME_RELEASE}
+	exit 0 ;;
+    *:Mac*OS:*:*)
+	echo ${UNAME_MACHINE}-apple-macos${UNAME_RELEASE}
+	exit 0 ;;
+    *:Darwin:*:*)
+	echo `uname -p`-apple-darwin${UNAME_RELEASE}
+	exit 0 ;;
+    *:procnto*:*:* | *:QNX:[0123456789]*:*)
+	if test "${UNAME_MACHINE}" = "x86pc"; then
+		UNAME_MACHINE=pc
+	fi
+	echo `uname -p`-${UNAME_MACHINE}-nto-qnx
+	exit 0 ;;
+    *:QNX:*:4*)
+	echo i386-pc-qnx
+	exit 0 ;;
+    NSR-W:NONSTOP_KERNEL:*:*)
+	echo nsr-tandem-nsk${UNAME_RELEASE}
+	exit 0 ;;
+    BS2000:POSIX*:*:*)
+	echo bs2000-siemens-sysv
+	exit 0 ;;
+esac
+
+#echo '(No uname command or uname output not recognized.)' 1>&2
+#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2
+
+cat >$dummy.c <<EOF
+#ifdef _SEQUENT_
+# include <sys/types.h>
+# include <sys/utsname.h>
+#endif
+main ()
+{
+#if defined (sony)
+#if defined (MIPSEB)
+  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
+     I don't know....  */
+  printf ("mips-sony-bsd\n"); exit (0);
+#else
+#include <sys/param.h>
+  printf ("m68k-sony-newsos%s\n",
+#ifdef NEWSOS4
+          "4"
+#else
+	  ""
+#endif
+         ); exit (0);
+#endif
+#endif
+
+#if defined (__arm) && defined (__acorn) && defined (__unix)
+  printf ("arm-acorn-riscix"); exit (0);
+#endif
+
+#if defined (hp300) && !defined (hpux)
+  printf ("m68k-hp-bsd\n"); exit (0);
+#endif
+
+#if defined (NeXT)
+#if !defined (__ARCHITECTURE__)
+#define __ARCHITECTURE__ "m68k"
+#endif
+  int version;
+  version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
+  if (version < 4)
+    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
+  else
+    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
+  exit (0);
+#endif
+
+#if defined (MULTIMAX) || defined (n16)
+#if defined (UMAXV)
+  printf ("ns32k-encore-sysv\n"); exit (0);
+#else
+#if defined (CMU)
+  printf ("ns32k-encore-mach\n"); exit (0);
+#else
+  printf ("ns32k-encore-bsd\n"); exit (0);
+#endif
+#endif
+#endif
+
+#if defined (__386BSD__)
+  printf ("i386-pc-bsd\n"); exit (0);
+#endif
+
+#if defined (sequent)
+#if defined (i386)
+  printf ("i386-sequent-dynix\n"); exit (0);
+#endif
+#if defined (ns32000)
+  printf ("ns32k-sequent-dynix\n"); exit (0);
+#endif
+#endif
+
+#if defined (_SEQUENT_)
+    struct utsname un;
+
+    uname(&un);
+
+    if (strncmp(un.version, "V2", 2) == 0) {
+	printf ("i386-sequent-ptx2\n"); exit (0);
+    }
+    if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
+	printf ("i386-sequent-ptx1\n"); exit (0);
+    }
+    printf ("i386-sequent-ptx\n"); exit (0);
+
+#endif
+
+#if defined (vax)
+#if !defined (ultrix)
+  printf ("vax-dec-bsd\n"); exit (0);
+#else
+  printf ("vax-dec-ultrix\n"); exit (0);
+#endif
+#endif
+
+#if defined (alliant) && defined (i860)
+  printf ("i860-alliant-bsd\n"); exit (0);
+#endif
+
+  exit (1);
+}
+EOF
+
+$CC_FOR_BUILD $dummy.c -o $dummy 2>/dev/null && ./$dummy && rm $dummy.c $dummy && exit 0
+rm -f $dummy.c $dummy
+
+# Apollos put the system type in the environment.
+
+test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit 0; }
+
+# Convex versions that predate uname can use getsysinfo(1)
+
+if [ -x /usr/convex/getsysinfo ]
+then
+    case `getsysinfo -f cpu_type` in
+    c1*)
+	echo c1-convex-bsd
+	exit 0 ;;
+    c2*)
+	if getsysinfo -f scalar_acc
+	then echo c32-convex-bsd
+	else echo c2-convex-bsd
+	fi
+	exit 0 ;;
+    c34*)
+	echo c34-convex-bsd
+	exit 0 ;;
+    c38*)
+	echo c38-convex-bsd
+	exit 0 ;;
+    c4*)
+	echo c4-convex-bsd
+	exit 0 ;;
+    esac
+fi
+
+#echo '(Unable to guess system type)' 1>&2
+
+exit 1
diff --git a/rts/gmp/config.in b/rts/gmp/config.in
new file mode 100644
index 0000000000..8b2546ef16
--- /dev/null
+++ b/rts/gmp/config.in
@@ -0,0 +1,162 @@
+/* config.in.  Generated automatically from configure.in by autoheader.  */
+/*
+Copyright (C) 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+
+/* Define if a limb is long long. */
+#undef _LONG_LONG_LIMB
+
+/* Define if we have native implementation of function. */
+#undef HAVE_NATIVE_
+#undef HAVE_NATIVE_mpn_add                 
+#undef HAVE_NATIVE_mpn_add_1               
+#undef HAVE_NATIVE_mpn_add_n               
+#undef HAVE_NATIVE_mpn_add_nc              
+#undef HAVE_NATIVE_mpn_addmul_1            
+#undef HAVE_NATIVE_mpn_addmul_1c
+#undef HAVE_NATIVE_mpn_addsub_n            
+#undef HAVE_NATIVE_mpn_addsub_nc
+#undef HAVE_NATIVE_mpn_and_n               
+#undef HAVE_NATIVE_mpn_andn_n              
+#undef HAVE_NATIVE_mpn_bdivmod             
+#undef HAVE_NATIVE_mpn_cmp                 
+#undef HAVE_NATIVE_mpn_com_n               
+#undef HAVE_NATIVE_mpn_copyd               
+#undef HAVE_NATIVE_mpn_copyi               
+#undef HAVE_NATIVE_mpn_divexact_by3c
+#undef HAVE_NATIVE_mpn_divrem              
+#undef HAVE_NATIVE_mpn_divrem_1            
+#undef HAVE_NATIVE_mpn_divrem_1c
+#undef HAVE_NATIVE_mpn_divrem_2            
+#undef HAVE_NATIVE_mpn_divrem_newton       
+#undef HAVE_NATIVE_mpn_divrem_classic      
+#undef HAVE_NATIVE_mpn_dump                
+#undef HAVE_NATIVE_mpn_gcd                 
+#undef HAVE_NATIVE_mpn_gcd_1               
+#undef HAVE_NATIVE_mpn_gcdext              
+#undef HAVE_NATIVE_mpn_get_str             
+#undef HAVE_NATIVE_mpn_hamdist             
+#undef HAVE_NATIVE_mpn_invert_limb
+#undef HAVE_NATIVE_mpn_ior_n               
+#undef HAVE_NATIVE_mpn_iorn_n              
+#undef HAVE_NATIVE_mpn_lshift              
+#undef HAVE_NATIVE_mpn_mod_1               
+#undef HAVE_NATIVE_mpn_mod_1c
+#undef HAVE_NATIVE_mpn_mul                 
+#undef HAVE_NATIVE_mpn_mul_1               
+#undef HAVE_NATIVE_mpn_mul_1c
+#undef HAVE_NATIVE_mpn_mul_basecase        
+#undef HAVE_NATIVE_mpn_mul_n               
+#undef HAVE_NATIVE_mpn_nand_n              
+#undef HAVE_NATIVE_mpn_nior_n              
+#undef HAVE_NATIVE_mpn_perfect_square_p    
+#undef HAVE_NATIVE_mpn_popcount            
+#undef HAVE_NATIVE_mpn_preinv_mod_1        
+#undef HAVE_NATIVE_mpn_random2             
+#undef HAVE_NATIVE_mpn_random              
+#undef HAVE_NATIVE_mpn_rawrandom           
+#undef HAVE_NATIVE_mpn_rshift              
+#undef HAVE_NATIVE_mpn_scan0               
+#undef HAVE_NATIVE_mpn_scan1               
+#undef HAVE_NATIVE_mpn_set_str             
+#undef HAVE_NATIVE_mpn_sqrtrem             
+#undef HAVE_NATIVE_mpn_sqr_basecase        
+#undef HAVE_NATIVE_mpn_sub                 
+#undef HAVE_NATIVE_mpn_sub_1               
+#undef HAVE_NATIVE_mpn_sub_n               
+#undef HAVE_NATIVE_mpn_sub_nc              
+#undef HAVE_NATIVE_mpn_submul_1            
+#undef HAVE_NATIVE_mpn_submul_1c
+#undef HAVE_NATIVE_mpn_udiv_w_sdiv         
+#undef HAVE_NATIVE_mpn_umul_ppmm
+#undef HAVE_NATIVE_mpn_udiv_qrnnd
+#undef HAVE_NATIVE_mpn_xor_n               
+#undef HAVE_NATIVE_mpn_xnor_n              
+
+/* Define to 1 if you have the declaration of `optarg', and to 0 if you don't.
+   */
+#undef HAVE_DECL_OPTARG
+
+/* ./configure --enable-assert option, to enable some ASSERT()s */
+#undef WANT_ASSERT
+
+/* Define if you have the <sys/sysctl.h> header file. */
+#undef HAVE_SYS_SYSCTL_H
+
+/* Define if you have the `strtoul' function. */
+#undef HAVE_STRTOUL
+
+/* Name of package */
+#undef PACKAGE
+
+/* Define if you have the `sysctlbyname' function. */
+#undef HAVE_SYSCTLBYNAME
+
+/* Define if the system has the type `void'. */
+#undef HAVE_VOID
+
+/* Define if you have the `popen' function. */
+#undef HAVE_POPEN
+
+/* ./configure --disable-alloca option, to use stack-alloc.c, not alloca */
+#undef USE_STACK_ALLOC
+
+/* Define if cpp supports the ANSI # stringizing operator. */
+#undef HAVE_STRINGIZE
+
+/* Define if you have the <sys/time.h> header file. */
+#undef HAVE_SYS_TIME_H
+
+/* Define if you have the `sysconf' function. */
+#undef HAVE_SYSCONF
+
+/* Define if you have the `getpagesize' function. */
+#undef HAVE_GETPAGESIZE
+
+/* Define if you have the `processor_info' function. */
+#undef HAVE_PROCESSOR_INFO
+
+/* Version number of package */
+#undef VERSION
+
+/* Define if you have the `getopt_long' function. */
+#undef HAVE_GETOPT_LONG
+
+/* Define if you have the <getopt.h> header file. */
+#undef HAVE_GETOPT_H
+
+/* Define if you have the ANSI C header files. */
+#undef STDC_HEADERS
+
+/* Define if a speed_cyclecounter exists (for the tune programs) */
+#undef HAVE_SPEED_CYCLECOUNTER
+
+/* Define if mpn/tests has calling conventions checking for the CPU */
+#undef HAVE_CALLING_CONVENTIONS
+
+/* ./configure --enable-fft option, to enable FFTs for multiplication */
+#undef WANT_FFT
+
+/* Define if you have the <string.h> header file. */
+#undef HAVE_STRING_H
+
+/* Define if you have the <unistd.h> header file. */
+#undef HAVE_UNISTD_H
diff --git a/rts/gmp/config.sub b/rts/gmp/config.sub
new file mode 100644
index 0000000000..c4123f28ff
--- /dev/null
+++ b/rts/gmp/config.sub
@@ -0,0 +1,1273 @@
+#! /bin/sh
+# Configuration validation subroutine script, version 1.1.
+#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000
+#   Free Software Foundation, Inc.
+#
+# This file is (in principle) common to ALL GNU software.
+# The presence of a machine in this file suggests that SOME GNU software
+# can handle that machine.  It does not imply ALL GNU software can.
+#
+# This file is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330,
+# Boston, MA 02111-1307, USA.
+
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# Written by Per Bothner <bothner@cygnus.com>.
+# Please send patches to <config-patches@gnu.org>.
+#
+# Configuration subroutine to validate and canonicalize a configuration type.
+# Supply the specified configuration type as an argument.
+# If it is invalid, we print an error message on stderr and exit with code 1.
+# Otherwise, we print the canonical config type on stdout and succeed.
+
+# This file is supposed to be the same for all GNU packages
+# and recognize all the CPU types, system types and aliases
+# that are meaningful with *any* GNU software.
+# Each package is responsible for reporting which valid configurations
+# it does not support.  The user should be able to distinguish
+# a failure to support a valid configuration from a meaningless
+# configuration.
+
+# The goal of this file is to map all the various variations of a given
+# machine specification into a single specification in the form:
+#	CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM
+# or in some cases, the newer four-part form:
+#	CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM
+# It is wrong to echo any other type of specification.
+
+if [ x$1 = x ]
+then
+	echo Configuration name missing. 1>&2
+	echo "Usage: $0 CPU-MFR-OPSYS" 1>&2
+	echo "or     $0 ALIAS" 1>&2
+	echo where ALIAS is a recognized configuration type. 1>&2
+	exit 1
+fi
+
+# First pass through any local machine types.
+case $1 in
+	*local*)
+		echo $1
+		exit 0
+		;;
+	*)
+	;;
+esac
+
+# Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any).
+# Here we must recognize all the valid KERNEL-OS combinations.
+maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
+case $maybe_os in
+  nto-qnx* | linux-gnu*)
+    os=-$maybe_os
+    basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
+    ;;
+  *)
+    basic_machine=`echo $1 | sed 's/-[^-]*$//'`
+    if [ $basic_machine != $1 ]
+    then os=`echo $1 | sed 's/.*-/-/'`
+    else os=; fi
+    ;;
+esac
+
+### Let's recognize common machines as not being operating systems so
+### that things like config.sub decstation-3100 work.  We also
+### recognize some manufacturers as not being operating systems, so we
+### can provide default operating systems below.
+case $os in
+	-sun*os*)
+		# Prevent following clause from handling this invalid input.
+		;;
+	-dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \
+	-att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \
+	-unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \
+	-convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
+	-c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
+	-harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
+	-apple)
+		os=
+		basic_machine=$1
+		;;
+	-sim | -cisco | -oki | -wec | -winbond)
+		os=
+		basic_machine=$1
+		;;
+	-scout)
+		;;
+	-wrs)
+		os=-vxworks
+		basic_machine=$1
+		;;
+	-hiux*)
+		os=-hiuxwe2
+		;;
+	-sco5)
+		os=-sco3.2v5
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco4)
+		os=-sco3.2v4
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco3.2.[4-9]*)
+		os=`echo $os | sed -e 's/sco3.2./sco3.2v/'`
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco3.2v[4-9]*)
+		# Don't forget version if it is 3.2v4 or newer.
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-sco*)
+		os=-sco3.2v2
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-udk*)
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-isc)
+		os=-isc2.2
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-clix*)
+		basic_machine=clipper-intergraph
+		;;
+	-isc*)
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+		;;
+	-lynx*)
+		os=-lynxos
+		;;
+	-ptx*)
+		basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'`
+		;;
+	-windowsnt*)
+		os=`echo $os | sed -e 's/windowsnt/winnt/'`
+		;;
+	-psos*)
+		os=-psos
+		;;
+	-mint | -mint[0-9]*)
+		basic_machine=m68k-atari
+		os=-mint
+		;;
+esac
+
+# Decode aliases for certain CPU-COMPANY combinations.
+case $basic_machine in
+	# Recognize the basic CPU types without company name.
+	# Some are omitted here because they have special meanings below.
+	tahoe | i860 | ia64 | m32r | m68k | m68000 | m88k | ns32k | arc | arm \
+		| arme[lb] | pyramid | mn10200 | mn10300 | tron | a29k \
+		| 580 | i960 | h8300 \
+		| x86 | ppcbe | mipsbe | mipsle | shbe | shle | armbe | armle \
+		| hppa | hppa1.0 | hppa1.1 | hppa2.0 | hppa2.0w | hppa2.0n \
+		| alpha | alphaev[4-8] | alphaev56 | alphapca5[67] \
+		| alphaev6[78] \
+		| we32k | ns16k | clipper | i370 | sh | powerpc | powerpcle \
+		| 1750a | dsp16xx | pdp11 | mips16 | mips64 | mipsel | mips64el \
+		| mips64orion | mips64orionel | mipstx39 | mipstx39el \
+		| mips64vr4300 | mips64vr4300el | mips64vr4100 | mips64vr4100el \
+		| mips64vr5000 | miprs64vr5000el | mcore \
+		| sparc | sparclet | sparclite | sparc64 | sparcv9 | v850 | c4x \
+		| powerpc64 | sparcv8 | supersparc | microsparc | ultrasparc \
+		| thumb | d10v | fr30 | avr)
+		basic_machine=$basic_machine-unknown
+		;;
+	m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | z8k | v70 | h8500 | w65 | pj | pjl)
+		;;
+
+	# We use `pc' rather than `unknown'
+	# because (1) that's what they normally are, and
+	# (2) the word "unknown" tends to confuse beginning users.
+	i[34567]86 | pentium[23] | k[56] | k6[23] | athlon)
+	  basic_machine=$basic_machine-pc
+	  ;;
+	# Object if more than one company name word.
+	*-*-*)
+		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
+		exit 1
+		;;
+	# Recognize the basic CPU types with company name.
+	vax-* | tahoe-* | i[34567]86-* | pentium[23]-* | i860-* | ia64-* | m32r-* | m68k-* | m68000-* \
+	      | m88k-* | sparc-* | ns32k-* | fx80-* | arc-* | arm-* | c[123]* \
+	      | mips-* | pyramid-* | tron-* | a29k-* | romp-* | rs6000-* \
+	      | power-* | none-* | 580-* | cray2-* | h8300-* | h8500-* | i960-* \
+	      | xmp-* | ymp-* \
+	      | x86-* | ppcbe-* | mipsbe-* | mipsle-* | shbe-* | shle-* | armbe-* | armle-* \
+	      | hppa-* | hppa1.0-* | hppa1.1-* | hppa2.0-* | hppa2.0w-* | hppa2.0n-* \
+	      | alpha-* | alphaev[4-8]-* | alphaev56-* | alphapca5[67]-* \
+	      | alphaev6[78]-* \
+	      | we32k-* | cydra-* | ns16k-* | pn-* | np1-* | xps100-* \
+	      | clipper-* | orion-* \
+	      | sparclite-* | pdp11-* | sh-* | powerpc-* | powerpcle-* \
+	      | sparc64-* | sparcv9-* | sparc86x-* | mips16-* | mips64-* | mipsel-* \
+	      | mips64el-* | mips64orion-* | mips64orionel-* \
+	      | mips64vr4100-* | mips64vr4100el-* | mips64vr4300-* | mips64vr4300el-* \
+	      | mipstx39-* | mipstx39el-* | mcore-* \
+	      | f301-* | armv*-* | s390-* | sv1-* | t3e-* \
+	      | m88110-* | m680[01234]0-* | m683?2-* | m68360-* | z8k-* | d10v-* \
+	      | k[56]-* | k6[23]-* | athlon-* | powerpc64-* \
+	      | sparcv8-* | supersparc-* | microsparc-* | ultrasparc-* \
+	      | thumb-* | v850-* | d30v-* | tic30-* | c30-* | fr30-* )
+		;;
+	# Recognize the various machine names and aliases which stand
+	# for a CPU type and a company and sometimes even an OS.
+	3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc)
+		basic_machine=m68000-att
+		;;
+	3b*)
+		basic_machine=we32k-att
+		;;
+	a29khif)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+	adobe68k)
+		basic_machine=m68010-adobe
+		os=-scout
+		;;
+	alliant | fx80)
+		basic_machine=fx80-alliant
+		;;
+	altos | altos3068)
+		basic_machine=m68k-altos
+		;;
+	am29k)
+		basic_machine=a29k-none
+		os=-bsd
+		;;
+	amdahl)
+		basic_machine=580-amdahl
+		os=-sysv
+		;;
+	amiga | amiga-*)
+		basic_machine=m68k-cbm
+		;;
+	amigaos | amigados)
+		basic_machine=m68k-cbm
+		os=-amigaos
+		;;
+	amigaunix | amix)
+		basic_machine=m68k-cbm
+		os=-sysv4
+		;;
+	apollo68)
+		basic_machine=m68k-apollo
+		os=-sysv
+		;;
+	apollo68bsd)
+		basic_machine=m68k-apollo
+		os=-bsd
+		;;
+	aux)
+		basic_machine=m68k-apple
+		os=-aux
+		;;
+	balance)
+		basic_machine=ns32k-sequent
+		os=-dynix
+		;;
+	convex-c1)
+		basic_machine=c1-convex
+		os=-bsd
+		;;
+	convex-c2)
+		basic_machine=c2-convex
+		os=-bsd
+		;;
+	convex-c32)
+		basic_machine=c32-convex
+		os=-bsd
+		;;
+	convex-c34)
+		basic_machine=c34-convex
+		os=-bsd
+		;;
+	convex-c38)
+		basic_machine=c38-convex
+		os=-bsd
+		;;
+	cray | ymp)
+		basic_machine=ymp-cray
+		os=-unicos
+		;;
+	cray2)
+		basic_machine=cray2-cray
+		os=-unicos
+		;;
+	[ctj]90-cray)
+		basic_machine=c90-cray
+		os=-unicos
+		;;
+	crds | unos)
+		basic_machine=m68k-crds
+		;;
+	da30 | da30-*)
+		basic_machine=m68k-da30
+		;;
+	decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn)
+		basic_machine=mips-dec
+		;;
+	delta | 3300 | motorola-3300 | motorola-delta \
+	      | 3300-motorola | delta-motorola)
+		basic_machine=m68k-motorola
+		;;
+	delta88)
+		basic_machine=m88k-motorola
+		os=-sysv3
+		;;
+	dpx20 | dpx20-*)
+		basic_machine=rs6000-bull
+		os=-bosx
+		;;
+	dpx2* | dpx2*-bull)
+		basic_machine=m68k-bull
+		os=-sysv3
+		;;
+	ebmon29k)
+		basic_machine=a29k-amd
+		os=-ebmon
+		;;
+	elxsi)
+		basic_machine=elxsi-elxsi
+		os=-bsd
+		;;
+	encore | umax | mmax)
+		basic_machine=ns32k-encore
+		;;
+	es1800 | OSE68k | ose68k | ose | OSE)
+		basic_machine=m68k-ericsson
+		os=-ose
+		;;
+	fx2800)
+		basic_machine=i860-alliant
+		;;
+	genix)
+		basic_machine=ns32k-ns
+		;;
+	gmicro)
+		basic_machine=tron-gmicro
+		os=-sysv
+		;;
+	h3050r* | hiux*)
+		basic_machine=hppa1.1-hitachi
+		os=-hiuxwe2
+		;;
+	h8300hms)
+		basic_machine=h8300-hitachi
+		os=-hms
+		;;
+	h8300xray)
+		basic_machine=h8300-hitachi
+		os=-xray
+		;;
+	h8500hms)
+		basic_machine=h8500-hitachi
+		os=-hms
+		;;
+	harris)
+		basic_machine=m88k-harris
+		os=-sysv3
+		;;
+	hp300-*)
+		basic_machine=m68k-hp
+		;;
+	hp300bsd)
+		basic_machine=m68k-hp
+		os=-bsd
+		;;
+	hp300hpux)
+		basic_machine=m68k-hp
+		os=-hpux
+		;;
+	hp3k9[0-9][0-9] | hp9[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
+	hp9k2[0-9][0-9] | hp9k31[0-9])
+		basic_machine=m68000-hp
+		;;
+	hp9k3[2-9][0-9])
+		basic_machine=m68k-hp
+		;;
+	hp9k6[0-9][0-9] | hp6[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
+	hp9k7[0-79][0-9] | hp7[0-79][0-9])
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k78[0-9] | hp78[0-9])
+		basic_machine=hppa2.0-hp
+		;;
+	hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893)
+		basic_machine=hppa2.0-hp
+		;;
+	hp9k8[0-9][13679] | hp8[0-9][13679])
+		basic_machine=hppa1.1-hp
+		;;
+	hp9k8[0-9][0-9] | hp8[0-9][0-9])
+		basic_machine=hppa1.0-hp
+		;;
+	hppa-next)
+		os=-nextstep3
+		;;
+	hppaosf)
+		basic_machine=hppa1.1-hp
+		os=-osf
+		;;
+	hppro)
+		basic_machine=hppa1.1-hp
+		os=-proelf
+		;;
+	i370-ibm* | ibm*)
+		basic_machine=i370-ibm
+		;;
+# I'm not sure what "Sysv32" means.  Should this be sysv3.2?
+	i[34567]86v32)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-sysv32
+		;;
+	i[34567]86v4*)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-sysv4
+		;;
+	i[34567]86v)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-sysv
+		;;
+	i[34567]86sol2)
+		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+		os=-solaris2
+		;;
+	i386mach)
+		basic_machine=i386-mach
+		os=-mach
+		;;
+	i386-vsta | vsta)
+		basic_machine=i386-unknown
+		os=-vsta
+		;;
+	i386-go32 | go32)
+		basic_machine=i386-unknown
+		os=-go32
+		;;
+	i386-mingw32 | mingw32)
+		basic_machine=i386-unknown
+		os=-mingw32
+		;;
+	iris | iris4d)
+		basic_machine=mips-sgi
+		case $os in
+		    -irix*)
+			;;
+		    *)
+			os=-irix4
+			;;
+		esac
+		;;
+	isi68 | isi)
+		basic_machine=m68k-isi
+		os=-sysv
+		;;
+	macppc*)
+		basic_machine=powerpc-apple
+		;;
+	m88k-omron*)
+		basic_machine=m88k-omron
+		;;
+	magnum | m3230)
+		basic_machine=mips-mips
+		os=-sysv
+		;;
+	merlin)
+		basic_machine=ns32k-utek
+		os=-sysv
+		;;
+	miniframe)
+		basic_machine=m68000-convergent
+		;;
+	*mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*)
+		basic_machine=m68k-atari
+		os=-mint
+		;;
+	mipsel*-linux*)
+		basic_machine=mipsel-unknown
+		os=-linux-gnu
+		;;
+	mips*-linux*)
+		basic_machine=mips-unknown
+		os=-linux-gnu
+		;;
+	mips3*-*)
+		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`
+		;;
+	mips3*)
+		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown
+		;;
+	mmix*)
+		basic_machine=mmix-knuth
+		os=-mmixware
+		;;
+	monitor)
+		basic_machine=m68k-rom68k
+		os=-coff
+		;;
+	msdos)
+		basic_machine=i386-unknown
+		os=-msdos
+		;;
+	mvs)
+		basic_machine=i370-ibm
+		os=-mvs
+		;;
+	ncr3000)
+		basic_machine=i486-ncr
+		os=-sysv4
+		;;
+	netbsd386)
+		basic_machine=i386-unknown
+		os=-netbsd
+		;;
+	netwinder)
+		basic_machine=armv4l-rebel
+		os=-linux
+		;;
+	news | news700 | news800 | news900)
+		basic_machine=m68k-sony
+		os=-newsos
+		;;
+	news1000)
+		basic_machine=m68030-sony
+		os=-newsos
+		;;
+	news-3600 | risc-news)
+		basic_machine=mips-sony
+		os=-newsos
+		;;
+	necv70)
+		basic_machine=v70-nec
+		os=-sysv
+		;;
+	next | m*-next )
+		basic_machine=m68k-next
+		case $os in
+		    -nextstep* )
+			;;
+		    -ns2*)
+		      os=-nextstep2
+			;;
+		    *)
+		      os=-nextstep3
+			;;
+		esac
+		;;
+	nh3000)
+		basic_machine=m68k-harris
+		os=-cxux
+		;;
+	nh[45]000)
+		basic_machine=m88k-harris
+		os=-cxux
+		;;
+	nindy960)
+		basic_machine=i960-intel
+		os=-nindy
+		;;
+	mon960)
+		basic_machine=i960-intel
+		os=-mon960
+		;;
+	np1)
+		basic_machine=np1-gould
+		;;
+	nsr-tandem)
+		basic_machine=nsr-tandem
+		;;
+	op50n-* | op60c-*)
+		basic_machine=hppa1.1-oki
+		os=-proelf
+		;;
+	OSE68000 | ose68000)
+		basic_machine=m68000-ericsson
+		os=-ose
+		;;
+	os68k)
+		basic_machine=m68k-none
+		os=-os68k
+		;;
+	pa-hitachi)
+		basic_machine=hppa1.1-hitachi
+		os=-hiuxwe2
+		;;
+	paragon)
+		basic_machine=i860-intel
+		os=-osf
+		;;
+	pbd)
+		basic_machine=sparc-tti
+		;;
+	pbb)
+		basic_machine=m68k-tti
+		;;
+        pc532 | pc532-*)
+		basic_machine=ns32k-pc532
+		;;
+	pentiummmx | p55)
+		basic_machine=pentiummmx-pc
+		;;
+	pentium | p5 | i586)
+		basic_machine=pentium-pc
+		;;
+	pentiumpro | p6)
+		basic_machine=pentiumpro-pc
+		;;
+	pentiummmx-* | p55-*)
+		basic_machine=pentiummmx-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pentium-* | p5-* | i586-*)
+		basic_machine=pentium-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	pentiumpro-* | p6-*)
+		basic_machine=pentiumpro-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	nexen)
+		# We don't have specific support for Nexgen yet, so just call it a Pentium
+		basic_machine=i586-nexgen
+		;;
+	pn)
+		basic_machine=pn-gould
+		;;
+	power)	basic_machine=rs6000-ibm
+		;;
+	ppc)	basic_machine=powerpc-unknown
+	        ;;
+	ppc-*)	basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ppc64)	basic_machine=powerpc64-unknown
+	        ;;
+	ppc64-*)
+		basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ppcle | powerpclittle | ppc-le | powerpc-little)
+		basic_machine=powerpcle-unknown
+	        ;;
+	ppcle-* | powerpclittle-*)
+		basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'`
+		;;
+	ps2)
+		basic_machine=i386-ibm
+		;;
+	rom68k)
+		basic_machine=m68k-rom68k
+		os=-coff
+		;;
+	rm[46]00)
+		basic_machine=mips-siemens
+		;;
+	rtpc | rtpc-*)
+		basic_machine=romp-ibm
+		;;
+	sa29200)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+	sequent)
+		basic_machine=i386-sequent
+		;;
+	sh)
+		basic_machine=sh-hitachi
+		os=-hms
+		;;
+	sparclite-wrs)
+		basic_machine=sparclite-wrs
+		os=-vxworks
+		;;
+	sps7)
+		basic_machine=m68k-bull
+		os=-sysv2
+		;;
+	spur)
+		basic_machine=spur-unknown
+		;;
+	st2000)
+		basic_machine=m68k-tandem
+		;;
+	stratus)
+		basic_machine=i860-stratus
+		os=-sysv4
+		;;
+	sun2)
+		basic_machine=m68000-sun
+		;;
+	sun2os3)
+		basic_machine=m68000-sun
+		os=-sunos3
+		;;
+	sun2os4)
+		basic_machine=m68000-sun
+		os=-sunos4
+		;;
+	sun3os3)
+		basic_machine=m68k-sun
+		os=-sunos3
+		;;
+	sun3os4)
+		basic_machine=m68k-sun
+		os=-sunos4
+		;;
+	sun4os3)
+		basic_machine=sparc-sun
+		os=-sunos3
+		;;
+	sun4os4)
+		basic_machine=sparc-sun
+		os=-sunos4
+		;;
+	sun4sol2)
+		basic_machine=sparc-sun
+		os=-solaris2
+		;;
+	sun3 | sun3-*)
+		basic_machine=m68k-sun
+		;;
+	sun4)
+		basic_machine=sparc-sun
+		;;
+	sun386 | sun386i | roadrunner)
+		basic_machine=i386-sun
+		;;
+	sv1)
+		basic_machine=sv1-cray
+		os=-unicos
+		;;
+	symmetry)
+		basic_machine=i386-sequent
+		os=-dynix
+		;;
+	t3e)
+		basic_machine=t3e-cray
+		os=-unicos
+		;;
+	tx39)
+		basic_machine=mipstx39-unknown
+		;;
+	tx39el)
+		basic_machine=mipstx39el-unknown
+		;;
+	tower | tower-32)
+		basic_machine=m68k-ncr
+		;;
+	udi29k)
+		basic_machine=a29k-amd
+		os=-udi
+		;;
+	ultra3)
+		basic_machine=a29k-nyu
+		os=-sym1
+		;;
+	v810 | necv810)
+		basic_machine=v810-nec
+		os=-none
+		;;
+	vaxv)
+		basic_machine=vax-dec
+		os=-sysv
+		;;
+	vms)
+		basic_machine=vax-dec
+		os=-vms
+		;;
+	vpp*|vx|vx-*)
+               basic_machine=f301-fujitsu
+               ;;
+	vxworks960)
+		basic_machine=i960-wrs
+		os=-vxworks
+		;;
+	vxworks68)
+		basic_machine=m68k-wrs
+		os=-vxworks
+		;;
+	vxworks29k)
+		basic_machine=a29k-wrs
+		os=-vxworks
+		;;
+	w65*)
+		basic_machine=w65-wdc
+		os=-none
+		;;
+	w89k-*)
+		basic_machine=hppa1.1-winbond
+		os=-proelf
+		;;
+	xmp)
+		basic_machine=xmp-cray
+		os=-unicos
+		;;
+        xps | xps100)
+		basic_machine=xps100-honeywell
+		;;
+	z8k-*-coff)
+		basic_machine=z8k-unknown
+		os=-sim
+		;;
+	none)
+		basic_machine=none-none
+		os=-none
+		;;
+
+# Here we handle the default manufacturer of certain CPU types.  It is in
+# some cases the only manufacturer, in others, it is the most popular.
+	w89k)
+		basic_machine=hppa1.1-winbond
+		;;
+	op50n)
+		basic_machine=hppa1.1-oki
+		;;
+	op60c)
+		basic_machine=hppa1.1-oki
+		;;
+	mips)
+		if [ x$os = x-linux-gnu ]; then
+			basic_machine=mips-unknown
+		else
+			basic_machine=mips-mips
+		fi
+		;;
+	romp)
+		basic_machine=romp-ibm
+		;;
+	rs6000)
+		basic_machine=rs6000-ibm
+		;;
+	vax)
+		basic_machine=vax-dec
+		;;
+	pdp11)
+		basic_machine=pdp11-dec
+		;;
+	we32k)
+		basic_machine=we32k-att
+		;;
+	sparc | sparcv9)
+		basic_machine=sparc-sun
+		;;
+        cydra)
+		basic_machine=cydra-cydrome
+		;;
+	orion)
+		basic_machine=orion-highlevel
+		;;
+	orion105)
+		basic_machine=clipper-highlevel
+		;;
+	mac | mpw | mac-mpw)
+		basic_machine=m68k-apple
+		;;
+	pmac | pmac-mpw)
+		basic_machine=powerpc-apple
+		;;
+	c4x*)
+		basic_machine=c4x-none
+		os=-coff
+		;;
+	*)
+		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
+		exit 1
+		;;
+esac
+
+# Here we canonicalize certain aliases for manufacturers.
+case $basic_machine in
+	*-digital*)
+		basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'`
+		;;
+	*-commodore*)
+		basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'`
+		;;
+	*)
+		;;
+esac
+
+# Decode manufacturer-specific aliases for certain operating systems.
+
+if [ x"$os" != x"" ]
+then
+case $os in
+        # First match some system type aliases
+        # that might get confused with valid system types.
+	# -solaris* is a basic system type, with this one exception.
+	-solaris1 | -solaris1.*)
+		os=`echo $os | sed -e 's|solaris1|sunos4|'`
+		;;
+	-solaris)
+		os=-solaris2
+		;;
+	-svr4*)
+		os=-sysv4
+		;;
+	-unixware*)
+		os=-sysv4.2uw
+		;;
+	-gnu/linux*)
+		os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'`
+		;;
+	# First accept the basic system types.
+	# The portable systems comes first.
+	# Each alternative MUST END IN A *, to match a version number.
+	# -sysv* is not here because it comes later, after sysvr4.
+	-gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
+	      | -*vms* | -sco* | -esix* | -isc* | -aix* | -sunos | -sunos[34]*\
+	      | -hpux* | -unos* | -osf* | -luna* | -dgux* | -solaris* | -sym* \
+	      | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
+	      | -aos* \
+	      | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
+	      | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
+	      | -hiux* | -386bsd* | -netbsd* | -openbsd* | -freebsd* | -riscix* \
+	      | -lynxos* | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
+	      | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
+	      | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
+	      | -cygwin* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
+	      | -mingw32* | -linux-gnu* | -uxpv* | -beos* | -mpeix* | -udk* \
+	      | -interix* | -uwin* | -rhapsody* | -darwin* | -opened* \
+	      | -openstep* | -oskit*)
+	# Remember, each alternative MUST END IN *, to match a version number.
+		;;
+	-qnx*)
+		case $basic_machine in
+		    x86-* | i[34567]86-*)
+			;;
+		    *)
+			os=-nto$os
+			;;
+		esac
+		;;
+	-nto*)
+		os=-nto-qnx
+		;;
+	-sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \
+	      | -windows* | -osx | -abug | -netware* | -os9* | -beos* \
+	      | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*)
+		;;
+	-mac*)
+		os=`echo $os | sed -e 's|mac|macos|'`
+		;;
+	-linux*)
+		os=`echo $os | sed -e 's|linux|linux-gnu|'`
+		;;
+	-sunos5*)
+		os=`echo $os | sed -e 's|sunos5|solaris2|'`
+		;;
+	-sunos6*)
+		os=`echo $os | sed -e 's|sunos6|solaris3|'`
+		;;
+	-opened*)
+		os=-openedition
+		;;
+	-wince*)
+		os=-wince
+		;;
+	-osfrose*)
+		os=-osfrose
+		;;
+	-osf*)
+		os=-osf
+		;;
+	-utek*)
+		os=-bsd
+		;;
+	-dynix*)
+		os=-bsd
+		;;
+	-acis*)
+		os=-aos
+		;;
+	-386bsd)
+		os=-bsd
+		;;
+	-ctix* | -uts*)
+		os=-sysv
+		;;
+	-ns2 )
+	        os=-nextstep2
+		;;
+	-nsk)
+		os=-nsk
+		;;
+	# Preserve the version number of sinix5.
+	-sinix5.*)
+		os=`echo $os | sed -e 's|sinix|sysv|'`
+		;;
+	-sinix*)
+		os=-sysv4
+		;;
+	-triton*)
+		os=-sysv3
+		;;
+	-oss*)
+		os=-sysv3
+		;;
+	-svr4)
+		os=-sysv4
+		;;
+	-svr3)
+		os=-sysv3
+		;;
+	-sysvr4)
+		os=-sysv4
+		;;
+	# This must come after -sysvr4.
+	-sysv*)
+		;;
+	-ose*)
+		os=-ose
+		;;
+	-es1800*)
+		os=-ose
+		;;
+	-xenix)
+		os=-xenix
+		;;
+        -*mint | -*MiNT)
+	        os=-mint
+		;;
+	-none)
+		;;
+	*)
+		# Get rid of the `-' at the beginning of $os.
+		os=`echo $os | sed 's/[^-]*-//'`
+		echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2
+		exit 1
+		;;
+esac
+else
+
+# Here we handle the default operating systems that come with various machines.
+# The value should be what the vendor currently ships out the door with their
+# machine or put another way, the most popular os provided with the machine.
+
+# Note that if you're going to try to match "-MANUFACTURER" here (say,
+# "-sun"), then you have to tell the case statement up towards the top
+# that MANUFACTURER isn't an operating system.  Otherwise, code above
+# will signal an error saying that MANUFACTURER isn't an operating
+# system, and we'll never get to this point.
+
+case $basic_machine in
+	*-acorn)
+		os=-riscix1.2
+		;;
+	arm*-rebel)
+		os=-linux
+		;;
+	arm*-semi)
+		os=-aout
+		;;
+        pdp11-*)
+		os=-none
+		;;
+	*-dec | vax-*)
+		os=-ultrix4.2
+		;;
+	m68*-apollo)
+		os=-domain
+		;;
+	i386-sun)
+		os=-sunos4.0.2
+		;;
+	m68000-sun)
+		os=-sunos3
+		# This also exists in the configure program, but was not the
+		# default.
+		# os=-sunos4
+		;;
+	m68*-cisco)
+		os=-aout
+		;;
+	mips*-cisco)
+		os=-elf
+		;;
+	mips*-*)
+		os=-elf
+		;;
+	*-tti)	# must be before sparc entry or we get the wrong os.
+		os=-sysv3
+		;;
+	sparc-* | *-sun)
+		os=-sunos4.1.1
+		;;
+	*-be)
+		os=-beos
+		;;
+	*-ibm)
+		os=-aix
+		;;
+	*-wec)
+		os=-proelf
+		;;
+	*-winbond)
+		os=-proelf
+		;;
+	*-oki)
+		os=-proelf
+		;;
+	*-hp)
+		os=-hpux
+		;;
+	*-hitachi)
+		os=-hiux
+		;;
+	i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent)
+		os=-sysv
+		;;
+	*-cbm)
+		os=-amigaos
+		;;
+	*-dg)
+		os=-dgux
+		;;
+	*-dolphin)
+		os=-sysv3
+		;;
+	m68k-ccur)
+		os=-rtu
+		;;
+	m88k-omron*)
+		os=-luna
+		;;
+	*-next )
+		os=-nextstep
+		;;
+	*-sequent)
+		os=-ptx
+		;;
+	*-crds)
+		os=-unos
+		;;
+	*-ns)
+		os=-genix
+		;;
+	i370-*)
+		os=-mvs
+		;;
+	*-next)
+		os=-nextstep3
+		;;
+        *-gould)
+		os=-sysv
+		;;
+        *-highlevel)
+		os=-bsd
+		;;
+	*-encore)
+		os=-bsd
+		;;
+        *-sgi)
+		os=-irix
+		;;
+        *-siemens)
+		os=-sysv4
+		;;
+	*-masscomp)
+		os=-rtu
+		;;
+	f301-fujitsu)
+		os=-uxpv
+		;;
+	*-rom68k)
+		os=-coff
+		;;
+	*-*bug)
+		os=-coff
+		;;
+	*-apple)
+		os=-macos
+		;;
+	*-atari*)
+		os=-mint
+		;;
+	*)
+		os=-none
+		;;
+esac
+fi
+
+# Here we handle the case where we know the os, and the CPU type, but not the
+# manufacturer.  We pick the logical manufacturer.
+vendor=unknown
+case $basic_machine in
+	*-unknown)
+		case $os in
+			-riscix*)
+				vendor=acorn
+				;;
+			-sunos*)
+				vendor=sun
+				;;
+			-aix*)
+				vendor=ibm
+				;;
+			-beos*)
+				vendor=be
+				;;
+			-hpux*)
+				vendor=hp
+				;;
+			-mpeix*)
+				vendor=hp
+				;;
+			-hiux*)
+				vendor=hitachi
+				;;
+			-unos*)
+				vendor=crds
+				;;
+			-dgux*)
+				vendor=dg
+				;;
+			-luna*)
+				vendor=omron
+				;;
+			-genix*)
+				vendor=ns
+				;;
+			-mvs* | -opened*)
+				vendor=ibm
+				;;
+			-ptx*)
+				vendor=sequent
+				;;
+			-vxsim* | -vxworks*)
+				vendor=wrs
+				;;
+			-aux*)
+				vendor=apple
+				;;
+			-hms*)
+				vendor=hitachi
+				;;
+			-mpw* | -macos*)
+				vendor=apple
+				;;
+			-*mint | -*MiNT)
+				vendor=atari
+				;;
+		esac
+		basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"`
+		;;
+esac
+
+echo $basic_machine$os
diff --git a/rts/gmp/configure b/rts/gmp/configure
new file mode 100644
index 0000000000..8294680486
--- /dev/null
+++ b/rts/gmp/configure
@@ -0,0 +1,5216 @@
+#! /bin/sh
+# From configure.in Revision: 1.129.2.2
+# Guess values for system-dependent variables and create Makefiles.
+# Generated automatically using Autoconf version 2.14a.
+# Copyright (C) 1992, 93, 94, 95, 96, 98, 99, 2000
+# Free Software Foundation, Inc.
+#
+# This configure script is free software; the Free Software Foundation
+# gives unlimited permission to copy, distribute and modify it.
+
+# Defaults:
+ac_default_prefix=/usr/local
+# Any additions from configure.in:
+
+# Initialize some variables set by options.
+ac_init_help=false
+ac_init_version=false
+# The variables have the same names as the options, with
+# dashes changed to underlines.
+build=NONE
+cache_file=./config.cache
+exec_prefix=NONE
+host=NONE
+no_create=
+nonopt=NONE
+no_recursion=
+prefix=NONE
+program_prefix=NONE
+program_suffix=NONE
+program_transform_name=s,x,x,
+silent=
+site=
+srcdir=
+target=NONE
+verbose=
+x_includes=NONE
+x_libraries=NONE
+bindir='${exec_prefix}/bin'
+sbindir='${exec_prefix}/sbin'
+libexecdir='${exec_prefix}/libexec'
+datadir='${prefix}/share'
+sysconfdir='${prefix}/etc'
+sharedstatedir='${prefix}/com'
+localstatedir='${prefix}/var'
+libdir='${exec_prefix}/lib'
+includedir='${prefix}/include'
+oldincludedir='/usr/include'
+infodir='${prefix}/info'
+mandir='${prefix}/man'
+
+# Initialize some other variables.
+subdirs=
+MFLAGS= MAKEFLAGS=
+SHELL=${CONFIG_SHELL-/bin/sh}
+# Maximum number of lines to put in a shell here document.
+: ${ac_max_here_lines=48}
+# Sed expression to map a string onto a valid sh and CPP variable names.
+ac_tr_sh='sed -e y%*+%pp%;s%[^a-zA-Z0-9_]%_%g'
+ac_tr_cpp='sed -e y%*abcdefghijklmnopqrstuvwxyz%PABCDEFGHIJKLMNOPQRSTUVWXYZ%;s%[^A-Z0-9_]%_%g'
+
+ac_prev=
+for ac_option
+do
+  # If the previous option needs an argument, assign it.
+  if test -n "$ac_prev"; then
+    eval "$ac_prev=\$ac_option"
+    ac_prev=
+    continue
+  fi
+
+  ac_optarg=`echo "$ac_option" | sed -n 's/^[^=]*=//p'`
+
+  # Accept the important Cygnus configure options, so we can diagnose typos.
+
+  case "$ac_option" in
+
+  -bindir | --bindir | --bindi | --bind | --bin | --bi)
+    ac_prev=bindir ;;
+  -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*)
+    bindir="$ac_optarg" ;;
+
+  -build | --build | --buil | --bui | --bu)
+    ac_prev=build ;;
+  -build=* | --build=* | --buil=* | --bui=* | --bu=*)
+    build="$ac_optarg" ;;
+
+  -cache-file | --cache-file | --cache-fil | --cache-fi \
+  | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c)
+    ac_prev=cache_file ;;
+  -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \
+  | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*)
+    cache_file="$ac_optarg" ;;
+
+  -datadir | --datadir | --datadi | --datad | --data | --dat | --da)
+    ac_prev=datadir ;;
+  -datadir=* | --datadir=* | --datadi=* | --datad=* | --data=* | --dat=* \
+  | --da=*)
+    datadir="$ac_optarg" ;;
+
+  -disable-* | --disable-*)
+    ac_feature=`echo "$ac_option"|sed -e 's/-*disable-//'`
+    # Reject names that are not valid shell variable names.
+    if echo "$ac_feature" | grep '[^-a-zA-Z0-9_]' >/dev/null 2>&1; then
+      { echo "configure: error: invalid feature: $ac_feature" 1>&2; exit 1; }
+    fi
+    ac_feature=`echo $ac_feature| sed 's/-/_/g'`
+    eval "enable_${ac_feature}=no" ;;
+
+  -enable-* | --enable-*)
+    ac_feature=`echo "$ac_option"|sed -e 's/-*enable-//' -e 's/=.*//'`
+    # Reject names that are not valid shell variable names.
+    if echo "$ac_feature" | grep '[^-a-zA-Z0-9_]' >/dev/null 2>&1; then
+      { echo "configure: error: invalid feature: $ac_feature" 1>&2; exit 1; }
+    fi
+    ac_feature=`echo $ac_feature| sed 's/-/_/g'`
+    case "$ac_option" in
+      *=*) ac_optarg=`echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"`;;
+      *) ac_optarg=yes ;;
+    esac
+    eval "enable_${ac_feature}='$ac_optarg'" ;;
+
+  -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \
+  | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \
+  | --exec | --exe | --ex)
+    ac_prev=exec_prefix ;;
+  -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \
+  | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \
+  | --exec=* | --exe=* | --ex=*)
+    exec_prefix="$ac_optarg" ;;
+
+  -gas | --gas | --ga | --g)
+    # Obsolete; use --with-gas.
+    with_gas=yes ;;
+
+  -help | --help | --hel | --he | -h)
+    ac_init_help=: ;;
+  -host | --host | --hos | --ho)
+    ac_prev=host ;;
+  -host=* | --host=* | --hos=* | --ho=*)
+    host="$ac_optarg" ;;
+
+  -includedir | --includedir | --includedi | --included | --include \
+  | --includ | --inclu | --incl | --inc)
+    ac_prev=includedir ;;
+  -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \
+  | --includ=* | --inclu=* | --incl=* | --inc=*)
+    includedir="$ac_optarg" ;;
+
+  -infodir | --infodir | --infodi | --infod | --info | --inf)
+    ac_prev=infodir ;;
+  -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*)
+    infodir="$ac_optarg" ;;
+
+  -libdir | --libdir | --libdi | --libd)
+    ac_prev=libdir ;;
+  -libdir=* | --libdir=* | --libdi=* | --libd=*)
+    libdir="$ac_optarg" ;;
+
+  -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \
+  | --libexe | --libex | --libe)
+    ac_prev=libexecdir ;;
+  -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \
+  | --libexe=* | --libex=* | --libe=*)
+    libexecdir="$ac_optarg" ;;
+
+  -localstatedir | --localstatedir | --localstatedi | --localstated \
+  | --localstate | --localstat | --localsta | --localst \
+  | --locals | --local | --loca | --loc | --lo)
+    ac_prev=localstatedir ;;
+  -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \
+  | --localstate=* | --localstat=* | --localsta=* | --localst=* \
+  | --locals=* | --local=* | --loca=* | --loc=* | --lo=*)
+    localstatedir="$ac_optarg" ;;
+
+  -mandir | --mandir | --mandi | --mand | --man | --ma | --m)
+    ac_prev=mandir ;;
+  -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*)
+    mandir="$ac_optarg" ;;
+
+  -nfp | --nfp | --nf)
+    # Obsolete; use --without-fp.
+    with_fp=no ;;
+
+  -no-create | --no-create | --no-creat | --no-crea | --no-cre \
+  | --no-cr | --no-c)
+    no_create=yes ;;
+
+  -no-recursion | --no-recursion | --no-recursio | --no-recursi \
+  | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r)
+    no_recursion=yes ;;
+
+  -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \
+  | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \
+  | --oldin | --oldi | --old | --ol | --o)
+    ac_prev=oldincludedir ;;
+  -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \
+  | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \
+  | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*)
+    oldincludedir="$ac_optarg" ;;
+
+  -prefix | --prefix | --prefi | --pref | --pre | --pr | --p)
+    ac_prev=prefix ;;
+  -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*)
+    prefix="$ac_optarg" ;;
+
+  -program-prefix | --program-prefix | --program-prefi | --program-pref \
+  | --program-pre | --program-pr | --program-p)
+    ac_prev=program_prefix ;;
+  -program-prefix=* | --program-prefix=* | --program-prefi=* \
+  | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*)
+    program_prefix="$ac_optarg" ;;
+
+  -program-suffix | --program-suffix | --program-suffi | --program-suff \
+  | --program-suf | --program-su | --program-s)
+    ac_prev=program_suffix ;;
+  -program-suffix=* | --program-suffix=* | --program-suffi=* \
+  | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*)
+    program_suffix="$ac_optarg" ;;
+
+  -program-transform-name | --program-transform-name \
+  | --program-transform-nam | --program-transform-na \
+  | --program-transform-n | --program-transform- \
+  | --program-transform | --program-transfor \
+  | --program-transfo | --program-transf \
+  | --program-trans | --program-tran \
+  | --progr-tra | --program-tr | --program-t)
+    ac_prev=program_transform_name ;;
+  -program-transform-name=* | --program-transform-name=* \
+  | --program-transform-nam=* | --program-transform-na=* \
+  | --program-transform-n=* | --program-transform-=* \
+  | --program-transform=* | --program-transfor=* \
+  | --program-transfo=* | --program-transf=* \
+  | --program-trans=* | --program-tran=* \
+  | --progr-tra=* | --program-tr=* | --program-t=*)
+    program_transform_name="$ac_optarg" ;;
+
+  -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+  | -silent | --silent | --silen | --sile | --sil)
+    silent=yes ;;
+
+  -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
+    ac_prev=sbindir ;;
+  -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
+  | --sbi=* | --sb=*)
+    sbindir="$ac_optarg" ;;
+
+  -sharedstatedir | --sharedstatedir | --sharedstatedi \
+  | --sharedstated | --sharedstate | --sharedstat | --sharedsta \
+  | --sharedst | --shareds | --shared | --share | --shar \
+  | --sha | --sh)
+    ac_prev=sharedstatedir ;;
+  -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \
+  | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \
+  | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \
+  | --sha=* | --sh=*)
+    sharedstatedir="$ac_optarg" ;;
+
+  -site | --site | --sit)
+    ac_prev=site ;;
+  -site=* | --site=* | --sit=*)
+    site="$ac_optarg" ;;
+
+  -srcdir | --srcdir | --srcdi | --srcd | --src | --sr)
+    ac_prev=srcdir ;;
+  -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*)
+    srcdir="$ac_optarg" ;;
+
+  -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \
+  | --syscon | --sysco | --sysc | --sys | --sy)
+    ac_prev=sysconfdir ;;
+  -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \
+  | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*)
+    sysconfdir="$ac_optarg" ;;
+
+  -target | --target | --targe | --targ | --tar | --ta | --t)
+    ac_prev=target ;;
+  -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*)
+    target="$ac_optarg" ;;
+
+  -v | -verbose | --verbose | --verbos | --verbo | --verb)
+    verbose=yes ;;
+
+  -version | --version | --versio | --versi | --vers | -V)
+    ac_init_version=: ;;
+
+  -with-* | --with-*)
+    ac_package=`echo "$ac_option"|sed -e 's/-*with-//' -e 's/=.*//'`
+    # Reject names that are not valid shell variable names.
+    if echo "$ac_package" | grep '[^-a-zA-Z0-9_]' >/dev/null 2>&1; then
+      { echo "configure: error: invalid package: $ac_package" 1>&2; exit 1; }
+    fi
+    ac_package=`echo $ac_package| sed 's/-/_/g'`
+    case "$ac_option" in
+      *=*) ac_optarg=`echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"`;;
+      *) ac_optarg=yes ;;
+    esac
+    eval "with_${ac_package}='$ac_optarg'" ;;
+
+  -without-* | --without-*)
+    ac_package=`echo "$ac_option"|sed -e 's/-*without-//'`
+    # Reject names that are not valid shell variable names.
+    if echo "$ac_package" | grep '[^-a-zA-Z0-9_]' >/dev/null 2>&1; then
+      { echo "configure: error: invalid package: $ac_package" 1>&2; exit 1; }
+    fi
+    ac_package=`echo $ac_package| sed 's/-/_/g'`
+    eval "with_${ac_package}=no" ;;
+
+  --x)
+    # Obsolete; use --with-x.
+    with_x=yes ;;
+
+  -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \
+  | --x-incl | --x-inc | --x-in | --x-i)
+    ac_prev=x_includes ;;
+  -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \
+  | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*)
+    x_includes="$ac_optarg" ;;
+
+  -x-libraries | --x-libraries | --x-librarie | --x-librari \
+  | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l)
+    ac_prev=x_libraries ;;
+  -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \
+  | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*)
+    x_libraries="$ac_optarg" ;;
+
+  -*) { echo "configure: error: unrecognized option: $ac_option
+Try \`configure --help' for more information." 1>&2; exit 1; }
+    ;;
+
+  *=*)
+    ac_envvar=`echo "$ac_option" | sed -e 's/=.*//'`
+    # Reject names that are not valid shell variable names.
+    if echo "$ac_envvar" | grep '[^a-zA-Z0-9_]' >/dev/null 2>&1; then
+      { echo "configure: error: invalid variable name: $ac_envvar" 1>&2; exit 1; }
+    fi
+    ac_optarg=`echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"`
+    eval "$ac_envvar='$ac_optarg'"
+    export $ac_envvar ;;
+
+  *)
+    if echo "$ac_option" | grep '[^-a-zA-Z0-9.]' >/dev/null 2>&1; then
+      echo "configure: warning: invalid host type: $ac_option" 1>&2
+    fi
+    if test "x$nonopt" != xNONE; then
+      { echo "configure: error: can only configure for one host and one target at a time" 1>&2; exit 1; }
+    fi
+    nonopt="$ac_option"
+    ;;
+
+  esac
+done
+
+if test -n "$ac_prev"; then
+  { echo "configure: error: missing argument to --\`echo $ac_prev | sed 's/_/-/g'\`" 1>&2; exit 1; }
+fi
+if $ac_init_help; then
+  # Omit some internal or obsolete options to make the list less imposing.
+  # This message is too long to be a string in the A/UX 3.1 sh.
+  cat <<\EOF
+`configure' configures software source code packages to adapt to many kinds
+of systems.
+
+Usage: configure [OPTION]... [VAR=VALUE]... [HOST]
+
+To safely assign special values to environment variables (e.g., CC,
+CFLAGS...), give to `configure' the definition as VAR=VALUE.
+
+Defaults for the options are specified in brackets.
+
+Configuration:
+  -h, --help              print this message
+  -V, --version           print the version of autoconf that created configure
+  -q, --quiet, --silent   do not print `checking...' messages
+      --cache-file=FILE   cache test results in FILE
+  -n, --no-create         do not create output files
+
+EOF
+
+  cat <<EOF
+Directories:
+  --prefix=PREFIX         install architecture-independent files in PREFIX
+                          [$ac_default_prefix]
+  --exec-prefix=EPREFIX   install architecture-dependent files in EPREFIX
+                          [same as prefix]
+  --bindir=DIR            user executables in DIR [EPREFIX/bin]
+  --sbindir=DIR           system admin executables in DIR [EPREFIX/sbin]
+  --libexecdir=DIR        program executables in DIR [EPREFIX/libexec]
+  --datadir=DIR           read-only architecture-independent data in DIR
+                          [PREFIX/share]
+  --sysconfdir=DIR        read-only single-machine data in DIR [PREFIX/etc]
+  --sharedstatedir=DIR    modifiable architecture-independent data in DIR
+                          [PREFIX/com]
+  --localstatedir=DIR     modifiable single-machine data in DIR [PREFIX/var]
+  --libdir=DIR            object code libraries in DIR [EPREFIX/lib]
+  --includedir=DIR        C header files in DIR [PREFIX/include]
+  --oldincludedir=DIR     C header files for non-gcc in DIR [/usr/include]
+  --infodir=DIR           info documentation in DIR [PREFIX/info]
+  --mandir=DIR            man documentation in DIR [PREFIX/man]
+  --srcdir=DIR            find the sources in DIR [configure dir or ..]
+EOF
+
+  cat <<\EOF
+
+Host type:
+  --build=BUILD      configure for building on BUILD [BUILD=HOST]
+  --host=HOST        configure for HOST [guessed]
+  --target=TARGET    configure for TARGET [TARGET=HOST]
+EOF
+
+  cat <<\EOF
+
+Program names:
+  --program-prefix=PREFIX            prepend PREFIX to installed program names
+  --program-suffix=SUFFIX            append SUFFIX to installed program names
+  --program-transform-name=PROGRAM   run sed PROGRAM on installed program names
+
+Optional Features:
+  --disable-FEATURE       do not include FEATURE (same as --enable-FEATURE=no)
+  --enable-FEATURE=ARG  include FEATURE ARG=yes
+  --disable-dependency-tracking Speeds up one-time builds
+  --enable-dependency-tracking  Do not reject slow dependency extractors
+  --enable-maintainer-mode enable make rules and dependencies not useful
+                          (and sometimes confusing) to the casual installer
+  --enable-assert         enable ASSERT checking default=no
+  --enable-alloca         use alloca for temp space default=yes
+  --enable-fft            enable FFTs for multiplication default=no
+  --enable-mpbsd          build Berkley MP compatibility library default=no
+  --enable-mpfr           build MPFR default=no
+  --enable-shared=PKGS  build shared libraries default=yes
+  --enable-static=PKGS  build static libraries default=yes
+  --enable-fast-install=PKGS  optimize for fast installation default=yes
+  --disable-libtool-lock  avoid locking (might break parallel builds)
+
+Optional Packages:
+  --with-PACKAGE=ARG    use PACKAGE ARG=yes
+  --without-PACKAGE       do not use PACKAGE (same as --with-PACKAGE=no)
+  --with-gnu-ld           assume the C compiler uses GNU ld default=no
+  --with-pic              try to use only PIC/non-PIC objects default=use both
+EOF
+  exit 0
+fi
+if $ac_init_version; then
+  cat <<\EOF
+Generated automatically using Autoconf version 2.14a.
+Copyright (C) 1992, 93, 94, 95, 96, 98, 99, 2000
+Free Software Foundation, Inc.
+
+This configure script is free software; the Free Software Foundation
+gives unlimited permission to copy, distribute and modify it.
+EOF
+  exit 0
+fi
+trap 'rm -fr conftest* confdefs* core core.* *.core $ac_clean_files; exit 1' 1 2 15
+
+# Keep a trace of the command line.
+# Strip out --no-create and --no-recursion so they do not pile up.
+# Also quote any args containing shell meta-characters.
+ac_configure_args=
+for ac_arg
+do
+  case "$ac_arg" in
+  -no-create | --no-create | --no-creat | --no-crea | --no-cre \
+  | --no-cr | --no-c) ;;
+  -no-recursion | --no-recursion | --no-recursio | --no-recursi \
+  | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r) ;;
+  *" "*|*"	"*|*[\[\]\~\#\$\^\&\*\(\)\{\}\\\|\;\<\>\?\"\']*)
+  ac_arg=`echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"`
+  ac_configure_args="$ac_configure_args '$ac_arg'" ;;
+  *) ac_configure_args="$ac_configure_args $ac_arg" ;;
+  esac
+done
+
+# File descriptor usage:
+# 0 standard input
+# 1 file creation
+# 2 errors and warnings
+# 3 some systems may open it to /dev/tty
+# 4 used on the Kubota Titan
+# 6 checking for... messages and results
+# 5 compiler messages saved in config.log
+if test "$silent" = yes; then
+  exec 6>/dev/null
+else
+  exec 6>&1
+fi
+exec 5>./config.log
+
+echo "\
+This file contains any messages produced by compilers while
+running configure, to aid debugging if configure makes a mistake.
+
+It was created by configure version 2.14a, executed with
+ > $0 $ac_configure_args
+" 1>&5
+
+# NLS nuisances.
+# Only set these to C if already set.  These must not be set unconditionally
+# because not all systems understand e.g. LANG=C (notably SCO).
+# Fixing LC_MESSAGES prevents Solaris sh from translating var values in `set'!
+# Non-C LC_CTYPE values break the ctype check.
+if test "${LANG+set}"   = set; then LANG=C;   export LANG;   fi
+if test "${LC_ALL+set}" = set; then LC_ALL=C; export LC_ALL; fi
+if test "${LC_MESSAGES+set}" = set; then LC_MESSAGES=C; export LC_MESSAGES; fi
+if test "${LC_CTYPE+set}"    = set; then LC_CTYPE=C;    export LC_CTYPE;    fi
+
+# confdefs.h avoids OS command line length limits that DEFS can exceed.
+rm -rf conftest* confdefs.h
+# AIX cpp loses on an empty file, so make sure it contains at least a newline.
+echo >confdefs.h
+
+# A filename unique to this package, relative to the directory that
+# configure is in, which we can look for to find out if srcdir is correct.
+ac_unique_file=
+
+# Find the source files, if location was not specified.
+if test -z "$srcdir"; then
+  ac_srcdir_defaulted=yes
+  # Try the directory containing this script, then its parent.
+  ac_prog=$0
+  ac_confdir=`echo "$ac_prog" | sed 's%/[^/][^/]*$%%'`
+  test "x$ac_confdir" = "x$ac_prog" && ac_confdir=.
+  srcdir=$ac_confdir
+  if test ! -r $srcdir/$ac_unique_file; then
+    srcdir=..
+  fi
+else
+  ac_srcdir_defaulted=no
+fi
+if test ! -r $srcdir/$ac_unique_file; then
+  if test "$ac_srcdir_defaulted" = yes; then
+    { echo "configure: error: cannot find sources in $ac_confdir or .." 1>&2; exit 1; }
+  else
+    { echo "configure: error: cannot find sources in $srcdir" 1>&2; exit 1; }
+  fi
+fi
+srcdir=`echo "$srcdir" | sed 's%\([^/]\)/*$%\1%'`
+
+# Prefer explicitly selected file to automatically selected ones.
+if test -z "$CONFIG_SITE"; then
+  if test "x$prefix" != xNONE; then
+    CONFIG_SITE="$prefix/share/config.site $prefix/etc/config.site"
+  else
+    CONFIG_SITE="$ac_default_prefix/share/config.site $ac_default_prefix/etc/config.site"
+  fi
+fi
+for ac_site_file in $CONFIG_SITE; do
+  if test -r "$ac_site_file"; then
+    echo "loading site script $ac_site_file"
+    . "$ac_site_file"
+  fi
+done
+
+if test -r "$cache_file"; then
+  echo "loading cache $cache_file"
+      test -f "$cache_file" && . $cache_file
+else
+  echo "creating cache $cache_file"
+  >$cache_file
+fi
+
+ac_ext=c
+# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+cross_compiling=$ac_cv_prog_cc_cross
+
+ac_exeext=
+ac_objext=o
+# Factoring default headers for most tests.
+ac_includes_default="\
+#include <stdio.h>
+#include <sys/types.h>
+#if STDC_HEADERS
+# include <stdlib.h>
+# include <stddef.h>
+#else
+# if HAVE_STDLIB_H
+#  include <stdlib.h>
+# endif
+#endif
+#if HAVE_STRING_H
+# if !STDC_HEADERS && HAVE_MEMORY_H
+#  include <memory.h>
+# endif
+# include <string.h>
+#else
+# if HAVE_STRINGS_H
+#  include <strings.h>
+# endif
+#endif
+#if HAVE_INTTYPES_H
+# include <inttypes.h>
+#endif
+#if HAVE_UNISTD_H
+# include <unistd.h>
+#endif"
+
+if (echo "testing\c"; echo 1,2,3) | grep c >/dev/null; then
+  # Stardent Vistra SVR4 grep lacks -e, says Kaveh R. Ghazi.
+  if (echo -n testing; echo 1,2,3) | sed s/-n/xn/ | grep xn >/dev/null; then
+    ECHO_N= ECHO_C='
+' ECHO_T='	'
+  else
+    ECHO_N=-n ECHO_C= ECHO_T=
+  fi
+else
+  ECHO_N= ECHO_C='\c' ECHO_T=
+fi
+
+ac_aux_dir=
+for ac_dir in $srcdir $srcdir/.. $srcdir/../..; do
+  if test -f $ac_dir/install-sh; then
+    ac_aux_dir=$ac_dir
+    ac_install_sh="$ac_aux_dir/install-sh -c"
+    break
+  elif test -f $ac_dir/install.sh; then
+    ac_aux_dir=$ac_dir
+    ac_install_sh="$ac_aux_dir/install.sh -c"
+    break
+  elif test -f $ac_dir/shtool; then
+    ac_aux_dir=$ac_dir
+    ac_install_sh="$ac_aux_dir/shtool install -c"
+    break
+  fi
+done
+if test -z "$ac_aux_dir"; then
+  { echo "configure: error: cannot find install-sh or install.sh in $srcdir $srcdir/.. $srcdir/../.." 1>&2; exit 1; }
+fi
+ac_config_guess="$SHELL $ac_aux_dir/config.guess"
+ac_config_sub="$SHELL $ac_aux_dir/config.sub"
+ac_configure="$SHELL $ac_aux_dir/configure" # This should be Cygnus configure.
+
+echo $ECHO_N "checking host system type... $ECHO_C" 1>&6
+echo "configure:636: checking host system type" 1>&5
+if test "x$ac_cv_host" = "x" || (test "x$host" != "xNONE" && test "x$host" != "x$ac_cv_host_alias"); then
+
+  # Make sure we can run config.sub.
+  if $ac_config_sub sun4 >/dev/null 2>&1; then :; else
+    { echo "configure: error: cannot run $ac_config_sub" 1>&2; exit 1; }
+  fi
+
+  ac_cv_host_alias=$host
+  case "$ac_cv_host_alias" in
+  NONE)
+    case $nonopt in
+    NONE)
+      if ac_cv_host_alias=`$ac_config_guess`; then :
+      else { echo "configure: error: cannot guess host type; you must specify one" 1>&2; exit 1; }
+      fi ;;    *) ac_cv_host_alias=$nonopt ;;
+    esac ;;
+  esac
+
+  ac_cv_host=`$ac_config_sub $ac_cv_host_alias` || exit 1
+  ac_cv_host_cpu=`echo $ac_cv_host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'`
+  ac_cv_host_vendor=`echo $ac_cv_host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'`
+  ac_cv_host_os=`echo $ac_cv_host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'`
+else
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+fi
+
+echo "$ECHO_T""$ac_cv_host" 1>&6
+
+host=$ac_cv_host
+host_alias=$ac_cv_host_alias
+host_cpu=$ac_cv_host_cpu
+host_vendor=$ac_cv_host_vendor
+host_os=$ac_cv_host_os
+
+echo $ECHO_N "checking target system type... $ECHO_C" 1>&6
+echo "configure:672: checking target system type" 1>&5
+if test "x$ac_cv_target" = "x" || (test "x$target" != "xNONE" && test "x$target" != "x$ac_cv_target_alias"); then
+
+  # Make sure we can run config.sub.
+  if $ac_config_sub sun4 >/dev/null 2>&1; then :; else
+    { echo "configure: error: cannot run $ac_config_sub" 1>&2; exit 1; }
+  fi
+
+  ac_cv_target_alias=$target
+  case "$ac_cv_target_alias" in
+  NONE)
+    case $nonopt in
+    NONE)
+      ac_cv_target_alias=$host_alias ;;
+    *) ac_cv_target_alias=$nonopt ;;
+    esac ;;
+  esac
+
+  ac_cv_target=`$ac_config_sub $ac_cv_target_alias` || exit 1
+  ac_cv_target_cpu=`echo $ac_cv_target | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'`
+  ac_cv_target_vendor=`echo $ac_cv_target | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'`
+  ac_cv_target_os=`echo $ac_cv_target | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'`
+else
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+fi
+
+echo "$ECHO_T""$ac_cv_target" 1>&6
+
+target=$ac_cv_target
+target_alias=$ac_cv_target_alias
+target_cpu=$ac_cv_target_cpu
+target_vendor=$ac_cv_target_vendor
+target_os=$ac_cv_target_os
+
+echo $ECHO_N "checking build system type... $ECHO_C" 1>&6
+echo "configure:707: checking build system type" 1>&5
+if test "x$ac_cv_build" = "x" || (test "x$build" != "xNONE" && test "x$build" != "x$ac_cv_build_alias"); then
+
+  # Make sure we can run config.sub.
+  if $ac_config_sub sun4 >/dev/null 2>&1; then :; else
+    { echo "configure: error: cannot run $ac_config_sub" 1>&2; exit 1; }
+  fi
+
+  ac_cv_build_alias=$build
+  case "$ac_cv_build_alias" in
+  NONE)
+    case $nonopt in
+    NONE)
+      ac_cv_build_alias=$host_alias ;;
+    *) ac_cv_build_alias=$nonopt ;;
+    esac ;;
+  esac
+
+  ac_cv_build=`$ac_config_sub $ac_cv_build_alias` || exit 1
+  ac_cv_build_cpu=`echo $ac_cv_build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'`
+  ac_cv_build_vendor=`echo $ac_cv_build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'`
+  ac_cv_build_os=`echo $ac_cv_build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'`
+else
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+fi
+
+echo "$ECHO_T""$ac_cv_build" 1>&6
+
+build=$ac_cv_build
+build_alias=$ac_cv_build_alias
+build_cpu=$ac_cv_build_cpu
+build_vendor=$ac_cv_build_vendor
+build_os=$ac_cv_build_os
+
+# Do some error checking and defaulting for the host and target type.
+# The inputs are:
+#    configure --host=HOST --target=TARGET --build=BUILD NONOPT
+#
+# The rules are:
+# 1. You are not allowed to specify --host, --target, and nonopt at the
+#    same time.
+# 2. Host defaults to nonopt.
+# 3. If nonopt is not specified, then host defaults to the current host,
+#    as determined by config.guess.
+# 4. Target and build default to nonopt.
+# 5. If nonopt is not specified, then target and build default to host.
+
+# The aliases save the names the user supplied, while $host etc.
+# will get canonicalized.
+case $host---$target---$nonopt in
+NONE---*---* | *---NONE---* | *---*---NONE) ;;
+*) { echo "configure: error: can only configure for one host and one target at a time" 1>&2; exit 1; } ;;
+esac
+
+test "$host_alias" != "$target_alias" &&
+  test "$program_prefix$program_suffix$program_transform_name" = \
+    NONENONEs,x,x, &&
+  program_prefix=${target_alias}-
+
+# Find a good install program.  We prefer a C program (faster),
+# so one script is as good as another.  But avoid the broken or
+# incompatible versions:
+# SysV /etc/install, /usr/sbin/install
+# SunOS /usr/etc/install
+# IRIX /sbin/install
+# AIX /bin/install
+# AIX 4 /usr/bin/installbsd, which doesn't work without a -g flag
+# AFS /usr/afsws/bin/install, which mishandles nonexistent args
+# SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff"
+# ./install, which can be erroneously created by make from ./install.sh.
+echo $ECHO_N "checking for a BSD compatible install... $ECHO_C" 1>&6
+echo "configure:778: checking for a BSD compatible install" 1>&5
+if test -z "$INSTALL"; then
+if test "${ac_cv_path_install+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+    IFS="${IFS= 	}"; ac_save_IFS="$IFS"; IFS=":"
+  for ac_dir in $PATH; do
+    # Account for people who put trailing slashes in PATH elements.
+    case "$ac_dir/" in
+    /|./|.//|/etc/*|/usr/sbin/*|/usr/etc/*|/sbin/*|/usr/afsws/bin/*|/usr/ucb/*) ;;
+    *)
+      # OSF1 and SCO ODT 3.0 have their own names for install.
+      # Don't use installbsd from OSF since it installs stuff as root
+      # by default.
+      for ac_prog in ginstall scoinst install; do
+        if test -f $ac_dir/$ac_prog; then
+	  if test $ac_prog = install &&
+            grep dspmsg $ac_dir/$ac_prog >/dev/null 2>&1; then
+	    # AIX install.  It has an incompatible calling convention.
+	    :
+	  elif test $ac_prog = install &&
+	    grep pwplus $ac_dir/$ac_prog >/dev/null 2>&1; then
+	    # program-specific install script used by HP pwplus--don't use.
+	    :
+	  else
+	    ac_cv_path_install="$ac_dir/$ac_prog -c"
+	    break 2
+	  fi
+	fi
+      done
+      ;;
+    esac
+  done
+  IFS="$ac_save_IFS"
+
+fi
+  if test "${ac_cv_path_install+set}" = set; then
+    INSTALL="$ac_cv_path_install"
+  else
+    # As a last resort, use the slow shell script.  We don't cache a
+    # path for INSTALL within a source directory, because that will
+    # break other packages using the cache if that directory is
+    # removed, or if the path is relative.
+    INSTALL="$ac_install_sh"
+  fi
+fi
+echo "$ECHO_T""$INSTALL" 1>&6
+
+# Use test -z because SunOS4 sh mishandles braces in ${var-val}.
+# It thinks the first close brace ends the variable substitution.
+test -z "$INSTALL_PROGRAM" && INSTALL_PROGRAM='${INSTALL}'
+
+test -z "$INSTALL_SCRIPT" && INSTALL_SCRIPT='${INSTALL}'
+
+test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644'
+
+echo $ECHO_N "checking whether build environment is sane... $ECHO_C" 1>&6
+echo "configure:835: checking whether build environment is sane" 1>&5
+# Just in case
+sleep 1
+echo timestamp > conftestfile
+# Do `set' in a subshell so we don't clobber the current shell's
+# arguments.  Must try -L first in case configure is actually a
+# symlink; some systems play weird games with the mod time of symlinks
+# (eg FreeBSD returns the mod time of the symlink's containing
+# directory).
+if (
+   set X `ls -Lt $srcdir/configure conftestfile 2> /dev/null`
+   if test "$*" = "X"; then
+      # -L didn't work.
+      set X `ls -t $srcdir/configure conftestfile`
+   fi
+   if test "$*" != "X $srcdir/configure conftestfile" \
+      && test "$*" != "X conftestfile $srcdir/configure"; then
+
+      # If neither matched, then we have a broken ls.  This can happen
+      # if, for instance, CONFIG_SHELL is bash and it inherits a
+      # broken ls alias from the environment.  This has actually
+      # happened.  Such a system could not be considered "sane".
+      { echo "configure: error: ls -t appears to fail.  Make sure there is not a broken
+alias in your environment" 1>&2; exit 1; }
+   fi
+
+   test "$2" = conftestfile
+   )
+then
+   # Ok.
+   :
+else
+   { echo "configure: error: newly created file is older than distributed files!
+Check your system clock" 1>&2; exit 1; }
+fi
+rm -f conftest*
+echo "$ECHO_T""yes" 1>&6
+if test "$program_transform_name" = s,x,x,; then
+  program_transform_name=
+else
+  # Double any \ or $.  echo might interpret backslashes.
+  cat <<\EOF >conftestsed
+s,\\,\\\\,g; s,\$,$$,g
+EOF
+  program_transform_name=`echo $program_transform_name | sed -f conftestsed`
+  rm -f conftestsed
+fi
+test "$program_prefix" != NONE &&
+  program_transform_name="s,^,${program_prefix},;$program_transform_name"
+# Use a double $ so make ignores it.
+test "$program_suffix" != NONE &&
+  program_transform_name="s,\$\$,${program_suffix},;$program_transform_name"
+
+# sed with no file args requires a program.
+test "$program_transform_name" = "" && program_transform_name="s,x,x,"
+
+test x"${MISSING+set}" = xset || \
+  MISSING="\${SHELL} `CDPATH=: && cd $ac_aux_dir && pwd`/missing"
+if eval "$MISSING --run :"; then
+  am_missing_run="$MISSING --run "
+else
+  am_missing_run=
+  am_backtick='`'
+  echo "configure: warning: ${am_backtick}missing' script is too old or missing" 1>&2
+fi
+
+for ac_prog in mawk gawk nawk awk
+do
+# Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+echo $ECHO_N "checking for $ac_word... $ECHO_C" 1>&6
+echo "configure:906: checking for $ac_word" 1>&5
+if test "${ac_cv_prog_AWK+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -n "$AWK"; then
+  ac_cv_prog_AWK="$AWK" # Let the user override the test.
+else
+  for ac_path in `IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+ac_dummy="$PATH"
+for ac_dir in $ac_dummy; do
+  test -z "$ac_dir" && ac_dir=.
+  if test -f $ac_dir/$ac_word; then
+    echo "$ac_dir/$ac_word"
+  fi
+done
+IFS="$ac_save_ifs"
+`; do
+    ac_cv_prog_AWK="$ac_prog"
+    break
+  done
+fi
+fi
+AWK="$ac_cv_prog_AWK"
+if test -n "$AWK"; then
+  echo "$ECHO_T""$AWK" 1>&6
+else
+  echo "$ECHO_T""no" 1>&6
+fi
+
+test -n "$AWK" && break
+done
+
+echo $ECHO_N "checking whether ${MAKE-make} sets \${MAKE}... $ECHO_C" 1>&6
+echo "configure:939: checking whether ${MAKE-make} sets \${MAKE}" 1>&5
+set dummy ${MAKE-make}; ac_make=`echo "$2" | sed 'y%./+-%__p_%'`
+if eval "test \"\${ac_cv_prog_make_${ac_make}_set+set}\" = set"; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  cat >conftestmake <<\EOF
+all:
+	@echo 'ac_maketemp="${MAKE}"'
+EOF
+# GNU make sometimes prints "make[1]: Entering...", which would confuse us.
+eval `${MAKE-make} -f conftestmake 2>/dev/null | grep temp=`
+if test -n "$ac_maketemp"; then
+  eval ac_cv_prog_make_${ac_make}_set=yes
+else
+  eval ac_cv_prog_make_${ac_make}_set=no
+fi
+rm -f conftestmake
+fi
+if eval "test \"`echo '$ac_cv_prog_make_'${ac_make}_set`\" = yes"; then
+  echo "$ECHO_T""yes" 1>&6
+  SET_MAKE=
+else
+  echo "$ECHO_T""no" 1>&6
+  SET_MAKE="MAKE=${MAKE-make}"
+fi
+
+# Check whether --enable-dependency-tracking or --disable-dependency-tracking was given.
+if test "${enable_dependency_tracking+set}" = set; then
+  enableval="$enable_dependency_tracking"
+
+fi
+if test "x$enable_dependency_tracking" = xno; then
+  AMDEP="#"
+else
+  am_depcomp="$ac_aux_dir/depcomp"
+  if test ! -f "$am_depcomp"; then
+    AMDEP="#"
+  else
+    AMDEP=
+  fi
+fi
+
+if test -z "$AMDEP"; then
+  AMDEPBACKSLASH='\'
+else
+  AMDEPBACKSLASH=
+fi
+
+if test -d .deps || mkdir .deps 2> /dev/null || test -d .deps; then
+  DEPDIR=.deps
+else
+  DEPDIR=_deps
+fi
+
+PACKAGE=gmp
+
+VERSION=3.1.1
+
+if test "`CDPATH=: && cd $srcdir && pwd`" != "`pwd`" &&
+   test -f $srcdir/config.status; then
+  { echo "configure: error: source directory already configured; run "make distclean" there first" 1>&2; exit 1; }
+fi
+cat >>confdefs.h <<EOF
+#define PACKAGE "$PACKAGE"
+EOF
+
+cat >>confdefs.h <<EOF
+#define VERSION "$VERSION"
+EOF
+
+ACLOCAL=${ACLOCAL-"${am_missing_run}aclocal"}
+
+AUTOCONF=${AUTOCONF-"${am_missing_run}autoconf"}
+
+AUTOMAKE=${AUTOMAKE-"${am_missing_run}automake"}
+
+AUTOHEADER=${AUTOHEADER-"${am_missing_run}autoheader"}
+
+MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"}
+
+AMTAR=${AMTAR-"${am_missing_run}tar"}
+
+if test -z "$install_sh"; then
+   install_sh="$ac_aux_dir/install-sh"
+   test -f "$install_sh" || install_sh="$ac_aux_dir/install.sh"
+   test -f "$install_sh" || install_sh="${am_missing_run}${ac_auxdir}/install-sh"
+         install_sh="`echo $install_sh | sed -e 's/\${SHELL}//'`"
+fi
+
+echo $ECHO_N "checking whether to enable maintainer-specific portions of Makefiles... $ECHO_C" 1>&6
+echo "configure:1029: checking whether to enable maintainer-specific portions of Makefiles" 1>&5
+    # Check whether --enable-maintainer-mode or --disable-maintainer-mode was given.
+if test "${enable_maintainer_mode+set}" = set; then
+  enableval="$enable_maintainer_mode"
+  USE_MAINTAINER_MODE=$enableval
+else
+  USE_MAINTAINER_MODE=no
+fi
+  echo "$ECHO_T""$USE_MAINTAINER_MODE" 1>&6
+
+if test $USE_MAINTAINER_MODE = yes; then
+  MAINTAINER_MODE_TRUE=
+  MAINTAINER_MODE_FALSE='#'
+else
+  MAINTAINER_MODE_TRUE='#'
+  MAINTAINER_MODE_FALSE=
+fi
+  MAINT=$MAINTAINER_MODE_TRUE
+
+gmp_configm4="config.m4"
+gmp_tmpconfigm4=cnfm4.tmp
+gmp_tmpconfigm4i=cnfm4i.tmp
+gmp_tmpconfigm4p=cnfm4p.tmp
+test -f $gmp_tmpconfigm4 && rm $gmp_tmpconfigm4
+test -f $gmp_tmpconfigm4i && rm $gmp_tmpconfigm4i
+test -f $gmp_tmpconfigm4p && rm $gmp_tmpconfigm4p
+
+# Check whether --enable-assert or --disable-assert was given.
+if test "${enable_assert+set}" = set; then
+  enableval="$enable_assert"
+  case "${enableval}" in
+yes|no) ;;
+*) { echo "configure: error: bad value ${enableval} for --enable-assert, need yes or no" 1>&2; exit 1; } ;;
+esac
+else
+  enable_assert=no
+fi
+
+if test "$enable_assert" = "yes"; then
+  cat >>confdefs.h <<\EOF
+#define WANT_ASSERT 1
+EOF
+
+fi
+
+# Check whether --enable-alloca or --disable-alloca was given.
+if test "${enable_alloca+set}" = set; then
+  enableval="$enable_alloca"
+  case "${enableval}" in
+yes|no) ;;
+*) { echo "configure: error: bad value ${enableval} for --enable-alloca, need yes or no" 1>&2; exit 1; } ;;
+esac
+else
+  enable_alloca=yes
+fi
+
+if test "$enable_alloca" = "no"; then
+  cat >>confdefs.h <<\EOF
+#define USE_STACK_ALLOC 1
+EOF
+
+fi
+
+# Check whether --enable-fft or --disable-fft was given.
+if test "${enable_fft+set}" = set; then
+  enableval="$enable_fft"
+  case "${enableval}" in
+yes|no) ;;
+*) { echo "configure: error: bad value ${enableval} for --enable-fft, need yes or no" 1>&2; exit 1; } ;;
+esac
+else
+  enable_fft=no
+fi
+
+if test "$enable_fft" = "yes"; then
+  cat >>confdefs.h <<\EOF
+#define WANT_FFT 1
+EOF
+
+fi
+
+# Check whether --enable-mpbsd or --disable-mpbsd was given.
+if test "${enable_mpbsd+set}" = set; then
+  enableval="$enable_mpbsd"
+  case "${enableval}" in
+yes|no) ;;
+*) { echo "configure: error: bad value ${enableval} for --enable-mpbsd, need yes or no" 1>&2; exit 1; } ;;
+esac
+else
+  enable_mpbsd=no
+fi
+
+if test "$enable_mpbsd" = "yes"; then
+  WANT_MPBSD_TRUE=
+  WANT_MPBSD_FALSE='#'
+else
+  WANT_MPBSD_TRUE='#'
+  WANT_MPBSD_FALSE=
+fi
+
+# Check whether --enable-mpfr or --disable-mpfr was given.
+if test "${enable_mpfr+set}" = set; then
+  enableval="$enable_mpfr"
+  case "${enableval}" in
+yes|no) ;;
+*) { echo "configure: error: bad value ${enableval} for --enable-mpfr, need yes or no" 1>&2; exit 1; } ;;
+esac
+else
+  enable_mpfr=no
+fi
+
+if test "$enable_mpfr" = "yes"; then
+  WANT_MPFR_TRUE=
+  WANT_MPFR_FALSE='#'
+else
+  WANT_MPFR_TRUE='#'
+  WANT_MPFR_FALSE=
+fi
+
+os_64bit="no"
+cclist="gcc cc"		# FIXME: Prefer c89 to cc.
+gmp_cflags_gcc="-g -O2"
+gmp_cflags64_gcc="-g -O2"
+gmp_cflags_cc="-g"
+gmp_cflags64_cc="-g"
+
+case "$target" in
+  # Alpha
+  alpha*-cray-unicos*)
+    # Don't perform any assembly syntax tests on this beast.
+    gmp_no_asm_syntax_testing=yes
+    cclist=cc
+    gmp_cflags_cc="$gmp_cflags_cc -O"
+    ;;
+  alpha*-*-osf*)
+    flavour=`echo $target_cpu | sed 's/^alpha//g'`
+    if test -n "$flavour"; then
+      case $flavour in	   # compilers don't seem to understand `ev67' and such.
+	ev6? | ev7*) flavour=ev6;;
+      esac
+      gmp_optcflags_gcc="-mcpu=$flavour"
+      # FIXME: We shouldn't fail fatally if none of these work, but that's
+      # how xoptcflags work and we don't have any other mechanism right now.
+      # Why do we need this here and not for alpha*-*-* below?
+      gmp_xoptcflags_gcc="-Wa,-arch,${flavour} -Wa,-m${flavour}"
+      gmp_optcflags_cc="-arch $flavour -tune $flavour"
+    fi
+    ;;
+  alpha*-*-*)
+    cclist="gcc"
+    flavour=`echo $target_cpu | sed 's/^alpha//g'`
+    if test -n "$flavour"; then
+      case $flavour in
+	ev6? | ev7*) flavour=ev6;;
+      esac
+      gmp_optcflags_gcc="-mcpu=$flavour"
+    fi
+    ;;
+  # Cray vector machines.  This must come after alpha* so that we can
+  # recognize present and future vector processors with a wildcard.
+  *-cray-unicos*)
+    # Don't perform any assembly syntax tests on this beast.
+    gmp_no_asm_syntax_testing=yes
+    cclist=cc
+    # Don't inherit default gmp_cflags_cc value; it comes with -g which
+    # disables all optimization on Cray vector systems
+    gmp_cflags_cc="-O"
+    ;;
+
+  # AMD and Intel x86 configurations
+  i?86*-*-* | k[5-8]*-*-* | pentium*-*-* | athlon-*-*)
+    # Rumour has it -O2 used to give worse register allocation than just -O.
+    gmp_cflags_gcc="-g -O -fomit-frame-pointer"
+
+    case "${target}" in
+      i386*-*-*)    gmp_optcflags_gcc="-mcpu=i386 -march=i386";;
+      i486*-*-*)    gmp_optcflags_gcc="-mcpu=i486 -march=i486";;
+      i586*-*-* | pentium-*-* | pentiummmx-*-*)
+                    gmp_optcflags_gcc="-mcpu=pentium -march=pentium";;
+
+      # -march=pentiumpro not used because mpz/powm.c (swox cvs rev 1.4)
+      # tickles a bug in gcc 2.95.2 (believed fixed in 2.96).
+      i686*-*-* | pentiumpro-*-* | pentium[23]-*-*)
+                    gmp_optcflags_gcc="-mcpu=pentiumpro";;
+
+      k6*-*-*)      gmp_optcflags_gcc="-mcpu=k6 -march=k6";;
+
+      # Athlon instruction costs are close to p6: 3 cycle load latency, 4-6
+      # cycle mul, 40 cycle div, pairable adc, ...
+      # FIXME: Change this when gcc gets something specific for Athlon.
+      # -march=pentiumpro not used, per i686 above.
+      athlon-*-*)   gmp_optcflags_gcc="-mcpu=pentiumpro";;
+    esac
+    ;;
+
+  # Sparc
+  ultrasparc*-*-solaris2.[7-9] | sparcv9-*-solaris2.[7-9])
+    os_64bit=yes
+    gmp_cflags_gcc="$gmp_cflags_gcc -Wa,-xarch=v8plus"
+    gmp_xoptcflags_gcc="-mcpu=v9 -mcpu=v8 -mv8"
+    gmp_cflags64_gcc="$gmp_cflags64_gcc -m64 -mptr64 -Wa,-xarch=v9 -mcpu=v9"
+    gmp_cflags_cc="-xtarget=native -xarch=v8 -xO4"
+    gmp_cflags64_cc="-xtarget=native -xarch=v9 -xO4"
+    ;;
+  sparc64-*-linux*)
+    # Need to think more about the options passed here.  This isn't good for
+    # some sparc64 linux distros, since we end up not optimizing when all the
+    # options below fail.
+    os_64bit=yes
+    gmp_cflags64_gcc="$gmp_cflags64_gcc -m64 -mptr64 -Wa,-xarch=v9 -mcpu=v9"
+    gmp_cflags_gcc="$gmp_cflags_gcc -m32"
+    gmp_xoptflags_gcc="-mcpu=ultrasparc -mvis"
+    ;;
+  ultrasparc*-*-* | sparcv9-*-*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -Wa,-xarch=v8plus"
+    gmp_xoptcflags_gcc="-mcpu=v9 -mcpu=v8 -mv8"
+    gmp_cflags_cc="-xtarget=native -xarch=v8 -xO4"
+    ;;
+  sparcv8*-*-solaris2.* | microsparc*-*-solaris2.*)
+    gmp_cflags_gcc="$gmp_cflags_gcc"
+    gmp_xoptcflags_gcc="-mcpu=v8 -mv8"
+    gmp_cflags_cc="-xtarget=native -xarch=v8 -xO4"
+    ;;
+  sparcv8*-*-* | microsparc*-*-*)		# SunOS, Linux, *BSD
+    cclist="gcc acc cc"
+    gmp_cflags_gcc="$gmp_cflags_gcc"
+    gmp_xoptcflags_gcc="-mcpu=v8 -mv8"
+    gmp_cflags_acc="-g -O2 -cg92"
+    gmp_cflags_cc="-O2"		# FIXME: Flag for v8?
+    ;;
+  supersparc*-*-solaris2.*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -DSUPERSPARC"
+    gmp_xoptcflags_gcc="-mcpu=v8 -mv8"
+    gmp_cflags_cc="-xtarget=native -xarch=v8 -xO4 -DSUPERSPARC"
+    ;;
+  supersparc*-*-*)		# SunOS, Linux, *BSD
+    cclist="gcc acc cc"
+    gmp_cflags_gcc="$gmp_cflags_gcc -DSUPERSPARC"
+    gmp_xoptcflags_gcc="-mcpu=v8 -mv8"
+    gmp_cflags_acc="-g -O2 -cg92 -DSUPERSPARC"
+    gmp_cflags_cc="-O2 -DSUPERSPARC"	# FIXME: Flag for v8?
+    ;;
+  *sparc*-*-*)
+    cclist="gcc acc cc"
+    gmp_cflags_acc="-g -O2"
+    gmp_cflags_cc="-g -O2"
+    ;;
+
+  # POWER/PowerPC
+  powerpc64-*-aix*)
+    cclist="gcc xlc"
+    gmp_cflags_gcc="$gmp_cflags_gcc -maix64 -mpowerpc64"
+    gmp_cflags_xlc="-g -O2 -q64 -qtune=pwr3"
+    ;;
+  powerpc*-*-aix*)
+    cclist="gcc xlc"
+    gmp_cflags_gcc="$gmp_cflags_gcc -mpowerpc"
+    gmp_cflags_xlc="$gmp_cflags_cc -qarch=ppc -O2"
+    ;;
+  power-*-aix*)
+    cclist="gcc xlc"
+    gmp_cflags_gcc="$gmp_cflags_gcc -mpower"
+    gmp_cflags_xlc="$gmp_cflags_cc -qarch=pwr -O2"
+    ;;
+  powerpc64*-*-*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -mpowerpc64"
+    cat >>confdefs.h <<\EOF
+#define _LONG_LONG_LIMB 1
+EOF
+	    ;;
+  powerpc-apple-darwin* | powerpc-apple-macosx*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -mpowerpc -traditional-cpp"
+    ;;
+  powerpc*-*-*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -mpowerpc"
+    ;;
+
+  # MIPS
+  mips-sgi-irix6.*)
+    os_64bit=yes
+    gmp_cflags64_gcc="-g -O2 -mabi=n32"
+    gmp_cflags64_cc="$gmp_cflags64_cc -O2 -n32"
+    ;;
+
+  # Motorola 68k family
+  m88110*-*-*)
+    gmp_cflags_gcc="-g -O -m88110"	    ;;
+  m68*-*-*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -fomit-frame-pointer"
+    ;;
+
+  # HP
+  hppa1.0*-*-*)
+    cclist="gcc c89 cc"
+    gmp_cflags_c89="$gmp_cflags_cc +O2"
+    gmp_cflags_cc="$gmp_cflags_cc +O2"
+    ;;
+  hppa2.0w*-*-*)
+    cclist="c89 cc"
+    gmp_cflags_c89="+DD64 +O3"
+    gmp_cflags_cc="+DD64 +O3"
+    ;;
+  hppa2.0*-*-*)
+    os_64bit=yes
+    cclist="gcc c89 cc"
+    gmp_cflags64_gcc="$gmp_cflags64_gcc -mWHAT -D_LONG_LONG_LIMB"
+    # +O2 to cc triggers bug in mpz/powm.c (1.4)
+    gmp_cflags64_c89="+DA2.0 +e +O3 -D_LONG_LONG_LIMB"
+    gmp_cflags64_cc="+DA2.0 +e +O3 -D_LONG_LONG_LIMB"
+    gmp_cflags_c89="$gmp_cflags_cc +O2"
+    gmp_cflags_cc="$gmp_cflags_cc +O2"
+    ;;
+
+  # VAX
+  vax*-*-*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -fomit-frame-pointer"
+    ;;
+
+  # Fujitsu
+  f30[01]-fujitsu-sysv*)
+    cclist="gcc vcc"
+    gmp_cflags_vcc="-g"		# FIXME: flags for vcc?
+    ;;
+esac
+
+case "${target}" in
+  *-*-mingw32) gmp_cflags_gcc="$gmp_cflags_gcc -mno-cygwin";;
+esac
+
+echo $ECHO_N "checking for BSD-compatible nm... $ECHO_C" 1>&6
+echo "configure:1352: checking for BSD-compatible nm" 1>&5
+if test "${ac_cv_path_NM+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -n "$NM"; then
+  # Let the user override the test.
+  ac_cv_path_NM="$NM"
+else
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}${PATH_SEPARATOR-:}"
+  for ac_dir in $PATH /usr/ccs/bin /usr/ucb /bin; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/nm || test -f $ac_dir/nm$ac_exeext ; then
+      # Check to see if the nm accepts a BSD-compat flag.
+      # Adding the `sed 1q' prevents false positives on HP-UX, which says:
+      #   nm: unknown option "B" ignored
+      if ($ac_dir/nm -B /dev/null 2>&1 | sed '1q'; exit 0) | egrep /dev/null >/dev/null; then
+	ac_cv_path_NM="$ac_dir/nm -B"
+	break
+      elif ($ac_dir/nm -p /dev/null 2>&1 | sed '1q'; exit 0) | egrep /dev/null >/dev/null; then
+	ac_cv_path_NM="$ac_dir/nm -p"
+	break
+      else
+	ac_cv_path_NM=${ac_cv_path_NM="$ac_dir/nm"} # keep the first match, but
+	continue # so that we can try to find one that supports BSD flags
+      fi
+    fi
+  done
+  IFS="$ac_save_ifs"
+  test -z "$ac_cv_path_NM" && ac_cv_path_NM=nm
+fi
+fi
+
+NM="$ac_cv_path_NM"
+echo "$ECHO_T""$NM" 1>&6
+	# nm on 64-bit AIX needs to know the object file format
+case "$target" in
+  powerpc64*-*-aix*)
+    NM="$NM -X 64"
+    ;;
+esac
+
+# Save CFLAGS given on command line.
+gmp_user_CFLAGS="$CFLAGS"
+
+if test -z "$CC"; then
+  # Find compiler.
+
+if test $host != $build; then
+  ac_tool_prefix=${host_alias}-
+else
+  ac_tool_prefix=
+fi
+
+gmp_cc_list="$cclist"
+gmp_req_64bit_cc="$os_64bit"
+
+CC32=
+CC64=
+for c in $gmp_cc_list; do
+  # Avoid cache hits.
+  unset CC
+  unset ac_cv_prog_CC
+
+# Extract the first word of "${ac_tool_prefix}$c", so it can be a program name with args.
+set dummy ${ac_tool_prefix}$c; ac_word=$2
+echo $ECHO_N "checking for $ac_word... $ECHO_C" 1>&6
+echo "configure:1418: checking for $ac_word" 1>&5
+if test "${ac_cv_prog_CC+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+  for ac_path in `IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+ac_dummy="$PATH"
+for ac_dir in $ac_dummy; do
+  test -z "$ac_dir" && ac_dir=.
+  if test -f $ac_dir/$ac_word; then
+    echo "$ac_dir/$ac_word"
+  fi
+done
+IFS="$ac_save_ifs"
+`; do
+    ac_cv_prog_CC="${ac_tool_prefix}$c"
+    break
+  done
+fi
+fi
+CC="$ac_cv_prog_CC"
+if test -n "$CC"; then
+  echo "$ECHO_T""$CC" 1>&6
+else
+  echo "$ECHO_T""no" 1>&6
+fi
+
+if test -z "$ac_cv_prog_CC"; then
+  if test -n "$ac_tool_prefix"; then
+    # Extract the first word of "$c", so it can be a program name with args.
+set dummy $c; ac_word=$2
+echo $ECHO_N "checking for $ac_word... $ECHO_C" 1>&6
+echo "configure:1452: checking for $ac_word" 1>&5
+if test "${ac_cv_prog_CC+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+  for ac_path in `IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+ac_dummy="$PATH"
+for ac_dir in $ac_dummy; do
+  test -z "$ac_dir" && ac_dir=.
+  if test -f $ac_dir/$ac_word; then
+    echo "$ac_dir/$ac_word"
+  fi
+done
+IFS="$ac_save_ifs"
+`; do
+    ac_cv_prog_CC="$c"
+    break
+  done
+  test -z "$ac_cv_prog_CC" && ac_cv_prog_CC="$c"
+fi
+fi
+CC="$ac_cv_prog_CC"
+if test -n "$CC"; then
+  echo "$ECHO_T""$CC" 1>&6
+else
+  echo "$ECHO_T""no" 1>&6
+fi
+
+  else
+    CC="$c"
+  fi
+fi
+
+  if test -n "$CC"; then
+    eval c_flags=\$gmp_cflags_$c
+    ac_ext=c
+# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+cross_compiling=$ac_cv_prog_cc_cross
+	CC="$CC"
+CFLAGS="$c_flags"
+echo $ECHO_N "checking if the C compiler ($CC) works with flags $CFLAGS... $ECHO_C" 1>&6
+echo "configure:1498: checking if the C compiler ($CC) works with flags $CFLAGS" 1>&5
+
+# Simple test for all targets.
+cat >conftest.$ac_ext <<EOF
+
+#line 1503 "configure"
+#include "confdefs.h"
+
+int main(){return(0);}
+EOF
+if { (eval echo configure:1508: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  tmp_works=yes
+  # If we can't run a trivial program, we are probably using a cross compiler.
+  if (./conftest; exit) 2>/dev/null; then
+    tmp_cross=no
+  else
+    tmp_cross=yes
+  fi
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  tmp_works=no
+fi
+rm -fr conftest*
+
+# Target specific tests.
+if test "$tmp_works" = "yes"; then
+  case "$target" in
+    *-*-aix*)	# Returning a funcptr.
+      cat >conftest.$ac_ext <<EOF
+#line 1528 "configure"
+#include "confdefs.h"
+
+int
+main ()
+{
+} void *g(); void *f() { return g(); } int bar(){
+  ;
+  return 0;
+}
+EOF
+if { (eval echo configure:1539: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  rm -rf conftest*
+  tmp_works=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  tmp_works=no
+fi
+rm -f conftest*
+      ;;
+  esac
+fi
+
+if test "$tmp_works" = "yes"; then
+  gmp_prog_cc_works=yes
+else
+  gmp_prog_cc_works=no
+fi
+
+echo "$ECHO_T""$tmp_works" 1>&6
+
+    if test "$gmp_prog_cc_works" != "yes"; then
+      continue
+    fi
+
+    # Save first working compiler, whether 32- or 64-bit capable.
+    if test -z "$CC32"; then
+      CC32="$CC"
+    fi
+    if test "$gmp_req_64bit_cc" = "yes"; then
+      eval c_flags=\$gmp_cflags64_$c
+
+      # Verify that the compiler works in 64-bit mode as well.
+      # /usr/ucb/cc on Solaris 7 can *compile* in 64-bit mode, but not link.
+      ac_ext=c
+# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+cross_compiling=$ac_cv_prog_cc_cross
+	CC="$c"
+CFLAGS="$c_flags"
+echo $ECHO_N "checking if the C compiler ($CC) works with flags $CFLAGS... $ECHO_C" 1>&6
+echo "configure:1583: checking if the C compiler ($CC) works with flags $CFLAGS" 1>&5
+
+# Simple test for all targets.
+cat >conftest.$ac_ext <<EOF
+
+#line 1588 "configure"
+#include "confdefs.h"
+
+int main(){return(0);}
+EOF
+if { (eval echo configure:1593: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  tmp_works=yes
+  # If we can't run a trivial program, we are probably using a cross compiler.
+  if (./conftest; exit) 2>/dev/null; then
+    tmp_cross=no
+  else
+    tmp_cross=yes
+  fi
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  tmp_works=no
+fi
+rm -fr conftest*
+
+# Target specific tests.
+if test "$tmp_works" = "yes"; then
+  case "$target" in
+    *-*-aix*)	# Returning a funcptr.
+      cat >conftest.$ac_ext <<EOF
+#line 1613 "configure"
+#include "confdefs.h"
+
+int
+main ()
+{
+} void *g(); void *f() { return g(); } int bar(){
+  ;
+  return 0;
+}
+EOF
+if { (eval echo configure:1624: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  rm -rf conftest*
+  tmp_works=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  tmp_works=no
+fi
+rm -f conftest*
+      ;;
+  esac
+fi
+
+if test "$tmp_works" = "yes"; then
+  gmp_prog_cc_works=yes
+else
+  gmp_prog_cc_works=no
+fi
+
+echo "$ECHO_T""$tmp_works" 1>&6
+
+      if test "$gmp_prog_cc_works" = "yes"; then
+
+  gmp_tmp_CC_save="$CC"
+  CC="$c"
+  echo $ECHO_N "checking whether the C compiler ($CC) is 64-bit capable... $ECHO_C" 1>&6
+echo "configure:1651: checking whether the C compiler ($CC) is 64-bit capable" 1>&5
+  if test -z "$NM"; then
+    echo; echo "configure: GMP_CHECK_CC_64BIT: fatal: need nm"
+    exit 1
+  fi
+  gmp_tmp_CFLAGS_save="$CFLAGS"
+  CFLAGS="$c_flags"
+
+  case "$target" in
+    hppa2.0*-*-*)
+      # FIXME: If gcc is installed under another name than "gcc", we will
+      # test the wrong thing.
+      if test "$CC" != "gcc"; then
+                                                echo >conftest.c
+        gmp_tmp_vs=`$CC $CFLAGS -V -c -o conftest.o conftest.c 2>&1 | grep "^ccom:"`
+        rm conftest*
+        gmp_tmp_v1=`echo $gmp_tmp_vs | sed 's/.* .\.\(.*\)\..*\..* HP C.*/\1/'`
+        gmp_tmp_v2=`echo $gmp_tmp_vs | sed 's/.* .\..*\.\(.*\)\..* HP C.*/\1/'`
+        gmp_tmp_v3=`echo $gmp_tmp_vs | sed 's/.* .\..*\..*\.\(.*\) HP C.*/\1/'`
+	gmp_cv_cc_64bit=no
+	test -n "$gmp_tmp_v1" && test "$gmp_tmp_v1" -ge "10" \
+  	  && test -n "$gmp_tmp_v2" && test "$gmp_tmp_v2" -ge "32" \
+    	  && test -n "$gmp_tmp_v3" && test "$gmp_tmp_v3" -ge "30" \
+	  && gmp_cv_cc_64bit=yes
+      else	# gcc
+	# FIXME: Compile a minimal file and determine if the resulting object
+	# file is an ELF file.  If so, gcc can produce 64-bit code.
+	# Do we have file(1) for target?
+	gmp_cv_cc_64bit=no
+      fi
+      ;;
+    mips-sgi-irix6.*)
+      # We use `-n32' to cc and `-mabi=n32' to gcc, resulting in 64-bit
+      # arithmetic but not 64-bit pointers, so the general test for sizeof
+      # (void *) is not valid.
+      # Simply try to compile an empty main.  If that succeeds return
+      # true.
+      cat >conftest.$ac_ext <<EOF
+#line 1689 "configure"
+#include "confdefs.h"
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+EOF
+if { (eval echo configure:1700: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  rm -rf conftest*
+  gmp_cv_cc_64bit=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  gmp_cv_cc_64bit=no
+fi
+rm -f conftest*
+      ;;
+    *-*-*)
+      # Allocate an array of size sizeof (void *) and use nm to determine its
+      # size.  We depend on the first declared variable being put at address 0.
+      cat >conftest.c <<EOF
+char arr[sizeof (void *)]={0};
+char post=0;
+EOF
+      gmp_compile="$CC $CFLAGS -c conftest.c 1>&5"
+      if { (eval echo configure:1719: \"$gmp_compile\") 1>&5; (eval $gmp_compile) 2>&5; }; then
+        	gmp_tmp_val=`$NM conftest.o | grep post | sed -e 's;[[][0-9][]]\(.*\);\1;' \
+          -e 's;[^1-9]*\([0-9]*\).*;\1;'`
+                if test "$gmp_tmp_val" = "8"; then
+	  gmp_cv_cc_64bit=yes
+	else
+	  gmp_cv_cc_64bit=no
+        fi
+      else
+        echo "configure: failed program was:" >&5
+        cat conftest.$ac_ext >&5
+        gmp_cv_cc_64bit=no
+      fi
+      rm -f conftest*
+      ;;
+  esac
+
+  CC="$gmp_tmp_CC_save"
+  CFLAGS="$gmp_tmp_CFLAGS_save"
+  echo "$ECHO_T""$gmp_cv_cc_64bit" 1>&6
+
+        if test "$gmp_cv_cc_64bit" = "yes"; then
+          test -z "$CC64" && CC64="$c"
+          test -z "$CFLAGS64" && CFLAGS64="$c_flags"
+	  # We have CC64 so we're done.
+          break
+        fi
+      fi
+    else
+      # We have CC32, and we don't need a 64-bit compiler so we're done.
+      break
+    fi
+  fi
+done
+CC="$CC32"
+
+  # If 64-bit OS and we have a 64-bit compiler, use it.
+  if test -n "$os_64bit" && test -n "$CC64"; then
+    CC=$CC64
+    CFLAGS=$CFLAGS64
+  else
+    eval CFLAGS=\$gmp_cflags_$CC
+  fi
+
+  # Try compiler flags that may work with only some compiler versions.
+  # gmp_optcflags: All or nothing.
+  eval optcflags=\$gmp_optcflags_$CC
+  if test -n "$optcflags"; then
+    CFLAGS_save="$CFLAGS"
+    CFLAGS="$CFLAGS $optcflags"
+    echo $ECHO_N "checking whether $CC accepts $optcflags... $ECHO_C" 1>&6
+echo "configure:1770: checking whether $CC accepts $optcflags" 1>&5
+    ac_ext=c
+# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+cross_compiling=$ac_cv_prog_cc_cross
+
+    cat >conftest.$ac_ext <<EOF
+
+#line 1780 "configure"
+#include "confdefs.h"
+
+int main(){return(0);}
+EOF
+if { (eval echo configure:1785: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  optok=yes
+  # If we can't run a trivial program, we are probably using a cross compiler.
+  if (./conftest; exit) 2>/dev/null; then
+    cross=no
+  else
+    cross=yes
+  fi
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  optok=no
+fi
+rm -fr conftest*
+    if test "$optok" = "yes"; then
+      echo "$ECHO_T""yes" 1>&6
+    else
+      echo "$ECHO_T""no" 1>&6
+      CFLAGS="$CFLAGS_save"
+    fi
+  fi
+  # gmp_xoptcflags: First is best, one has to work.
+  eval xoptcflags=\$gmp_xoptcflags_$CC
+  if test -n "$xoptcflags"; then
+    gmp_found="no"
+    for xopt in $xoptcflags; do
+      CFLAGS_save="$CFLAGS"
+      CFLAGS="$CFLAGS $xopt"
+      echo $ECHO_N "checking whether $CC accepts $xopt... $ECHO_C" 1>&6
+echo "configure:1814: checking whether $CC accepts $xopt" 1>&5
+      ac_ext=c
+# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+cross_compiling=$ac_cv_prog_cc_cross
+
+      cat >conftest.$ac_ext <<EOF
+
+#line 1824 "configure"
+#include "confdefs.h"
+
+int main(){return(0);}
+EOF
+if { (eval echo configure:1829: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  optok=yes
+  # If we can't run a trivial program, we are probably using a cross compiler.
+  if (./conftest; exit) 2>/dev/null; then
+    cross=no
+  else
+    cross=yes
+  fi
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  optok=no
+fi
+rm -fr conftest*
+      if test "$optok" = "yes"; then
+        echo "$ECHO_T""yes" 1>&6
+        gmp_found="yes"
+        break
+      else
+        echo "$ECHO_T""no" 1>&6
+        CFLAGS="$CFLAGS_save"
+      fi
+    done
+    if test "$gmp_found" = "no"; then
+      echo "$0: fatal: need a compiler that understands one of $xoptcflags"
+      exit 1
+    fi
+  fi
+fi
+
+# Restore CFLAGS given on command line.
+# FIXME: We've run through quite some unnecessary code looking for a
+# nice compiler and working flags for it, just to spoil that with user
+# supplied flags.
+test -n "$gmp_user_CFLAGS" && CFLAGS="$gmp_user_CFLAGS"
+
+# Select chosen compiler.
+
+echo $ECHO_N "checking whether the C compiler ($CC $CFLAGS $CPPFLAGS $LDFLAGS) works... $ECHO_C" 1>&6
+echo "configure:1868: checking whether the C compiler ($CC $CFLAGS $CPPFLAGS $LDFLAGS) works" 1>&5
+
+ac_ext=c
+# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+cross_compiling=$ac_cv_prog_cc_cross
+
+cat >conftest.$ac_ext <<EOF
+
+#line 1879 "configure"
+#include "confdefs.h"
+
+int main(){return(0);}
+EOF
+if { (eval echo configure:1884: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  ac_cv_prog_cc_works=yes
+  # If we can't run a trivial program, we are probably using a cross compiler.
+  if (./conftest; exit) 2>/dev/null; then
+    ac_cv_prog_cc_cross=no
+  else
+    ac_cv_prog_cc_cross=yes
+  fi
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  ac_cv_prog_cc_works=no
+fi
+rm -fr conftest*
+ac_ext=c
+# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+cross_compiling=$ac_cv_prog_cc_cross
+
+echo "$ECHO_T""$ac_cv_prog_cc_works" 1>&6
+if test $ac_cv_prog_cc_works = no; then
+  { echo "configure: error: installation or configuration problem: C compiler cannot create executables." 1>&2; exit 77; }
+fi
+echo $ECHO_N "checking whether the C compiler ($CC $CFLAGS $CPPFLAGS $LDFLAGS) is a cross-compiler... $ECHO_C" 1>&6
+echo "configure:1910: checking whether the C compiler ($CC $CFLAGS $CPPFLAGS $LDFLAGS) is a cross-compiler" 1>&5
+echo "$ECHO_T""$ac_cv_prog_cc_cross" 1>&6
+cross_compiling=$ac_cv_prog_cc_cross
+
+echo $ECHO_N "checking whether we are using GNU C... $ECHO_C" 1>&6
+echo "configure:1915: checking whether we are using GNU C" 1>&5
+if test "${ac_cv_prog_gcc+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  cat >conftest.c <<EOF
+#ifdef __GNUC__
+  yes;
+#endif
+EOF
+if { ac_try='${CC-cc} -E conftest.c'; { (eval echo configure:1924: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then
+  ac_cv_prog_gcc=yes
+else
+  ac_cv_prog_gcc=no
+fi
+fi
+echo "$ECHO_T""$ac_cv_prog_gcc" 1>&6
+
+if test "$ac_cv_prog_gcc" = "yes"; then
+  GCC=yes
+else
+  GCC=
+fi
+
+# Set CFLAGS if not already set.
+if test -z "$CFLAGS"; then
+  CFLAGS="-g"
+  if test "$GCC" = "yes"; then
+    CFLAGS="$CFLAGS -O2"
+  fi
+fi
+
+if test "${ac_cv_prog_CC+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  ac_cv_prog_CC="$CC"
+fi
+
+# How to assemble.
+CCAS="$CC -c"
+
+echo $ECHO_N "checking how to run the C preprocessor... $ECHO_C" 1>&6
+echo "configure:1956: checking how to run the C preprocessor" 1>&5
+# On Suns, sometimes $CPP names a directory.
+if test -n "$CPP" && test -d "$CPP"; then
+  CPP=
+fi
+if test -z "$CPP"; then
+if test "${ac_cv_prog_CPP+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+    # This must be in double quotes, not single quotes, because CPP may get
+  # substituted into the Makefile and "${CC-cc}" will confuse make.
+  CPP="${CC-cc} -E"
+  # On the NeXT, cc -E runs the code through the compiler's parser,
+  # not just through cpp.
+
+cat >conftest.$ac_ext <<EOF
+#line 1972 "configure"
+#include "confdefs.h"
+#include <assert.h>
+Syntax Error
+EOF
+ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
+{ (eval echo configure:1978: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
+if test -z "$ac_err"; then
+  :
+else
+  echo "$ac_err" >&5
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  CPP="${CC-cc} -E -traditional-cpp"
+
+cat >conftest.$ac_ext <<EOF
+#line 1990 "configure"
+#include "confdefs.h"
+#include <assert.h>
+Syntax Error
+EOF
+ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
+{ (eval echo configure:1996: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
+if test -z "$ac_err"; then
+  :
+else
+  echo "$ac_err" >&5
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  CPP="${CC-cc} -nologo -E"
+
+cat >conftest.$ac_ext <<EOF
+#line 2008 "configure"
+#include "confdefs.h"
+#include <assert.h>
+Syntax Error
+EOF
+ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
+{ (eval echo configure:2014: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
+if test -z "$ac_err"; then
+  :
+else
+  echo "$ac_err" >&5
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  CPP=/lib/cpp
+fi
+rm -f conftest*
+fi
+rm -f conftest*
+fi
+rm -f conftest*
+  ac_cv_prog_CPP="$CPP"
+fi
+  CPP="$ac_cv_prog_CPP"
+else
+  ac_cv_prog_CPP="$CPP"
+fi
+echo "$ECHO_T""$CPP" 1>&6
+
+# Find a good install program.  We prefer a C program (faster),
+# so one script is as good as another.  But avoid the broken or
+# incompatible versions:
+# SysV /etc/install, /usr/sbin/install
+# SunOS /usr/etc/install
+# IRIX /sbin/install
+# AIX /bin/install
+# AIX 4 /usr/bin/installbsd, which doesn't work without a -g flag
+# AFS /usr/afsws/bin/install, which mishandles nonexistent args
+# SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff"
+# ./install, which can be erroneously created by make from ./install.sh.
+echo $ECHO_N "checking for a BSD compatible install... $ECHO_C" 1>&6
+echo "configure:2050: checking for a BSD compatible install" 1>&5
+if test -z "$INSTALL"; then
+if test "${ac_cv_path_install+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+    IFS="${IFS= 	}"; ac_save_IFS="$IFS"; IFS=":"
+  for ac_dir in $PATH; do
+    # Account for people who put trailing slashes in PATH elements.
+    case "$ac_dir/" in
+    /|./|.//|/etc/*|/usr/sbin/*|/usr/etc/*|/sbin/*|/usr/afsws/bin/*|/usr/ucb/*) ;;
+    *)
+      # OSF1 and SCO ODT 3.0 have their own names for install.
+      # Don't use installbsd from OSF since it installs stuff as root
+      # by default.
+      for ac_prog in ginstall scoinst install; do
+        if test -f $ac_dir/$ac_prog; then
+	  if test $ac_prog = install &&
+            grep dspmsg $ac_dir/$ac_prog >/dev/null 2>&1; then
+	    # AIX install.  It has an incompatible calling convention.
+	    :
+	  elif test $ac_prog = install &&
+	    grep pwplus $ac_dir/$ac_prog >/dev/null 2>&1; then
+	    # program-specific install script used by HP pwplus--don't use.
+	    :
+	  else
+	    ac_cv_path_install="$ac_dir/$ac_prog -c"
+	    break 2
+	  fi
+	fi
+      done
+      ;;
+    esac
+  done
+  IFS="$ac_save_IFS"
+
+fi
+  if test "${ac_cv_path_install+set}" = set; then
+    INSTALL="$ac_cv_path_install"
+  else
+    # As a last resort, use the slow shell script.  We don't cache a
+    # path for INSTALL within a source directory, because that will
+    # break other packages using the cache if that directory is
+    # removed, or if the path is relative.
+    INSTALL="$ac_install_sh"
+  fi
+fi
+echo "$ECHO_T""$INSTALL" 1>&6
+
+# Use test -z because SunOS4 sh mishandles braces in ${var-val}.
+# It thinks the first close brace ends the variable substitution.
+test -z "$INSTALL_PROGRAM" && INSTALL_PROGRAM='${INSTALL}'
+
+test -z "$INSTALL_SCRIPT" && INSTALL_SCRIPT='${INSTALL}'
+
+test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644'
+
+echo $ECHO_N "checking whether ln -s works... $ECHO_C" 1>&6
+echo "configure:2107: checking whether ln -s works" 1>&5
+if test "${ac_cv_prog_LN_S+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  rm -f conftestdata
+if ln -s X conftestdata 2>/dev/null
+then
+  rm -f conftestdata
+  ac_cv_prog_LN_S="ln -s"
+else
+  ac_cv_prog_LN_S=ln
+fi
+fi
+LN_S="$ac_cv_prog_LN_S"
+if test "$ac_cv_prog_LN_S" = "ln -s"; then
+  echo "$ECHO_T""yes" 1>&6
+else
+  echo "$ECHO_T""no" 1>&6
+fi
+
+echo $ECHO_N "checking for suitable m4... $ECHO_C" 1>&6
+echo "configure:2128: checking for suitable m4" 1>&5
+if test "${gmp_cv_prog_m4+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -n "$M4"; then
+  gmp_cv_prog_m4="$M4"
+else
+  cat >conftest.m4 <<\EOF
+define(dollarhash,``$#'')dnl
+ifelse(dollarhash(x),1,`define(t1,Y)',
+``bad: $# not supported (SunOS /usr/bin/m4)
+'')dnl
+ifelse(eval(89),89,`define(t2,Y)',
+`bad: eval() doesnt support 8 or 9 in a constant (OpenBSD 2.6 m4)
+')dnl
+ifelse(t1`'t2,YY,`good
+')dnl
+EOF
+  echo "trying m4" 1>&5
+  gmp_tmp_val="`(m4 conftest.m4) 2>&5`"
+  echo "$gmp_tmp_val" 1>&5
+  if test "$gmp_tmp_val" = good; then
+    gmp_cv_prog_m4="m4"
+  else
+    IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+    ac_dummy="$PATH:/usr/5bin"
+    for ac_dir in $ac_dummy; do
+      test -z "$ac_dir" && ac_dir=.
+      echo "trying $ac_dir/m4" 1>&5
+      gmp_tmp_val="`($ac_dir/m4 conftest.m4) 2>&5`"
+      echo "$gmp_tmp_val" 1>&5
+      if test "$gmp_tmp_val" = good; then
+        gmp_cv_prog_m4="$ac_dir/m4"
+        break
+      fi
+    done
+    IFS="$ac_save_ifs"
+    if test -z "$gmp_cv_prog_m4"; then
+      { echo "configure: error: No usable m4 in \$PATH or /usr/5bin (see config.log for reasons)." 1>&2; exit 1; }
+    fi
+  fi
+  rm -f conftest.m4
+fi
+fi
+echo "$ECHO_T""$gmp_cv_prog_m4" 1>&6
+M4="$gmp_cv_prog_m4"
+
+# Extract the first word of "ar", so it can be a program name with args.
+set dummy ar; ac_word=$2
+echo $ECHO_N "checking for $ac_word... $ECHO_C" 1>&6
+echo "configure:2178: checking for $ac_word" 1>&5
+if test "${ac_cv_prog_AR+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -n "$AR"; then
+  ac_cv_prog_AR="$AR" # Let the user override the test.
+else
+  for ac_path in `IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+ac_dummy="$PATH"
+for ac_dir in $ac_dummy; do
+  test -z "$ac_dir" && ac_dir=.
+  if test -f $ac_dir/$ac_word; then
+    echo "$ac_dir/$ac_word"
+  fi
+done
+IFS="$ac_save_ifs"
+`; do
+    ac_cv_prog_AR="ar"
+    break
+  done
+fi
+fi
+AR="$ac_cv_prog_AR"
+if test -n "$AR"; then
+  echo "$ECHO_T""$AR" 1>&6
+else
+  echo "$ECHO_T""no" 1>&6
+fi
+
+# ar on AIX needs to know the object file format
+case "$target" in
+  powerpc64*-*-aix*)
+    AR="$AR -X 64"
+    ;;
+esac
+
+if test "$gmp_no_asm_syntax_testing" != "yes"; then
+  echo $ECHO_N "checking how to switch to text section... $ECHO_C" 1>&6
+echo "configure:2216: checking how to switch to text section" 1>&5
+if test "${gmp_cv_check_asm_text+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  case "$target" in
+  *-*-aix*)
+
+    gmp_cv_check_asm_text=".csect .text[PR]"
+
+    ;;
+  *-*-hpux*) gmp_cv_check_asm_text=".code" ;;
+  *) gmp_cv_check_asm_text=".text" ;;
+esac
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_text" 1>&6
+echo "define(<TEXT>, <$gmp_cv_check_asm_text>)" >> $gmp_tmpconfigm4
+
+  echo $ECHO_N "checking how to switch to data section... $ECHO_C" 1>&6
+echo "configure:2235: checking how to switch to data section" 1>&5
+if test "${gmp_cv_check_asm_data+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  case "$target" in
+  *-*-aix*)
+
+    gmp_cv_check_asm_data=".csect .data[RW]"
+
+    ;;
+  *) gmp_cv_check_asm_data=".data" ;;
+esac
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_data" 1>&6
+echo "define(<DATA>, <$gmp_cv_check_asm_data>)" >> $gmp_tmpconfigm4
+
+  echo $ECHO_N "checking how to export a symbol... $ECHO_C" 1>&6
+echo "configure:2253: checking how to export a symbol" 1>&5
+if test "${gmp_cv_check_asm_globl+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  case "$target" in
+  *-*-hpux*) gmp_cv_check_asm_globl=".export" ;;
+  *) gmp_cv_check_asm_globl=".globl" ;;
+esac
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_globl" 1>&6
+echo "define(<GLOBL>, <$gmp_cv_check_asm_globl>)" >> $gmp_tmpconfigm4
+
+  echo $ECHO_N "checking what assembly label suffix to use... $ECHO_C" 1>&6
+echo "configure:2267: checking what assembly label suffix to use" 1>&5
+if test "${gmp_cv_check_asm_label_suffix+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  case "$target" in
+  *-*-hpux*) gmp_cv_check_asm_label_suffix="" ;;
+  *) gmp_cv_check_asm_label_suffix=":" ;;
+esac
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_label_suffix" 1>&6
+echo "define(<LABEL_SUFFIX>, <\$1$gmp_cv_check_asm_label_suffix>)" >> $gmp_tmpconfigm4
+
+  echo $ECHO_N "checking how the .type assembly directive should be used... $ECHO_C" 1>&6
+echo "configure:2281: checking how the .type assembly directive should be used" 1>&5
+if test "${gmp_cv_check_asm_type+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  ac_assemble="$CCAS $CFLAGS conftest.s 1>&5"
+for gmp_tmp_prefix in @ \# %; do
+  echo "	.type	sym,${gmp_tmp_prefix}function" > conftest.s
+  if { (eval echo configure:2288: \"$ac_assemble\") 1>&5; (eval $ac_assemble) 2>&5; }; then
+    gmp_cv_check_asm_type=".type	\$1,${gmp_tmp_prefix}\$2"
+    break
+  fi
+done
+if test -z "$gmp_cv_check_asm_type"; then
+  gmp_cv_check_asm_type="dnl"
+fi
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_type" 1>&6
+echo "define(<TYPE>, <$gmp_cv_check_asm_type>)" >> $gmp_tmpconfigm4
+
+  echo $ECHO_N "checking if the .size assembly directive works... $ECHO_C" 1>&6
+echo "configure:2302: checking if the .size assembly directive works" 1>&5
+if test "${gmp_cv_check_asm_size+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  ac_assemble="$CCAS $CFLAGS conftest.s 1>&5"
+echo '	.size	sym,1' > conftest.s
+if { (eval echo configure:2308: \"$ac_assemble\") 1>&5; (eval $ac_assemble) 2>&5; }; then
+  gmp_cv_check_asm_size=".size	\$1,\$2"
+else
+  gmp_cv_check_asm_size="dnl"
+fi
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_size" 1>&6
+echo "define(<SIZE>, <$gmp_cv_check_asm_size>)" >> $gmp_tmpconfigm4
+
+echo $ECHO_N "checking what prefix to use for a local label... $ECHO_C" 1>&6
+echo "configure:2319: checking what prefix to use for a local label" 1>&5
+if test "${gmp_cv_check_asm_lsym_prefix+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -z "$NM"; then
+  echo; echo "GMP_CHECK_ASM_LSYM_PREFIX: fatal: need nm"
+  exit 1
+fi
+ac_assemble="$CCAS $CFLAGS conftest.s 1>&5"
+gmp_cv_check_asm_lsym_prefix="L"
+for gmp_tmp_pre in L .L $ L$; do
+  cat > conftest.s <<EOF
+dummy${gmp_cv_check_asm_label_suffix}
+${gmp_tmp_pre}gurkmacka${gmp_cv_check_asm_label_suffix}
+	.byte 0
+EOF
+  if { (eval echo configure:2335: \"$ac_assemble\") 1>&5; (eval $ac_assemble) 2>&5; }; then
+    $NM conftest.o >/dev/null 2>&1
+    gmp_rc=$?
+    if test "$gmp_rc" != "0"; then
+      echo "configure: $NM failure, using default"
+      break
+    fi
+    if $NM conftest.o | grep gurkmacka >/dev/null; then true; else
+      gmp_cv_check_asm_lsym_prefix="$gmp_tmp_pre"
+      break
+    fi
+  else
+    echo "configure: failed program was:" >&5
+    cat conftest.s >&5
+    # Use default.
+  fi
+done
+rm -f conftest*
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_lsym_prefix" 1>&6
+echo "define(<LSYM_PREFIX>, <${gmp_cv_check_asm_lsym_prefix}>)" >> $gmp_tmpconfigm4
+
+echo $ECHO_N "checking how to define a 32-bit word... $ECHO_C" 1>&6
+echo "configure:2359: checking how to [define] a 32-bit word" 1>&5
+if test "${gmp_cv_check_asm_w32+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -z "$NM"; then
+  echo; echo "configure: GMP_CHECK_ASM_W32: fatal: need nm"
+  exit 1
+fi
+
+# FIXME: HPUX puts first symbol at 0x40000000, breaking our assumption
+# that it's at 0x0.  We'll have to declare another symbol before the
+# .long/.word and look at the distance between the two symbols.  The
+# only problem is that the sed expression(s) barfs (on Solaris, for
+# example) for the symbol with value 0.  For now, HPUX uses .word.
+
+case "$target" in
+  *-*-hpux*)
+    gmp_cv_check_asm_w32=".word"
+    ;;
+  *-*-*)
+    ac_assemble="$CCAS $CFLAGS conftest.s 1>&5"
+    for gmp_tmp_op in .long .word; do
+      cat > conftest.s <<EOF
+	$gmp_cv_check_asm_data
+	$gmp_cv_check_asm_globl	foo
+	$gmp_tmp_op	0
+foo${gmp_cv_check_asm_label_suffix}
+	.byte	0
+EOF
+      if { (eval echo configure:2388: \"$ac_assemble\") 1>&5; (eval $ac_assemble) 2>&5; }; then
+
+        gmp_tmp_val=`$NM conftest.o | grep foo | sed -e 's;[[][0-9][]]\(.*\);\1;' \
+             -e 's;[^1-9]*\([0-9]*\).*;\1;'`
+                if test "$gmp_tmp_val" = "4"; then
+          gmp_cv_check_asm_w32="$gmp_tmp_op"
+          break
+        fi
+      fi
+    done
+    ;;
+esac
+
+if test -z "$gmp_cv_check_asm_w32"; then
+  echo; echo "configure: GMP_CHECK_ASM_W32: fatal: do not know how to define a 32-bit word"
+  exit 1
+fi
+rm -f conftest*
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_w32" 1>&6
+echo "define(<W32>, <$gmp_cv_check_asm_w32>)" >> $gmp_tmpconfigm4
+
+  echo $ECHO_N "checking if symbols are prefixed by underscore... $ECHO_C" 1>&6
+echo "configure:2412: checking if symbols are prefixed by underscore" 1>&5
+if test "${gmp_cv_check_asm_underscore+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  cat > conftest.$ac_ext <<EOF
+#line 2417 "configure"
+#include "confdefs.h"
+int underscore_test() {
+return; }
+EOF
+if { (eval echo configure:2422: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  if grep _underscore_test conftest* >/dev/null; then
+    gmp_cv_check_asm_underscore=yes
+  else
+    gmp_cv_check_asm_underscore=no
+  fi
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+fi
+rm -f conftest*
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_underscore" 1>&6
+if test "$gmp_cv_check_asm_underscore" = "yes"; then
+
+echo 'define(<GSYM_PREFIX>, <_>)' >> $gmp_tmpconfigm4
+
+  underscore=yes
+else
+
+echo 'define(<GSYM_PREFIX>, <>)' >> $gmp_tmpconfigm4
+
+  underscore=no
+fi
+
+echo $ECHO_N "checking if .align assembly directive is logarithmic... $ECHO_C" 1>&6
+echo "configure:2449: checking if .align assembly directive is logarithmic" 1>&5
+if test "${gmp_cv_check_asm_align_log+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -z "$NM"; then
+  echo; echo "configure: GMP_CHECK_ASM_ALIGN_LOG: fatal: need nm"
+  exit 1
+fi
+cat > conftest.s <<EOF
+      	$gmp_cv_check_asm_data
+      	.align  4
+	$gmp_cv_check_asm_globl	foo
+	.byte	1
+	.align	4
+foo$gmp_cv_check_asm_label_suffix
+	.byte	2
+EOF
+ac_assemble="$CCAS $CFLAGS conftest.s 1>&5"
+if { (eval echo configure:2467: \"$ac_assemble\") 1>&5; (eval $ac_assemble) 2>&5; }; then
+
+  gmp_tmp_val=`$NM conftest.o | grep foo | sed -e 's;[[][0-9][]]\(.*\);\1;' \
+       -e 's;[^1-9]*\([0-9]*\).*;\1;'`
+    if test "$gmp_tmp_val" = "10" || test "$gmp_tmp_val" = "16"; then
+    gmp_cv_check_asm_align_log=yes
+  else
+    gmp_cv_check_asm_align_log=no
+  fi
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.s >&5
+fi
+rm -f conftest*
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_align_log" 1>&6
+
+echo "define(<ALIGN_LOGARITHMIC>,<$gmp_cv_check_asm_align_log>)" >> $gmp_tmpconfigm4
+
+if test "$gmp_cv_check_asm_align_log" = "yes"; then
+  asm_align=log
+else
+  asm_align=nolog
+fi
+
+fi
+
+family=generic
+
+case ${target} in
+  arm*-*-*)
+    path="arm"
+    ;;
+  sparcv9*-*-solaris2.[789]* | sparc64*-*-solaris2.[789]* | ultrasparc*-*-solaris2.[789]*)
+    if test -n "$CC64"
+      then path="sparc64"
+      else path="sparc32/v9 sparc32/v8 sparc32"
+    fi
+    ;;
+  sparc64-*-linux*)
+    if test -n "$CC64"
+      then path="sparc64"
+      else path="sparc32/v9 sparc32/v8 sparc32"
+    fi
+    ;;
+  sparcv8*-*-* | microsparc*-*-*)
+	path="sparc32/v8 sparc32"
+	if test x${floating_point} = xno
+	  then extra_functions="udiv_nfp"
+	  else extra_functions="udiv_fp"
+	fi
+	;;
+  sparcv9*-*-* | ultrasparc*-*-*)
+	path="sparc32/v9 sparc32/v8 sparc32"
+	extra_functions="udiv_fp"
+	;;
+  supersparc*-*-*)
+	path="sparc32/v8/supersparc sparc32/v8 sparc32"
+	extra_functions="udiv"
+	;;
+  sparc*-*-*) path="sparc32"
+	if test x${floating_point} = xno
+	  then extra_functions="udiv_nfp"
+	  else extra_functions="udiv_fp"
+	fi
+	;;
+  hppa7000*-*-*)
+    path="hppa/hppa1_1 hppa"
+    extra_functions="udiv_qrnnd"
+    ;;
+  hppa1.0*-*-*)
+    path="hppa"
+    extra_functions="udiv_qrnnd"
+    ;;
+  hppa2.0w-*-*)
+    path="pa64w"
+    extra_functions="umul_ppmm udiv_qrnnd"
+    ;;
+  hppa2.0*-*-*)
+    if test -n "$CC64"; then
+      path="pa64"
+      extra_functions="umul_ppmm udiv_qrnnd"
+      # We need to use the system compiler, or actually the system assembler,
+      # since GAS has not been ported to understand the 2.0 instructions.
+      CCAS="$CC64 -c"
+    else
+      # FIXME: path should be "hppa/hppa2_0 hppa/hppa1_1 hppa"
+      path="hppa/hppa1_1 hppa"
+      extra_functions="udiv_qrnnd"
+    fi
+    ;;
+  hppa*-*-*)					#assume pa7100
+    path="hppa/hppa1_1/pa7100 hppa/hppa1_1 hppa"
+    extra_functions="udiv_qrnnd";;
+  f30[01]-fujitsu-sysv*)
+    path=fujitsu;;
+  alphaev6*-*-*) path="alpha/ev6 alpha"; extra_functions="invert_limb cntlz";;
+  alphaev5*-*-*) path="alpha/ev5 alpha"; extra_functions="invert_limb cntlz";;
+  alpha*-*-*) path="alpha"; extra_functions="invert_limb cntlz";;
+  # Cray vector machines.  This must come after alpha* so that we can
+  # recognize present and future vector processors with a wildcard.
+  *-cray-unicos*)
+    path="cray"
+    extra_functions="mulww";;
+  am29000*-*-*) path="a29k";;
+  a29k*-*-*) path="a29k";;
+
+  # AMD and Intel x86 configurations
+
+  i?86*-*-* | k[5-8]*-*-* | pentium*-*-* | athlon-*-*)
+    gmp_m4postinc="x86/x86-defs.m4"
+    extra_functions="udiv umul"
+    CALLING_CONVENTIONS_OBJS="x86call.o x86check.o"
+
+echo $ECHO_N "checking if the assembler takes cl with shldl... $ECHO_C" 1>&6
+echo "configure:2583: checking if the assembler takes cl with shldl" 1>&5
+if test "${gmp_cv_check_asm_shldl_cl+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  cat > conftest.s <<EOF
+	$gmp_cv_check_asm_text
+	shldl	%cl, %eax, %ebx
+EOF
+ac_assemble="$CCAS $CFLAGS conftest.s 1>&5"
+if { (eval echo configure:2592: \"$ac_assemble\") 1>&5; (eval $ac_assemble) 2>&5; }; then
+  gmp_cv_check_asm_shldl_cl=yes
+else
+  gmp_cv_check_asm_shldl_cl=no
+fi
+rm -f conftest*
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_shldl_cl" 1>&6
+if test "$gmp_cv_check_asm_shldl_cl" = "yes"; then
+
+echo 'define(<WANT_SHLDL_CL>, <1>)' >> $gmp_tmpconfigm4
+
+else
+
+echo 'define(<WANT_SHLDL_CL>, <0>)' >> $gmp_tmpconfigm4
+
+fi
+
+    echo $ECHO_N "checking if the .align directive accepts an 0x90 fill in .text... $ECHO_C" 1>&6
+echo "configure:2612: checking if the .align directive accepts an 0x90 fill in .text" 1>&5
+if test "${gmp_cv_check_asm_align_fill_0x90+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+
+cat > conftest.s <<EOF
+      	$gmp_cv_check_asm_text
+      	.align  4, 0x90
+	.byte   0
+      	.align  4, 0x90
+EOF
+gmp_tmp_val="`$CCAS $CFLAGS conftest.s 2>&1`"
+if test $? = 0; then
+  echo "$gmp_tmp_val" 1>&5
+  if echo "$gmp_tmp_val" | grep "Warning: Fill parameter ignored for executable section"; then
+    echo "Supressing this warning by omitting 0x90" 1>&5
+    gmp_cv_check_asm_align_fill_0x90=no
+  else
+    gmp_cv_check_asm_align_fill_0x90=yes
+  fi
+else
+  echo "Non-zero exit code" 1>&5
+  echo "$gmp_tmp_val" 1>&5
+  gmp_cv_check_asm_align_fill_0x90=no
+fi
+rm -f conftest*
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_align_fill_0x90" 1>&6
+
+echo "define(<ALIGN_FILL_0x90>,<$gmp_cv_check_asm_align_fill_0x90>)" >> $gmp_tmpconfigm4
+
+    # the CPUs below wanting to know about mmx
+    case ${target} in
+      pentiummmx-*-* | pentium[23]-*-* | k6*-*-* | athlon-*-*)
+
+echo $ECHO_N "checking if the assembler knows about MMX instructions... $ECHO_C" 1>&6
+echo "configure:2649: checking if the assembler knows about MMX instructions" 1>&5
+if test "${gmp_cv_check_asm_mmx+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  cat > conftest.s <<EOF
+	$gmp_cv_check_asm_text
+	por	%mm0, %mm0
+EOF
+ac_assemble="$CCAS $CFLAGS conftest.s 1>&5"
+if { (eval echo configure:2658: \"$ac_assemble\") 1>&5; (eval $ac_assemble) 2>&5; }; then
+  gmp_cv_check_asm_mmx=yes
+else
+  gmp_cv_check_asm_mmx=no
+fi
+rm -f conftest*
+
+fi
+echo "$ECHO_T""$gmp_cv_check_asm_mmx" 1>&6
+if test "$gmp_cv_check_asm_mmx" = "yes"; then
+  tmp_mmx=yes
+else
+  echo "configure: warning: +----------------------------------------------------------" 1>&2
+  echo "configure: warning: | WARNING WARNING WARNING" 1>&2
+  echo "configure: warning: | Target CPU has MMX code, but it can't be assembled by" 1>&2
+  echo "configure: warning: |     $CCAS $CFLAGS" 1>&2
+  echo "configure: warning: | Non-MMX replacements will be used." 1>&2
+  echo "configure: warning: | This will be an inferior build." 1>&2
+  echo "configure: warning: +----------------------------------------------------------" 1>&2
+  tmp_mmx=no
+fi
+
+        ;;
+    esac
+
+    # default for anything not otherwise mentioned
+    path="x86"
+
+    case ${target} in
+      i[34]86*-*-*)
+        path="x86"
+        ;;
+      k5*-*-*)
+        # don't know what best suits k5
+        path="x86"
+        ;;
+      i586*-*-* | pentium-*-*)
+	path="x86/pentium x86"
+        ;;
+      pentiummmx-*-*)
+	path="x86/pentium x86"
+	if test "$tmp_mmx" = yes; then
+          path="x86/pentium/mmx $path"
+        fi
+        ;;
+      i686*-*-* | pentiumpro-*-*)
+	path="x86/p6 x86"
+        ;;
+      pentium2-*-*)
+	path="x86/p6 x86"
+	# The pentium/mmx lshift and rshift are good on p6 and can be used
+        # until there's something specific for p6.
+	if test "$tmp_mmx" = yes; then
+          path="x86/p6/mmx x86/pentium/mmx $path"
+        fi
+        ;;
+      pentium3-*-*)
+	path="x86/p6 x86"
+	# The pentium/mmx lshift and rshift are good on p6 and can be used
+        # until there's something specific for p6.
+	if test "$tmp_mmx" = yes; then
+          path="x86/p6/p3mmx x86/p6/mmx x86/pentium/mmx $path"
+        fi
+        ;;
+      k6[23]*-*-*)
+	path="x86/k6 x86"
+	if test "$tmp_mmx" = yes; then
+          path="x86/k6/k62mmx x86/k6/mmx $path"
+        fi
+        ;;
+      k6*-*-*)
+	path="x86/k6 x86"
+	if test "$tmp_mmx" = yes; then
+          path="x86/k6/mmx $path"
+        fi
+        ;;
+      athlon-*-*)
+	path="x86/k7 x86"
+	if test "$tmp_mmx" = yes; then
+          path="x86/k7/mmx $path"
+        fi
+        ;;
+    esac
+    ;;
+
+  i960*-*-*) path="i960";;
+
+  ia64*-*-*) path="ia64";;
+
+# Motorola 68k configurations.  Let m68k mean 68020-68040.
+  m680[234]0*-*-* | m68k*-*-* | \
+  m68*-next-nextstep*)		# Nexts are at least '020
+    path="m68k/mc68020 m68k"
+    family=m68k
+    ;;
+  m68000*-*-*)
+    path="m68k"
+    family=m68k
+    ;;
+
+  m88k*-*-* | m88k*-*-*) path="m88k";;
+  m88110*-*-*) path="m88k/mc88110 m88k";;
+  ns32k*-*-*) path="ns32k";;
+
+  pyramid-*-*) path="pyr";;
+
+  ppc601-*-*) path="power powerpc32";;
+  powerpc64*-*-*) path="powerpc64";;
+  powerpc*-*-*) path="powerpc32";;
+  rs6000-*-* | power-*-* | power2-*-*)
+    path="power"
+    extra_functions="udiv_w_sdiv"
+    ;;
+
+  sh-*-*) path="sh";;
+  sh2-*-*) path="sh/sh2 sh";;
+
+  mips[34]*-*-*) path="mips3";;
+  mips*-*-irix6*) path="mips3";;
+  mips*-*-*) path="mips2";;
+
+  vax*-*-*) path="vax"; extra_functions="udiv_w_sdiv";;
+
+  z8000x*-*-*) path="z8000x"; extra_functions="udiv_w_sdiv";;
+  z8000*-*-*) path="z8000"; extra_functions="udiv_w_sdiv";;
+
+  clipper*-*-*) path="clipper";;
+esac
+
+if test -n "$CALLING_CONVENTIONS_OBJS"; then
+  cat >>confdefs.h <<\EOF
+#define HAVE_CALLING_CONVENTIONS 1
+EOF
+
+fi
+
+case ${target} in
+  i[5-8]86*-*-* | k[5-8]*-*-* | pentium*-*-* | athlon-*-*)
+    # rdtsc is in pentium and up, not in i386 and i486
+    SPEED_CYCLECOUNTER_OBJS=pentium.lo
+    ;;
+  alpha*-*-*)
+    SPEED_CYCLECOUNTER_OBJS=alpha.lo
+    ;;
+  sparcv9*-*-* | ultrasparc*-*-* | sparc64*-*-*)
+    SPEED_CYCLECOUNTER_OBJS=sparcv9.lo
+    ;;
+  hppa2*-*-*)
+    SPEED_CYCLECOUNTER_OBJS=hppa2.lo
+    ;;
+  hppa*-*-*)
+    SPEED_CYCLECOUNTER_OBJS=hppa.lo
+    ;;
+esac
+
+if test -n "$SPEED_CYCLECOUNTER_OBJS"
+then
+  cat >>confdefs.h <<\EOF
+#define HAVE_SPEED_CYCLECOUNTER 1
+EOF
+
+fi
+
+echo $ECHO_N "checking for Cygwin environment... $ECHO_C" 1>&6
+echo "configure:2822: checking for Cygwin environment" 1>&5
+if test "${ac_cv_cygwin+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  cat >conftest.$ac_ext <<EOF
+#line 2827 "configure"
+#include "confdefs.h"
+
+int
+main ()
+{
+#ifndef __CYGWIN__
+# define __CYGWIN__ __CYGWIN32__
+#endif
+return __CYGWIN__;
+  ;
+  return 0;
+}
+EOF
+if { (eval echo configure:2841: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  rm -rf conftest*
+  ac_cv_cygwin=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_cygwin=no
+fi
+rm -f conftest*
+fi
+echo "$ECHO_T""$ac_cv_cygwin" 1>&6
+CYGWIN=
+test "$ac_cv_cygwin" = yes && CYGWIN=yes
+echo $ECHO_N "checking for mingw32 environment... $ECHO_C" 1>&6
+echo "configure:2856: checking for mingw32 environment" 1>&5
+if test "${ac_cv_mingw32+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  cat >conftest.$ac_ext <<EOF
+#line 2861 "configure"
+#include "confdefs.h"
+
+int
+main ()
+{
+return __MINGW32__;
+  ;
+  return 0;
+}
+EOF
+if { (eval echo configure:2872: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  rm -rf conftest*
+  ac_cv_mingw32=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_mingw32=no
+fi
+rm -f conftest*
+fi
+echo "$ECHO_T""$ac_cv_mingw32" 1>&6
+MINGW32=
+test "$ac_cv_mingw32" = yes && MINGW32=yes
+echo $ECHO_N "checking for EMX OS/2 environment... $ECHO_C" 1>&6
+echo "configure:2887: checking for EMX OS/2 environment" 1>&5
+if test "${ac_cv_emxos2+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  cat >conftest.$ac_ext <<EOF
+#line 2892 "configure"
+#include "confdefs.h"
+
+int
+main ()
+{
+return __EMX__;
+  ;
+  return 0;
+}
+EOF
+if { (eval echo configure:2903: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  rm -rf conftest*
+  ac_cv_emxos2=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_emxos2=no
+fi
+rm -f conftest*
+fi
+echo "$ECHO_T""$ac_cv_emxos2" 1>&6
+EMXOS2=
+test "$ac_cv_emxos2" = yes && EMXOS2=yes
+
+echo $ECHO_N "checking for executable suffix... $ECHO_C" 1>&6
+echo "configure:2919: checking for executable suffix" 1>&5
+if test "${ac_cv_exeext+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test "$CYGWIN" = yes || test "$MINGW32" = yes || test "$EMXOS2" = yes; then
+  ac_cv_exeext=.exe
+else
+  rm -f conftest*
+  echo 'int main () { return 0; }' >conftest.$ac_ext
+  ac_cv_exeext=
+  if { (eval echo configure:2929: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; }; then
+    for ac_file in conftest.*; do
+      case $ac_file in
+      *.c | *.C | *.o | *.obj | *.xcoff) ;;
+      *) ac_cv_exeext=`echo $ac_file | sed -e s/conftest//` ;;
+      esac
+    done
+  else
+    { echo "configure: error: installation or configuration problem: compiler cannot create executables." 1>&2; exit 1; }
+  fi
+  rm -f conftest*
+  test x"${ac_cv_exeext}" = x && ac_cv_exeext=no
+fi
+fi
+
+EXEEXT=""
+test x"${ac_cv_exeext}" != xno && EXEEXT=${ac_cv_exeext}
+echo "$ECHO_T""${ac_cv_exeext}" 1>&6
+ac_exeext=$EXEEXT
+
+echo $ECHO_N "checking for object suffix... $ECHO_C" 1>&6
+echo "configure:2950: checking for object suffix" 1>&5
+if test "${ac_cv_objext+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  rm -f conftest*
+echo 'int i = 1;' >conftest.$ac_ext
+if { (eval echo configure:2956: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  for ac_file in conftest.*; do
+    case $ac_file in
+    *.c) ;;
+    *) ac_cv_objext=`echo $ac_file | sed -e s/conftest.//` ;;
+    esac
+  done
+else
+  { echo "configure: error: installation or configuration problem; compiler does not work" 1>&2; exit 1; }
+fi
+rm -f conftest*
+fi
+
+echo "$ECHO_T""$ac_cv_objext" 1>&6
+OBJEXT=$ac_cv_objext
+ac_objext=$ac_cv_objext
+
+case "$target" in
+  *-*-aix4.[3-9]*) enable_shared=no ;;
+esac
+# Check whether --enable-shared or --disable-shared was given.
+if test "${enable_shared+set}" = set; then
+  enableval="$enable_shared"
+  p=${PACKAGE-default}
+case "$enableval" in
+yes) enable_shared=yes ;;
+no) enable_shared=no ;;
+*)
+  enable_shared=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_shared=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac
+else
+  enable_shared=yes
+fi
+# Check whether --enable-static or --disable-static was given.
+if test "${enable_static+set}" = set; then
+  enableval="$enable_static"
+  p=${PACKAGE-default}
+case "$enableval" in
+yes) enable_static=yes ;;
+no) enable_static=no ;;
+*)
+  enable_static=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_static=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac
+else
+  enable_static=yes
+fi
+# Check whether --enable-fast-install or --disable-fast-install was given.
+if test "${enable_fast_install+set}" = set; then
+  enableval="$enable_fast_install"
+  p=${PACKAGE-default}
+case "$enableval" in
+yes) enable_fast_install=yes ;;
+no) enable_fast_install=no ;;
+*)
+  enable_fast_install=no
+  # Look at the argument we got.  We use all the common list separators.
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}:,"
+  for pkg in $enableval; do
+    if test "X$pkg" = "X$p"; then
+      enable_fast_install=yes
+    fi
+  done
+  IFS="$ac_save_ifs"
+  ;;
+esac
+else
+  enable_fast_install=yes
+fi
+
+echo $ECHO_N "checking build system type... $ECHO_C" 1>&6
+echo "configure:3044: checking build system type" 1>&5
+if test "x$ac_cv_build" = "x" || (test "x$build" != "xNONE" && test "x$build" != "x$ac_cv_build_alias"); then
+
+  # Make sure we can run config.sub.
+  if $ac_config_sub sun4 >/dev/null 2>&1; then :; else
+    { echo "configure: error: cannot run $ac_config_sub" 1>&2; exit 1; }
+  fi
+
+  ac_cv_build_alias=$build
+  case "$ac_cv_build_alias" in
+  NONE)
+    case $nonopt in
+    NONE)
+      ac_cv_build_alias=$host_alias ;;
+    *) ac_cv_build_alias=$nonopt ;;
+    esac ;;
+  esac
+
+  ac_cv_build=`$ac_config_sub $ac_cv_build_alias` || exit 1
+  ac_cv_build_cpu=`echo $ac_cv_build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'`
+  ac_cv_build_vendor=`echo $ac_cv_build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'`
+  ac_cv_build_os=`echo $ac_cv_build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'`
+else
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+fi
+
+echo "$ECHO_T""$ac_cv_build" 1>&6
+
+build=$ac_cv_build
+build_alias=$ac_cv_build_alias
+build_cpu=$ac_cv_build_cpu
+build_vendor=$ac_cv_build_vendor
+build_os=$ac_cv_build_os
+
+# Check whether --with-gnu-ld or --without-gnu-ld was given.
+if test "${with_gnu_ld+set}" = set; then
+  withval="$with_gnu_ld"
+  test "$withval" = no || with_gnu_ld=yes
+else
+  with_gnu_ld=no
+fi
+
+ac_prog=ld
+if test "$ac_cv_prog_gcc" = yes; then
+  # Check if gcc -print-prog-name=ld gives a path.
+  echo $ECHO_N "checking for ld used by GCC... $ECHO_C" 1>&6
+echo "configure:3090: checking for ld used by GCC" 1>&5
+  case $target in
+  *-*-mingw*)
+    # gcc leaves a trailing carriage return which upsets mingw
+    ac_prog=`($CC -print-prog-name=ld) 2>&5 | tr -d '\015'` ;;
+  *)
+    ac_prog=`($CC -print-prog-name=ld) 2>&5` ;;
+  esac
+  case "$ac_prog" in
+    # Accept absolute paths.
+    [\\/]* | [A-Za-z]:[\\/]*)
+      re_direlt='/[^/][^/]*/\.\./'
+      # Canonicalize the path of ld
+      ac_prog=`echo $ac_prog| sed 's%\\\\%/%g'`
+      while echo $ac_prog | grep "$re_direlt" > /dev/null 2>&1; do
+	ac_prog=`echo $ac_prog| sed "s%$re_direlt%/%"`
+      done
+      test -z "$LD" && LD="$ac_prog"
+      ;;
+  "")
+    # If it fails, then pretend we aren't using GCC.
+    ac_prog=ld
+    ;;
+  *)
+    # If it is relative, then search for the first ld in PATH.
+    with_gnu_ld=unknown
+    ;;
+  esac
+elif test "$with_gnu_ld" = yes; then
+  echo $ECHO_N "checking for GNU ld... $ECHO_C" 1>&6
+echo "configure:3120: checking for GNU ld" 1>&5
+else
+  echo $ECHO_N "checking for non-GNU ld... $ECHO_C" 1>&6
+echo "configure:3123: checking for non-GNU ld" 1>&5
+fi
+if test "${ac_cv_path_LD+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -z "$LD"; then
+  IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS="${IFS}${PATH_SEPARATOR-:}"
+  for ac_dir in $PATH; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f "$ac_dir/$ac_prog" || test -f "$ac_dir/$ac_prog$ac_exeext"; then
+      ac_cv_path_LD="$ac_dir/$ac_prog"
+      # Check to see if the program is GNU ld.  I'd rather use --version,
+      # but apparently some GNU ld's only accept -v.
+      # Break only if it was the GNU/non-GNU ld that we prefer.
+      if "$ac_cv_path_LD" -v 2>&1 < /dev/null | egrep '(GNU|with BFD)' > /dev/null; then
+	test "$with_gnu_ld" != no && break
+      else
+	test "$with_gnu_ld" != yes && break
+      fi
+    fi
+  done
+  IFS="$ac_save_ifs"
+else
+  ac_cv_path_LD="$LD" # Let the user override the test with a path.
+fi
+fi
+
+LD="$ac_cv_path_LD"
+if test -n "$LD"; then
+  echo "$ECHO_T""$LD" 1>&6
+else
+  echo "$ECHO_T""no" 1>&6
+fi
+test -z "$LD" && { echo "configure: error: no acceptable ld found in \$PATH" 1>&2; exit 1; }
+echo $ECHO_N "checking if the linker ($LD) is GNU ld... $ECHO_C" 1>&6
+echo "configure:3158: checking if the linker ($LD) is GNU ld" 1>&5
+if test "${ac_cv_prog_gnu_ld+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  # I'd rather use --version here, but apparently some GNU ld's only accept -v.
+if $LD -v 2>&1 </dev/null | egrep '(GNU|with BFD)' 1>&5; then
+  ac_cv_prog_gnu_ld=yes
+else
+  ac_cv_prog_gnu_ld=no
+fi
+fi
+echo "$ECHO_T""$ac_cv_prog_gnu_ld" 1>&6
+with_gnu_ld=$ac_cv_prog_gnu_ld
+
+echo $ECHO_N "checking for $LD option to reload object files... $ECHO_C" 1>&6
+echo "configure:3173: checking for $LD option to reload object files" 1>&5
+if test "${lt_cv_ld_reload_flag+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  lt_cv_ld_reload_flag='-r'
+fi
+echo "$ECHO_T""$lt_cv_ld_reload_flag" 1>&6
+reload_flag=$lt_cv_ld_reload_flag
+test -n "$reload_flag" && reload_flag=" $reload_flag"
+
+echo $ECHO_N "checking how to recognise dependant libraries... $ECHO_C" 1>&6
+echo "configure:3184: checking how to recognise dependant libraries" 1>&5
+if test "${lt_cv_deplibs_check_method+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  lt_cv_file_magic_cmd='${MAGIC}'
+lt_cv_file_magic_test_file=
+lt_cv_deplibs_check_method='unknown'
+# Need to set the preceding variable on all platforms that support
+# interlibrary dependencies.
+# 'none' -- dependencies not supported.
+# `unknown' -- same as none, but documents that we really don't know.
+# 'pass_all' -- all dependencies passed with no checks.
+# 'test_compile' -- check by making test program.
+# 'file_magic [regex]' -- check by looking for files in library path
+# which responds to the $file_magic_cmd with a given egrep regex.
+# If you have `file' or equivalent on your system and you're not sure
+# whether `pass_all' will *always* work, you probably want this one.
+
+case "$host_os" in
+aix4* | beos*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+bsdi4*)
+    lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [ML]SB (shared object|dynamic lib)'
+    lt_cv_file_magic_test_file=/shlib/libc.so
+  ;;
+
+cygwin* | mingw*)
+  lt_cv_deplibs_check_method='file_magic file format pei*-i386(.*architecture: i386)?'
+  lt_cv_file_magic_cmd='${OBJDUMP} -f'
+  ;;
+
+freebsd*)
+  case "$version_type" in
+  freebsd-elf*)
+    lt_cv_deplibs_check_method=pass_all
+    ;;
+  esac
+  ;;
+
+gnu*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+irix5* | irix6*)
+  case "$host_os" in
+  irix5*)
+    # this will be overridden with pass_all, but let us keep it just in case
+    lt_cv_deplibs_check_method="file_magic ELF 32-bit MSB dynamic lib MIPS - version 1"
+    ;;
+  *)
+    case "$LD" in
+    *-32|*"-32 ") libmagic=32-bit;;
+    *-n32|*"-n32 ") libmagic=N32;;
+    *-64|*"-64 ") libmagic=64-bit;;
+    *) libmagic=never-match;;
+    esac
+    # this will be overridden with pass_all, but let us keep it just in case
+        lt_cv_deplibs_check_method="file_magic ELF ${libmagic} MSB mips-[1234] dynamic lib MIPS - version 1"
+        ;;
+  esac
+  lt_cv_file_magic_test_file=`echo /lib${libsuff}/libc.so*`
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+# This must be Linux ELF.
+linux-gnu*)
+  case "$host_cpu" in
+  alpha* | i*86 | powerpc* | sparc* )
+    lt_cv_deplibs_check_method=pass_all ;;
+  *)
+    # glibc up to 2.1.1 does not perform some relocations on ARM
+        lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [LM]SB (shared object|dynamic lib )' ;;
+      esac
+  lt_cv_file_magic_test_file=`echo /lib/libc.so* /lib/libc-*.so`
+  ;;
+
+osf3* | osf4* | osf5*)
+  # this will be overridden with pass_all, but let us keep it just in case
+  lt_cv_deplibs_check_method='file_magic COFF format alpha shared library'
+  lt_cv_file_magic_test_file=/shlib/libc.so
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+sco3.2v5*)
+  lt_cv_deplibs_check_method=pass_all
+  ;;
+
+solaris*)
+  lt_cv_deplibs_check_method=pass_all
+  lt_cv_file_magic_test_file=/lib/libc.so
+  ;;
+
+sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+  case "$host_vendor" in
+  ncr)
+    lt_cv_deplibs_check_method=pass_all
+    ;;
+  motorola)
+        lt_cv_deplibs_check_method='file_magic ELF [0-9][0-9]*-bit [ML]SB (shared object|dynamic lib) M[0-9][0-9]* Version [0-9]'
+        lt_cv_file_magic_test_file=`echo /usr/lib/libc.so*`
+    ;;
+  esac
+  ;;
+esac
+
+fi
+echo "$ECHO_T""$lt_cv_deplibs_check_method" 1>&6
+file_magic_cmd=$lt_cv_file_magic_cmd
+deplibs_check_method=$lt_cv_deplibs_check_method
+
+if test $host != $build; then
+  ac_tool_prefix=${host_alias}-
+else
+  ac_tool_prefix=
+fi
+
+# Only perform the check for file, if the check method requires it
+case "$deplibs_check_method" in
+file_magic*)
+  if test "$file_magic_cmd" = '${MAGIC}'; then
+
+echo $ECHO_N "checking for ${ac_tool_prefix}file... $ECHO_C" 1>&6
+echo "configure:3308: checking for ${ac_tool_prefix}file" 1>&5
+if test "${lt_cv_path_MAGIC+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  case "$MAGIC" in
+  /*)
+  lt_cv_path_MAGIC="$MAGIC" # Let the user override the test with a path.
+  ;;
+  ?:/*)
+  ac_cv_path_MAGIC="$MAGIC" # Let the user override the test with a dos path.
+  ;;
+  *)
+  ac_save_MAGIC="$MAGIC"
+  IFS="${IFS=   }"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="/usr/bin:$PATH"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/${ac_tool_prefix}file; then
+      lt_cv_path_MAGIC="$ac_dir/${ac_tool_prefix}file"
+      if test -n "$file_magic_test_file"; then
+	case "$deplibs_check_method" in
+	"file_magic "*)
+	  file_magic_regex="`expr \"$deplibs_check_method\" : \"file_magic \(.*\)\"`"
+	  MAGIC="$lt_cv_path_MAGIC"
+	  if eval $file_magic_cmd \$file_magic_test_file 2> /dev/null |
+	    egrep "$file_magic_regex" > /dev/null; then
+	    :
+	  else
+	    cat <<EOF 1>&2
+
+*** Warning: the command libtool uses to detect shared libraries,
+*** $file_magic_cmd, produces output that libtool cannot recognize.
+*** The result is that libtool may fail to recognize shared libraries
+*** as such.  This will affect the creation of libtool libraries that
+*** depend on shared libraries, but programs linked with such libtool
+*** libraries will work regardless of this problem.  Nevertheless, you
+*** may want to report the problem to your system manager and/or to
+*** bug-libtool@gnu.org
+
+EOF
+	  fi ;;
+	esac
+      fi
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+  MAGIC="$ac_save_MAGIC"
+  ;;
+esac
+fi
+
+MAGIC="$lt_cv_path_MAGIC"
+if test -n "$MAGIC"; then
+  echo "$ECHO_T""$MAGIC" 1>&6
+else
+  echo "$ECHO_T""no" 1>&6
+fi
+
+if test -z "$lt_cv_path_MAGIC"; then
+  if test -n "$ac_tool_prefix"; then
+    echo $ECHO_N "checking for file... $ECHO_C" 1>&6
+echo "configure:3370: checking for file" 1>&5
+if test "${lt_cv_path_MAGIC+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  case "$MAGIC" in
+  /*)
+  lt_cv_path_MAGIC="$MAGIC" # Let the user override the test with a path.
+  ;;
+  ?:/*)
+  ac_cv_path_MAGIC="$MAGIC" # Let the user override the test with a dos path.
+  ;;
+  *)
+  ac_save_MAGIC="$MAGIC"
+  IFS="${IFS=   }"; ac_save_ifs="$IFS"; IFS=":"
+  ac_dummy="/usr/bin:$PATH"
+  for ac_dir in $ac_dummy; do
+    test -z "$ac_dir" && ac_dir=.
+    if test -f $ac_dir/file; then
+      lt_cv_path_MAGIC="$ac_dir/file"
+      if test -n "$file_magic_test_file"; then
+	case "$deplibs_check_method" in
+	"file_magic "*)
+	  file_magic_regex="`expr \"$deplibs_check_method\" : \"file_magic \(.*\)\"`"
+	  MAGIC="$lt_cv_path_MAGIC"
+	  if eval $file_magic_cmd \$file_magic_test_file 2> /dev/null |
+	    egrep "$file_magic_regex" > /dev/null; then
+	    :
+	  else
+	    cat <<EOF 1>&2
+
+*** Warning: the command libtool uses to detect shared libraries,
+*** $file_magic_cmd, produces output that libtool cannot recognize.
+*** The result is that libtool may fail to recognize shared libraries
+*** as such.  This will affect the creation of libtool libraries that
+*** depend on shared libraries, but programs linked with such libtool
+*** libraries will work regardless of this problem.  Nevertheless, you
+*** may want to report the problem to your system manager and/or to
+*** bug-libtool@gnu.org
+
+EOF
+	  fi ;;
+	esac
+      fi
+      break
+    fi
+  done
+  IFS="$ac_save_ifs"
+  MAGIC="$ac_save_MAGIC"
+  ;;
+esac
+fi
+
+MAGIC="$lt_cv_path_MAGIC"
+if test -n "$MAGIC"; then
+  echo "$ECHO_T""$MAGIC" 1>&6
+else
+  echo "$ECHO_T""no" 1>&6
+fi
+
+  else
+    MAGIC=:
+  fi
+fi
+
+  fi
+  ;;
+esac
+
+case "$target" in
+NONE) lt_target="$host" ;;
+*) lt_target="$target" ;;
+esac
+
+# Extract the first word of "${ac_tool_prefix}ranlib", so it can be a program name with args.
+set dummy ${ac_tool_prefix}ranlib; ac_word=$2
+echo $ECHO_N "checking for $ac_word... $ECHO_C" 1>&6
+echo "configure:3446: checking for $ac_word" 1>&5
+if test "${ac_cv_prog_RANLIB+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -n "$RANLIB"; then
+  ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test.
+else
+  for ac_path in `IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+ac_dummy="$PATH"
+for ac_dir in $ac_dummy; do
+  test -z "$ac_dir" && ac_dir=.
+  if test -f $ac_dir/$ac_word; then
+    echo "$ac_dir/$ac_word"
+  fi
+done
+IFS="$ac_save_ifs"
+`; do
+    ac_cv_prog_RANLIB="${ac_tool_prefix}ranlib"
+    break
+  done
+fi
+fi
+RANLIB="$ac_cv_prog_RANLIB"
+if test -n "$RANLIB"; then
+  echo "$ECHO_T""$RANLIB" 1>&6
+else
+  echo "$ECHO_T""no" 1>&6
+fi
+
+if test -z "$ac_cv_prog_RANLIB"; then
+  if test -n "$ac_tool_prefix"; then
+    # Extract the first word of "ranlib", so it can be a program name with args.
+set dummy ranlib; ac_word=$2
+echo $ECHO_N "checking for $ac_word... $ECHO_C" 1>&6
+echo "configure:3480: checking for $ac_word" 1>&5
+if test "${ac_cv_prog_RANLIB+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -n "$RANLIB"; then
+  ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test.
+else
+  for ac_path in `IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+ac_dummy="$PATH"
+for ac_dir in $ac_dummy; do
+  test -z "$ac_dir" && ac_dir=.
+  if test -f $ac_dir/$ac_word; then
+    echo "$ac_dir/$ac_word"
+  fi
+done
+IFS="$ac_save_ifs"
+`; do
+    ac_cv_prog_RANLIB="ranlib"
+    break
+  done
+  test -z "$ac_cv_prog_RANLIB" && ac_cv_prog_RANLIB=":"
+fi
+fi
+RANLIB="$ac_cv_prog_RANLIB"
+if test -n "$RANLIB"; then
+  echo "$ECHO_T""$RANLIB" 1>&6
+else
+  echo "$ECHO_T""no" 1>&6
+fi
+
+  else
+    RANLIB=":"
+  fi
+fi
+
+# Extract the first word of "${ac_tool_prefix}strip", so it can be a program name with args.
+set dummy ${ac_tool_prefix}strip; ac_word=$2
+echo $ECHO_N "checking for $ac_word... $ECHO_C" 1>&6
+echo "configure:3518: checking for $ac_word" 1>&5
+if test "${ac_cv_prog_STRIP+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -n "$STRIP"; then
+  ac_cv_prog_STRIP="$STRIP" # Let the user override the test.
+else
+  for ac_path in `IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+ac_dummy="$PATH"
+for ac_dir in $ac_dummy; do
+  test -z "$ac_dir" && ac_dir=.
+  if test -f $ac_dir/$ac_word; then
+    echo "$ac_dir/$ac_word"
+  fi
+done
+IFS="$ac_save_ifs"
+`; do
+    ac_cv_prog_STRIP="${ac_tool_prefix}strip"
+    break
+  done
+fi
+fi
+STRIP="$ac_cv_prog_STRIP"
+if test -n "$STRIP"; then
+  echo "$ECHO_T""$STRIP" 1>&6
+else
+  echo "$ECHO_T""no" 1>&6
+fi
+
+if test -z "$ac_cv_prog_STRIP"; then
+  if test -n "$ac_tool_prefix"; then
+    # Extract the first word of "strip", so it can be a program name with args.
+set dummy strip; ac_word=$2
+echo $ECHO_N "checking for $ac_word... $ECHO_C" 1>&6
+echo "configure:3552: checking for $ac_word" 1>&5
+if test "${ac_cv_prog_STRIP+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  if test -n "$STRIP"; then
+  ac_cv_prog_STRIP="$STRIP" # Let the user override the test.
+else
+  for ac_path in `IFS="${IFS= 	}"; ac_save_ifs="$IFS"; IFS=":"
+ac_dummy="$PATH"
+for ac_dir in $ac_dummy; do
+  test -z "$ac_dir" && ac_dir=.
+  if test -f $ac_dir/$ac_word; then
+    echo "$ac_dir/$ac_word"
+  fi
+done
+IFS="$ac_save_ifs"
+`; do
+    ac_cv_prog_STRIP="strip"
+    break
+  done
+  test -z "$ac_cv_prog_STRIP" && ac_cv_prog_STRIP=":"
+fi
+fi
+STRIP="$ac_cv_prog_STRIP"
+if test -n "$STRIP"; then
+  echo "$ECHO_T""$STRIP" 1>&6
+else
+  echo "$ECHO_T""no" 1>&6
+fi
+
+  else
+    STRIP=":"
+  fi
+fi
+
+# Check for any special flags to pass to ltconfig.
+libtool_flags="--cache-file=$cache_file"
+test "$enable_shared" = no && libtool_flags="$libtool_flags --disable-shared"
+test "$enable_static" = no && libtool_flags="$libtool_flags --disable-static"
+test "$enable_fast_install" = no && libtool_flags="$libtool_flags --disable-fast-install"
+test "$ac_cv_prog_gcc" = yes && libtool_flags="$libtool_flags --with-gcc"
+test "$ac_cv_prog_gnu_ld" = yes && libtool_flags="$libtool_flags --with-gnu-ld"
+
+# Check whether --enable-libtool-lock or --disable-libtool-lock was given.
+if test "${enable_libtool_lock+set}" = set; then
+  enableval="$enable_libtool_lock"
+
+fi
+test "x$enable_libtool_lock" = xno && libtool_flags="$libtool_flags --disable-lock"
+test x"$silent" = xyes && libtool_flags="$libtool_flags --silent"
+
+# Check whether --with-pic or --without-pic was given.
+if test "${with_pic+set}" = set; then
+  withval="$with_pic"
+  pic_mode="$withval"
+else
+  pic_mode=default
+fi
+test x"$pic_mode" = xyes && libtool_flags="$libtool_flags --prefer-pic"
+test x"$pic_mode" = xno && libtool_flags="$libtool_flags --prefer-non-pic"
+
+# Some flags need to be propagated to the compiler or linker for good
+# libtool support.
+case "$lt_target" in
+*-*-irix6*)
+  # Find out which ABI we are using.
+  echo '#line 3618 "configure"' > conftest.$ac_ext
+  if { (eval echo configure:3619: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+    case "`/usr/bin/file conftest.o`" in
+    *32-bit*)
+      LD="${LD-ld} -32"
+      ;;
+    *N32*)
+      LD="${LD-ld} -n32"
+      ;;
+    *64-bit*)
+      LD="${LD-ld} -64"
+      ;;
+    esac
+  fi
+  rm -rf conftest*
+  ;;
+
+*-*-sco3.2v5*)
+  # On SCO OpenServer 5, we need -belf to get full-featured binaries.
+  SAVE_CFLAGS="$CFLAGS"
+  CFLAGS="$CFLAGS -belf"
+  echo $ECHO_N "checking whether the C compiler needs -belf... $ECHO_C" 1>&6
+echo "configure:3640: checking whether the C compiler needs -belf" 1>&5
+if test "${lt_cv_cc_needs_belf+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+
+     ac_ext=c
+# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+cross_compiling=$ac_cv_prog_cc_cross
+
+     cat >conftest.$ac_ext <<EOF
+#line 3653 "configure"
+#include "confdefs.h"
+
+int
+main()
+{
+
+  ;
+  return 0;
+}
+EOF
+if { (eval echo configure:3664: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  lt_cv_cc_needs_belf=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  lt_cv_cc_needs_belf=no
+fi
+rm -f conftest*
+
+     ac_ext=c
+# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+cross_compiling=$ac_cv_prog_cc_cross
+
+fi
+echo "$ECHO_T""$lt_cv_cc_needs_belf" 1>&6
+  if test x"$lt_cv_cc_needs_belf" != x"yes"; then
+    # this is probably gcc 2.8.0, egcs 1.0 or newer; no need for -belf
+    CFLAGS="$SAVE_CFLAGS"
+  fi
+  ;;
+
+esac
+
+# Save cache, so that ltconfig can load it
+cat >confcache <<\EOF
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs.  It is not useful on other systems.
+# If it contains results you don't want to keep, you may remove or edit it.
+#
+# By default, configure uses ./config.cache as the cache file,
+# creating it if it does not exist already.  You can give configure
+# the --cache-file=FILE option to use a different cache file; that is
+# what configure does when it calls configure scripts in
+# subdirectories, so they share the cache.
+# Giving --cache-file=/dev/null disables caching, for debugging configure.
+# config.status only pays attention to the cache file if you give it the
+# --recheck option to rerun configure.
+#
+EOF
+# The following way of writing the cache mishandles newlines in values,
+# but we know of no workaround that is simple, portable, and efficient.
+# So, don't put newlines in cache variables' values.
+# Ultrix sh set writes to stderr and can't be redirected directly,
+# and sets the high bit in the cache file unless we assign to the vars.
+(set) 2>&1 |
+  case `(ac_space=' '; set | grep ac_space) 2>&1` in
+  *ac_space=\ *)
+    # `set' does not quote correctly, so add quotes (double-quote substitution
+    # turns \\\\ into \\, and sed turns \\ into \).
+    sed -n \
+      -e "s/'/'\\\\''/g" \
+      -e "s/^\\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\\)=\\(.*\\)/\\1=\${\\1='\\2'}/p"
+    ;;
+  *)
+    # `set' quotes correctly as required by POSIX, so do not add quotes.
+    sed -n -e 's/^\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\)=\(.*\)/\1=${\1=\2}/p'
+    ;;
+  esac >>confcache
+if cmp -s $cache_file confcache; then :; else
+  if test -w $cache_file; then
+    echo "updating cache $cache_file"
+    cat confcache >$cache_file
+  else
+    echo "not updating unwritable cache $cache_file"
+  fi
+fi
+rm -f confcache
+
+# Actually configure libtool.  ac_aux_dir is where install-sh is found.
+AR="$AR" CC="$CC" CFLAGS="$CFLAGS" CPPFLAGS="$CPPFLAGS" \
+MAGIC="$MAGIC" LD="$LD" LDFLAGS="$LDFLAGS" LIBS="$LIBS" \
+LN_S="$LN_S" NM="$NM" RANLIB="$RANLIB" STRIP="$STRIP" \
+AS="$AS" DLLTOOL="$DLLTOOL" OBJDUMP="$OBJDUMP" \
+objext="$OBJEXT" exeext="$EXEEXT" reload_flag="$reload_flag" \
+deplibs_check_method="$deplibs_check_method" file_magic_cmd="$file_magic_cmd" \
+${CONFIG_SHELL-/bin/sh} $ac_aux_dir/ltconfig --no-reexec \
+$libtool_flags --no-verify --build="$build" $ac_aux_dir/ltmain.sh $lt_target \
+|| { echo "configure: error: libtool configure failed" 1>&2; exit 1; }
+
+# Reload cache, that may have been modified by ltconfig
+if test -r "$cache_file"; then
+  echo "loading cache $cache_file"
+      test -f "$cache_file" && . $cache_file
+else
+  echo "creating cache $cache_file"
+  >$cache_file
+fi
+
+# This can be used to rebuild libtool when needed
+LIBTOOL_DEPS="$ac_aux_dir/ltconfig $ac_aux_dir/ltmain.sh"
+
+# Always use our own libtool.
+LIBTOOL='$(SHELL) $(top_builddir)/libtool'
+
+# Redirect the config.log output again, so that the ltconfig log is not
+# clobbered by the next message.
+exec 5>>./config.log
+
+echo $ECHO_N "checking whether optarg is declared... $ECHO_C" 1>&6
+echo "configure:3769: checking whether optarg is declared" 1>&5
+if test "${ac_cv_have_decl_optarg+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  cat >conftest.$ac_ext <<EOF
+#line 3774 "configure"
+#include "confdefs.h"
+$ac_includes_default
+int
+main ()
+{
+#ifndef optarg
+  char *p = (char *) optarg;
+#endif
+
+  ;
+  return 0;
+}
+EOF
+if { (eval echo configure:3788: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  rm -rf conftest*
+  ac_cv_have_decl_optarg=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_have_decl_optarg=no
+fi
+rm -f conftest*
+fi
+echo "$ECHO_T""$ac_cv_have_decl_optarg" 1>&6
+if test $ac_cv_have_decl_optarg = yes; then
+  cat >>confdefs.h <<EOF
+#define HAVE_DECL_OPTARG 1
+EOF
+
+else
+  cat >>confdefs.h <<EOF
+#define HAVE_DECL_OPTARG 0
+EOF
+
+fi
+
+echo $ECHO_N "checking for ANSI C header files... $ECHO_C" 1>&6
+echo "configure:3813: checking for ANSI C header files" 1>&5
+if test "${ac_cv_header_stdc+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+
+cat >conftest.$ac_ext <<EOF
+#line 3819 "configure"
+#include "confdefs.h"
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <float.h>
+
+EOF
+ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
+{ (eval echo configure:3828: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
+if test -z "$ac_err"; then
+  rm -rf conftest*
+  ac_cv_header_stdc=yes
+else
+  echo "$ac_err" >&5
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+if test $ac_cv_header_stdc = yes; then
+  # SunOS 4.x string.h does not declare mem*, contrary to ANSI.
+
+cat >conftest.$ac_ext <<EOF
+#line 3846 "configure"
+#include "confdefs.h"
+#include <string.h>
+
+EOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  egrep "memchr" >/dev/null 2>&1; then
+  :
+else
+  rm -rf conftest*
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI.
+
+cat >conftest.$ac_ext <<EOF
+#line 3866 "configure"
+#include "confdefs.h"
+#include <stdlib.h>
+
+EOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  egrep "free" >/dev/null 2>&1; then
+  :
+else
+  rm -rf conftest*
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi.
+if test "$cross_compiling" = yes; then
+  :
+else
+  cat >conftest.$ac_ext <<EOF
+#line 3888 "configure"
+#include "confdefs.h"
+#include <ctype.h>
+#if ((' ' & 0x0FF) == 0x020)
+# define ISLOWER(c) ('a' <= (c) && (c) <= 'z')
+# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c))
+#else
+# define ISLOWER(c) (('a' <= (c) && (c) <= 'i') \
+                     || ('j' <= (c) && (c) <= 'r') \
+                     || ('s' <= (c) && (c) <= 'z'))
+# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c))
+#endif
+
+#define XOR(e, f) (((e) && !(f)) || (!(e) && (f)))
+int
+main ()
+{
+  int i;
+  for (i = 0; i < 256; i++)
+    if (XOR (islower (i), ISLOWER (i))
+        || toupper (i) != TOUPPER (i))
+      exit(2);
+  exit (0);
+}
+EOF
+if { (eval echo configure:3913: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null
+then
+  :
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -fr conftest*
+  ac_cv_header_stdc=no
+fi
+rm -fr conftest*
+
+fi
+
+fi
+fi
+echo "$ECHO_T""$ac_cv_header_stdc" 1>&6
+if test $ac_cv_header_stdc = yes; then
+  cat >>confdefs.h <<\EOF
+#define STDC_HEADERS 1
+EOF
+
+fi
+
+for ac_header in getopt.h unistd.h sys/sysctl.h sys/time.h
+do
+ac_ac_Header=`echo "ac_cv_header_$ac_header" | $ac_tr_sh`
+echo $ECHO_N "checking for $ac_header... $ECHO_C" 1>&6
+echo "configure:3940: checking for $ac_header" 1>&5
+if eval "test \"\${$ac_ac_Header+set}\" = set"; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+
+cat >conftest.$ac_ext <<EOF
+#line 3946 "configure"
+#include "confdefs.h"
+#include <$ac_header>
+
+EOF
+ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
+{ (eval echo configure:3952: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
+if test -z "$ac_err"; then
+  rm -rf conftest*
+  eval "$ac_ac_Header=yes"
+else
+  echo "$ac_err" >&5
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  eval "$ac_ac_Header=no"
+fi
+rm -f conftest*
+fi
+echo "$ECHO_T""`eval echo '${'$ac_ac_Header'}'`" 1>&6
+if test `eval echo '${'$ac_ac_Header'}'` = yes; then
+  cat >>confdefs.h <<EOF
+#define `echo "HAVE_$ac_header" | $ac_tr_cpp` 1
+EOF
+
+fi
+done
+
+echo $ECHO_N "checking for void... $ECHO_C" 1>&6
+echo "configure:3976: checking for void" 1>&5
+if test "${ac_cv_type_void+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  cat >conftest.$ac_ext <<EOF
+#line 3981 "configure"
+#include "confdefs.h"
+$ac_includes_default
+int
+main ()
+{
+if ((void *) 0)
+  return 0;
+if (sizeof (void))
+  return 0;
+  ;
+  return 0;
+}
+EOF
+if { (eval echo configure:3995: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  rm -rf conftest*
+  ac_cv_type_void=yes
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_type_void=no
+fi
+rm -f conftest*
+fi
+echo "$ECHO_T""$ac_cv_type_void" 1>&6
+if test $ac_cv_type_void = yes; then
+  cat >>confdefs.h <<EOF
+#define HAVE_VOID 1
+EOF
+
+fi
+
+echo $ECHO_N "checking for preprocessor stringizing operator... $ECHO_C" 1>&6
+echo "configure:4015: checking for preprocessor stringizing operator" 1>&5
+if test "${ac_cv_c_stringize+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+
+cat >conftest.$ac_ext <<EOF
+#line 4021 "configure"
+#include "confdefs.h"
+
+#define x(y) #y
+
+char *s = x(teststring);
+
+EOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  egrep "#teststring" >/dev/null 2>&1; then
+  rm -rf conftest*
+  ac_cv_c_stringize=no
+else
+  rm -rf conftest*
+  ac_cv_c_stringize=yes
+fi
+rm -f conftest*
+
+fi
+
+if test "${ac_cv_c_stringize}" = yes; then
+  cat >>confdefs.h <<\EOF
+#define HAVE_STRINGIZE 1
+EOF
+
+fi
+echo "$ECHO_T""${ac_cv_c_stringize}" 1>&6
+
+for ac_func in getopt_long getpagesize popen processor_info strtoul sysconf sysctlbyname
+do
+ac_ac_var=`echo "ac_cv_func_$ac_func" | $ac_tr_sh`
+echo $ECHO_N "checking for $ac_func... $ECHO_C" 1>&6
+echo "configure:4053: checking for $ac_func" 1>&5
+if eval "test \"\${$ac_ac_var+set}\" = set"; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  cat >conftest.$ac_ext <<EOF
+#line 4058 "configure"
+#include "confdefs.h"
+/* System header to define __stub macros and hopefully few prototypes,
+    which can conflict with char $ac_func(); below.  */
+#include <assert.h>
+/* Override any gcc2 internal prototype to avoid an error.  */
+/* We use char because int might match the return type of a gcc2
+    builtin and then its argument prototype would still apply.  */
+char $ac_func();
+char (*f)();
+
+int
+main()
+{
+
+/* The GNU C library defines this for functions which it implements
+    to always fail with ENOSYS.  Some functions are actually named
+    something starting with __ and the normal name is an alias.  */
+#if defined (__stub_$ac_func) || defined (__stub___$ac_func)
+choke me
+#else
+f = $ac_func;
+#endif
+
+  ;
+  return 0;
+}
+EOF
+if { (eval echo configure:4086: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  eval "$ac_ac_var=yes"
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  eval "$ac_ac_var=no"
+fi
+rm -f conftest*
+
+fi
+echo "$ECHO_T""`eval echo '${'$ac_ac_var'}'`" 1>&6
+if test `eval echo '${'$ac_ac_var'}'` = yes; then
+  cat >>confdefs.h <<EOF
+#define `echo "HAVE_$ac_func" | $ac_tr_cpp` 1
+EOF
+
+fi
+done
+
+echo $ECHO_N "checking if ansi2knr should be used... $ECHO_C" 1>&6
+echo "configure:4108: checking if ansi2knr should be used" 1>&5
+if test "${gmp_cv_c_ansi2knr+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+  cat >conftest.c <<EOF
+int main (int argc, char *argv) { return 0; }
+EOF
+if { (eval echo configure:4115: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
+  gmp_cv_c_ansi2knr=no
+else
+  gmp_cv_c_ansi2knr=yes
+fi
+rm -f conftest.*
+
+fi
+echo "$ECHO_T""$gmp_cv_c_ansi2knr" 1>&6
+if test $gmp_cv_c_ansi2knr = no; then
+  U= ANSI2KNR=
+else
+  U=_ ANSI2KNR=./ansi2knr
+  # Ensure some checks needed by ansi2knr itself.
+
+echo $ECHO_N "checking for ANSI C header files... $ECHO_C" 1>&6
+echo "configure:4131: checking for ANSI C header files" 1>&5
+if test "${ac_cv_header_stdc+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+
+cat >conftest.$ac_ext <<EOF
+#line 4137 "configure"
+#include "confdefs.h"
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <float.h>
+
+EOF
+ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
+{ (eval echo configure:4146: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
+if test -z "$ac_err"; then
+  rm -rf conftest*
+  ac_cv_header_stdc=yes
+else
+  echo "$ac_err" >&5
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+if test $ac_cv_header_stdc = yes; then
+  # SunOS 4.x string.h does not declare mem*, contrary to ANSI.
+
+cat >conftest.$ac_ext <<EOF
+#line 4164 "configure"
+#include "confdefs.h"
+#include <string.h>
+
+EOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  egrep "memchr" >/dev/null 2>&1; then
+  :
+else
+  rm -rf conftest*
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI.
+
+cat >conftest.$ac_ext <<EOF
+#line 4184 "configure"
+#include "confdefs.h"
+#include <stdlib.h>
+
+EOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  egrep "free" >/dev/null 2>&1; then
+  :
+else
+  rm -rf conftest*
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi.
+if test "$cross_compiling" = yes; then
+  :
+else
+  cat >conftest.$ac_ext <<EOF
+#line 4206 "configure"
+#include "confdefs.h"
+#include <ctype.h>
+#if ((' ' & 0x0FF) == 0x020)
+# define ISLOWER(c) ('a' <= (c) && (c) <= 'z')
+# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c))
+#else
+# define ISLOWER(c) (('a' <= (c) && (c) <= 'i') \
+                     || ('j' <= (c) && (c) <= 'r') \
+                     || ('s' <= (c) && (c) <= 'z'))
+# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c))
+#endif
+
+#define XOR(e, f) (((e) && !(f)) || (!(e) && (f)))
+int
+main ()
+{
+  int i;
+  for (i = 0; i < 256; i++)
+    if (XOR (islower (i), ISLOWER (i))
+        || toupper (i) != TOUPPER (i))
+      exit(2);
+  exit (0);
+}
+EOF
+if { (eval echo configure:4231: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null
+then
+  :
+else
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -fr conftest*
+  ac_cv_header_stdc=no
+fi
+rm -fr conftest*
+
+fi
+
+fi
+fi
+echo "$ECHO_T""$ac_cv_header_stdc" 1>&6
+if test $ac_cv_header_stdc = yes; then
+  cat >>confdefs.h <<\EOF
+#define STDC_HEADERS 1
+EOF
+
+fi
+
+  for ac_header in string.h
+do
+ac_ac_Header=`echo "ac_cv_header_$ac_header" | $ac_tr_sh`
+echo $ECHO_N "checking for $ac_header... $ECHO_C" 1>&6
+echo "configure:4258: checking for $ac_header" 1>&5
+if eval "test \"\${$ac_ac_Header+set}\" = set"; then
+  echo $ECHO_N "(cached) $ECHO_C" 1>&6
+else
+
+cat >conftest.$ac_ext <<EOF
+#line 4264 "configure"
+#include "confdefs.h"
+#include <$ac_header>
+
+EOF
+ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
+{ (eval echo configure:4270: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
+if test -z "$ac_err"; then
+  rm -rf conftest*
+  eval "$ac_ac_Header=yes"
+else
+  echo "$ac_err" >&5
+  echo "configure: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  eval "$ac_ac_Header=no"
+fi
+rm -f conftest*
+fi
+echo "$ECHO_T""`eval echo '${'$ac_ac_Header'}'`" 1>&6
+if test `eval echo '${'$ac_ac_Header'}'` = yes; then
+  cat >>confdefs.h <<EOF
+#define `echo "HAVE_$ac_header" | $ac_tr_cpp` 1
+EOF
+
+fi
+done
+
+fi
+
+syntax=
+# For now, we use the old switch for setting syntax.
+# FIXME: Remove when conversion to .asm is completed.
+case "${target}" in
+  m680[234]0*-*-linuxaout* | m68k*-*-linuxaout* | \
+  m68k-next-nextstep* | \
+  m68000*-*-*)
+    syntax=mit
+    ;;
+  m680[234]0*-*-linux* | m68k*-*-linux*)
+    syntax=elf
+    ;;
+  m680[234]0*-*-* | m68k*-*-*)
+    syntax=mit
+    ;;
+esac
+
+# Now build an asm-syntax.h file for targets that include that from the
+# assembly files.
+# FIXME: Remove when conversion to .asm is completed.
+case "${family}-${underscore}-${asm_align}-${syntax}" in
+  m68k-yes-log-mit)
+    echo '#define MIT_SYNTAX' >asm-syntax.h
+    cat $srcdir/mpn/underscore.h >>asm-syntax.h
+    echo '#include "'$srcdir'/mpn/m68k/syntax.h"' >>asm-syntax.h;;
+  m68k-no-nolog-elf)
+    echo '#define ELF_SYNTAX' >asm-syntax.h
+    echo '#define C_SYMBOL_NAME(name) name' >>asm-syntax.h
+    echo '#include "'$srcdir'/mpn/m68k/syntax.h"' >>asm-syntax.h;;
+esac
+
+# The pattern here tests for an absolute path the same way as
+# _AC_OUTPUT_FILES in autoconf acgeneral.m4.
+
+echo "dnl  CONFIG_TOP_SRCDIR is a path from the mpn builddir to the top srcdir" >> $gmp_tmpconfigm4
+
+case "$srcdir" in
+[\\/]* | ?:[\\/]* )
+
+echo "define(<CONFIG_TOP_SRCDIR>,<\`$srcdir'>)" >> $gmp_tmpconfigm4
+    ;;
+*)
+
+echo "define(<CONFIG_TOP_SRCDIR>,<\`../$srcdir'>)" >> $gmp_tmpconfigm4
+ ;;
+esac
+
+echo "include(CONFIG_TOP_SRCDIR\`/mpn/asm-defs.m4')" >> $gmp_tmpconfigm4p
+
+# Must be after asm-defs.m4
+
+echo "define_not_for_expansion(\`HAVE_TARGET_CPU_$target_cpu')" >> $gmp_tmpconfigm4p
+
+case "$target" in
+  alpha*-cray-unicos*)
+    gmp_m4postinc="alpha/unicos.m4"
+    ;;
+  alpha*-*-*)
+    gmp_m4postinc="alpha/default.m4"
+    ;;
+  power*-*-*)
+    case "$target" in
+      *-*-mach* | *-*-rhapsody* | *-*-nextstep* | *-*-darwin* | *-*-macosx*)
+        ;;	# these use non-conventional assembly syntax.
+      powerpc64-*-aix*)
+	gmp_m4postinc="powerpc32/regmap.m4 powerpc64/aix.m4"
+        ;;
+      *-*-aix*)
+	gmp_m4postinc="powerpc32/regmap.m4 powerpc32/aix.m4"
+        ;;
+      *)
+	gmp_m4postinc="powerpc32/regmap.m4"
+	;;
+    esac
+    ;;
+esac
+
+for tmp_f in $gmp_m4postinc; do
+
+echo "include_mpn(\`$tmp_f')" >> $gmp_tmpconfigm4p
+
+done
+
+# Set up `gmp_links'.  It's a list of link:file pairs that configure will
+# process to create link -> file.
+gmp_links=
+
+# If the user specified `MPN_PATH', use that instead of the path we've
+# come up with.
+if test -z "$MPN_PATH"; then
+  path="$path generic"
+else
+  path="$MPN_PATH"
+fi
+
+# Pick the correct source files in $path and link them to mpn/.
+# $gmp_mpn_functions lists all functions we need.
+#
+# The rule is to find a file with the function name and a .asm, .S,
+# .s, or .c extension.  Certain multi-function files with special names
+# can provide some functions too.  (mpn/Makefile.am passes
+# -DOPERATION_<func> to get them to generate the right code.)
+
+# FIXME: udiv and umul aren't in $gmp_mpn_functions_optional yet since
+# there's some versions of those files which should be checked for bit
+# rot first.  Put them in $extra_functions for each target for now,
+# change to standard optionals when all are ready.
+
+# Note: The following lines defining $gmp_mpn_functions_optional
+#       and $gmp_mpn_functions are parsed by the "macos/configure"
+#       Perl script. So if you change the lines in a major way
+#       make sure to run and examine the output from
+#
+#           % (cd macos; perl configure)
+
+gmp_mpn_functions_optional="copyi copyd com_n		\
+  and_n andn_n nand_n ior_n iorn_n nior_n xor_n xnor_n"
+
+gmp_mpn_functions="${extra_functions} inlines add_n sub_n mul_1 addmul_1   \
+  submul_1 lshift rshift diveby3 divrem divrem_1 divrem_2                  \
+  mod_1 mod_1_rs pre_mod_1 dump                                            \
+  mul mul_fft mul_n mul_basecase sqr_basecase random                       \
+  random2 sqrtrem get_str set_str scan0 scan1 popcount hamdist cmp perfsqr \
+  bdivmod gcd_1 gcd gcdext tdiv_qr bz_divrem_n sb_divrem_mn jacbase        \
+  $gmp_mpn_functions_optional"
+
+# the list of all object files used by mpn/Makefile.in and the
+# top-level Makefile.in, respectively
+mpn_objects=
+mpn_objs_in_libgmp="mpn/mp_bases.lo"
+
+# SLPJ trace
+echo "Peering at file structure (takes a while)..." 1>&6
+
+for tmp_fn in ${gmp_mpn_functions} ; do
+# SLPJ trace
+  echo "...$tmp_fn..." 1>&6
+
+# This line was
+#    rm -f mpn/${tmp_fn}.[Ssc] mpn/${tmp_fn}.asm
+# but I found that on my NT workstation the command
+# would unpredictably hang.  rm wasn't an active process,
+# but absolutlely nothing was happening.  
+# I *think* that expanding the [Ssc] cures the problem
+#    SLPJ May 01
+  rm -f mpn/${tmp_fn}.S mpn/${tmp_fn}.s mpn/${tmp_fn}.c mpn/${tmp_fn}.asm
+
+  echo "...$tmp_fn (done rm)..." 1>&6
+
+  # functions that can be provided by multi-function files
+  tmp_mulfunc=
+  case $tmp_fn in
+  add_n|sub_n)       tmp_mulfunc="aors_n"    ;;
+  addmul_1|submul_1) tmp_mulfunc="aorsmul_1" ;;
+  popcount|hamdist)  tmp_mulfunc="popham"    ;;
+  and_n|andn_n|nand_n | ior_n|iorn_n|nior_n | xor_n|xnor_n)
+                     tmp_mulfunc="logops_n"  ;;
+  esac
+
+  found=no
+  for tmp_dir in $path; do
+
+# SLPJ trace
+# We get stuck sometimes
+	echo "  ...dir $tmp_dir..." 1>&6
+    for tmp_base in $tmp_fn $tmp_mulfunc; do
+
+# SLPJ trace
+# We get stuck sometimes
+	echo "   ...base $tmp_base..." 1>&6
+      for tmp_ext in asm S s c; do
+        tmp_file=$srcdir/mpn/$tmp_dir/$tmp_base.$tmp_ext
+
+# SLPJ trace
+# We get stuck sometimes
+	echo "   ...$tmp_file..." 1>&6
+
+        if test -f $tmp_file; then
+          found=yes
+
+          mpn_objects="$mpn_objects ${tmp_fn}.lo"
+          mpn_objs_in_libgmp="$mpn_objs_in_libgmp mpn/${tmp_fn}.lo"
+          gmp_links="$gmp_links mpn/$tmp_fn.$tmp_ext:mpn/$tmp_dir/$tmp_base.$tmp_ext"
+
+          # duplicate AC_DEFINEs are harmless, so it doesn't matter
+          # that multi-function files get grepped here repeatedly
+          gmp_ep="`
+            sed -n 's/^[ 	]*MULFUNC_PROLOGUE(\(.*\))/\1/p' $tmp_file ;
+            sed -n 's/^[ 	]*PROLOGUE.*(\(.*\))/\1/p' $tmp_file
+          `"
+          for gmp_tmp in $gmp_ep; do
+            cat >>confdefs.h <<EOF
+#define HAVE_NATIVE_${gmp_tmp} 1
+EOF
+
+          done
+
+          break
+        fi
+      done
+      if test $found = yes; then break ; fi
+    done
+    if test $found = yes; then break ; fi
+  done
+
+  if test $found = no; then
+    for tmp_optional in $gmp_mpn_functions_optional; do
+      if test $tmp_optional = $tmp_fn; then
+        found=yes
+      fi
+    done
+    if test $found = no; then
+      { echo "configure: error: no version of $tmp_fn found in path: $path" 1>&2; exit 1; }
+    fi
+  fi
+done
+
+
+# Create link for gmp-mparam.h.
+
+# SLPJ trace
+echo "Creating link for gmp-mparam.h..." 1>&6
+
+for tmp_dir in $path ; do
+  rm -f gmp-mparam.h
+  if test -f $srcdir/mpn/${tmp_dir}/gmp-mparam.h ; then
+    gmp_links="$gmp_links gmp-mparam.h:mpn/${tmp_dir}/gmp-mparam.h"
+
+    # Copy any KARATSUBA_SQR_THRESHOLD in gmp-mparam.h to config.m4.
+    # Some versions of sqr_basecase.asm use this.
+    tmp_gmp_karatsuba_sqr_threshold="`sed -n 's/^#define KARATSUBA_SQR_THRESHOLD[ 	]*\([0-9][0-9]*\).*$/\1/p' $srcdir/mpn/${tmp_dir}/gmp-mparam.h`"
+    if test -n "$tmp_gmp_karatsuba_sqr_threshold"; then
+
+echo "define(<KARATSUBA_SQR_THRESHOLD>,<$tmp_gmp_karatsuba_sqr_threshold>)" >> $gmp_tmpconfigm4
+
+    fi
+
+    break
+  fi
+done
+
+# SLPJ trace
+echo "Digging out links to include in DISTCLEANFILES..." 1>&6
+
+# Dig out the links from `gmp_links' for inclusion in DISTCLEANFILES.
+gmp_srclinks=
+for f in $gmp_links; do
+  gmp_srclinks="$gmp_srclinks `echo $f | sed 's/\(.*\):.*/\1/'`"
+done
+
+echo "creating $gmp_configm4"
+echo "dnl $gmp_configm4.  Generated automatically by configure." > $gmp_configm4
+if test -f $gmp_tmpconfigm4; then
+  echo "changequote(<,>)dnl" >> $gmp_configm4
+  echo "ifdef(<__CONFIG_M4_INCLUDED__>,,<" >> $gmp_configm4
+  cat $gmp_tmpconfigm4 >> $gmp_configm4
+  echo ">)" >> $gmp_configm4
+  echo "changequote(\`,')dnl" >> $gmp_configm4
+  rm $gmp_tmpconfigm4
+fi
+echo "ifdef(\`__CONFIG_M4_INCLUDED__',,\`" >> $gmp_configm4
+if test -f $gmp_tmpconfigm4i; then
+  cat $gmp_tmpconfigm4i >> $gmp_configm4
+  rm $gmp_tmpconfigm4i
+fi
+if test -f $gmp_tmpconfigm4p; then
+  cat $gmp_tmpconfigm4p >> $gmp_configm4
+  rm $gmp_tmpconfigm4p
+fi
+echo "')" >> $gmp_configm4
+echo "define(\`__CONFIG_M4_INCLUDED__')" >> $gmp_configm4
+
+trap '' 1 2 15
+cat >confcache <<\EOF
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs.  It is not useful on other systems.
+# If it contains results you don't want to keep, you may remove or edit it.
+#
+# By default, configure uses ./config.cache as the cache file,
+# creating it if it does not exist already.  You can give configure
+# the --cache-file=FILE option to use a different cache file; that is
+# what configure does when it calls configure scripts in
+# subdirectories, so they share the cache.
+# Giving --cache-file=/dev/null disables caching, for debugging configure.
+# config.status only pays attention to the cache file if you give it the
+# --recheck option to rerun configure.
+#
+EOF
+# The following way of writing the cache mishandles newlines in values,
+# but we know of no workaround that is simple, portable, and efficient.
+# So, don't put newlines in cache variables' values.
+# Ultrix sh set writes to stderr and can't be redirected directly,
+# and sets the high bit in the cache file unless we assign to the vars.
+(set) 2>&1 |
+  case `(ac_space=' '; set | grep ac_space) 2>&1` in
+  *ac_space=\ *)
+    # `set' does not quote correctly, so add quotes (double-quote substitution
+    # turns \\\\ into \\, and sed turns \\ into \).
+    sed -n \
+      -e "s/'/'\\\\''/g" \
+      -e "s/^\\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\\)=\\(.*\\)/\\1=\${\\1='\\2'}/p"
+    ;;
+  *)
+    # `set' quotes correctly as required by POSIX, so do not add quotes.
+    sed -n -e 's/^\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\)=\(.*\)/\1=${\1=\2}/p'
+    ;;
+  esac >>confcache
+if cmp -s $cache_file confcache; then :; else
+  if test -w $cache_file; then
+    echo "updating cache $cache_file"
+    cat confcache >$cache_file
+  else
+    echo "not updating unwritable cache $cache_file"
+  fi
+fi
+rm -f confcache
+
+trap 'rm -fr conftest* confdefs* core core.* *.core $ac_clean_files; exit 1' 1 2 15
+
+test "x$prefix" = xNONE && prefix=$ac_default_prefix
+# Let make expand exec_prefix.
+test "x$exec_prefix" = xNONE && exec_prefix='${prefix}'
+
+# Any assignment to VPATH causes Sun make to only execute
+# the first set of double-colon rules, so remove it if not needed.
+# If there is a colon in the path, we need to keep it.
+if test "x$srcdir" = x.; then
+  ac_vpsub='/^[ 	]*VPATH[ 	]*=[^:]*$/d'
+fi
+
+DEFS=-DHAVE_CONFIG_H
+
+: ${CONFIG_STATUS=./config.status}
+trap 'rm -f $CONFIG_STATUS conftest*; exit 1' 1 2 15
+echo creating $CONFIG_STATUS
+cat >$CONFIG_STATUS <<EOF
+#! /bin/sh
+# Generated automatically by configure.
+# Run this file to recreate the current configuration.
+# This directory was configured as follows,
+# on host `(hostname || uname -n) 2>/dev/null | sed 1q`:
+#
+# $0 $ac_configure_args
+#
+# Compiler output produced by configure, useful for debugging
+# configure, is in ./config.log if it exists.
+
+# Files that config.status was made for.
+config_files="\\
+  Makefile mpn/Makefile mpz/Makefile"
+config_headers="\\
+  config.h:config.in"
+config_links="\\
+  $gmp_links"
+config_commands="\\
+  default-1"
+
+ac_cs_usage="\\
+\\\`$CONFIG_STATUS' instantiates files from templates according to the
+current configuration.
+
+Usage: $CONFIG_STATUS [OPTIONS] FILE...
+
+  --recheck    Update $CONFIG_STATUS by reconfiguring in the same conditions
+  --version    Print the version of Autoconf and exit
+  --help       Display this help and exit
+  --file=FILE[:TEMPLATE]
+               Instantiate the configuration file FILE
+  --header=FILE[:TEMPLATE]
+               Instantiate the configuration header FILE
+
+Configuration files:
+\$config_files
+
+Configuration headers:
+\$config_headers
+
+Configuration links:
+\$config_links
+
+Configuration commands:
+\$config_commands
+
+Report bugs to <bug-autoconf@gnu.org>."
+
+ac_cs_version="\\
+$CONFIG_STATUS generated by autoconf version 2.14a.
+Configured on host `(hostname || uname -n) 2>/dev/null | sed 1q` by
+  `echo "$0 $ac_configure_args" | sed 's/[\\"\`\$]/\\\\&/g'`"
+
+# Root of the tmp file names.  Use pid to allow concurrent executions.
+ac_cs_root=cs\$\$
+ac_given_srcdir=$srcdir
+ac_given_INSTALL="$INSTALL"
+
+# If no file are specified by the user, then we need to provide default
+# value.  By we need to know if files were specified by the user.
+ac_need_defaults=:
+while test \$# != 0
+do
+  case "\$1" in
+  --*=*)
+    ac_option=\`echo "\$1" | sed -e 's/=.*//'\`
+    ac_optarg=\`echo "\$1" | sed -e 's/[^=]*=//'\`
+    shift
+    set dummy "\$ac_option" "\$ac_optarg" \${1+"\$@"}
+    shift
+    ;;
+  -*);;
+  *) # This is not an option, so the user has probably given explicit
+     # arguments.
+     ac_need_defaults=false;;
+  esac
+
+  case "\$1" in
+
+  # Handling of the options.
+  -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r)
+    echo "running \${CONFIG_SHELL-/bin/sh} $0 `echo "$ac_configure_args" | sed 's/[\\"\`\$]/\\\\&/g'` --no-create --no-recursion"
+    exec \${CONFIG_SHELL-/bin/sh} $0 $ac_configure_args --no-create --no-recursion ;;
+  -version | --version | --versio | --versi | --vers | --ver | --ve | --v)
+    echo "\$ac_cs_version"; exit 0 ;;
+  --he | --h)
+    # Conflict between --help and --header
+    echo "$CONFIG_STATUS: ambiguous option: \$ac_option
+Try \\\`$CONFIG_STATUS --help' for more information."; exit 1 ;;
+  -help | --help | --hel )
+    echo "\$ac_cs_usage"; exit 0 ;;
+  --file | --fil | --fi | --f )
+    shift
+    CONFIG_FILES="\$CONFIG_FILES \$1"
+    ac_need_defaults=false;;
+  --header | --heade | --head | --hea )
+    shift
+    CONFIG_HEADERS="\$CONFIG_FILES \$1"
+    ac_need_defaults=false;;
+
+  # Handling of arguments.
+  'Makefile' ) CONFIG_FILES="\$CONFIG_FILES Makefile" ;;
+  'mpz/Makefile' ) CONFIG_FILES="\$CONFIG_FILES mpz/Makefile" ;;
+  'mpn/Makefile' ) CONFIG_FILES="\$CONFIG_FILES mpn/Makefile" ;;
+  '$gmp_links' ) CONFIG_LINKS="\$CONFIG_LINKS $gmp_links" ;;
+  'default-1' ) CONFIG_COMMANDS="\$CONFIG_COMMANDS default-1" ;;
+  'config.h' ) CONFIG_HEADERS="\$CONFIG_HEADERS config.h:config.in" ;;
+
+  # This is an error.
+  -*) echo "$CONFIG_STATUS: unrecognized option: \$1
+Try \\\`$CONFIG_STATUS --help' for more information."; exit 1 ;;
+  *) echo "$CONFIG_STATUS: invalid argument: \$1"; exit 1 ;;
+  esac
+  shift
+done
+
+EOF
+
+cat >>$CONFIG_STATUS <<\EOF
+# If the user did not use the arguments to specify the items to instantiate,
+# then the envvar interface is used.  Set only those that are not.
+if $ac_need_defaults; then
+  : ${CONFIG_FILES=$config_files}
+  : ${CONFIG_HEADERS=$config_headers}
+  : ${CONFIG_LINKS=$config_links}
+  : ${CONFIG_COMMANDS=$config_commands}
+fi
+
+# Trap to remove the temp files.
+trap 'rm -fr $ac_cs_root*; exit 1' 1 2 15
+
+EOF
+
+cat >>$CONFIG_STATUS <<EOF
+#
+# INIT-COMMANDS section.
+#
+
+EOF
+
+cat >>$CONFIG_STATUS <<EOF
+
+#
+# CONFIG_FILES section.
+#
+
+# No need to generate the scripts if there are no CONFIG_FILES.
+# This happens for instance when ./config.status config.h
+if test -n "\$CONFIG_FILES"; then
+  # Protect against being on the right side of a sed subst in config.status.
+  sed 's/%@/@@/; s/@%/@@/; s/%;t t\$/@;t t/; /@;t t\$/s/[\\\\&%]/\\\\&/g;
+   s/@@/%@/; s/@@/@%/; s/@;t t\$/%;t t/' >\$ac_cs_root.subs <<\\CEOF
+s%@exec_prefix@%$exec_prefix%;t t
+s%@prefix@%$prefix%;t t
+s%@program_transform_name@%$program_transform_name%;t t
+s%@bindir@%$bindir%;t t
+s%@sbindir@%$sbindir%;t t
+s%@libexecdir@%$libexecdir%;t t
+s%@datadir@%$datadir%;t t
+s%@sysconfdir@%$sysconfdir%;t t
+s%@sharedstatedir@%$sharedstatedir%;t t
+s%@localstatedir@%$localstatedir%;t t
+s%@libdir@%$libdir%;t t
+s%@includedir@%$includedir%;t t
+s%@oldincludedir@%$oldincludedir%;t t
+s%@infodir@%$infodir%;t t
+s%@mandir@%$mandir%;t t
+s%@SHELL@%$SHELL%;t t
+s%@ECHO_C@%$ECHO_C%;t t
+s%@ECHO_N@%$ECHO_N%;t t
+s%@ECHO_T@%$ECHO_T%;t t
+s%@CFLAGS@%$CFLAGS%;t t
+s%@CPPFLAGS@%$CPPFLAGS%;t t
+s%@CXXFLAGS@%$CXXFLAGS%;t t
+s%@FFLAGS@%$FFLAGS%;t t
+s%@DEFS@%$DEFS%;t t
+s%@LDFLAGS@%$LDFLAGS%;t t
+s%@LIBS@%$LIBS%;t t
+s%@host@%$host%;t t
+s%@host_alias@%$host_alias%;t t
+s%@host_cpu@%$host_cpu%;t t
+s%@host_vendor@%$host_vendor%;t t
+s%@host_os@%$host_os%;t t
+s%@target@%$target%;t t
+s%@target_alias@%$target_alias%;t t
+s%@target_cpu@%$target_cpu%;t t
+s%@target_vendor@%$target_vendor%;t t
+s%@target_os@%$target_os%;t t
+s%@build@%$build%;t t
+s%@build_alias@%$build_alias%;t t
+s%@build_cpu@%$build_cpu%;t t
+s%@build_vendor@%$build_vendor%;t t
+s%@build_os@%$build_os%;t t
+s%@INSTALL_PROGRAM@%$INSTALL_PROGRAM%;t t
+s%@INSTALL_SCRIPT@%$INSTALL_SCRIPT%;t t
+s%@INSTALL_DATA@%$INSTALL_DATA%;t t
+s%@PACKAGE@%$PACKAGE%;t t
+s%@VERSION@%$VERSION%;t t
+s%@ACLOCAL@%$ACLOCAL%;t t
+s%@AUTOCONF@%$AUTOCONF%;t t
+s%@AUTOMAKE@%$AUTOMAKE%;t t
+s%@AUTOHEADER@%$AUTOHEADER%;t t
+s%@MAKEINFO@%$MAKEINFO%;t t
+s%@AMTAR@%$AMTAR%;t t
+s%@install_sh@%$install_sh%;t t
+s%@AWK@%$AWK%;t t
+s%@SET_MAKE@%$SET_MAKE%;t t
+s%@AMDEP@%$AMDEP%;t t
+s%@AMDEPBACKSLASH@%$AMDEPBACKSLASH%;t t
+s%@DEPDIR@%$DEPDIR%;t t
+s%@MAINTAINER_MODE_TRUE@%$MAINTAINER_MODE_TRUE%;t t
+s%@MAINTAINER_MODE_FALSE@%$MAINTAINER_MODE_FALSE%;t t
+s%@MAINT@%$MAINT%;t t
+s%@WANT_MPBSD_TRUE@%$WANT_MPBSD_TRUE%;t t
+s%@WANT_MPBSD_FALSE@%$WANT_MPBSD_FALSE%;t t
+s%@WANT_MPFR_TRUE@%$WANT_MPFR_TRUE%;t t
+s%@WANT_MPFR_FALSE@%$WANT_MPFR_FALSE%;t t
+s%@CC@%$CC%;t t
+s%@CCAS@%$CCAS%;t t
+s%@CPP@%$CPP%;t t
+s%@LN_S@%$LN_S%;t t
+s%@M4@%$M4%;t t
+s%@AR@%$AR%;t t
+s%@CALLING_CONVENTIONS_OBJS@%$CALLING_CONVENTIONS_OBJS%;t t
+s%@SPEED_CYCLECOUNTER_OBJS@%$SPEED_CYCLECOUNTER_OBJS%;t t
+s%@EXEEXT@%$EXEEXT%;t t
+s%@OBJEXT@%$OBJEXT%;t t
+s%@RANLIB@%$RANLIB%;t t
+s%@STRIP@%$STRIP%;t t
+s%@LIBTOOL@%$LIBTOOL%;t t
+s%@U@%$U%;t t
+s%@ANSI2KNR@%$ANSI2KNR%;t t
+s%@mpn_objects@%$mpn_objects%;t t
+s%@mpn_objs_in_libgmp@%$mpn_objs_in_libgmp%;t t
+s%@gmp_srclinks@%$gmp_srclinks%;t t
+CEOF
+
+EOF
+
+  cat >>$CONFIG_STATUS <<\EOF
+  # Split the substitutions into bite-sized pieces for seds with
+  # small command number limits, like on Digital OSF/1 and HP-UX.
+  ac_max_sed_lines=48
+  ac_sed_frag=1 # Number of current file.
+  ac_beg=1 # First line for current file.
+  ac_end=$ac_max_sed_lines # Line after last line for current file.
+  ac_more_lines=:
+  ac_sed_cmds=""
+  while $ac_more_lines; do
+    if test $ac_beg -gt 1; then
+      sed "1,${ac_beg}d; ${ac_end}q" $ac_cs_root.subs >$ac_cs_root.sfrag
+    else
+      sed "${ac_end}q" $ac_cs_root.subs >$ac_cs_root.sfrag
+    fi
+    if test ! -s $ac_cs_root.sfrag; then
+      ac_more_lines=false
+      rm -f $ac_cs_root.sfrag
+    else
+      # The purpose of the label and of the branching condition is to
+      # speed up the sed processing (if there are no `@' at all, there
+      # is no need to browse any of the substitutions).
+      # These are the two extra sed commands mentioned above.
+      (echo ':t
+  /@[a-zA-Z_][a-zA-Z_0-9]*@/!b' && cat $ac_cs_root.sfrag) >$ac_cs_root.s$ac_sed_frag
+      if test -z "$ac_sed_cmds"; then
+  	ac_sed_cmds="sed -f $ac_cs_root.s$ac_sed_frag"
+      else
+  	ac_sed_cmds="$ac_sed_cmds | sed -f $ac_cs_root.s$ac_sed_frag"
+      fi
+      ac_sed_frag=`expr $ac_sed_frag + 1`
+      ac_beg=$ac_end
+      ac_end=`expr $ac_end + $ac_max_sed_lines`
+    fi
+  done
+  if test -z "$ac_sed_cmds"; then
+    ac_sed_cmds=cat
+  fi
+fi # test -n "$CONFIG_FILES"
+
+EOF
+cat >>$CONFIG_STATUS <<\EOF
+for ac_file in .. $CONFIG_FILES; do if test "x$ac_file" != x..; then
+  # Support "outfile[:infile[:infile...]]", defaulting infile="outfile.in".
+  case "$ac_file" in
+  *:*) ac_file_in=`echo "$ac_file" | sed 's%[^:]*:%%'`
+       ac_file=`echo "$ac_file" | sed 's%:.*%%'` ;;
+  *) ac_file_in="${ac_file}.in" ;;
+  esac
+
+  # Adjust a relative srcdir, top_srcdir, and INSTALL for subdirectories.
+
+  # Remove last slash and all that follows it.  Not all systems have dirname.
+  ac_dir=`echo "$ac_file" | sed 's%/[^/][^/]*$%%'`
+  if test "$ac_dir" != "$ac_file" && test "$ac_dir" != .; then
+    # The file is in a subdirectory.
+    test ! -d "$ac_dir" && mkdir "$ac_dir"
+    ac_dir_suffix="/`echo $ac_dir|sed 's%^\./%%'`"
+    # A "../" for each directory in $ac_dir_suffix.
+    ac_dots=`echo "$ac_dir_suffix" | sed 's%/[^/]*%../%g'`
+  else
+    ac_dir_suffix= ac_dots=
+  fi
+
+  case "$ac_given_srcdir" in
+  .)  srcdir=.
+      if test -z "$ac_dots"; then top_srcdir=.
+      else top_srcdir=`echo $ac_dots | sed 's%/$%%'`; fi ;;
+  [\\/]* | ?:[\\/]* )
+      srcdir="$ac_given_srcdir$ac_dir_suffix";
+      top_srcdir=$ac_given_srcdir ;;
+  *) # Relative path.
+    srcdir="$ac_dots$ac_given_srcdir$ac_dir_suffix"
+    top_srcdir="$ac_dots$ac_given_srcdir" ;;
+  esac
+
+  case "$ac_given_INSTALL" in
+  [\\/$]* | ?:[\\/]* ) INSTALL="$ac_given_INSTALL" ;;
+  *) INSTALL="$ac_dots$ac_given_INSTALL" ;;
+  esac
+
+  echo creating "$ac_file"
+  rm -f "$ac_file"
+  configure_input="Generated automatically from `echo $ac_file_in |
+                                                 sed 's%.*/%%'` by configure."
+  case "$ac_file" in
+  *[Mm]akefile*) ac_comsub="1i\\
+# $configure_input" ;;
+  *) ac_comsub= ;;
+  esac
+
+  # Don't redirect the output to AC_FILE directly: use `mv' so that updating
+  # is atomic, and doesn't need trapping.
+  ac_file_inputs=`echo "$ac_file_in" |
+                  sed -e "s%:% $ac_given_srcdir/%g;s%^%$ac_given_srcdir/%"`
+  for ac_file_input in $ac_file_inputs;
+  do
+    test -f "$ac_file_input" ||
+        { echo "configure: error: cannot find input file \`$ac_file_input'" 1>&2; exit 1; }
+  done
+EOF
+cat >>$CONFIG_STATUS <<EOF
+  sed -e "$ac_comsub
+$ac_vpsub
+$extrasub
+EOF
+cat >>$CONFIG_STATUS <<\EOF
+:t
+/@[a-zA-Z_][a-zA-Z_0-9]*@/!b
+s%@configure_input@%$configure_input%;t t
+s%@srcdir@%$srcdir%;t t
+s%@top_srcdir@%$top_srcdir%;t t
+s%@INSTALL@%$INSTALL%;t t
+" $ac_file_inputs | (eval "$ac_sed_cmds") >$ac_cs_root.out
+  mv $ac_cs_root.out $ac_file
+
+fi; done
+rm -f $ac_cs_root.s*
+EOF
+cat >>$CONFIG_STATUS <<\EOF
+
+#
+# CONFIG_HEADER section.
+#
+
+# These sed commands are passed to sed as "A NAME B NAME C VALUE D", where
+# NAME is the cpp macro being defined and VALUE is the value it is being given.
+#
+# ac_d sets the value in "#define NAME VALUE" lines.
+ac_dA='s%^\([ 	]*\)#\([ 	]*define[ 	][ 	]*\)'
+ac_dB='[ 	].*$%\1#\2'
+ac_dC=' '
+ac_dD='%;t'
+# ac_u turns "#undef NAME" without trailing blanks into "#define NAME VALUE".
+ac_uA='s%^\([ 	]*\)#\([ 	]*\)undef\([ 	][ 	]*\)'
+ac_uB='$%\1#\2define\3'
+ac_uC=' '
+ac_uD='%;t'
+
+for ac_file in .. $CONFIG_HEADERS; do if test "x$ac_file" != x..; then
+  # Support "outfile[:infile[:infile...]]", defaulting infile="outfile.in".
+  case "$ac_file" in
+  *:*) ac_file_in=`echo "$ac_file" | sed 's%[^:]*:%%'`
+       ac_file=`echo "$ac_file" | sed 's%:.*%%'` ;;
+  *) ac_file_in="${ac_file}.in" ;;
+  esac
+
+  echo creating $ac_file
+
+  rm -f $ac_cs_root.frag $ac_cs_root.in $ac_cs_root.out
+  ac_file_inputs=`echo "$ac_file_in" |
+                  sed -e "s%:% $ac_given_srcdir/%g;s%^%$ac_given_srcdir/%"`
+    for ac_file_input in $ac_file_inputs;
+  do
+    test -f "$ac_file_input" ||
+        { echo "configure: error: cannot find input file \`$ac_file_input'" 1>&2; exit 1; }
+  done
+  # Remove the trailing spaces.
+  sed -e 's/[ 	]*$//' $ac_file_inputs >$ac_cs_root.in
+
+EOF
+
+# Transform confdefs.h into two sed scripts, `conftest.defines' and
+# `conftest.undefs', that substitutes the proper values into
+# config.h.in to produce config.h.  The first handles `#define'
+# templates, and the second `#undef' templates.
+# And first: Protect against being on the right side of a sed subst in
+# config.status.  Protect against being in an unquoted here document
+# in config.status.
+rm -f conftest.defines conftest.undefs
+ac_cs_root=conftest
+cat >$ac_cs_root.hdr <<\EOF
+s/[\\&%]/\\&/g
+s%[\\$`]%\\&%g
+t clear
+: clear
+s%^[ 	]*#[ 	]*define[ 	][ 	]*\(\([^ 	(][^ 	(]*\)([^)]*)\)[ 	]*\(.*\)$%${ac_dA}\2${ac_dB}\1${ac_dC}\3${ac_dD}%gp
+t cleanup
+s%^[ 	]*#[ 	]*define[ 	][ 	]*\([^ 	][^ 	]*\)[ 	]*\(.*\)$%${ac_dA}\1${ac_dB}\1${ac_dC}\2${ac_dD}%gp
+: cleanup
+EOF
+# If some macros were called several times there might be several times
+# the same #defines, which is useless.  Nevertheless, we may not want to
+# sort them, since we want the *last* AC_DEFINE to be honored.
+uniq confdefs.h | sed -n -f $ac_cs_root.hdr >conftest.defines
+sed -e 's/ac_d/ac_u/g' conftest.defines >conftest.undefs
+rm -f $ac_cs_root.hdr
+
+# This sed command replaces #undef with comments.  This is necessary, for
+# example, in the case of _POSIX_SOURCE, which is predefined and required
+# on some systems where configure will not decide to define it.
+cat >>conftest.undefs <<\EOF
+s%^[ 	]*#[ 	]*undef[ 	][ 	]*[a-zA-Z_][a-zA-Z_0-9]*%/* & */%
+EOF
+
+# Break up conftest.defines because some shells have a limit on the size
+# of here documents, and old seds have small limits too (100 cmds).
+echo '  # Handle all the #define templates only if necessary.' >>$CONFIG_STATUS
+echo '  if egrep "^[ 	]*#[ 	]*define" $ac_cs_root.in >/dev/null; then' >>$CONFIG_STATUS
+echo '  # If there are no defines, we may have an empty if/fi' >>$CONFIG_STATUS
+echo '  :' >>$CONFIG_STATUS
+rm -f conftest.tail
+while grep . conftest.defines >/dev/null
+do
+  # Write a limited-size here document to $ac_cs_root.frag.
+  echo '  cat >$ac_cs_root.frag <<CEOF' >>$CONFIG_STATUS
+  echo '/^[ 	]*#[ 	]*define/!b' >>$CONFIG_STATUS
+  sed ${ac_max_here_lines}q conftest.defines >>$CONFIG_STATUS
+  echo 'CEOF
+  sed -f $ac_cs_root.frag $ac_cs_root.in >$ac_cs_root.out
+  rm -f $ac_cs_root.in
+  mv $ac_cs_root.out $ac_cs_root.in
+' >>$CONFIG_STATUS
+  sed 1,${ac_max_here_lines}d conftest.defines >conftest.tail
+  rm -f conftest.defines
+  mv conftest.tail conftest.defines
+done
+rm -f conftest.defines
+echo '  fi # egrep' >>$CONFIG_STATUS
+echo >>$CONFIG_STATUS
+
+# Break up conftest.undefs because some shells have a limit on the size
+# of here documents, and old seds have small limits too (100 cmds).
+echo '  # Handle all the #undef templates' >>$CONFIG_STATUS
+rm -f conftest.tail
+while grep . conftest.undefs >/dev/null
+do
+  # Write a limited-size here document to $ac_cs_root.frag.
+  echo '  cat >$ac_cs_root.frag <<CEOF' >>$CONFIG_STATUS
+  echo '/^[ 	]*#[ 	]*undef/!b' >>$CONFIG_STATUS
+  sed ${ac_max_here_lines}q conftest.undefs >>$CONFIG_STATUS
+  echo 'CEOF
+  sed -f $ac_cs_root.frag $ac_cs_root.in >$ac_cs_root.out
+  rm -f $ac_cs_root.in
+  mv $ac_cs_root.out $ac_cs_root.in
+' >>$CONFIG_STATUS
+  sed 1,${ac_max_here_lines}d conftest.undefs >conftest.tail
+  rm -f conftest.undefs
+  mv conftest.tail conftest.undefs
+done
+rm -f conftest.undefs
+
+cat >>$CONFIG_STATUS <<\EOF
+  rm -f $ac_cs_root.frag $ac_cs_root.h
+  echo "/* $ac_file.  Generated automatically by configure.  */" >$ac_cs_root.h
+  cat $ac_cs_root.in >>$ac_cs_root.h
+  rm -f $ac_cs_root.in
+  if cmp -s $ac_file $ac_cs_root.h 2>/dev/null; then
+    echo "$ac_file is unchanged"
+    rm -f $ac_cs_root.h
+  else
+    # Remove last slash and all that follows it.  Not all systems have dirname.
+    ac_dir=`echo "$ac_file" | sed 's%/[^/][^/]*$%%'`
+    if test "$ac_dir" != "$ac_file" && test "$ac_dir" != .; then
+      # The file is in a subdirectory.
+      test ! -d "$ac_dir" && mkdir "$ac_dir"
+    fi
+    rm -f $ac_file
+    mv $ac_cs_root.h $ac_file
+  fi
+fi; done
+EOF
+cat >>$CONFIG_STATUS <<\EOF
+
+#
+# CONFIG_LINKS section.
+#
+srcdir=$ac_given_srcdir
+
+for ac_file in : $CONFIG_LINKS; do if test "x$ac_file" != x:; then
+  ac_dest=`echo "$ac_file" | sed 's%:.*%%'`
+  ac_source=`echo "$ac_file" | sed 's%[^:]*:%%'`
+
+  echo "copying $srcdir/$ac_source to $ac_dest"
+
+  if test ! -r $srcdir/$ac_source; then
+    { echo "configure: error: $srcdir/$ac_source: File not found" 1>&2; exit 1; }
+  fi
+  rm -f $ac_dest
+
+  # Make relative symlinks.
+  # Remove last slash and all that follows it.  Not all systems have dirname.
+  ac_dest_dir=`echo $ac_dest | sed 's%/[^/][^/]*$%%'`
+  if test "$ac_dest_dir" != "$ac_dest" && test "$ac_dest_dir" != .; then
+    # The dest file is in a subdirectory.
+    test ! -d "$ac_dest_dir" && mkdir "$ac_dest_dir"
+    ac_dest_dir_suffix="/`echo $ac_dest_dir|sed 's%^\./%%'`"
+    # A "../" for each directory in $ac_dest_dir_suffix.
+    ac_dots=`echo $ac_dest_dir_suffix|sed 's%/[^/]*%../%g'`
+  else
+    ac_dest_dir_suffix= ac_dots=
+  fi
+
+  case "$srcdir" in
+  [\\/$]* | ?:[\\/]* ) ac_rel_source="$srcdir/$ac_source" ;;
+      *) ac_rel_source="$ac_dots$srcdir/$ac_source" ;;
+  esac
+
+  # Note: Dodgy local mods to 'make things work' in an environment (cygwin)
+  # that supports symlinks (through silly hack) using tools that don't
+  # understand them (mingw). The end sometimes justifies the means, son.
+  #
+  # Make a symlink if possible; otherwise try a hard link.
+  #if ln -s $ac_rel_source $ac_dest 2>/dev/null ||
+  #   ln $srcdir/$ac_source $ac_dest; then :
+  # 
+  # Note: If the -p offends your 'cp', just drop it; no harm done, you'll just 
+  # get more recompilations.
+  # 
+  if cp -p $srcdir/$ac_source $ac_dest; then :
+  else
+    { echo "configure: error: cannot copy $ac_dest to $srcdir/$ac_source" 1>&2; exit 1; }
+  fi
+fi; done
+EOF
+cat >>$CONFIG_STATUS <<\EOF
+
+#
+# CONFIG_COMMANDS section.
+#
+for ac_file in .. $CONFIG_COMMANDS; do if test "x$ac_file" != x..; then
+  ac_dest=`echo "$ac_file" | sed 's%:.*%%'`
+  ac_source=`echo "$ac_file" | sed 's%[^:]*:%%'`
+
+  case "$ac_dest" in
+    default-1 ) test -z "$CONFIG_HEADERS" || echo timestamp > stamp-h ;;
+  esac
+fi;done
+EOF
+
+cat >>$CONFIG_STATUS <<\EOF
+
+exit 0
+EOF
+chmod +x $CONFIG_STATUS
+rm -fr confdefs* $ac_clean_files
+trap 'exit 1' 1 2 15
+
+test "$no_create" = yes || $SHELL $CONFIG_STATUS || exit 1
diff --git a/rts/gmp/configure.in b/rts/gmp/configure.in
new file mode 100644
index 0000000000..18f610fe29
--- /dev/null
+++ b/rts/gmp/configure.in
@@ -0,0 +1,950 @@
+dnl  Process this file with autoconf to produce a configure script.
+
+
+dnl  Copyright (C) 1996, 1997, 1998, 1999, 2000 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+AC_REVISION($Revision: 1.8 $)dnl
+AC_PREREQ(2.14)dnl
+AC_INIT(gmp-impl.h)
+
+dnl Check system.
+AC_CANONICAL_SYSTEM
+
+dnl  Automake
+AM_INIT_AUTOMAKE(gmp, GMP_VERSION)
+AM_CONFIG_HEADER(config.h:config.in)
+AM_MAINTAINER_MODE
+
+dnl  GMP specific
+GMP_INIT(config.m4)
+
+
+AC_ARG_ENABLE(assert,
+AC_HELP_STRING([--enable-assert],[enable ASSERT checking [default=no]]),
+[case "${enableval}" in
+yes|no) ;;
+*) AC_MSG_ERROR([bad value ${enableval} for --enable-assert, need yes or no]) ;;
+esac],
+[enable_assert=no])
+
+if test "$enable_assert" = "yes"; then
+  AC_DEFINE(WANT_ASSERT,1,
+  [./configure --enable-assert option, to enable some ASSERT()s])
+fi
+
+
+AC_ARG_ENABLE(alloca,
+AC_HELP_STRING([--enable-alloca],[use alloca for temp space [default=yes]]),
+[case "${enableval}" in
+yes|no) ;;
+*) AC_MSG_ERROR([bad value ${enableval} for --enable-alloca, need yes or no]) ;;
+esac],
+[enable_alloca=yes])
+
+if test "$enable_alloca" = "no"; then
+  AC_DEFINE(USE_STACK_ALLOC,1,
+  [./configure --disable-alloca option, to use stack-alloc.c, not alloca])
+fi
+
+
+AC_ARG_ENABLE(fft,
+AC_HELP_STRING([--enable-fft],[enable FFTs for multiplication [default=no]]),
+[case "${enableval}" in
+yes|no) ;;
+*) AC_MSG_ERROR([bad value ${enableval} for --enable-fft, need yes or no]) ;;
+esac],
+[enable_fft=no])
+
+if test "$enable_fft" = "yes"; then
+  AC_DEFINE(WANT_FFT,1,
+  [./configure --enable-fft option, to enable FFTs for multiplication])
+fi
+
+
+AC_ARG_ENABLE(mpbsd,
+AC_HELP_STRING([--enable-mpbsd],[build Berkley MP compatibility library [default=no]]),
+[case "${enableval}" in
+yes|no) ;;
+*) AC_MSG_ERROR([bad value ${enableval} for --enable-mpbsd, need yes or no]) ;;
+esac],
+[enable_mpbsd=no])
+AM_CONDITIONAL(WANT_MPBSD, test "$enable_mpbsd" = "yes")
+
+
+AC_ARG_ENABLE(mpfr,
+AC_HELP_STRING([--enable-mpfr],[build MPFR [default=no]]),
+[case "${enableval}" in
+yes|no) ;;
+*) AC_MSG_ERROR([bad value ${enableval} for --enable-mpfr, need yes or no]) ;;
+esac],
+[enable_mpfr=no])
+AM_CONDITIONAL(WANT_MPFR, test "$enable_mpfr" = "yes")
+
+
+dnl Switch on OS and determine what compiler to use.
+dnl
+dnl  os_64bit	           Set to "yes" if OS is 64-bit capable.
+dnl                        FIXME: Rename to `check_64bit_compiler'!
+dnl  cclist		   List of compilers, best first.
+dnl  gmp_cflags_{cc}	   Flags for compiler named {cc}.
+dnl  gmp_cflags64_{cc}     Flags for compiler named {cc} for 64-bit code.
+dnl  gmp_optcflags_{cc}	   Optional compiler flags.
+dnl  gmp_xoptcflags_{cc}   Exclusive optional compiler flags.
+dnl
+os_64bit="no"
+cclist="gcc cc"		# FIXME: Prefer c89 to cc.
+gmp_cflags_gcc="-g -O2"
+gmp_cflags64_gcc="-g -O2"
+gmp_cflags_cc="-g"
+gmp_cflags64_cc="-g"
+
+case "$target" in
+  # Alpha
+  alpha*-cray-unicos*)
+    # Don't perform any assembly syntax tests on this beast.
+    gmp_no_asm_syntax_testing=yes
+    cclist=cc
+    gmp_cflags_cc="$gmp_cflags_cc -O"
+    ;;
+  alpha*-*-osf*)
+    flavour=`echo $target_cpu | sed 's/^alpha//g'`
+    if test -n "$flavour"; then
+      case $flavour in	   # compilers don't seem to understand `ev67' and such.
+	ev6? | ev7*) flavour=ev6;;
+      esac
+      gmp_optcflags_gcc="-mcpu=$flavour"
+      # FIXME: We shouldn't fail fatally if none of these work, but that's
+      # how xoptcflags work and we don't have any other mechanism right now.
+      # Why do we need this here and not for alpha*-*-* below?
+      gmp_xoptcflags_gcc="-Wa,-arch,${flavour} -Wa,-m${flavour}"
+      gmp_optcflags_cc="-arch $flavour -tune $flavour"
+    fi
+    ;;
+  alpha*-*-*)
+    cclist="gcc"
+    flavour=`echo $target_cpu | sed 's/^alpha//g'`
+    if test -n "$flavour"; then
+      case $flavour in
+	ev6? | ev7*) flavour=ev6;;
+      esac
+      gmp_optcflags_gcc="-mcpu=$flavour"
+    fi
+    ;;
+  # Cray vector machines.  This must come after alpha* so that we can
+  # recognize present and future vector processors with a wildcard.
+  *-cray-unicos*)
+    # Don't perform any assembly syntax tests on this beast.
+    gmp_no_asm_syntax_testing=yes
+    cclist=cc
+    # Don't inherit default gmp_cflags_cc value; it comes with -g which
+    # disables all optimization on Cray vector systems
+    gmp_cflags_cc="-O"
+    ;;
+
+  # AMD and Intel x86 configurations
+  [i?86*-*-* | k[5-8]*-*-* | pentium*-*-* | athlon-*-*])
+    # Rumour has it -O2 used to give worse register allocation than just -O.
+    gmp_cflags_gcc="-g -O -fomit-frame-pointer"
+
+    case "${target}" in
+      i386*-*-*)    gmp_optcflags_gcc="-mcpu=i386 -march=i386";;
+      i486*-*-*)    gmp_optcflags_gcc="-mcpu=i486 -march=i486";;
+      i586*-*-* | pentium-*-* | pentiummmx-*-*)
+                    gmp_optcflags_gcc="-mcpu=pentium -march=pentium";;
+
+      # -march=pentiumpro not used because mpz/powm.c (swox cvs rev 1.4)
+      # tickles a bug in gcc 2.95.2 (believed fixed in 2.96).
+      [i686*-*-* | pentiumpro-*-* | pentium[23]-*-*])
+                    gmp_optcflags_gcc="-mcpu=pentiumpro";;
+
+      k6*-*-*)      gmp_optcflags_gcc="-mcpu=k6 -march=k6";;
+
+      # Athlon instruction costs are close to p6: 3 cycle load latency, 4-6
+      # cycle mul, 40 cycle div, pairable adc, ...
+      # FIXME: Change this when gcc gets something specific for Athlon.
+      # -march=pentiumpro not used, per i686 above.
+      athlon-*-*)   gmp_optcflags_gcc="-mcpu=pentiumpro";;
+    esac
+    ;;
+
+  # Sparc
+  [ultrasparc*-*-solaris2.[7-9] | sparcv9-*-solaris2.[7-9]])
+    os_64bit=yes
+    gmp_cflags_gcc="$gmp_cflags_gcc -Wa,-xarch=v8plus"
+    gmp_xoptcflags_gcc="-mcpu=v9 -mcpu=v8 -mv8"
+    gmp_cflags64_gcc="$gmp_cflags64_gcc -m64 -mptr64 -Wa,-xarch=v9 -mcpu=v9"
+    gmp_cflags_cc="-xtarget=native -xarch=v8 -xO4"
+    gmp_cflags64_cc="-xtarget=native -xarch=v9 -xO4"
+    ;;
+  sparc64-*-linux*)
+    # Need to think more about the options passed here.  This isn't good for
+    # some sparc64 linux distros, since we end up not optimizing when all the
+    # options below fail.
+    os_64bit=yes
+    gmp_cflags64_gcc="$gmp_cflags64_gcc -m64 -mptr64 -Wa,-xarch=v9 -mcpu=v9"
+    gmp_cflags_gcc="$gmp_cflags_gcc -m32"
+    gmp_xoptflags_gcc="-mcpu=ultrasparc -mvis"
+    ;;
+  ultrasparc*-*-* | sparcv9-*-*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -Wa,-xarch=v8plus"
+    gmp_xoptcflags_gcc="-mcpu=v9 -mcpu=v8 -mv8"
+    gmp_cflags_cc="-xtarget=native -xarch=v8 -xO4"
+    ;;
+  sparcv8*-*-solaris2.* | microsparc*-*-solaris2.*)
+    gmp_cflags_gcc="$gmp_cflags_gcc"
+    gmp_xoptcflags_gcc="-mcpu=v8 -mv8"
+    gmp_cflags_cc="-xtarget=native -xarch=v8 -xO4"
+    ;;
+  sparcv8*-*-* | microsparc*-*-*)		# SunOS, Linux, *BSD
+    cclist="gcc acc cc"
+    gmp_cflags_gcc="$gmp_cflags_gcc"
+    gmp_xoptcflags_gcc="-mcpu=v8 -mv8"
+    gmp_cflags_acc="-g -O2 -cg92"
+    gmp_cflags_cc="-O2"		# FIXME: Flag for v8?
+    ;;
+  supersparc*-*-solaris2.*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -DSUPERSPARC"
+    gmp_xoptcflags_gcc="-mcpu=v8 -mv8"
+    gmp_cflags_cc="-xtarget=native -xarch=v8 -xO4 -DSUPERSPARC"
+    ;;
+  supersparc*-*-*)		# SunOS, Linux, *BSD
+    cclist="gcc acc cc"
+    gmp_cflags_gcc="$gmp_cflags_gcc -DSUPERSPARC"
+    gmp_xoptcflags_gcc="-mcpu=v8 -mv8"
+    gmp_cflags_acc="-g -O2 -cg92 -DSUPERSPARC"
+    gmp_cflags_cc="-O2 -DSUPERSPARC"	# FIXME: Flag for v8?
+    ;;
+  *sparc*-*-*)
+    cclist="gcc acc cc"
+    gmp_cflags_acc="-g -O2"
+    gmp_cflags_cc="-g -O2"
+    ;;
+
+  # POWER/PowerPC
+  powerpc64-*-aix*)
+    cclist="gcc xlc"
+    gmp_cflags_gcc="$gmp_cflags_gcc -maix64 -mpowerpc64"
+    gmp_cflags_xlc="-g -O2 -q64 -qtune=pwr3"
+    ;;
+  powerpc*-*-aix*)
+    cclist="gcc xlc"
+    gmp_cflags_gcc="$gmp_cflags_gcc -mpowerpc"
+    gmp_cflags_xlc="$gmp_cflags_cc -qarch=ppc -O2"
+    ;;
+  power-*-aix*)
+    cclist="gcc xlc"
+    gmp_cflags_gcc="$gmp_cflags_gcc -mpower"
+    gmp_cflags_xlc="$gmp_cflags_cc -qarch=pwr -O2"
+    ;;
+  powerpc64*-*-*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -mpowerpc64"
+    AC_DEFINE(_LONG_LONG_LIMB)	dnl FIXME: Remove.
+    ;;
+  powerpc-apple-darwin* | powerpc-apple-macosx*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -mpowerpc -traditional-cpp"
+    ;;
+  powerpc*-*-*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -mpowerpc"
+    ;;
+
+  # MIPS
+  mips-sgi-irix6.*)
+    os_64bit=yes
+    gmp_cflags64_gcc="-g -O2 -mabi=n32"
+    gmp_cflags64_cc="$gmp_cflags64_cc -O2 -n32"
+    ;;
+
+  # Motorola 68k family
+  m88110*-*-*)
+    gmp_cflags_gcc="-g -O -m88110"	dnl  FIXME: Use `-O2'?
+    ;;
+  m68*-*-*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -fomit-frame-pointer"
+    ;;
+
+  # HP
+  hppa1.0*-*-*)
+    cclist="gcc c89 cc"
+    gmp_cflags_c89="$gmp_cflags_cc +O2"
+    gmp_cflags_cc="$gmp_cflags_cc +O2"
+    ;;
+  hppa2.0w*-*-*)
+    cclist="c89 cc"
+    gmp_cflags_c89="+DD64 +O3"
+    gmp_cflags_cc="+DD64 +O3"
+    ;;
+  hppa2.0*-*-*)
+    os_64bit=yes
+    cclist="gcc c89 cc"
+    gmp_cflags64_gcc="$gmp_cflags64_gcc -mWHAT -D_LONG_LONG_LIMB"
+    # +O2 to cc triggers bug in mpz/powm.c (1.4)
+    gmp_cflags64_c89="+DA2.0 +e +O3 -D_LONG_LONG_LIMB"
+    gmp_cflags64_cc="+DA2.0 +e +O3 -D_LONG_LONG_LIMB"
+    gmp_cflags_c89="$gmp_cflags_cc +O2"
+    gmp_cflags_cc="$gmp_cflags_cc +O2"
+    ;;
+
+  # VAX
+  vax*-*-*)
+    gmp_cflags_gcc="$gmp_cflags_gcc -fomit-frame-pointer"
+    ;;
+
+  # Fujitsu
+  [f30[01]-fujitsu-sysv*])
+    cclist="gcc vcc"
+    gmp_cflags_vcc="-g"		# FIXME: flags for vcc?
+    ;;
+esac
+
+case "${target}" in
+  *-*-mingw32) gmp_cflags_gcc="$gmp_cflags_gcc -mno-cygwin";;
+esac
+
+dnl  Check for programs needed by macros for finding compiler.
+dnl  More programs are checked for below, when a compiler is found.
+AC_PROG_NM	dnl  Macro from Libtool.
+# nm on 64-bit AIX needs to know the object file format 
+case "$target" in
+  powerpc64*-*-aix*)
+    NM="$NM -X 64"
+    ;;
+esac   
+
+# Save CFLAGS given on command line.
+gmp_user_CFLAGS="$CFLAGS"
+
+if test -z "$CC"; then
+  # Find compiler.
+  GMP_PROG_CC_FIND($cclist, $os_64bit)
+
+  # If 64-bit OS and we have a 64-bit compiler, use it.
+  if test -n "$os_64bit" && test -n "$CC64"; then
+    CC=$CC64
+    CFLAGS=$CFLAGS64
+  else
+    eval CFLAGS=\$gmp_cflags_$CC  
+  fi
+
+  # Try compiler flags that may work with only some compiler versions.
+  # gmp_optcflags: All or nothing.
+  eval optcflags=\$gmp_optcflags_$CC
+  if test -n "$optcflags"; then
+    CFLAGS_save="$CFLAGS"
+    CFLAGS="$CFLAGS $optcflags"
+    AC_MSG_CHECKING([whether $CC accepts $optcflags])
+    AC_LANG_C
+    AC_TRY_COMPILER([int main(){return(0);}], optok, cross)
+    if test "$optok" = "yes"; then
+      AC_MSG_RESULT([yes])
+    else
+      AC_MSG_RESULT([no])
+      CFLAGS="$CFLAGS_save"
+    fi
+  fi
+  # gmp_xoptcflags: First is best, one has to work.
+  eval xoptcflags=\$gmp_xoptcflags_$CC
+  if test -n "$xoptcflags"; then
+    gmp_found="no"
+    for xopt in $xoptcflags; do
+      CFLAGS_save="$CFLAGS"
+      CFLAGS="$CFLAGS $xopt"
+      AC_MSG_CHECKING([whether $CC accepts $xopt])
+      AC_LANG_C
+      AC_TRY_COMPILER([int main(){return(0);}], optok, cross)
+      if test "$optok" = "yes"; then
+        AC_MSG_RESULT([yes])
+        gmp_found="yes"
+        break
+      else
+        AC_MSG_RESULT([no])
+        CFLAGS="$CFLAGS_save"
+      fi
+    done
+    if test "$gmp_found" = "no"; then
+      echo ["$0: fatal: need a compiler that understands one of $xoptcflags"]
+      exit 1
+    fi
+  fi
+fi
+
+# Restore CFLAGS given on command line.
+# FIXME: We've run through quite some unnecessary code looking for a
+# nice compiler and working flags for it, just to spoil that with user
+# supplied flags.
+test -n "$gmp_user_CFLAGS" && CFLAGS="$gmp_user_CFLAGS"
+
+# Select chosen compiler.
+GMP_PROG_CC_SELECT
+
+# How to assemble.
+CCAS="$CC -c"
+AC_SUBST(CCAS)
+
+dnl Checks for programs.
+dnl --------------------
+AC_PROG_CPP
+AC_PROG_INSTALL
+AC_PROG_LN_S
+GMP_PROG_M4
+AC_CHECK_PROG(AR, ar, ar)
+# ar on AIX needs to know the object file format 
+case "$target" in
+  powerpc64*-*-aix*)
+    AR="$AR -X 64"
+    ;;
+esac   
+dnl  FIXME: Find good ld?  /usr/ucb/ld on Solaris won't work.
+
+dnl Checks for assembly syntax.
+if test "$gmp_no_asm_syntax_testing" != "yes"; then
+  GMP_CHECK_ASM_TEXT
+  GMP_CHECK_ASM_DATA
+  GMP_CHECK_ASM_GLOBL
+  GMP_CHECK_ASM_LABEL_SUFFIX
+  GMP_CHECK_ASM_TYPE
+  GMP_CHECK_ASM_SIZE
+  GMP_CHECK_ASM_LSYM_PREFIX
+  GMP_CHECK_ASM_W32
+  GMP_CHECK_ASM_UNDERSCORE(underscore=yes, underscore=no)
+  GMP_CHECK_ASM_ALIGN_LOG(asm_align=log, asm_align=nolog)
+fi
+
+dnl  FIXME: Check for FPU and set `floating_point' appropriately.
+
+dnl  ========================================
+dnl  Configuring mpn.
+dnl  ----------------------------------------
+dnl  Set the following target specific variables:
+dnl  path		where to search for source files
+dnl  family		processor family (Needed for building 
+dnl 			asm-syntax.h for now.  FIXME: Remove.)
+dnl  extra_functions	extra functions
+
+family=generic
+
+case ${target} in
+  arm*-*-*)
+    path="arm"
+    ;;
+  [sparcv9*-*-solaris2.[789]* | sparc64*-*-solaris2.[789]* | ultrasparc*-*-solaris2.[789]*])
+    if test -n "$CC64"
+      then path="sparc64"
+      else path="sparc32/v9 sparc32/v8 sparc32"
+    fi
+    ;;
+  sparc64-*-linux*)
+    if test -n "$CC64"
+      then path="sparc64"
+      else path="sparc32/v9 sparc32/v8 sparc32"
+    fi
+    ;;
+  sparcv8*-*-* | microsparc*-*-*)
+	path="sparc32/v8 sparc32"
+	if test x${floating_point} = xno
+	  then extra_functions="udiv_nfp"
+	  else extra_functions="udiv_fp"
+	fi
+	;;
+  sparcv9*-*-* | ultrasparc*-*-*)
+	path="sparc32/v9 sparc32/v8 sparc32"
+	extra_functions="udiv_fp"
+	;;
+  supersparc*-*-*)
+	path="sparc32/v8/supersparc sparc32/v8 sparc32"
+	extra_functions="udiv"
+	;;
+  sparc*-*-*) path="sparc32"
+	if test x${floating_point} = xno
+	  then extra_functions="udiv_nfp"
+	  else extra_functions="udiv_fp"
+	fi
+	;;
+  hppa7000*-*-*)
+    path="hppa/hppa1_1 hppa"
+    extra_functions="udiv_qrnnd"
+    ;;
+  hppa1.0*-*-*)
+    path="hppa"
+    extra_functions="udiv_qrnnd"
+    ;;
+  hppa2.0w-*-*)
+    path="pa64w"
+    extra_functions="umul_ppmm udiv_qrnnd"
+    ;;
+  hppa2.0*-*-*)
+    if test -n "$CC64"; then
+      path="pa64"
+      extra_functions="umul_ppmm udiv_qrnnd"
+      # We need to use the system compiler, or actually the system assembler,
+      # since GAS has not been ported to understand the 2.0 instructions.
+      CCAS="$CC64 -c"
+    else
+      # FIXME: path should be "hppa/hppa2_0 hppa/hppa1_1 hppa"
+      path="hppa/hppa1_1 hppa"
+      extra_functions="udiv_qrnnd"
+    fi
+    ;;
+  hppa*-*-*)					#assume pa7100
+    path="hppa/hppa1_1/pa7100 hppa/hppa1_1 hppa"
+    extra_functions="udiv_qrnnd";;
+  [f30[01]-fujitsu-sysv*])
+    path=fujitsu;;
+  alphaev6*-*-*) path="alpha/ev6 alpha"; extra_functions="invert_limb cntlz";;
+  alphaev5*-*-*) path="alpha/ev5 alpha"; extra_functions="invert_limb cntlz";;
+  alpha*-*-*) path="alpha"; extra_functions="invert_limb cntlz";;
+  # Cray vector machines.  This must come after alpha* so that we can
+  # recognize present and future vector processors with a wildcard.
+  *-cray-unicos*)
+    path="cray"
+    extra_functions="mulww";;
+  am29000*-*-*) path="a29k";;
+  a29k*-*-*) path="a29k";;
+
+  # AMD and Intel x86 configurations
+
+  [i?86*-*-* | k[5-8]*-*-* | pentium*-*-* | athlon-*-*])
+    gmp_m4postinc="x86/x86-defs.m4"
+    extra_functions="udiv umul"
+    CALLING_CONVENTIONS_OBJS="x86call.o x86check.o"
+
+    GMP_CHECK_ASM_SHLDL_CL(
+      [GMP_DEFINE(WANT_SHLDL_CL,1)],
+      [GMP_DEFINE(WANT_SHLDL_CL,0)])
+    GMP_CHECK_ASM_ALIGN_FILL_0x90
+
+    # the CPUs below wanting to know about mmx
+    case ${target} in
+      [pentiummmx-*-* | pentium[23]-*-* | k6*-*-* | athlon-*-*])
+	GMP_CHECK_ASM_MMX(tmp_mmx=yes, tmp_mmx=no)
+        ;;
+    esac
+
+    # default for anything not otherwise mentioned
+    path="x86"
+ 
+    case ${target} in
+      [i[34]86*-*-*])
+        path="x86"
+        ;;
+      k5*-*-*)
+        # don't know what best suits k5
+        path="x86"
+        ;;
+      i586*-*-* | pentium-*-*)
+	path="x86/pentium x86"
+        ;;
+      pentiummmx-*-*)
+	path="x86/pentium x86"
+	if test "$tmp_mmx" = yes; then
+          path="x86/pentium/mmx $path"
+        fi
+        ;;
+      i686*-*-* | pentiumpro-*-*)
+	path="x86/p6 x86"
+        ;;
+      pentium2-*-*)
+	path="x86/p6 x86"
+	# The pentium/mmx lshift and rshift are good on p6 and can be used
+        # until there's something specific for p6.
+	if test "$tmp_mmx" = yes; then
+          path="x86/p6/mmx x86/pentium/mmx $path"
+        fi
+        ;;
+      pentium3-*-*)
+	path="x86/p6 x86"
+	# The pentium/mmx lshift and rshift are good on p6 and can be used
+        # until there's something specific for p6.
+	if test "$tmp_mmx" = yes; then
+          path="x86/p6/p3mmx x86/p6/mmx x86/pentium/mmx $path"
+        fi
+        ;;
+      [k6[23]*-*-*])
+	path="x86/k6 x86"
+	if test "$tmp_mmx" = yes; then
+          path="x86/k6/k62mmx x86/k6/mmx $path"
+        fi
+        ;;
+      k6*-*-*)
+	path="x86/k6 x86"
+	if test "$tmp_mmx" = yes; then
+          path="x86/k6/mmx $path"
+        fi
+        ;;
+      athlon-*-*)
+	path="x86/k7 x86"
+	if test "$tmp_mmx" = yes; then
+          path="x86/k7/mmx $path"
+        fi
+        ;;
+    esac
+    ;;
+
+
+  i960*-*-*) path="i960";;
+
+  ia64*-*-*) path="ia64";;
+
+# Motorola 68k configurations.  Let m68k mean 68020-68040.
+  [m680[234]0*-*-* | m68k*-*-* | \
+  m68*-next-nextstep*])		# Nexts are at least '020
+    path="m68k/mc68020 m68k"
+    family=m68k
+    ;;
+  m68000*-*-*)
+    path="m68k"
+    family=m68k
+    ;;
+
+  m88k*-*-* | m88k*-*-*) path="m88k";;
+  m88110*-*-*) path="m88k/mc88110 m88k";;
+  ns32k*-*-*) path="ns32k";;
+
+  pyramid-*-*) path="pyr";;
+
+  ppc601-*-*) path="power powerpc32";;
+  powerpc64*-*-*) path="powerpc64";;
+  powerpc*-*-*) path="powerpc32";;
+  rs6000-*-* | power-*-* | power2-*-*)
+    path="power"
+    extra_functions="udiv_w_sdiv"
+    ;;
+
+  sh-*-*) path="sh";;
+  sh2-*-*) path="sh/sh2 sh";;
+
+  [mips[34]*-*-*]) path="mips3";;
+  mips*-*-irix6*) path="mips3";;
+  mips*-*-*) path="mips2";;
+
+  vax*-*-*) path="vax"; extra_functions="udiv_w_sdiv";;
+
+  z8000x*-*-*) path="z8000x"; extra_functions="udiv_w_sdiv";;
+  z8000*-*-*) path="z8000"; extra_functions="udiv_w_sdiv";;
+
+  clipper*-*-*) path="clipper";;
+esac
+
+AC_SUBST(CALLING_CONVENTIONS_OBJS)
+if test -n "$CALLING_CONVENTIONS_OBJS"; then
+  AC_DEFINE(HAVE_CALLING_CONVENTIONS,1,
+  [Define if mpn/tests has calling conventions checking for the CPU])
+fi
+
+
+case ${target} in
+  [i[5-8]86*-*-* | k[5-8]*-*-* | pentium*-*-* | athlon-*-*])
+    # rdtsc is in pentium and up, not in i386 and i486
+    SPEED_CYCLECOUNTER_OBJS=pentium.lo
+    ;;
+  alpha*-*-*)
+    SPEED_CYCLECOUNTER_OBJS=alpha.lo
+    ;;
+  sparcv9*-*-* | ultrasparc*-*-* | sparc64*-*-*)
+    SPEED_CYCLECOUNTER_OBJS=sparcv9.lo
+    ;;
+  hppa2*-*-*)
+    SPEED_CYCLECOUNTER_OBJS=hppa2.lo
+    ;;
+  hppa*-*-*)
+    SPEED_CYCLECOUNTER_OBJS=hppa.lo
+    ;;
+esac
+
+AC_SUBST(SPEED_CYCLECOUNTER_OBJS)
+
+if test -n "$SPEED_CYCLECOUNTER_OBJS"
+then
+  AC_DEFINE(HAVE_SPEED_CYCLECOUNTER, 1,
+  [Define if a speed_cyclecounter exists (for the tune programs)])
+fi
+
+
+dnl Extensions for executable and object files.
+dnl -------------------------------------------
+AC_EXEEXT
+AC_OBJEXT
+
+dnl Use Libtool.
+dnl ------------
+dnl  FIXME: Shared libs seem to fail on aix4.3.
+dnl  FIXME: Should invoke [AC_DISABLE_SHARED], but m4 recurses to death.
+case "$target" in
+  [*-*-aix4.[3-9]*]) enable_shared=no ;;
+esac
+AC_PROG_LIBTOOL
+
+dnl Checks for libraries.
+dnl ---------------------
+AC_CHECK_DECLS((optarg))
+
+dnl Checks for header files.
+dnl ------------------------
+AC_HEADER_STDC
+AC_CHECK_HEADERS(getopt.h unistd.h sys/sysctl.h sys/time.h)
+
+dnl Checks for typedefs, structures, and compiler characteristics.
+dnl --------------------------------------------------------------
+AC_CHECK_TYPES((void))
+AC_C_STRINGIZE
+
+dnl Checks for library functions.
+dnl -----------------------------
+dnl Most of these are only for the benefit of supplementary programs.  The
+dnl library itself doesn't use anything weird.
+dnl AC_FUNC_MEMCMP
+dnl AC_TYPE_SIGNAL
+dnl AC_CHECK_FUNCS(strtol)
+AC_CHECK_FUNCS(getopt_long getpagesize popen processor_info strtoul sysconf sysctlbyname)
+
+dnl Trick automake into thinking we've run AM_C_PROTOTYPES which it wants
+dnl for ansi2knr, and instead use our own test.  (It's only a warning
+dnl automake prints, but it's good to suppress it.)
+ifelse(0,1,[
+AM_C_PROTOTYPES
+])
+GMP_C_ANSI2KNR
+
+
+dnl  Set `syntax' to one of <blank>, "mit", "elf", "aix", "macho".
+syntax=
+# For now, we use the old switch for setting syntax.  
+# FIXME: Remove when conversion to .asm is completed.
+changequote(,)dnl
+case "${target}" in
+  m680[234]0*-*-linuxaout* | m68k*-*-linuxaout* | \
+  m68k-next-nextstep* | \
+  m68000*-*-*)
+    syntax=mit
+    ;;
+  m680[234]0*-*-linux* | m68k*-*-linux*)
+    syntax=elf
+    ;;
+  m680[234]0*-*-* | m68k*-*-*)
+    syntax=mit
+    ;;
+esac
+changequote([,])dnl
+
+dnl  ----------------------------------------
+# Now build an asm-syntax.h file for targets that include that from the
+# assembly files.
+# FIXME: Remove when conversion to .asm is completed.
+case "${family}-${underscore}-${asm_align}-${syntax}" in
+  m68k-yes-log-mit)
+    echo '#define MIT_SYNTAX' >asm-syntax.h
+    cat $srcdir/mpn/underscore.h >>asm-syntax.h
+    echo '#include "'$srcdir'/mpn/m68k/syntax.h"' >>asm-syntax.h;;
+  m68k-no-nolog-elf)
+    echo '#define ELF_SYNTAX' >asm-syntax.h
+    echo '#define C_SYMBOL_NAME(name) name' >>asm-syntax.h
+    echo '#include "'$srcdir'/mpn/m68k/syntax.h"' >>asm-syntax.h;;
+esac
+
+
+# The pattern here tests for an absolute path the same way as
+# _AC_OUTPUT_FILES in autoconf acgeneral.m4.
+GMP_DEFINE_RAW(["dnl  CONFIG_TOP_SRCDIR is a path from the mpn builddir to the top srcdir"])
+case "$srcdir" in
+[[\\/]]* | ?:[[\\/]]* )
+     GMP_DEFINE_RAW(["define(<CONFIG_TOP_SRCDIR>,<\`$srcdir'>)"])    ;;
+*)   GMP_DEFINE_RAW(["define(<CONFIG_TOP_SRCDIR>,<\`../$srcdir'>)"]) ;;
+esac
+ 
+GMP_DEFINE_RAW(["include(CONFIG_TOP_SRCDIR\`/mpn/asm-defs.m4')"], POST)
+
+# Must be after asm-defs.m4
+GMP_DEFINE_RAW("define_not_for_expansion(\`HAVE_TARGET_CPU_$target_cpu')", POST)
+
+
+dnl  config.m4 post-includes
+dnl  -----------------------
+dnl  (Note x86 post include set with $path above.)
+changequote(,)dnl
+case "$target" in
+  alpha*-cray-unicos*)
+    gmp_m4postinc="alpha/unicos.m4"
+    ;;
+  alpha*-*-*)
+    gmp_m4postinc="alpha/default.m4"
+    ;;
+  power*-*-*)
+    case "$target" in
+      *-*-mach* | *-*-rhapsody* | *-*-nextstep* | *-*-darwin* | *-*-macosx*)
+        ;;	# these use non-conventional assembly syntax.
+      powerpc64-*-aix*)
+	gmp_m4postinc="powerpc32/regmap.m4 powerpc64/aix.m4"
+        ;;
+      *-*-aix*)
+	gmp_m4postinc="powerpc32/regmap.m4 powerpc32/aix.m4"
+        ;;
+      *)
+	gmp_m4postinc="powerpc32/regmap.m4"
+	;;
+    esac
+    ;;
+esac
+changequote([, ])dnl
+
+for tmp_f in $gmp_m4postinc; do
+  GMP_DEFINE_RAW(["include_mpn(\`$tmp_f')"], POST)
+done
+
+
+# Set up `gmp_links'.  It's a list of link:file pairs that configure will
+# process to create link -> file.
+gmp_links=
+
+# If the user specified `MPN_PATH', use that instead of the path we've
+# come up with.
+if test -z "$MPN_PATH"; then
+  path="$path generic"
+else
+  path="$MPN_PATH"
+fi
+
+# Pick the correct source files in $path and link them to mpn/.
+# $gmp_mpn_functions lists all functions we need.
+#
+# The rule is to find a file with the function name and a .asm, .S,
+# .s, or .c extension.  Certain multi-function files with special names
+# can provide some functions too.  (mpn/Makefile.am passes
+# -DOPERATION_<func> to get them to generate the right code.)
+
+# FIXME: udiv and umul aren't in $gmp_mpn_functions_optional yet since
+# there's some versions of those files which should be checked for bit
+# rot first.  Put them in $extra_functions for each target for now,
+# change to standard optionals when all are ready.
+
+# Note: The following lines defining $gmp_mpn_functions_optional
+#       and $gmp_mpn_functions are parsed by the "macos/configure"
+#       Perl script. So if you change the lines in a major way
+#       make sure to run and examine the output from
+#       
+#           % (cd macos; perl configure)
+
+gmp_mpn_functions_optional="copyi copyd com_n		\
+  and_n andn_n nand_n ior_n iorn_n nior_n xor_n xnor_n"
+
+gmp_mpn_functions="${extra_functions} inlines add_n sub_n mul_1 addmul_1   \
+  submul_1 lshift rshift diveby3 divrem divrem_1 divrem_2                  \
+  mod_1 mod_1_rs pre_mod_1 dump                                            \
+  mul mul_fft mul_n mul_basecase sqr_basecase random                       \
+  random2 sqrtrem get_str set_str scan0 scan1 popcount hamdist cmp perfsqr \
+  bdivmod gcd_1 gcd gcdext tdiv_qr bz_divrem_n sb_divrem_mn jacbase        \
+  $gmp_mpn_functions_optional"
+
+# the list of all object files used by mpn/Makefile.in and the
+# top-level Makefile.in, respectively
+mpn_objects=
+mpn_objs_in_libgmp="mpn/mp_bases.lo"
+
+for tmp_fn in ${gmp_mpn_functions} ; do
+  [rm -f mpn/${tmp_fn}.[Ssc] mpn/${tmp_fn}.asm]
+
+  # functions that can be provided by multi-function files
+  tmp_mulfunc=
+  case $tmp_fn in
+  add_n|sub_n)       tmp_mulfunc="aors_n"    ;;
+  addmul_1|submul_1) tmp_mulfunc="aorsmul_1" ;;
+  popcount|hamdist)  tmp_mulfunc="popham"    ;;
+  and_n|andn_n|nand_n | ior_n|iorn_n|nior_n | xor_n|xnor_n)
+                     tmp_mulfunc="logops_n"  ;;
+  esac
+
+  found=no
+  for tmp_dir in $path; do
+    for tmp_base in $tmp_fn $tmp_mulfunc; do
+      for tmp_ext in asm S s c; do
+        tmp_file=$srcdir/mpn/$tmp_dir/$tmp_base.$tmp_ext
+        if test -f $tmp_file; then
+          found=yes
+
+          mpn_objects="$mpn_objects ${tmp_fn}.lo"
+          mpn_objs_in_libgmp="$mpn_objs_in_libgmp mpn/${tmp_fn}.lo"
+          gmp_links="$gmp_links mpn/$tmp_fn.$tmp_ext:mpn/$tmp_dir/$tmp_base.$tmp_ext"
+
+          # duplicate AC_DEFINEs are harmless, so it doesn't matter
+          # that multi-function files get grepped here repeatedly
+          gmp_ep=["`
+            sed -n 's/^[ 	]*MULFUNC_PROLOGUE(\(.*\))/\1/p' $tmp_file ;
+            sed -n 's/^[ 	]*PROLOGUE.*(\(.*\))/\1/p' $tmp_file
+          `"]
+          for gmp_tmp in $gmp_ep; do
+            AC_DEFINE_UNQUOTED(HAVE_NATIVE_${gmp_tmp})
+          done
+
+          break
+        fi
+      done
+      if test $found = yes; then break ; fi
+    done
+    if test $found = yes; then break ; fi
+  done
+
+  if test $found = no; then
+    for tmp_optional in $gmp_mpn_functions_optional; do
+      if test $tmp_optional = $tmp_fn; then
+        found=yes
+      fi
+    done
+    if test $found = no; then
+      AC_MSG_ERROR([no version of $tmp_fn found in path: $path])
+    fi
+  fi
+done
+
+# Create link for gmp-mparam.h.
+for tmp_dir in $path ; do
+  rm -f gmp-mparam.h
+  if test -f $srcdir/mpn/${tmp_dir}/gmp-mparam.h ; then
+    gmp_links="$gmp_links gmp-mparam.h:mpn/${tmp_dir}/gmp-mparam.h"
+
+    # Copy any KARATSUBA_SQR_THRESHOLD in gmp-mparam.h to config.m4.
+    # Some versions of sqr_basecase.asm use this.
+    tmp_gmp_karatsuba_sqr_threshold="`sed -n 's/^#define KARATSUBA_SQR_THRESHOLD[ 	]*\([0-9][0-9]*\).*$/\1/p' $srcdir/mpn/${tmp_dir}/gmp-mparam.h`"
+    if test -n "$tmp_gmp_karatsuba_sqr_threshold"; then
+      GMP_DEFINE_RAW(["define(<KARATSUBA_SQR_THRESHOLD>,<$tmp_gmp_karatsuba_sqr_threshold>)"])
+    fi
+
+    break
+  fi
+done
+
+# Dig out the links from `gmp_links' for inclusion in DISTCLEANFILES.
+gmp_srclinks=
+for f in $gmp_links; do
+  gmp_srclinks="$gmp_srclinks `echo $f | sed 's/\(.*\):.*/\1/'`"
+done
+
+AC_SUBST(mpn_objects)
+AC_SUBST(mpn_objs_in_libgmp)
+AC_SUBST(gmp_srclinks)
+
+dnl  ----------------------------------------
+dnl  Make links.
+AC_CONFIG_LINKS($gmp_links)
+
+dnl  Create config.m4.
+GMP_FINISH
+
+dnl  Create Makefiles
+dnl  FIXME: Upcoming version of autoconf/automake may not like broken lines.
+AC_OUTPUT(Makefile mpz/Makefile mpn/Makefile)
diff --git a/rts/gmp/depcomp b/rts/gmp/depcomp
new file mode 100644
index 0000000000..7906096738
--- /dev/null
+++ b/rts/gmp/depcomp
@@ -0,0 +1,269 @@
+#! /bin/sh
+
+# depcomp - compile a program generating dependencies as side-effects
+# Copyright (C) 1999 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# 02111-1307, USA.
+
+# Originally written by Alexandre Oliva <oliva@dcc.unicamp.br>.
+
+if test -z "$depmode" || test -z "$source" || test -z "$object"; then
+  echo "depcomp: Variables source, object and depmode must be set" 1>&2
+  exit 1
+fi
+# `libtool' can also be set to `yes' or `no'.
+
+depfile=${depfile-`echo "$object" | sed 's,\([^/]*\)$,.deps/\1,;s/\.\([^.]*\)$/.P\1/'`}
+tmpdepfile=${tmpdepfile-`echo "$depfile" | sed 's/\.\([^.]*\)$/.T\1/'`}
+
+rm -f "$tmpdepfile"
+
+# Some modes work just like other modes, but use different flags.  We
+# parameterize here, but still list the modes in the big case below,
+# to make depend.m4 easier to write.  Note that we *cannot* use a case
+# here, because this file can only contain one case statement.
+if test "$depmode" = hp; then
+  # HP compiler uses -M and no extra arg.
+  gccflag=-M
+  depmode=gcc
+fi
+
+if test "$depmode" = dashXmstdout; then
+   # This is just like dashmstdout with a different argument.
+   dashmflag=-xM
+   depmode=dashmstdout
+fi
+
+case "$depmode" in
+gcc)
+## There are various ways to get dependency output from gcc.  Here's
+## why we pick this rather obscure method:
+## - Don't want to use -MD because we'd like the dependencies to end
+##   up in a subdir.  Having to rename by hand is ugly.
+##   (We might end up doing this anyway to support other compilers.)
+## - The DEPENDENCIES_OUTPUT environment variable makes gcc act like
+##   -MM, not -M (despite what the docs say).
+## - Using -M directly means running the compiler twice (even worse
+##   than renaming).
+  if test -z "$gccflag"; then
+    gccflag=-MD,
+  fi
+  if "$@" -Wp,"$gccflag$tmpdepfile"; then :
+  else
+    stat=$?
+    rm -f "$tmpdepfile"
+    exit $stat
+  fi
+  rm -f "$depfile" 
+  echo "$object : \\" > "$depfile"
+  sed 's/^[^:]*: / /' < "$tmpdepfile" >> "$depfile"
+## This next piece of magic avoids the `deleted header file' problem.
+## The problem is that when a header file which appears in a .P file
+## is deleted, the dependency causes make to die (because there is
+## typically no way to rebuild the header).  We avoid this by adding
+## dummy dependencies for each header file.  Too bad gcc doesn't do
+## this for us directly.
+  tr ' ' '
+' < "$tmpdepfile" |
+## Some versions of gcc put a space before the `:'.  On the theory
+## that the space means something, we add a space to the output as
+## well.
+## Some versions of the HPUX 10.20 sed can't process this invocation
+## correctly.  Breaking it into two sed invocations is a workaround.
+    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+hp)
+  # This case exists only to let depend.m4 do its work.  It works by
+  # looking at the text of this script.  This case will never be run,
+  # since it is checked for above.
+  exit 1
+  ;;
+
+dashmd)
+  # The Java front end to gcc doesn't run cpp, so we can't use the -Wp
+  # trick.  Instead we must use -M and then rename the resulting .d
+  # file.  This is also the case for older versions of gcc, which
+  # don't implement -Wp.
+  if "$@" -MD; then :
+  else
+    stat=$?
+    rm -f FIXME
+    exit $stat
+  fi
+  FIXME: rewrite the file
+  ;;
+
+sgi)
+  if test "$libtool" = yes; then
+    "$@" "-Wc,-MDupdate,$tmpdepfile"
+  else
+    "$@" -MDupdate "$tmpdepfile"
+  fi
+  stat=$?
+  if test $stat -eq 0; then :
+  else
+    stat=$?
+    rm -f "$tmpdepfile"
+    exit $stat
+  fi
+  rm -f "$depfile" 
+  echo "$object : \\" > "$depfile"
+  sed 's/^[^:]*: / /' < "$tmpdepfile" >> "$depfile"
+  tr ' ' '
+' < "$tmpdepfile" | \
+## Some versions of the HPUX 10.20 sed can't process this invocation
+## correctly.  Breaking it into two sed invocations is a workaround.
+    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+#nosideeffect)
+  # This comment above is used by automake to tell side-effect
+  # dependency tracking mechanisms from slower ones.
+
+dashmstdout)
+  # Important note: in order to support this mode, a compiler *must*
+  # always write the proprocessed file to stdout, regardless of -o,
+  # because we must use -o when running libtool.
+  test -z "$dashmflag" && dashmflag=-M
+  ( IFS=" "
+    case " $* " in
+    *" --mode=compile "*) # this is libtool, let us make it quiet
+      for arg
+      do # cycle over the arguments
+        case "$arg" in
+	"--mode=compile")
+	  # insert --quiet before "--mode=compile"
+	  set fnord "$@" --quiet
+	  shift # fnord
+	  ;;
+	esac
+	set fnord "$@" "$arg"
+	shift # fnord
+	shift # "$arg"
+      done
+      ;;
+    esac
+    "$@" $dashmflag | sed 's:^[^:]*\:[ 	]*:'"$object"'\: :' > "$tmpdepfile"
+  ) &
+  proc=$!
+  "$@"
+  stat=$?
+  wait "$proc"
+  if test "$stat" != 0; then exit $stat; fi
+  rm -f "$depfile" 
+  cat < "$tmpdepfile" > "$depfile"
+  tr ' ' '
+' < "$tmpdepfile" | \
+## Some versions of the HPUX 10.20 sed can't process this invocation
+## correctly.  Breaking it into two sed invocations is a workaround.
+    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+dashXmstdout)
+  # This case only exists to satisfy depend.m4.  It is never actually
+  # run, as this mode is specially recognized in the preamble.
+  exit 1
+  ;;
+
+makedepend)
+  # X makedepend
+  (
+    shift
+    cleared=no
+    for arg in "$@"; do
+      case $cleared in no)
+        set ""; shift
+	cleared=yes
+      esac
+      case "$arg" in
+        -D*|-I*)
+	  set fnord "$@" "$arg"; shift;;
+	-*)
+	  ;;
+	*)
+	  set fnord "$@" "$arg"; shift;;
+      esac
+    done
+    obj_suffix="`echo $object | sed 's/^.*\././'`"
+    touch "$tmpdepfile"
+    ${MAKEDEPEND-makedepend} 2>/dev/null -o"$obj_suffix" -f"$tmpdepfile" "$@"
+  ) &
+  proc=$!
+  "$@"
+  stat=$?
+  wait "$proc"
+  if test "$stat" != 0; then exit $stat; fi
+  rm -f "$depfile" 
+  cat < "$tmpdepfile" > "$depfile"
+  tail +3 "$tmpdepfile" | tr ' ' '
+' | \
+## Some versions of the HPUX 10.20 sed can't process this invocation
+## correctly.  Breaking it into two sed invocations is a workaround.
+    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile" "$tmpdepfile".bak
+  ;;
+
+cpp)
+  # Important note: in order to support this mode, a compiler *must*
+  # always write the proprocessed file to stdout, regardless of -o,
+  # because we must use -o when running libtool.
+  ( IFS=" "
+    case " $* " in
+    *" --mode=compile "*)
+      for arg
+      do # cycle over the arguments
+        case "$arg" in
+	"--mode=compile")
+	  # insert --quiet before "--mode=compile"
+	  set fnord "$@" --quiet
+	  shift # fnord
+	  ;;
+	esac
+	set fnord "$@" "$arg"
+	shift # fnord
+	shift # "$arg"
+      done
+      ;;
+    esac
+    "$@" -E |
+    sed -n '/^# [0-9][0-9]* "\([^"]*\)"/ s::'"$object"'\: \1:p' > "$tmpdepfile"
+  ) &
+  proc=$!
+  "$@"
+  stat=$?
+  wait "$proc"
+  if test "$stat" != 0; then exit $stat; fi
+  rm -f "$depfile"
+  cat < "$tmpdepfile" > "$depfile"
+  sed < "$tmpdepfile" -e 's/^[^:]*: //' -e 's/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+none)
+  exec "$@"
+  ;;
+
+*)
+  echo "Unknown depmode $depmode" 1>&2
+  exit 1
+  ;;
+esac
+
+exit 0
diff --git a/rts/gmp/errno.c b/rts/gmp/errno.c
new file mode 100644
index 0000000000..7dd223c19c
--- /dev/null
+++ b/rts/gmp/errno.c
@@ -0,0 +1,26 @@
+/* gmp_errno -- The largest and most complex file in GMP.
+
+Copyright (C) 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int gmp_errno = 0;
diff --git a/rts/gmp/extract-dbl.c b/rts/gmp/extract-dbl.c
new file mode 100644
index 0000000000..2d70d9a3b2
--- /dev/null
+++ b/rts/gmp/extract-dbl.c
@@ -0,0 +1,187 @@
+/* __gmp_extract_double -- convert from double to array of mp_limb_t.
+
+Copyright (C) 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#ifdef XDEBUG
+#undef _GMP_IEEE_FLOATS
+#endif
+
+#ifndef _GMP_IEEE_FLOATS
+#define _GMP_IEEE_FLOATS 0
+#endif
+
+/* Extract a non-negative double in d.  */
+
+int
+#if __STDC__
+__gmp_extract_double (mp_ptr rp, double d)
+#else
+__gmp_extract_double (rp, d)
+     mp_ptr rp;
+     double d;
+#endif
+{
+  long exp;
+  unsigned sc;
+  mp_limb_t manh, manl;
+
+  /* BUGS
+
+     1. Should handle Inf and NaN in IEEE specific code.
+     2. Handle Inf and NaN also in default code, to avoid hangs.
+     3. Generalize to handle all BITS_PER_MP_LIMB >= 32.
+     4. This lits is incomplete and misspelled.
+   */
+
+  if (d == 0.0)
+    {
+      rp[0] = 0;
+      rp[1] = 0;
+#if BITS_PER_MP_LIMB == 32
+      rp[2] = 0;
+#endif
+      return 0;
+    }
+
+#if _GMP_IEEE_FLOATS
+  {
+#if defined (__alpha) && __GNUC__ == 2 && __GNUC_MINOR__ == 8
+    /* Work around alpha-specific bug in GCC 2.8.x.  */
+    volatile
+#endif
+    union ieee_double_extract x;
+    x.d = d;
+    exp = x.s.exp;
+#if BITS_PER_MP_LIMB == 64
+    manl = (((mp_limb_t) 1 << 63)
+	    | ((mp_limb_t) x.s.manh << 43) | ((mp_limb_t) x.s.manl << 11));
+    if (exp == 0)
+      {
+	/* Denormalized number.  Don't try to be clever about this,
+	   since it is not an important case to make fast.  */
+	exp = 1;
+	do
+	  {
+	    manl = manl << 1;
+	    exp--;
+	  }
+	while ((mp_limb_signed_t) manl >= 0);
+      }
+#else
+    manh = ((mp_limb_t) 1 << 31) | (x.s.manh << 11) | (x.s.manl >> 21);
+    manl = x.s.manl << 11;
+    if (exp == 0)
+      {
+	/* Denormalized number.  Don't try to be clever about this,
+	   since it is not an important case to make fast.  */
+	exp = 1;
+	do
+	  {
+	    manh = (manh << 1) | (manl >> 31);
+	    manl = manl << 1;
+	    exp--;
+	  }
+	while ((mp_limb_signed_t) manh >= 0);
+      }
+#endif
+    exp -= 1022;		/* Remove IEEE bias.  */
+  }
+#else
+  {
+    /* Unknown (or known to be non-IEEE) double format.  */
+    exp = 0;
+    if (d >= 1.0)
+      {
+	if (d * 0.5 == d)
+	  abort ();
+
+	while (d >= 32768.0)
+	  {
+	    d *= (1.0 / 65536.0);
+	    exp += 16;
+	  }
+	while (d >= 1.0)
+	  {
+	    d *= 0.5;
+	    exp += 1;
+	  }
+      }
+    else if (d < 0.5)
+      {
+	while (d < (1.0 / 65536.0))
+	  {
+	    d *=  65536.0;
+	    exp -= 16;
+	  }
+	while (d < 0.5)
+	  {
+	    d *= 2.0;
+	    exp -= 1;
+	  }
+      }
+
+    d *= MP_BASE_AS_DOUBLE;
+#if BITS_PER_MP_LIMB == 64
+    manl = d;
+#else
+    manh = d;
+    manl = (d - manh) * MP_BASE_AS_DOUBLE;
+#endif
+  }
+#endif
+
+  sc = (unsigned) exp % BITS_PER_MP_LIMB;
+
+  /* We add something here to get rounding right.  */
+  exp = (exp + 2048) / BITS_PER_MP_LIMB - 2048 / BITS_PER_MP_LIMB + 1;
+
+#if BITS_PER_MP_LIMB == 64
+  if (sc != 0)
+    {
+      rp[1] = manl >> (BITS_PER_MP_LIMB - sc);
+      rp[0] = manl << sc;
+    }
+  else
+    {
+      rp[1] = manl;
+      rp[0] = 0;
+      exp--;
+    }
+#else
+  if (sc != 0)
+    {
+      rp[2] = manh >> (BITS_PER_MP_LIMB - sc);
+      rp[1] = (manl >> (BITS_PER_MP_LIMB - sc)) | (manh << sc);
+      rp[0] = manl << sc;
+    }
+  else
+    {
+      rp[2] = manh;
+      rp[1] = manl;
+      rp[0] = 0;
+      exp--;
+    }
+#endif
+
+  return exp;
+}
diff --git a/rts/gmp/gmp-impl.h b/rts/gmp/gmp-impl.h
new file mode 100644
index 0000000000..3c7ac26e7d
--- /dev/null
+++ b/rts/gmp/gmp-impl.h
@@ -0,0 +1,1072 @@
+/* Include file for internal GNU MP types and definitions.
+
+   THE CONTENTS OF THIS FILE ARE FOR INTERNAL USE AND ARE ALMOST CERTAIN TO
+   BE SUBJECT TO INCOMPATIBLE CHANGES IN FUTURE GNU MP RELEASES.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "config.h"
+#include "gmp-mparam.h"
+/* #include "longlong.h" */
+
+/* When using gcc, make sure to use its builtin alloca.  */
+#if ! defined (alloca) && defined (__GNUC__)
+#define alloca __builtin_alloca
+#define HAVE_ALLOCA 1
+#endif
+
+/* When using cc, do whatever necessary to allow use of alloca.  For many
+   machines, this means including alloca.h.  IBM's compilers need a #pragma
+   in "each module that needs to use alloca".  */
+#if ! defined (alloca)
+/* We need lots of variants for MIPS, to cover all versions and perversions
+   of OSes for MIPS.  */
+#if defined (__mips) || defined (MIPSEL) || defined (MIPSEB) \
+ || defined (_MIPSEL) || defined (_MIPSEB) || defined (__sgi) \
+ || defined (__alpha) || defined (__sparc) || defined (sparc) \
+ || defined (__ksr__)
+#include <alloca.h>
+#define HAVE_ALLOCA
+#endif
+#if defined (_IBMR2)
+#pragma alloca
+#define HAVE_ALLOCA
+#endif
+#if defined (__DECC)
+#define alloca(x) __ALLOCA(x)
+#define HAVE_ALLOCA
+#endif
+#endif
+
+#if defined (alloca)
+# ifndef HAVE_ALLOCA
+#define HAVE_ALLOCA
+# endif
+#endif
+
+#if ! defined (HAVE_ALLOCA) || USE_STACK_ALLOC
+#include "stack-alloc.h"
+#else
+#define TMP_DECL(m)
+#define TMP_ALLOC(x) alloca(x)
+#define TMP_MARK(m)
+#define TMP_FREE(m)
+#endif
+
+/* Allocating various types. */
+#define TMP_ALLOC_TYPE(n,type) ((type *) TMP_ALLOC ((n) * sizeof (type)))
+#define TMP_ALLOC_LIMBS(n)     TMP_ALLOC_TYPE(n,mp_limb_t)
+#define TMP_ALLOC_MP_PTRS(n)   TMP_ALLOC_TYPE(n,mp_ptr)
+
+
+#if ! defined (__GNUC__)	/* FIXME: Test for C++ compilers here,
+				   __DECC understands __inline */
+#define inline			/* Empty */
+#endif
+
+#define ABS(x) (x >= 0 ? x : -x)
+#define MIN(l,o) ((l) < (o) ? (l) : (o))
+#define MAX(h,i) ((h) > (i) ? (h) : (i))
+#define numberof(x)  (sizeof (x) / sizeof ((x)[0]))
+
+/* Field access macros.  */
+#define SIZ(x) ((x)->_mp_size)
+#define ABSIZ(x) ABS (SIZ (x))
+#define PTR(x) ((x)->_mp_d)
+#define LIMBS(x) ((x)->_mp_d)
+#define EXP(x) ((x)->_mp_exp)
+#define PREC(x) ((x)->_mp_prec)
+#define ALLOC(x) ((x)->_mp_alloc)
+
+/* Extra casts because shorts are promoted to ints by "~" and "<<".  "-1"
+   rather than "1" in SIGNED_TYPE_MIN avoids warnings from some compilers
+   about arithmetic overflow.  */
+#define UNSIGNED_TYPE_MAX(type)      ((type) ~ (type) 0)
+#define UNSIGNED_TYPE_HIGHBIT(type)  ((type) ~ (UNSIGNED_TYPE_MAX(type) >> 1))
+#define SIGNED_TYPE_MIN(type)        (((type) -1) << (8*sizeof(type)-1))
+#define SIGNED_TYPE_MAX(type)        ((type) ~ SIGNED_TYPE_MIN(type))
+#define SIGNED_TYPE_HIGHBIT(type)    SIGNED_TYPE_MIN(type)
+
+#define MP_LIMB_T_MAX      UNSIGNED_TYPE_MAX (mp_limb_t)
+#define MP_LIMB_T_HIGHBIT  UNSIGNED_TYPE_HIGHBIT (mp_limb_t)
+
+#define MP_SIZE_T_MAX      SIGNED_TYPE_MAX (mp_size_t)
+
+#ifndef ULONG_MAX
+#define ULONG_MAX          UNSIGNED_TYPE_MAX (unsigned long)
+#endif
+#define ULONG_HIGHBIT      UNSIGNED_TYPE_HIGHBIT (unsigned long)
+#define LONG_HIGHBIT       SIGNED_TYPE_HIGHBIT (long)
+#ifndef LONG_MAX
+#define LONG_MAX           SIGNED_TYPE_MAX (long)
+#endif
+
+#ifndef USHORT_MAX
+#define USHORT_MAX         UNSIGNED_TYPE_MAX (unsigned short)
+#endif
+#define USHORT_HIGHBIT     UNSIGNED_TYPE_HIGHBIT (unsigned short)
+#define SHORT_HIGHBIT      SIGNED_TYPE_HIGHBIT (short)
+#ifndef SHORT_MAX
+#define SHORT_MAX          SIGNED_TYPE_MAX (short)
+#endif
+
+
+/* Swap macros. */
+
+#define MP_LIMB_T_SWAP(x, y)                    \
+  do {                                          \
+    mp_limb_t __mp_limb_t_swap__tmp = (x);      \
+    (x) = (y);                                  \
+    (y) = __mp_limb_t_swap__tmp;                \
+  } while (0)
+#define MP_SIZE_T_SWAP(x, y)                    \
+  do {                                          \
+    mp_size_t __mp_size_t_swap__tmp = (x);      \
+    (x) = (y);                                  \
+    (y) = __mp_size_t_swap__tmp;                \
+  } while (0)
+
+#define MP_PTR_SWAP(x, y)               \
+  do {                                  \
+    mp_ptr __mp_ptr_swap__tmp = (x);    \
+    (x) = (y);                          \
+    (y) = __mp_ptr_swap__tmp;           \
+  } while (0)
+#define MP_SRCPTR_SWAP(x, y)                    \
+  do {                                          \
+    mp_srcptr __mp_srcptr_swap__tmp = (x);      \
+    (x) = (y);                                  \
+    (y) = __mp_srcptr_swap__tmp;                \
+  } while (0)
+
+#define MPN_PTR_SWAP(xp,xs, yp,ys)      \
+  do {                                  \
+    MP_PTR_SWAP (xp, yp);               \
+    MP_SIZE_T_SWAP (xs, ys);            \
+  } while(0)
+#define MPN_SRCPTR_SWAP(xp,xs, yp,ys)   \
+  do {                                  \
+    MP_SRCPTR_SWAP (xp, yp);            \
+    MP_SIZE_T_SWAP (xs, ys);            \
+  } while(0)
+
+#define MPZ_PTR_SWAP(x, y)              \
+  do {                                  \
+    mpz_ptr __mpz_ptr_swap__tmp = (x);  \
+    (x) = (y);                          \
+    (y) = __mpz_ptr_swap__tmp;          \
+  } while (0)
+#define MPZ_SRCPTR_SWAP(x, y)                   \
+  do {                                          \
+    mpz_srcptr __mpz_srcptr_swap__tmp = (x);    \
+    (x) = (y);                                  \
+    (y) = __mpz_srcptr_swap__tmp;               \
+  } while (0)
+
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* FIXME: These are purely internal, so do a search and replace to change
+   them to __gmp forms, rather than using these macros. */
+#define _mp_allocate_func      __gmp_allocate_func
+#define _mp_reallocate_func    __gmp_reallocate_func
+#define _mp_free_func          __gmp_free_func
+#define _mp_default_allocate   __gmp_default_allocate
+#define _mp_default_reallocate __gmp_default_reallocate
+#define _mp_default_free       __gmp_default_free
+
+extern void *	(*_mp_allocate_func) _PROTO ((size_t));
+extern void *	(*_mp_reallocate_func) _PROTO ((void *, size_t, size_t));
+extern void	(*_mp_free_func) _PROTO ((void *, size_t));
+
+void *_mp_default_allocate _PROTO ((size_t));
+void *_mp_default_reallocate _PROTO ((void *, size_t, size_t));
+void _mp_default_free _PROTO ((void *, size_t));
+
+#define _MP_ALLOCATE_FUNC_TYPE(n,type) \
+  ((type *) (*_mp_allocate_func) ((n) * sizeof (type)))
+#define _MP_ALLOCATE_FUNC_LIMBS(n)   _MP_ALLOCATE_FUNC_TYPE(n,mp_limb_t)
+
+#define _MP_FREE_FUNC_TYPE(p,n,type) (*_mp_free_func) (p, (n) * sizeof (type))
+#define _MP_FREE_FUNC_LIMBS(p,n)     _MP_FREE_FUNC_TYPE(p,n,mp_limb_t)
+
+
+#if (__STDC__-0) || defined (__cplusplus)
+
+#else
+
+#define const			/* Empty */
+#define signed			/* Empty */
+
+#endif
+
+#if defined (__GNUC__) && defined (__i386__)
+#if 0			/* check that these actually improve things */
+#define MPN_COPY_INCR(DST, SRC, N)					\
+  __asm__ ("cld\n\trep\n\tmovsl" : :					\
+	   "D" (DST), "S" (SRC), "c" (N) :				\
+	   "cx", "di", "si", "memory")
+#define MPN_COPY_DECR(DST, SRC, N)					\
+  __asm__ ("std\n\trep\n\tmovsl" : :					\
+	   "D" ((DST) + (N) - 1), "S" ((SRC) + (N) - 1), "c" (N) :	\
+	   "cx", "di", "si", "memory")
+#define MPN_NORMALIZE_NOT_ZERO(P, N)					\
+  do {									\
+    __asm__ ("std\n\trepe\n\tscasl" : "=c" (N) :			\
+	     "a" (0), "D" ((P) + (N) - 1), "0" (N) :			\
+	     "cx", "di");						\
+    (N)++;								\
+  } while (0)
+#endif
+#endif
+
+#if HAVE_NATIVE_mpn_copyi
+#define mpn_copyi __MPN(copyi)
+void mpn_copyi _PROTO ((mp_ptr, mp_srcptr, mp_size_t));
+#endif
+
+/* Remap names of internal mpn functions.  */
+#define __clz_tab               __MPN(clz_tab)
+#define mpn_udiv_w_sdiv		__MPN(udiv_w_sdiv)
+#define mpn_reciprocal		__MPN(reciprocal)
+
+#define mpn_sb_divrem_mn	__MPN(sb_divrem_mn)
+#define mpn_bz_divrem_n		__MPN(bz_divrem_n)
+/* #define mpn_tdiv_q		__MPN(tdiv_q) */
+
+#define mpn_kara_mul_n	__MPN(kara_mul_n)
+void mpn_kara_mul_n _PROTO((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_ptr));
+
+#define mpn_kara_sqr_n  __MPN(kara_sqr_n)
+void mpn_kara_sqr_n _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_ptr));
+
+#define mpn_toom3_mul_n  __MPN(toom3_mul_n)
+void mpn_toom3_mul_n _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t,mp_ptr));
+
+#define mpn_toom3_sqr_n  __MPN(toom3_sqr_n)
+void mpn_toom3_sqr_n _PROTO((mp_ptr, mp_srcptr, mp_size_t, mp_ptr));
+
+#define mpn_fft_best_k  __MPN(fft_best_k)
+int mpn_fft_best_k _PROTO ((mp_size_t n, int sqr));
+
+#define mpn_mul_fft  __MPN(mul_fft)
+void mpn_mul_fft _PROTO ((mp_ptr op, mp_size_t pl,
+                          mp_srcptr n, mp_size_t nl,
+                          mp_srcptr m, mp_size_t ml,
+                          int k));
+
+#define mpn_mul_fft_full  __MPN(mul_fft_full)
+void mpn_mul_fft_full _PROTO ((mp_ptr op,
+                               mp_srcptr n, mp_size_t nl,
+                               mp_srcptr m, mp_size_t ml));
+
+#define mpn_fft_next_size  __MPN(fft_next_size)
+mp_size_t mpn_fft_next_size _PROTO ((mp_size_t pl, int k));
+
+mp_limb_t mpn_sb_divrem_mn _PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t));
+mp_limb_t mpn_bz_divrem_n _PROTO ((mp_ptr, mp_ptr, mp_srcptr, mp_size_t));
+/* void mpn_tdiv_q _PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t)); */
+
+/* Copy NLIMBS *limbs* from SRC to DST, NLIMBS==0 allowed.  */
+#ifndef MPN_COPY_INCR
+#if HAVE_NATIVE_mpn_copyi
+#define MPN_COPY_INCR(DST, SRC, NLIMBS)   mpn_copyi (DST, SRC, NLIMBS)
+#else
+#define MPN_COPY_INCR(DST, SRC, NLIMBS) \
+  do {									\
+    mp_size_t __i;							\
+    for (__i = 0; __i < (NLIMBS); __i++)				\
+      (DST)[__i] = (SRC)[__i];						\
+  } while (0)
+#endif
+#endif
+
+#if HAVE_NATIVE_mpn_copyd
+#define mpn_copyd __MPN(copyd)
+void mpn_copyd _PROTO ((mp_ptr, mp_srcptr, mp_size_t));
+#endif
+
+/* NLIMBS==0 allowed */
+#ifndef MPN_COPY_DECR
+#if HAVE_NATIVE_mpn_copyd
+#define MPN_COPY_DECR(DST, SRC, NLIMBS)   mpn_copyd (DST, SRC, NLIMBS)
+#else
+#define MPN_COPY_DECR(DST, SRC, NLIMBS) \
+  do {									\
+    mp_size_t __i;							\
+    for (__i = (NLIMBS) - 1; __i >= 0; __i--)				\
+      (DST)[__i] = (SRC)[__i];						\
+  } while (0)
+#endif
+#endif
+
+/* Define MPN_COPY for vector computers.  Since #pragma cannot be in a macro,
+   rely on function inlining. */
+#if defined (_CRAY) || defined (__uxp__)
+static inline void
+_MPN_COPY (d, s, n) mp_ptr d; mp_srcptr s; mp_size_t n;
+{
+  int i;				/* Faster for Cray with plain int */
+#pragma _CRI ivdep			/* Cray PVP systems */
+#pragma loop noalias d,s		/* Fujitsu VPP systems */
+  for (i = 0; i < n; i++)
+    d[i] = s[i];
+}
+#define MPN_COPY _MPN_COPY
+#endif
+
+#ifndef MPN_COPY
+#define MPN_COPY MPN_COPY_INCR
+#endif
+
+/* Zero NLIMBS *limbs* AT DST.  */
+#ifndef MPN_ZERO
+#define MPN_ZERO(DST, NLIMBS) \
+  do {									\
+    mp_size_t __i;							\
+    for (__i = 0; __i < (NLIMBS); __i++)				\
+      (DST)[__i] = 0;							\
+  } while (0)
+#endif
+
+#ifndef MPN_NORMALIZE
+#define MPN_NORMALIZE(DST, NLIMBS) \
+  do {									\
+    while (NLIMBS > 0)							\
+      {									\
+	if ((DST)[(NLIMBS) - 1] != 0)					\
+	  break;							\
+	NLIMBS--;							\
+      }									\
+  } while (0)
+#endif
+#ifndef MPN_NORMALIZE_NOT_ZERO
+#define MPN_NORMALIZE_NOT_ZERO(DST, NLIMBS) \
+  do {									\
+    while (1)								\
+      {									\
+	if ((DST)[(NLIMBS) - 1] != 0)					\
+	  break;							\
+	NLIMBS--;							\
+      }									\
+  } while (0)
+#endif
+
+/* Strip least significant zero limbs from ptr,size by incrementing ptr and
+   decrementing size.  The number in ptr,size must be non-zero, ie. size!=0
+   and somewhere a non-zero limb.  */
+#define MPN_STRIP_LOW_ZEROS_NOT_ZERO(ptr, size) \
+  do                                            \
+    {                                           \
+      ASSERT ((size) != 0);                     \
+      while ((ptr)[0] == 0)                     \
+        {                                       \
+          (ptr)++;                              \
+          (size)--;                             \
+          ASSERT (size >= 0);                   \
+	}                                       \
+    }                                           \
+  while (0)
+
+/* Initialize X of type mpz_t with space for NLIMBS limbs.  X should be a
+   temporary variable; it will be automatically cleared out at function
+   return.  We use __x here to make it possible to accept both mpz_ptr and
+   mpz_t arguments.  */
+#define MPZ_TMP_INIT(X, NLIMBS) \
+  do {									\
+    mpz_ptr __x = (X);							\
+    __x->_mp_alloc = (NLIMBS);						\
+    __x->_mp_d = (mp_ptr) TMP_ALLOC ((NLIMBS) * BYTES_PER_MP_LIMB);	\
+  } while (0)
+
+/* Realloc for an mpz_t WHAT if it has less thann NEEDED limbs.  */
+#define MPZ_REALLOC(what,needed) \
+  do {								\
+    if ((needed) > ALLOC (what))				\
+      _mpz_realloc (what, needed);				\
+  } while (0)
+
+/* If KARATSUBA_MUL_THRESHOLD is not already defined, define it to a
+   value which is good on most machines.  */
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 32
+#endif
+
+/* If TOOM3_MUL_THRESHOLD is not already defined, define it to a
+   value which is good on most machines.  */
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD 256
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD (2*KARATSUBA_MUL_THRESHOLD)
+#endif
+
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD (2*TOOM3_MUL_THRESHOLD)
+#endif
+
+/* First k to use for an FFT modF multiply.  A modF FFT is an order
+   log(2^k)/log(2^(k-1)) algorithm, so k=3 is merely 1.5 like karatsuba,
+   whereas k=4 is 1.33 which is faster than toom3 at 1.485.    */
+#define FFT_FIRST_K  4
+
+/* Threshold at which FFT should be used to do a modF NxN -> N multiply. */
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD   (TOOM3_MUL_THRESHOLD * 3)
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD   (TOOM3_SQR_THRESHOLD * 3)
+#endif
+
+/* Threshold at which FFT should be used to do an NxN -> 2N multiply.  This
+   will be a size where FFT is using k=7 or k=8, since an FFT-k used for an
+   NxN->2N multiply and not recursing into itself is an order
+   log(2^k)/log(2^(k-2)) algorithm, so it'll be at least k=7 at 1.39 which
+   is the first better than toom3.  */
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD   (FFT_MODF_MUL_THRESHOLD * 10)
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD   (FFT_MODF_SQR_THRESHOLD * 10)
+#endif
+
+/* Table of thresholds for successive modF FFT "k"s.  The first entry is
+   where FFT_FIRST_K+1 should be used, the second FFT_FIRST_K+2,
+   etc.  See mpn_fft_best_k(). */
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE                           \
+  { TOOM3_MUL_THRESHOLD * 4,   /* k=5 */        \
+    TOOM3_MUL_THRESHOLD * 8,   /* k=6 */        \
+    TOOM3_MUL_THRESHOLD * 16,  /* k=7 */        \
+    TOOM3_MUL_THRESHOLD * 32,  /* k=8 */        \
+    TOOM3_MUL_THRESHOLD * 96,  /* k=9 */        \
+    TOOM3_MUL_THRESHOLD * 288, /* k=10 */       \
+    0 }
+#endif
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE                           \
+  { TOOM3_SQR_THRESHOLD * 4,   /* k=5 */        \
+    TOOM3_SQR_THRESHOLD * 8,   /* k=6 */        \
+    TOOM3_SQR_THRESHOLD * 16,  /* k=7 */        \
+    TOOM3_SQR_THRESHOLD * 32,  /* k=8 */        \
+    TOOM3_SQR_THRESHOLD * 96,  /* k=9 */        \
+    TOOM3_SQR_THRESHOLD * 288, /* k=10 */       \
+    0 }
+#endif
+
+#ifndef FFT_TABLE_ATTRS
+#define FFT_TABLE_ATTRS   static const
+#endif
+
+#define MPN_FFT_TABLE_SIZE  16
+
+
+/* Return non-zero if xp,xsize and yp,ysize overlap.
+   If xp+xsize<=yp there's no overlap, or if yp+ysize<=xp there's no
+   overlap.  If both these are false, there's an overlap. */
+#define MPN_OVERLAP_P(xp, xsize, yp, ysize) \
+  ((xp) + (xsize) > (yp) && (yp) + (ysize) > (xp))
+
+
+/* ASSERT() is a private assertion checking scheme, similar to <assert.h>.
+   ASSERT() does the check only if WANT_ASSERT is selected, ASSERT_ALWAYS()
+   does it always.  Generally assertions are meant for development, but
+   might help when looking for a problem later too.
+
+   ASSERT_NOCARRY() uses ASSERT() to check the expression is zero, but if
+   assertion checking is disabled, the expression is still evaluated.  This
+   is meant for use with routines like mpn_add_n() where the return value
+   represents a carry or whatever that shouldn't occur.  For example,
+   ASSERT_NOCARRY (mpn_add_n (rp, s1p, s2p, size)); */
+
+#ifdef __LINE__
+#define ASSERT_LINE  __LINE__
+#else
+#define ASSERT_LINE  -1
+#endif
+
+#ifdef __FILE__
+#define ASSERT_FILE  __FILE__
+#else
+#define ASSERT_FILE  ""
+#endif
+
+int __gmp_assert_fail _PROTO((const char *filename, int linenum,
+                              const char *expr));
+
+#if HAVE_STRINGIZE
+#define ASSERT_FAIL(expr)  __gmp_assert_fail (ASSERT_FILE, ASSERT_LINE, #expr)
+#else
+#define ASSERT_FAIL(expr)  __gmp_assert_fail (ASSERT_FILE, ASSERT_LINE, "expr")
+#endif
+
+#if HAVE_VOID
+#define CAST_TO_VOID        (void)
+#else
+#define CAST_TO_VOID
+#endif
+
+#define ASSERT_ALWAYS(expr) ((expr) ? 0 : ASSERT_FAIL (expr))
+
+#if WANT_ASSERT
+#define ASSERT(expr)           ASSERT_ALWAYS (expr)
+#define ASSERT_NOCARRY(expr)   ASSERT_ALWAYS ((expr) == 0)
+
+#else
+#define ASSERT(expr)           (CAST_TO_VOID 0)
+#define ASSERT_NOCARRY(expr)   (expr)
+#endif
+
+
+#if HAVE_NATIVE_mpn_com_n
+#define mpn_com_n __MPN(com_n)
+void mpn_com_n _PROTO ((mp_ptr, mp_srcptr, mp_size_t));
+#else
+#define mpn_com_n(d,s,n)        \
+  do                            \
+    {                           \
+      mp_ptr     __d = (d);     \
+      mp_srcptr  __s = (s);     \
+      mp_size_t  __n = (n);     \
+      do                        \
+        *__d++ = ~ *__s++;      \
+      while (--__n);            \
+    }                           \
+  while (0)
+#endif
+
+#define MPN_LOGOPS_N_INLINE(d,s1,s2,n,dop,op,s2op)      \
+  do                                                    \
+    {                                                   \
+      mp_ptr     __d = (d);                             \
+      mp_srcptr  __s1 = (s1);                           \
+      mp_srcptr  __s2 = (s2);                           \
+      mp_size_t  __n = (n);                             \
+      do                                                \
+        *__d++ = dop (*__s1++ op s2op *__s2++);         \
+      while (--__n);                                    \
+    }                                                   \
+  while (0)
+
+#if HAVE_NATIVE_mpn_and_n
+#define mpn_and_n __MPN(and_n)
+void mpn_and_n _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+#else
+#define mpn_and_n(d,s1,s2,n)  MPN_LOGOPS_N_INLINE(d,s1,s2,n, ,&, )
+#endif
+
+#if HAVE_NATIVE_mpn_andn_n
+#define mpn_andn_n __MPN(andn_n)
+void mpn_andn_n _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+#else
+#define mpn_andn_n(d,s1,s2,n) MPN_LOGOPS_N_INLINE(d,s1,s2,n, ,&,~)
+#endif
+
+#if HAVE_NATIVE_mpn_nand_n
+#define mpn_nand_n __MPN(nand_n)
+void mpn_nand_n _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+#else
+#define mpn_nand_n(d,s1,s2,n) MPN_LOGOPS_N_INLINE(d,s1,s2,n,~,&, )
+#endif
+
+#if HAVE_NATIVE_mpn_ior_n
+#define mpn_ior_n __MPN(ior_n)
+void mpn_ior_n _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+#else
+#define mpn_ior_n(d,s1,s2,n)  MPN_LOGOPS_N_INLINE(d,s1,s2,n, ,|, )
+#endif
+
+#if HAVE_NATIVE_mpn_iorn_n
+#define mpn_iorn_n __MPN(iorn_n)
+void mpn_iorn_n _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+#else
+#define mpn_iorn_n(d,s1,s2,n) MPN_LOGOPS_N_INLINE(d,s1,s2,n, ,|,~)
+#endif
+
+#if HAVE_NATIVE_mpn_nior_n
+#define mpn_nior_n __MPN(nior_n)
+void mpn_nior_n _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+#else
+#define mpn_nior_n(d,s1,s2,n) MPN_LOGOPS_N_INLINE(d,s1,s2,n,~,|, )
+#endif
+
+#if HAVE_NATIVE_mpn_xor_n
+#define mpn_xor_n __MPN(xor_n)
+void mpn_xor_n _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+#else
+#define mpn_xor_n(d,s1,s2,n)  MPN_LOGOPS_N_INLINE(d,s1,s2,n, ,^, )
+#endif
+
+#if HAVE_NATIVE_mpn_xnor_n
+#define mpn_xnor_n __MPN(xnor_n)
+void mpn_xnor_n _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+#else
+#define mpn_xnor_n(d,s1,s2,n) MPN_LOGOPS_N_INLINE(d,s1,s2,n,~,^, )
+#endif
+
+/* Structure for conversion between internal binary format and
+   strings in base 2..36.  */
+struct bases
+{
+  /* Number of digits in the conversion base that always fits in an mp_limb_t.
+     For example, for base 10 on a machine where a mp_limb_t has 32 bits this
+     is 9, since 10**9 is the largest number that fits into a mp_limb_t.  */
+  int chars_per_limb;
+
+  /* log(2)/log(conversion_base) */
+  double chars_per_bit_exactly;
+
+  /* base**chars_per_limb, i.e. the biggest number that fits a word, built by
+     factors of base.  Exception: For 2, 4, 8, etc, big_base is log2(base),
+     i.e. the number of bits used to represent each digit in the base.  */
+  mp_limb_t big_base;
+
+  /* A BITS_PER_MP_LIMB bit approximation to 1/big_base, represented as a
+     fixed-point number.  Instead of dividing by big_base an application can
+     choose to multiply by big_base_inverted.  */
+  mp_limb_t big_base_inverted;
+};
+
+#define __mp_bases __MPN(mp_bases)
+extern const struct bases __mp_bases[];
+extern mp_size_t __gmp_default_fp_limb_precision;
+
+#if defined (__i386__)
+#define TARGET_REGISTER_STARVED 1
+#else
+#define TARGET_REGISTER_STARVED 0
+#endif
+
+/* Use a library function for invert_limb, if available. */
+#if ! defined (invert_limb) && HAVE_NATIVE_mpn_invert_limb
+#define mpn_invert_limb  __MPN(invert_limb)
+mp_limb_t mpn_invert_limb _PROTO ((mp_limb_t));
+#define invert_limb(invxl,xl)  (invxl = __MPN(invert_limb) (xl))
+#endif
+
+#ifndef invert_limb
+#define invert_limb(invxl,xl) \
+  do {									\
+    mp_limb_t dummy;							\
+    if (xl << 1 == 0)							\
+      invxl = ~(mp_limb_t) 0;						\
+    else								\
+      udiv_qrnnd (invxl, dummy, -xl, 0, xl);				\
+  } while (0)
+#endif
+
+/* Divide the two-limb number in (NH,,NL) by D, with DI being the largest
+   limb not larger than (2**(2*BITS_PER_MP_LIMB))/D - (2**BITS_PER_MP_LIMB).
+   If this would yield overflow, DI should be the largest possible number
+   (i.e., only ones).  For correct operation, the most significant bit of D
+   has to be set.  Put the quotient in Q and the remainder in R.  */
+#define udiv_qrnnd_preinv(q, r, nh, nl, d, di) \
+  do {									\
+    mp_limb_t _q, _ql, _r;						\
+    mp_limb_t _xh, _xl;							\
+    umul_ppmm (_q, _ql, (nh), (di));					\
+    _q += (nh);			/* DI is 2**BITS_PER_MP_LIMB too small */\
+    umul_ppmm (_xh, _xl, _q, (d));					\
+    sub_ddmmss (_xh, _r, (nh), (nl), _xh, _xl);				\
+    if (_xh != 0)							\
+      {									\
+	sub_ddmmss (_xh, _r, _xh, _r, 0, (d));				\
+	_q += 1;							\
+	if (_xh != 0)							\
+	  {								\
+	    sub_ddmmss (_xh, _r, _xh, _r, 0, (d));			\
+	    _q += 1;							\
+	  }								\
+      }									\
+    if (_r >= (d))							\
+      {									\
+	_r -= (d);							\
+	_q += 1;							\
+      }									\
+    (r) = _r;								\
+    (q) = _q;								\
+  } while (0)
+/* Like udiv_qrnnd_preinv, but for for any value D.  DNORM is D shifted left
+   so that its most significant bit is set.  LGUP is ceil(log2(D)).  */
+#define udiv_qrnnd_preinv2gen(q, r, nh, nl, d, di, dnorm, lgup) \
+  do {									\
+    mp_limb_t _n2, _n10, _n1, _nadj, _q1;				\
+    mp_limb_t _xh, _xl;							\
+    _n2 = ((nh) << (BITS_PER_MP_LIMB - (lgup))) + ((nl) >> 1 >> (l - 1));\
+    _n10 = (nl) << (BITS_PER_MP_LIMB - (lgup));				\
+    _n1 = ((mp_limb_signed_t) _n10 >> (BITS_PER_MP_LIMB - 1));		\
+    _nadj = _n10 + (_n1 & (dnorm));					\
+    umul_ppmm (_xh, _xl, di, _n2 - _n1);				\
+    add_ssaaaa (_xh, _xl, _xh, _xl, 0, _nadj);				\
+    _q1 = ~(_n2 + _xh);							\
+    umul_ppmm (_xh, _xl, _q1, d);					\
+    add_ssaaaa (_xh, _xl, _xh, _xl, nh, nl);				\
+    _xh -= (d);								\
+    (r) = _xl + ((d) & _xh);						\
+    (q) = _xh - _q1;							\
+  } while (0)
+/* Exactly like udiv_qrnnd_preinv, but branch-free.  It is not clear which
+   version to use.  */
+#define udiv_qrnnd_preinv2norm(q, r, nh, nl, d, di) \
+  do {									\
+    mp_limb_t _n2, _n10, _n1, _nadj, _q1;				\
+    mp_limb_t _xh, _xl;							\
+    _n2 = (nh);								\
+    _n10 = (nl);							\
+    _n1 = ((mp_limb_signed_t) _n10 >> (BITS_PER_MP_LIMB - 1));		\
+    _nadj = _n10 + (_n1 & (d));						\
+    umul_ppmm (_xh, _xl, di, _n2 - _n1);				\
+    add_ssaaaa (_xh, _xl, _xh, _xl, 0, _nadj);				\
+    _q1 = ~(_n2 + _xh);							\
+    umul_ppmm (_xh, _xl, _q1, d);					\
+    add_ssaaaa (_xh, _xl, _xh, _xl, nh, nl);				\
+    _xh -= (d);								\
+    (r) = _xl + ((d) & _xh);						\
+    (q) = _xh - _q1;							\
+  } while (0)
+
+
+/* modlimb_invert() sets "inv" to the multiplicative inverse of "n" modulo
+   2^BITS_PER_MP_LIMB, ie. so that inv*n == 1 mod 2^BITS_PER_MP_LIMB.
+   "n" must be odd (otherwise such an inverse doesn't exist).
+
+   This is not to be confused with invert_limb(), which is completely
+   different.
+
+   The table lookup gives an inverse with the low 8 bits valid, and each
+   multiply step doubles the number of bits.  See Jebelean's exact division
+   paper, end of section 4 (reference in gmp.texi). */
+
+#define modlimb_invert_table  __gmp_modlimb_invert_table
+extern const unsigned char  modlimb_invert_table[128];
+
+#if BITS_PER_MP_LIMB <= 32
+#define modlimb_invert(inv,n)                                   \
+  do {                                                          \
+    mp_limb_t  __n = (n);                                       \
+    mp_limb_t  __inv;                                           \
+    ASSERT ((__n & 1) == 1);                                    \
+    __inv = modlimb_invert_table[(__n&0xFF)/2]; /*  8 */        \
+    __inv = 2 * __inv - __inv * __inv * __n;    /* 16 */        \
+    __inv = 2 * __inv - __inv * __inv * __n;    /* 32 */        \
+    ASSERT (__inv * __n == 1);                                  \
+    (inv) = __inv;                                              \
+  } while (0)
+#endif
+
+#if BITS_PER_MP_LIMB > 32 && BITS_PER_MP_LIMB <= 64
+#define modlimb_invert(inv,n)                                   \
+  do {                                                          \
+    mp_limb_t  __n = (n);                                       \
+    mp_limb_t  __inv;                                           \
+    ASSERT ((__n & 1) == 1);                                    \
+    __inv = modlimb_invert_table[(__n&0xFF)/2]; /*  8 */        \
+    __inv = 2 * __inv - __inv * __inv * __n;    /* 16 */        \
+    __inv = 2 * __inv - __inv * __inv * __n;    /* 32 */        \
+    __inv = 2 * __inv - __inv * __inv * __n;    /* 64 */        \
+    ASSERT (__inv * __n == 1);                                  \
+    (inv) = __inv;                                              \
+  } while (0)
+#endif
+
+
+/* The `mode' attribute was introduced in GCC 2.2, but we can only distinguish
+   between GCC 2 releases from 2.5, since __GNUC_MINOR__ wasn't introduced
+   until then.  */
+#if (__GNUC__ - 0 > 2 || defined (__GNUC_MINOR__)) && ! defined (__APPLE_CC__)
+/* Define stuff for longlong.h.  */
+typedef unsigned int UQItype	__attribute__ ((mode (QI)));
+typedef		 int SItype	__attribute__ ((mode (SI)));
+typedef unsigned int USItype	__attribute__ ((mode (SI)));
+typedef		 int DItype	__attribute__ ((mode (DI)));
+typedef unsigned int UDItype	__attribute__ ((mode (DI)));
+#else
+typedef unsigned char UQItype;
+typedef		 long SItype;
+typedef unsigned long USItype;
+#if defined _LONGLONG || defined _LONG_LONG_LIMB
+typedef	long long int DItype;
+typedef unsigned long long int UDItype;
+#else /* Assume `long' gives us a wide enough type.  Needed for hppa2.0w.  */
+typedef long int DItype;
+typedef unsigned long int UDItype;
+#endif
+#endif
+
+typedef mp_limb_t UWtype;
+typedef unsigned int UHWtype;
+#define W_TYPE_SIZE BITS_PER_MP_LIMB
+
+/* Define ieee_double_extract and _GMP_IEEE_FLOATS.  */
+
+#if (defined (__arm__) && (defined (__ARMWEL__) || defined (__linux__)))
+/* Special case for little endian ARM since floats remain in big-endian.  */
+#define _GMP_IEEE_FLOATS 1
+union ieee_double_extract
+{
+  struct
+    {
+      unsigned int manh:20;
+      unsigned int exp:11;
+      unsigned int sig:1;
+      unsigned int manl:32;
+    } s;
+  double d;
+};
+#else
+#if defined (_LITTLE_ENDIAN) || defined (__LITTLE_ENDIAN__)		\
+ || defined (__alpha)							\
+ || defined (__clipper__)						\
+ || defined (__cris)							\
+ || defined (__i386__)							\
+ || defined (__i860__)							\
+ || defined (__i960__)							\
+ || defined (MIPSEL) || defined (_MIPSEL)				\
+ || defined (__ns32000__)						\
+ || defined (__WINNT) || defined (_WIN32)
+#define _GMP_IEEE_FLOATS 1
+union ieee_double_extract
+{
+  struct
+    {
+      unsigned int manl:32;
+      unsigned int manh:20;
+      unsigned int exp:11;
+      unsigned int sig:1;
+    } s;
+  double d;
+};
+#else /* Need this as an #else since the tests aren't made exclusive.  */
+#if defined (_BIG_ENDIAN) || defined (__BIG_ENDIAN__)			\
+ || defined (__a29k__) || defined (_AM29K)				\
+ || defined (__arm__)							\
+ || (defined (__convex__) && defined (_IEEE_FLOAT_))			\
+ || defined (_CRAYMPP)							\
+ || defined (__i370__) || defined (__mvs__)				\
+ || defined (__mc68000__) || defined (__mc68020__) || defined (__m68k__)\
+    || defined(mc68020)							\
+ || defined (__m88000__)						\
+ || defined (MIPSEB) || defined (_MIPSEB)				\
+ || defined (__hppa) || defined (__hppa__)				\
+ || defined (__pyr__)							\
+ || defined (__ibm032__)						\
+ || defined (_IBMR2) || defined (_ARCH_PPC)				\
+ || defined (__sh__)							\
+ || defined (__sparc) || defined (sparc)				\
+ || defined (__we32k__)
+#define _GMP_IEEE_FLOATS 1
+union ieee_double_extract
+{
+  struct
+    {
+      unsigned int sig:1;
+      unsigned int exp:11;
+      unsigned int manh:20;
+      unsigned int manl:32;
+    } s;
+  double d;
+};
+#endif
+#endif
+#endif
+
+/* Using "(2.0 * ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1)))" doesn't work on
+   SunOS 4.1.4 native /usr/ucb/cc (K&R), it comes out as -4294967296.0,
+   presumably due to treating the mp_limb_t constant as signed rather than
+   unsigned. */
+#define MP_BASE_AS_DOUBLE (4.0 * ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 2)))
+#if BITS_PER_MP_LIMB == 64
+#define LIMBS_PER_DOUBLE 2
+#else
+#define LIMBS_PER_DOUBLE 3
+#endif
+
+double __gmp_scale2 _PROTO ((double, int));
+int __gmp_extract_double _PROTO ((mp_ptr, double));
+
+extern int __gmp_junk;
+extern const int __gmp_0;
+#define GMP_ERROR(code)   (gmp_errno |= (code), __gmp_junk = 10/__gmp_0)
+#define DIVIDE_BY_ZERO    GMP_ERROR(GMP_ERROR_DIVISION_BY_ZERO)
+#define SQRT_OF_NEGATIVE  GMP_ERROR(GMP_ERROR_SQRT_OF_NEGATIVE)
+
+#if defined _LONG_LONG_LIMB
+#if defined (__STDC__)
+#define CNST_LIMB(C) C##LL
+#else
+#define CNST_LIMB(C) C/**/LL
+#endif
+#else /* not _LONG_LONG_LIMB */
+#if defined (__STDC__)
+#define CNST_LIMB(C) C##L
+#else
+#define CNST_LIMB(C) C/**/L
+#endif
+#endif /* _LONG_LONG_LIMB */
+
+/*** Stuff used by mpn/generic/prefsqr.c and mpn/generic/next_prime.c ***/
+#if BITS_PER_MP_LIMB == 32
+#define PP 0xC0CFD797L		/* 3 x 5 x 7 x 11 x 13 x ... x 29 */
+#define PP_INVERTED 0x53E5645CL
+#define PP_MAXPRIME 29
+#define PP_MASK 0x208A28A8L
+#endif
+
+#if BITS_PER_MP_LIMB == 64
+#define PP CNST_LIMB(0xE221F97C30E94E1D)	/* 3 x 5 x 7 x 11 x 13 x ... x 53 */
+#define PP_INVERTED CNST_LIMB(0x21CFE6CFC938B36B)
+#define PP_MAXPRIME 53
+#define PP_MASK CNST_LIMB(0x208A20A08A28A8)
+#endif
+
+
+/* BIT1 means a result value in bit 1 (second least significant bit), with a
+   zero bit representing +1 and a one bit representing -1.  Bits other than
+   bit 1 are garbage.
+
+   JACOBI_TWOS_U_BIT1 and JACOBI_RECIP_UU_BIT1 are used in mpn_jacobi_base
+   and their speed is important.  Expressions are used rather than
+   conditionals to accumulate sign changes, which effectively means XORs
+   instead of conditional JUMPs. */
+
+/* (a/0), with a signed; is 1 if a=+/-1, 0 otherwise */
+#define JACOBI_S0(a) \
+  (((a) == 1) | ((a) == -1))
+
+/* (a/0), with a unsigned; is 1 if a=+/-1, 0 otherwise */
+#define JACOBI_U0(a) \
+  ((a) == 1)
+
+/* (a/0), with a an mpz_t; is 1 if a=+/-1, 0 otherwise
+   An mpz_t always has at least one limb of allocated space, so the fetch of
+   the low limb is valid. */
+#define JACOBI_Z0(a) \
+  (((SIZ(a) == 1) | (SIZ(a) == -1)) & (PTR(a)[0] == 1))
+
+/* Convert a bit1 to +1 or -1. */
+#define JACOBI_BIT1_TO_PN(result_bit1) \
+  (1 - ((result_bit1) & 2))
+
+/* (2/b), with b unsigned and odd;
+   is (-1)^((b^2-1)/8) which is 1 if b==1,7mod8 or -1 if b==3,5mod8 and
+   hence obtained from (b>>1)^b */
+#define JACOBI_TWO_U_BIT1(b) \
+  (ASSERT (b & 1), (((b) >> 1) ^ (b)))
+
+/* (2/b)^twos, with b unsigned and odd */
+#define JACOBI_TWOS_U_BIT1(twos, b) \
+  (((twos) << 1) & JACOBI_TWO_U_BIT1 (b))
+
+/* (2/b)^twos, with b unsigned and odd */
+#define JACOBI_TWOS_U(twos, b) \
+  (JACOBI_BIT1_TO_PN (JACOBI_TWOS_U_BIT1 (twos, b)))
+
+/* (a/b) effect due to sign of a: signed/unsigned, b odd;
+   is (-1)^((b-1)/2) if a<0, or +1 if a>=0 */
+#define JACOBI_ASGN_SU_BIT1(a, b) \
+  ((((a) < 0) << 1) & (b))
+
+/* (a/b) effect due to sign of b: signed/mpz;
+   is -1 if a and b both negative, +1 otherwise */
+#define JACOBI_BSGN_SZ_BIT1(a, b) \
+  ((((a) < 0) & (SIZ(b) < 0)) << 1)
+
+/* (a/b) effect due to sign of b: mpz/signed */
+#define JACOBI_BSGN_ZS_BIT1(a, b) \
+  JACOBI_BSGN_SZ_BIT1(b, a)
+
+/* (a/b) reciprocity to switch to (b/a), a,b both unsigned and odd.
+   Is (-1)^((a-1)*(b-1)/4), which means +1 if either a,b==1mod4 or -1 if
+   both a,b==3mod4, achieved in bit 1 by a&b.  No ASSERT()s about a,b odd
+   because this is used in a couple of places with only bit 1 of a or b
+   valid. */
+#define JACOBI_RECIP_UU_BIT1(a, b) \
+  ((a) & (b))
+
+
+/* For testing and debugging.  */
+#define MPZ_CHECK_FORMAT(z)                                     	\
+  (ASSERT_ALWAYS (SIZ(z) == 0 || PTR(z)[ABSIZ(z) - 1] != 0),    	\
+   ASSERT_ALWAYS (ALLOC(z) >= ABSIZ(z)))
+#define MPZ_PROVOKE_REALLOC(z)						\
+  do { ALLOC(z) = ABSIZ(z); } while (0)
+
+
+#if TUNE_PROGRAM_BUILD
+/* Some extras wanted when recompiling some .c files for use by the tune
+   program.  Not part of a normal build. */
+
+extern mp_size_t  mul_threshold[];
+extern mp_size_t  fft_modf_mul_threshold;
+extern mp_size_t  sqr_threshold[];
+extern mp_size_t  fft_modf_sqr_threshold;
+extern mp_size_t  bz_threshold[];
+extern mp_size_t  fib_threshold[];
+extern mp_size_t  powm_threshold[];
+extern mp_size_t  gcd_accel_threshold[];
+extern mp_size_t  gcdext_threshold[];
+
+#undef KARATSUBA_MUL_THRESHOLD
+#undef TOOM3_MUL_THRESHOLD
+#undef FFT_MUL_TABLE
+#undef FFT_MUL_THRESHOLD
+#undef FFT_MODF_MUL_THRESHOLD
+#undef KARATSUBA_SQR_THRESHOLD
+#undef TOOM3_SQR_THRESHOLD
+#undef FFT_SQR_TABLE
+#undef FFT_SQR_THRESHOLD
+#undef FFT_MODF_SQR_THRESHOLD
+#undef BZ_THRESHOLD
+#undef FIB_THRESHOLD
+#undef POWM_THRESHOLD
+#undef GCD_ACCEL_THRESHOLD
+#undef GCDEXT_THRESHOLD
+
+#define KARATSUBA_MUL_THRESHOLD  mul_threshold[0]
+#define TOOM3_MUL_THRESHOLD      mul_threshold[1]
+#define FFT_MUL_TABLE            0
+#define FFT_MUL_THRESHOLD        mul_threshold[2]
+#define FFT_MODF_MUL_THRESHOLD   fft_modf_mul_threshold
+#define KARATSUBA_SQR_THRESHOLD  sqr_threshold[0]
+#define TOOM3_SQR_THRESHOLD      sqr_threshold[1]
+#define FFT_SQR_TABLE            0
+#define FFT_SQR_THRESHOLD        sqr_threshold[2]
+#define FFT_MODF_SQR_THRESHOLD   fft_modf_sqr_threshold
+#define BZ_THRESHOLD             bz_threshold[0]
+#define FIB_THRESHOLD            fib_threshold[0]
+#define POWM_THRESHOLD           powm_threshold[0]
+#define GCD_ACCEL_THRESHOLD      gcd_accel_threshold[0]
+#define GCDEXT_THRESHOLD         gcdext_threshold[0]
+
+#define TOOM3_MUL_THRESHOLD_LIMIT  700
+
+#undef  FFT_TABLE_ATTRS
+#define FFT_TABLE_ATTRS
+extern mp_size_t mpn_fft_table[2][MPN_FFT_TABLE_SIZE];
+
+#endif /* TUNE_PROGRAM_BUILD */
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/rts/gmp/gmp.h b/rts/gmp/gmp.h
new file mode 100644
index 0000000000..0f1b9510e9
--- /dev/null
+++ b/rts/gmp/gmp.h
@@ -0,0 +1,1083 @@
+/* gmp.h -- Definitions for GNU multiple precision functions.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#ifndef __GMP_H__
+
+#ifndef __GNU_MP__		/* to allow inclusion of both gmp.h and mp.h */
+#define __GNU_MP__ 2
+#define __need_size_t
+#include <stddef.h>
+#undef __need_size_t
+
+#ifndef STG_H
+/* Get DLL_IMPORT */
+#include "../../includes/ghcconfig.h"
+#include "../../includes/StgDLL.h"
+#endif
+
+#if defined (__mips) && defined (_ABIN32)
+/* Force the use of 64-bit limbs for all 64-bit MIPS CPUs if ABI permits.  */
+#define _LONG_LONG_LIMB
+#endif
+
+#if (__STDC__-0) || defined (__cplusplus)
+#define __gmp_const const
+#define __gmp_signed signed
+#else
+#define __gmp_const
+#define __gmp_signed
+#endif
+
+#if defined (__GNUC__)
+#define __gmp_inline __inline__
+#else
+#define __gmp_inline
+#endif
+
+#ifndef _EXTERN_INLINE
+#ifdef __GNUC__
+#define _EXTERN_INLINE extern __inline__
+#else
+#define _EXTERN_INLINE static
+#endif
+#endif
+
+#ifdef _SHORT_LIMB
+typedef unsigned int		mp_limb_t;
+typedef int			mp_limb_signed_t;
+#else
+#ifdef _LONG_LONG_LIMB
+typedef unsigned long long int	mp_limb_t;
+typedef long long int		mp_limb_signed_t;
+#else
+typedef unsigned long int	mp_limb_t;
+typedef long int		mp_limb_signed_t;
+#endif
+#endif
+
+typedef mp_limb_t *		mp_ptr;
+typedef __gmp_const mp_limb_t *	mp_srcptr;
+#if defined (_CRAY) && ! defined (_CRAYMPP)
+/* plain `int' is much faster (48 bits) */
+typedef int			mp_size_t;
+typedef int			mp_exp_t;
+#else
+typedef long int		mp_size_t;
+typedef long int		mp_exp_t;
+#endif
+
+typedef struct
+{
+  int _mp_alloc;		/* Number of *limbs* allocated and pointed
+				   to by the _mp_d field.  */
+  int _mp_size;			/* abs(_mp_size) is the number of limbs the
+				   last field points to.  If _mp_size is
+				   negative this is a negative number.  */
+  mp_limb_t *_mp_d;		/* Pointer to the limbs.  */
+} __mpz_struct;
+#endif /* __GNU_MP__ */
+
+typedef __mpz_struct MP_INT;
+typedef __mpz_struct mpz_t[1];
+
+typedef struct
+{
+  __mpz_struct _mp_num;
+  __mpz_struct _mp_den;
+} __mpq_struct;
+
+typedef __mpq_struct MP_RAT;
+typedef __mpq_struct mpq_t[1];
+
+typedef struct
+{
+  int _mp_prec;			/* Max precision, in number of `mp_limb_t's.
+				   Set by mpf_init and modified by
+				   mpf_set_prec.  The area pointed to by the
+				   _mp_d field contains `prec' + 1 limbs.  */
+  int _mp_size;			/* abs(_mp_size) is the number of limbs the
+				   last field points to.  If _mp_size is
+				   negative this is a negative number.  */
+  mp_exp_t _mp_exp;		/* Exponent, in the base of `mp_limb_t'.  */
+  mp_limb_t *_mp_d;		/* Pointer to the limbs.  */
+} __mpf_struct;
+
+/* typedef __mpf_struct MP_FLOAT; */
+typedef __mpf_struct mpf_t[1];
+
+/* Available random number generation algorithms.  */
+typedef enum
+{
+  GMP_RAND_ALG_DEFAULT = 0,
+  GMP_RAND_ALG_LC = GMP_RAND_ALG_DEFAULT /* Linear congruential.  */
+} gmp_randalg_t;
+
+/* Linear congruential data struct.  */
+typedef struct {
+  mpz_t a;			/* Multiplier. */
+  unsigned long int c;		/* Adder. */
+  mpz_t m;			/* Modulus (valid only if m2exp == 0).  */
+  unsigned long int m2exp;	/* If != 0, modulus is 2 ^ m2exp.  */
+} __gmp_randata_lc;
+
+/* Random state struct.  */
+typedef struct
+{
+  mpz_t seed;			/* Current seed.  */
+  gmp_randalg_t alg;		/* Algorithm used.  */
+  union {			/* Algorithm specific data.  */
+    __gmp_randata_lc *lc;	/* Linear congruential.  */
+  } algdata;
+} __gmp_randstate_struct;
+typedef __gmp_randstate_struct gmp_randstate_t[1];
+
+/* Types for function declarations in gmp files.  */
+/* ??? Should not pollute user name space with these ??? */
+typedef __gmp_const __mpz_struct *mpz_srcptr;
+typedef __mpz_struct *mpz_ptr;
+typedef __gmp_const __mpf_struct *mpf_srcptr;
+typedef __mpf_struct *mpf_ptr;
+typedef __gmp_const __mpq_struct *mpq_srcptr;
+typedef __mpq_struct *mpq_ptr;
+
+#ifndef _PROTO
+#if (__STDC__-0) || defined (__cplusplus)
+#define _PROTO(x) x
+#else
+#define _PROTO(x) ()
+#endif
+#endif
+
+#ifndef __MPN
+/* Really use `defined (__STDC__)' here; we want it to be true for Sun C */
+#if defined (__STDC__) || defined (__cplusplus)
+#define __MPN(x) __gmpn_##x
+#else
+#define __MPN(x) __gmpn_/**/x
+#endif
+#endif
+
+#if defined (FILE) || defined (H_STDIO) || defined (_H_STDIO) \
+ || defined (_STDIO_H) || defined (_STDIO_H_) || defined (__STDIO_H__) \
+ || defined (_STDIO_INCLUDED) || defined (__dj_include_stdio_h_)
+#define _GMP_H_HAVE_FILE 1
+#endif
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#define mp_set_memory_functions __gmp_set_memory_functions
+DLL_IMPORT void mp_set_memory_functions _PROTO ((void *(*) (size_t),
+				      void *(*) (void *, size_t, size_t),
+				      void (*) (void *, size_t)));
+
+#define mp_bits_per_limb __gmp_bits_per_limb
+DLL_IMPORT extern __gmp_const int mp_bits_per_limb;
+
+#if defined (__cplusplus)
+}
+#endif
+
+
+/**************** Random number routines.  ****************/
+
+#define _gmp_rand __gmp_rand
+#define gmp_randinit __gmp_randinit
+#define gmp_randinit_lc __gmp_randinit_lc
+#define gmp_randinit_lc_2exp __gmp_randinit_lc_2exp
+#define gmp_randseed __gmp_randseed
+#define gmp_randseed_ui __gmp_randseed_ui
+#define gmp_randclear __gmp_randclear
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+DLL_IMPORT void _gmp_rand _PROTO ((mp_ptr, gmp_randstate_t, unsigned long int));
+DLL_IMPORT void gmp_randinit _PROTO ((gmp_randstate_t, gmp_randalg_t, ...));
+DLL_IMPORT void gmp_randinit_lc _PROTO ((gmp_randstate_t, mpz_t, unsigned long int,
+			      mpz_t));
+DLL_IMPORT void gmp_randinit_lc_2exp _PROTO ((gmp_randstate_t, mpz_t, unsigned long int,
+				   unsigned long int));
+DLL_IMPORT void gmp_randseed _PROTO ((gmp_randstate_t, mpz_t));
+DLL_IMPORT void gmp_randseed_ui _PROTO ((gmp_randstate_t, unsigned long int));
+DLL_IMPORT void gmp_randclear _PROTO ((gmp_randstate_t));
+
+#if defined (__cplusplus)
+}
+#endif
+
+/**************** Integer (i.e. Z) routines.  ****************/
+
+#define _mpz_realloc __gmpz_realloc
+#define mpz_realloc __gmpz_realloc
+#define mpz_abs __gmpz_abs
+#define mpz_add __gmpz_add
+#define mpz_add_ui __gmpz_add_ui
+#define mpz_addmul_ui __gmpz_addmul_ui
+#define mpz_and __gmpz_and
+#define mpz_array_init __gmpz_array_init
+#define mpz_bin_ui __gmpz_bin_ui
+#define mpz_bin_uiui __gmpz_bin_uiui
+#define mpz_cdiv_q __gmpz_cdiv_q
+#define mpz_cdiv_q_ui __gmpz_cdiv_q_ui
+#define mpz_cdiv_qr __gmpz_cdiv_qr
+#define mpz_cdiv_qr_ui __gmpz_cdiv_qr_ui
+#define mpz_cdiv_r __gmpz_cdiv_r
+#define mpz_cdiv_r_ui __gmpz_cdiv_r_ui
+#define mpz_cdiv_ui __gmpz_cdiv_ui
+#define mpz_clear __gmpz_clear
+#define mpz_clrbit __gmpz_clrbit
+#define mpz_cmp __gmpz_cmp
+#define _mpz_cmp_si __gmpz_cmp_si
+#define _mpz_cmp_ui __gmpz_cmp_ui
+#define mpz_cmpabs __gmpz_cmpabs
+#define mpz_cmpabs_ui __gmpz_cmpabs_ui
+#define mpz_com __gmpz_com
+#define mpz_divexact __gmpz_divexact
+#define mpz_dump __gmpz_dump
+#define mpz_fac_ui __gmpz_fac_ui
+#define mpz_fdiv_q __gmpz_fdiv_q
+#define mpz_fdiv_q_2exp __gmpz_fdiv_q_2exp
+#define mpz_fdiv_q_ui __gmpz_fdiv_q_ui
+#define mpz_fdiv_qr __gmpz_fdiv_qr
+#define mpz_fdiv_qr_ui __gmpz_fdiv_qr_ui
+#define mpz_fdiv_r __gmpz_fdiv_r
+#define mpz_fdiv_r_2exp __gmpz_fdiv_r_2exp
+#define mpz_fdiv_r_ui __gmpz_fdiv_r_ui
+#define mpz_fdiv_ui __gmpz_fdiv_ui
+#define mpz_fib_ui __gmpz_fib_ui
+#define mpz_fits_sint_p __gmpz_fits_sint_p
+#define mpz_fits_slong_p __gmpz_fits_slong_p
+#define mpz_fits_sshort_p __gmpz_fits_sshort_p
+#define mpz_fits_uint_p __gmpz_fits_uint_p
+#define mpz_fits_ulong_p __gmpz_fits_ulong_p
+#define mpz_fits_ushort_p __gmpz_fits_ushort_p
+#define mpz_gcd __gmpz_gcd
+#define mpz_gcd_ui __gmpz_gcd_ui
+#define mpz_gcdext __gmpz_gcdext
+#define mpz_get_d __gmpz_get_d
+#define mpz_get_si __gmpz_get_si
+#define mpz_get_str __gmpz_get_str
+#define mpz_get_ui __gmpz_get_ui
+#define mpz_getlimbn __gmpz_getlimbn
+#define mpz_hamdist __gmpz_hamdist
+#define mpz_init __gmpz_init
+#define mpz_inp_binary __gmpz_inp_binary
+#define mpz_inp_raw __gmpz_inp_raw
+#define mpz_inp_str __gmpz_inp_str
+#define mpz_init_set __gmpz_init_set
+#define mpz_init_set_d __gmpz_init_set_d
+#define mpz_init_set_si __gmpz_init_set_si
+#define mpz_init_set_str __gmpz_init_set_str
+#define mpz_init_set_ui __gmpz_init_set_ui
+#define mpz_invert __gmpz_invert
+#define mpz_ior __gmpz_ior
+#define mpz_jacobi __gmpz_jacobi
+#define mpz_lcm __gmpz_lcm
+#define mpz_legendre __gmpz_legendre
+#define mpz_mod __gmpz_mod
+#define mpz_mul __gmpz_mul
+#define mpz_mul_2exp __gmpz_mul_2exp
+#define mpz_neg __gmpz_neg
+#define mpz_nextprime __gmpz_nextprime
+#define mpz_out_binary __gmpz_out_binary
+#define mpz_out_raw __gmpz_out_raw
+#define mpz_out_str __gmpz_out_str
+#define mpz_perfect_power_p __gmpz_perfect_power_p
+#define mpz_perfect_square_p __gmpz_perfect_square_p
+#define mpz_popcount __gmpz_popcount
+#define mpz_pow_ui __gmpz_pow_ui
+#define mpz_powm __gmpz_powm
+#define mpz_powm_ui __gmpz_powm_ui
+#define mpz_probab_prime_p __gmpz_probab_prime_p
+#define mpz_random __gmpz_random
+#define mpz_random2 __gmpz_random2
+#define mpz_remove __gmpz_remove
+#define mpz_root __gmpz_root
+#define mpz_rrandomb __gmpz_rrandomb
+#define mpz_scan0 __gmpz_scan0
+#define mpz_scan1 __gmpz_scan1
+#define mpz_set __gmpz_set
+#define mpz_set_d __gmpz_set_d
+#define mpz_set_f __gmpz_set_f
+#define mpz_set_q __gmpz_set_q
+#define mpz_set_si __gmpz_set_si
+#define mpz_set_str __gmpz_set_str
+#define mpz_set_ui __gmpz_set_ui
+#define mpz_setbit __gmpz_setbit
+#define mpz_size __gmpz_size
+#define mpz_sizeinbase __gmpz_sizeinbase
+#define mpz_sqrt __gmpz_sqrt
+#define mpz_sqrtrem __gmpz_sqrtrem
+#define mpz_sub __gmpz_sub
+#define mpz_sub_ui __gmpz_sub_ui
+#define mpz_swap __gmpz_swap
+#define mpz_tdiv_ui __gmpz_tdiv_ui
+#define mpz_tdiv_q __gmpz_tdiv_q
+#define mpz_tdiv_q_2exp __gmpz_tdiv_q_2exp
+#define mpz_tdiv_q_ui __gmpz_tdiv_q_ui
+#define mpz_tdiv_qr __gmpz_tdiv_qr
+#define mpz_tdiv_qr_ui __gmpz_tdiv_qr_ui
+#define mpz_tdiv_r __gmpz_tdiv_r
+#define mpz_tdiv_r_2exp __gmpz_tdiv_r_2exp
+#define mpz_tdiv_r_ui __gmpz_tdiv_r_ui
+#define mpz_tstbit __gmpz_tstbit
+#define mpz_ui_pow_ui __gmpz_ui_pow_ui
+#define mpz_urandomb __gmpz_urandomb
+#define mpz_urandomm __gmpz_urandomm
+#define mpz_xor __gmpz_xor
+#define mpz_eor __gmpz_xor
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+DLL_IMPORT void *_mpz_realloc _PROTO ((mpz_ptr, mp_size_t));
+
+DLL_IMPORT void mpz_abs _PROTO ((mpz_ptr, mpz_srcptr));
+DLL_IMPORT void mpz_add _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_add_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_addmul_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_and _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_array_init _PROTO ((mpz_ptr, mp_size_t, mp_size_t));
+DLL_IMPORT void mpz_bin_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_bin_uiui _PROTO ((mpz_ptr, unsigned long int, unsigned long int));
+DLL_IMPORT void mpz_cdiv_q _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT unsigned long int mpz_cdiv_q_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_cdiv_qr _PROTO ((mpz_ptr, mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT unsigned long int mpz_cdiv_qr_ui _PROTO ((mpz_ptr, mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_cdiv_r _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT unsigned long int mpz_cdiv_r_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT unsigned long int mpz_cdiv_ui _PROTO ((mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_clear _PROTO ((mpz_ptr));
+DLL_IMPORT void mpz_clrbit _PROTO ((mpz_ptr, unsigned long int));
+DLL_IMPORT int mpz_cmp _PROTO ((mpz_srcptr, mpz_srcptr));
+DLL_IMPORT int _mpz_cmp_si _PROTO ((mpz_srcptr, signed long int));
+DLL_IMPORT int _mpz_cmp_ui _PROTO ((mpz_srcptr, unsigned long int));
+DLL_IMPORT int mpz_cmpabs _PROTO ((mpz_srcptr, mpz_srcptr));
+DLL_IMPORT int mpz_cmpabs_ui _PROTO ((mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_com _PROTO ((mpz_ptr, mpz_srcptr));
+DLL_IMPORT void mpz_divexact _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_dump _PROTO ((mpz_srcptr));
+DLL_IMPORT void mpz_fac_ui _PROTO ((mpz_ptr, unsigned long int));
+DLL_IMPORT void mpz_fdiv_q _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_fdiv_q_2exp _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT unsigned long int mpz_fdiv_q_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_fdiv_qr _PROTO ((mpz_ptr, mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT unsigned long int mpz_fdiv_qr_ui _PROTO ((mpz_ptr, mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_fdiv_r _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_fdiv_r_2exp _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT unsigned long int mpz_fdiv_r_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT unsigned long int mpz_fdiv_ui _PROTO ((mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_fib_ui _PROTO ((mpz_ptr, unsigned long int));
+DLL_IMPORT int mpz_fits_sint_p _PROTO ((mpz_srcptr));
+DLL_IMPORT int mpz_fits_slong_p _PROTO ((mpz_srcptr));
+DLL_IMPORT int mpz_fits_sshort_p _PROTO ((mpz_srcptr));
+DLL_IMPORT int mpz_fits_uint_p _PROTO ((mpz_srcptr));
+DLL_IMPORT int mpz_fits_ulong_p _PROTO ((mpz_srcptr));
+DLL_IMPORT int mpz_fits_ushort_p _PROTO ((mpz_srcptr));
+DLL_IMPORT void mpz_gcd _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT unsigned long int mpz_gcd_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_gcdext _PROTO ((mpz_ptr, mpz_ptr, mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT double mpz_get_d _PROTO ((mpz_srcptr));
+/* signed */ long int mpz_get_si _PROTO ((mpz_srcptr));
+DLL_IMPORT char *mpz_get_str _PROTO ((char *, int, mpz_srcptr));
+DLL_IMPORT unsigned long int mpz_get_ui _PROTO ((mpz_srcptr));
+DLL_IMPORT mp_limb_t mpz_getlimbn _PROTO ((mpz_srcptr, mp_size_t));
+DLL_IMPORT unsigned long int mpz_hamdist _PROTO ((mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_init _PROTO ((mpz_ptr));
+#ifdef _GMP_H_HAVE_FILE
+DLL_IMPORT size_t mpz_inp_binary _PROTO ((mpz_ptr, FILE *));
+DLL_IMPORT size_t mpz_inp_raw _PROTO ((mpz_ptr, FILE *));
+DLL_IMPORT size_t mpz_inp_str _PROTO ((mpz_ptr, FILE *, int));
+#endif
+DLL_IMPORT void mpz_init_set _PROTO ((mpz_ptr, mpz_srcptr));
+DLL_IMPORT void mpz_init_set_d _PROTO ((mpz_ptr, double));
+DLL_IMPORT void mpz_init_set_si _PROTO ((mpz_ptr, signed long int));
+DLL_IMPORT int mpz_init_set_str _PROTO ((mpz_ptr, __gmp_const char *, int));
+DLL_IMPORT void mpz_init_set_ui _PROTO ((mpz_ptr, unsigned long int));
+DLL_IMPORT int mpz_invert _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_ior _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT int mpz_jacobi _PROTO ((mpz_srcptr, mpz_srcptr));
+
+#define mpz_kronecker_si __gmpz_kronecker_si
+DLL_IMPORT int mpz_kronecker_si _PROTO ((mpz_srcptr, long));
+
+#define mpz_kronecker_ui __gmpz_kronecker_ui
+DLL_IMPORT int mpz_kronecker_ui _PROTO ((mpz_srcptr, unsigned long));
+
+#define mpz_si_kronecker __gmpz_si_kronecker
+DLL_IMPORT int mpz_si_kronecker _PROTO ((long, mpz_srcptr));
+
+#define mpz_ui_kronecker __gmpz_ui_kronecker
+DLL_IMPORT int mpz_ui_kronecker _PROTO ((unsigned long, mpz_srcptr));
+
+DLL_IMPORT void mpz_lcm _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT int mpz_legendre _PROTO ((mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_mod _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_mul _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_mul_2exp _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+
+#define mpz_mul_si __gmpz_mul_si
+DLL_IMPORT void mpz_mul_si _PROTO ((mpz_ptr, mpz_srcptr, long int));
+
+#define mpz_mul_ui __gmpz_mul_ui
+DLL_IMPORT void mpz_mul_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+
+DLL_IMPORT void mpz_neg _PROTO ((mpz_ptr, mpz_srcptr));
+DLL_IMPORT void mpz_nextprime _PROTO ((mpz_ptr, mpz_srcptr));
+#ifdef _GMP_H_HAVE_FILE
+DLL_IMPORT size_t mpz_out_binary _PROTO ((FILE *, mpz_srcptr));
+DLL_IMPORT size_t mpz_out_raw _PROTO ((FILE *, mpz_srcptr));
+DLL_IMPORT size_t mpz_out_str _PROTO ((FILE *, int, mpz_srcptr));
+#endif
+DLL_IMPORT int mpz_perfect_power_p _PROTO ((mpz_srcptr));
+DLL_IMPORT int mpz_perfect_square_p _PROTO ((mpz_srcptr));
+DLL_IMPORT unsigned long int mpz_popcount _PROTO ((mpz_srcptr));
+DLL_IMPORT void mpz_pow_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_powm _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_powm_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int, mpz_srcptr));
+DLL_IMPORT int mpz_probab_prime_p _PROTO ((mpz_srcptr, int));
+DLL_IMPORT void mpz_random _PROTO ((mpz_ptr, mp_size_t));
+DLL_IMPORT void mpz_random2 _PROTO ((mpz_ptr, mp_size_t));
+DLL_IMPORT unsigned long int mpz_remove _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT int mpz_root _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_rrandomb _PROTO ((mpz_ptr, gmp_randstate_t, unsigned long int));
+DLL_IMPORT unsigned long int mpz_scan0 _PROTO ((mpz_srcptr, unsigned long int));
+DLL_IMPORT unsigned long int mpz_scan1 _PROTO ((mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_set _PROTO ((mpz_ptr, mpz_srcptr));
+DLL_IMPORT void mpz_set_d _PROTO ((mpz_ptr, double));
+DLL_IMPORT void mpz_set_f _PROTO ((mpz_ptr, mpf_srcptr));
+DLL_IMPORT void mpz_set_q _PROTO ((mpz_ptr, mpq_srcptr));
+DLL_IMPORT void mpz_set_si _PROTO ((mpz_ptr, signed long int));
+DLL_IMPORT int mpz_set_str _PROTO ((mpz_ptr, __gmp_const char *, int));
+DLL_IMPORT void mpz_set_ui _PROTO ((mpz_ptr, unsigned long int));
+DLL_IMPORT void mpz_setbit _PROTO ((mpz_ptr, unsigned long int));
+DLL_IMPORT size_t mpz_size _PROTO ((mpz_srcptr));
+DLL_IMPORT size_t mpz_sizeinbase _PROTO ((mpz_srcptr, int));
+DLL_IMPORT void mpz_sqrt _PROTO ((mpz_ptr, mpz_srcptr));
+DLL_IMPORT void mpz_sqrtrem _PROTO ((mpz_ptr, mpz_ptr, mpz_srcptr));
+DLL_IMPORT void mpz_sub _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_sub_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_swap _PROTO ((mpz_ptr, mpz_ptr));
+DLL_IMPORT void mpz_tdiv_q _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_tdiv_q_2exp _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT unsigned long int mpz_tdiv_ui _PROTO ((mpz_srcptr, unsigned long int));
+DLL_IMPORT unsigned long int mpz_tdiv_q_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_tdiv_qr _PROTO ((mpz_ptr, mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT unsigned long int mpz_tdiv_qr_ui _PROTO ((mpz_ptr, mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_tdiv_r _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+DLL_IMPORT void mpz_tdiv_r_2exp _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT unsigned long int mpz_tdiv_r_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int));
+DLL_IMPORT int mpz_tstbit _PROTO ((mpz_srcptr, unsigned long int));
+DLL_IMPORT void mpz_ui_pow_ui _PROTO ((mpz_ptr, unsigned long int, unsigned long int));
+DLL_IMPORT void mpz_urandomb _PROTO ((mpz_t, gmp_randstate_t, unsigned long int));
+DLL_IMPORT void mpz_urandomm _PROTO ((mpz_t, gmp_randstate_t, mpz_t));
+DLL_IMPORT void mpz_xor _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr));
+#if defined (__cplusplus)
+}
+#endif
+
+/**************** Rational (i.e. Q) routines.  ****************/
+
+#define mpq_init __gmpq_init
+#define mpq_clear __gmpq_clear
+#define mpq_set __gmpq_set
+#define mpq_set_ui __gmpq_set_ui
+#define mpq_set_si __gmpq_set_si
+#define mpq_set_z __gmpq_set_z
+#define mpq_add __gmpq_add
+#define mpq_sub __gmpq_sub
+#define mpq_mul __gmpq_mul
+#define mpq_div __gmpq_div
+#define mpq_neg __gmpq_neg
+#define mpq_cmp __gmpq_cmp
+#define _mpq_cmp_ui __gmpq_cmp_ui
+#define mpq_equal __gmpq_equal
+#define mpq_inv __gmpq_inv
+#define mpq_set_num __gmpq_set_num
+#define mpq_set_den __gmpq_set_den
+#define mpq_get_num __gmpq_get_num
+#define mpq_get_den __gmpq_get_den
+#define mpq_get_d __gmpq_get_d
+#define mpq_set_d __gmpq_set_d
+#define mpq_canonicalize __gmpq_canonicalize
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+DLL_IMPORT void mpq_init _PROTO ((mpq_ptr));
+DLL_IMPORT void mpq_clear _PROTO ((mpq_ptr));
+DLL_IMPORT void mpq_set _PROTO ((mpq_ptr, mpq_srcptr));
+DLL_IMPORT void mpq_set_ui _PROTO ((mpq_ptr, unsigned long int, unsigned long int));
+DLL_IMPORT void mpq_set_si _PROTO ((mpq_ptr, signed long int, unsigned long int));
+DLL_IMPORT void mpq_set_z _PROTO ((mpq_ptr, mpz_srcptr));
+DLL_IMPORT void mpq_add _PROTO ((mpq_ptr, mpq_srcptr, mpq_srcptr));
+DLL_IMPORT void mpq_sub _PROTO ((mpq_ptr, mpq_srcptr, mpq_srcptr));
+DLL_IMPORT void mpq_mul _PROTO ((mpq_ptr, mpq_srcptr, mpq_srcptr));
+DLL_IMPORT void mpq_div _PROTO ((mpq_ptr, mpq_srcptr, mpq_srcptr));
+DLL_IMPORT void mpq_neg _PROTO ((mpq_ptr, mpq_srcptr));
+DLL_IMPORT int mpq_cmp _PROTO ((mpq_srcptr, mpq_srcptr));
+DLL_IMPORT int _mpq_cmp_ui _PROTO ((mpq_srcptr, unsigned long int, unsigned long int));
+DLL_IMPORT int mpq_equal _PROTO ((mpq_srcptr, mpq_srcptr));
+DLL_IMPORT void mpq_inv _PROTO ((mpq_ptr, mpq_srcptr));
+DLL_IMPORT void mpq_set_num _PROTO ((mpq_ptr, mpz_srcptr));
+DLL_IMPORT void mpq_set_den _PROTO ((mpq_ptr, mpz_srcptr));
+DLL_IMPORT void mpq_get_num _PROTO ((mpz_ptr, mpq_srcptr));
+DLL_IMPORT void mpq_get_den _PROTO ((mpz_ptr, mpq_srcptr));
+DLL_IMPORT double mpq_get_d _PROTO ((mpq_srcptr));
+DLL_IMPORT void mpq_set_d _PROTO ((mpq_ptr, double));
+DLL_IMPORT void mpq_canonicalize _PROTO ((mpq_ptr));
+
+#define mpq_swap __gmpq_swap
+DLL_IMPORT void mpq_swap _PROTO ((mpq_ptr, mpq_ptr));
+
+#ifdef _GMP_H_HAVE_FILE
+#define mpq_out_str __gmpq_out_str
+DLL_IMPORT size_t mpq_out_str _PROTO ((FILE *, int, mpq_srcptr));
+#endif
+
+#if defined (__cplusplus)
+}
+#endif
+
+/**************** Float (i.e. F) routines.  ****************/
+
+#define mpf_abs __gmpf_abs
+#define mpf_add __gmpf_add
+#define mpf_add_ui __gmpf_add_ui
+#define mpf_ceil __gmpf_ceil
+#define mpf_clear __gmpf_clear
+#define mpf_cmp __gmpf_cmp
+#define mpf_cmp_si __gmpf_cmp_si
+#define mpf_cmp_ui __gmpf_cmp_ui
+#define mpf_div __gmpf_div
+#define mpf_div_2exp __gmpf_div_2exp
+#define mpf_div_ui __gmpf_div_ui
+#define mpf_dump __gmpf_dump
+#define mpf_floor __gmpf_floor
+#define mpf_eq __gmpf_eq
+#define mpf_get_d __gmpf_get_d
+#define mpf_get_prec __gmpf_get_prec
+#define mpf_get_str __gmpf_get_str
+#define mpf_init __gmpf_init
+#define mpf_init2 __gmpf_init2
+#define mpf_inp_str __gmpf_inp_str
+#define mpf_init_set __gmpf_init_set
+#define mpf_init_set_d __gmpf_init_set_d
+#define mpf_init_set_si __gmpf_init_set_si
+#define mpf_init_set_str __gmpf_init_set_str
+#define mpf_init_set_ui __gmpf_init_set_ui
+#define mpf_mul __gmpf_mul
+#define mpf_mul_2exp __gmpf_mul_2exp
+#define mpf_mul_ui __gmpf_mul_ui
+#define mpf_neg __gmpf_neg
+#define mpf_out_str __gmpf_out_str
+#define mpf_pow_ui __gmpf_pow_ui
+#define mpf_random2 __gmpf_random2
+#define mpf_reldiff __gmpf_reldiff
+#define mpf_set __gmpf_set
+#define mpf_set_d __gmpf_set_d
+#define mpf_set_default_prec __gmpf_set_default_prec
+#define mpf_set_prec __gmpf_set_prec
+#define mpf_set_prec_raw __gmpf_set_prec_raw
+#define mpf_set_q __gmpf_set_q
+#define mpf_set_si __gmpf_set_si
+#define mpf_set_str __gmpf_set_str
+#define mpf_set_ui __gmpf_set_ui
+#define mpf_set_z __gmpf_set_z
+#define mpf_size __gmpf_size
+#define mpf_sqrt __gmpf_sqrt
+#define mpf_sqrt_ui __gmpf_sqrt_ui
+#define mpf_sub __gmpf_sub
+#define mpf_sub_ui __gmpf_sub_ui
+#define mpf_trunc __gmpf_trunc
+#define mpf_ui_div __gmpf_ui_div
+#define mpf_ui_sub __gmpf_ui_sub
+#define mpf_urandomb __gmpf_urandomb
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+DLL_IMPORT void mpf_abs _PROTO ((mpf_ptr, mpf_srcptr));
+DLL_IMPORT void mpf_add _PROTO ((mpf_ptr, mpf_srcptr, mpf_srcptr));
+DLL_IMPORT void mpf_add_ui _PROTO ((mpf_ptr, mpf_srcptr, unsigned long int));
+DLL_IMPORT void mpf_ceil _PROTO ((mpf_ptr, mpf_srcptr));
+DLL_IMPORT void mpf_clear _PROTO ((mpf_ptr));
+DLL_IMPORT int mpf_cmp _PROTO ((mpf_srcptr, mpf_srcptr));
+DLL_IMPORT int mpf_cmp_si _PROTO ((mpf_srcptr, signed long int));
+DLL_IMPORT int mpf_cmp_ui _PROTO ((mpf_srcptr, unsigned long int));
+DLL_IMPORT void mpf_div _PROTO ((mpf_ptr, mpf_srcptr, mpf_srcptr));
+DLL_IMPORT void mpf_div_2exp _PROTO ((mpf_ptr, mpf_srcptr, unsigned long int));
+DLL_IMPORT void mpf_div_ui _PROTO ((mpf_ptr, mpf_srcptr, unsigned long int));
+DLL_IMPORT void mpf_dump _PROTO ((mpf_srcptr));
+DLL_IMPORT int mpf_eq _PROTO ((mpf_srcptr, mpf_srcptr, unsigned long int));
+DLL_IMPORT void mpf_floor _PROTO ((mpf_ptr, mpf_srcptr));
+DLL_IMPORT double mpf_get_d _PROTO ((mpf_srcptr));
+DLL_IMPORT unsigned long int mpf_get_prec _PROTO ((mpf_srcptr));
+char *mpf_get_str _PROTO ((char *, mp_exp_t *, int, size_t, mpf_srcptr));
+DLL_IMPORT void mpf_init _PROTO ((mpf_ptr));
+DLL_IMPORT void mpf_init2 _PROTO ((mpf_ptr, unsigned long int));
+#ifdef _GMP_H_HAVE_FILE
+DLL_IMPORT size_t mpf_inp_str _PROTO ((mpf_ptr, FILE *, int));
+#endif
+DLL_IMPORT void mpf_init_set _PROTO ((mpf_ptr, mpf_srcptr));
+DLL_IMPORT void mpf_init_set_d _PROTO ((mpf_ptr, double));
+DLL_IMPORT void mpf_init_set_si _PROTO ((mpf_ptr, signed long int));
+DLL_IMPORT int mpf_init_set_str _PROTO ((mpf_ptr, __gmp_const char *, int));
+DLL_IMPORT void mpf_init_set_ui _PROTO ((mpf_ptr, unsigned long int));
+DLL_IMPORT void mpf_mul _PROTO ((mpf_ptr, mpf_srcptr, mpf_srcptr));
+DLL_IMPORT void mpf_mul_2exp _PROTO ((mpf_ptr, mpf_srcptr, unsigned long int));
+DLL_IMPORT void mpf_mul_ui _PROTO ((mpf_ptr, mpf_srcptr, unsigned long int));
+DLL_IMPORT void mpf_neg _PROTO ((mpf_ptr, mpf_srcptr));
+#ifdef _GMP_H_HAVE_FILE
+DLL_IMPORT size_t mpf_out_str _PROTO ((FILE *, int, size_t, mpf_srcptr));
+#endif
+DLL_IMPORT void mpf_pow_ui _PROTO ((mpf_ptr, mpf_srcptr, unsigned long int));
+DLL_IMPORT void mpf_random2 _PROTO ((mpf_ptr, mp_size_t, mp_exp_t));
+DLL_IMPORT void mpf_reldiff _PROTO ((mpf_ptr, mpf_srcptr, mpf_srcptr));
+DLL_IMPORT void mpf_set _PROTO ((mpf_ptr, mpf_srcptr));
+DLL_IMPORT void mpf_set_d _PROTO ((mpf_ptr, double));
+DLL_IMPORT void mpf_set_default_prec _PROTO ((unsigned long int));
+DLL_IMPORT void mpf_set_prec _PROTO ((mpf_ptr, unsigned long int));
+DLL_IMPORT void mpf_set_prec_raw _PROTO ((mpf_ptr, unsigned long int));
+DLL_IMPORT void mpf_set_q _PROTO ((mpf_ptr, mpq_srcptr));
+DLL_IMPORT void mpf_set_si _PROTO ((mpf_ptr, signed long int));
+DLL_IMPORT int mpf_set_str _PROTO ((mpf_ptr, __gmp_const char *, int));
+DLL_IMPORT void mpf_set_ui _PROTO ((mpf_ptr, unsigned long int));
+DLL_IMPORT void mpf_set_z _PROTO ((mpf_ptr, mpz_srcptr));
+DLL_IMPORT size_t mpf_size _PROTO ((mpf_srcptr));
+DLL_IMPORT void mpf_sqrt _PROTO ((mpf_ptr, mpf_srcptr));
+DLL_IMPORT void mpf_sqrt_ui _PROTO ((mpf_ptr, unsigned long int));
+DLL_IMPORT void mpf_sub _PROTO ((mpf_ptr, mpf_srcptr, mpf_srcptr));
+DLL_IMPORT void mpf_sub_ui _PROTO ((mpf_ptr, mpf_srcptr, unsigned long int));
+DLL_IMPORT void mpf_trunc _PROTO ((mpf_ptr, mpf_srcptr));
+DLL_IMPORT void mpf_ui_div _PROTO ((mpf_ptr, unsigned long int, mpf_srcptr));
+DLL_IMPORT void mpf_ui_sub _PROTO ((mpf_ptr, unsigned long int, mpf_srcptr));
+DLL_IMPORT void mpf_urandomb _PROTO ((mpf_t, gmp_randstate_t, unsigned long int));
+
+#define mpf_swap __gmpf_swap
+DLL_IMPORT void mpf_swap _PROTO ((mpf_ptr, mpf_ptr));
+
+#if defined (__cplusplus)
+}
+#endif
+/************ Low level positive-integer (i.e. N) routines.  ************/
+
+/* This is ugly, but we need to make user calls reach the prefixed function. */
+#define mpn_add			__MPN(add)
+#define mpn_add_1		__MPN(add_1)
+#define mpn_add_n		__MPN(add_n)
+#define mpn_add_nc		__MPN(add_nc)
+#define mpn_addmul_1		__MPN(addmul_1)
+#define mpn_addsub_n		__MPN(addsub_n)
+#define mpn_addsub_nc		__MPN(addsub_nc)
+/* #define mpn_and_n		__MPN(and_n) */
+/* #define mpn_andn_n		__MPN(andn_n) */
+#define mpn_bdivmod		__MPN(bdivmod)
+#define mpn_cmp			__MPN(cmp)
+/* #define mpn_com_n		__MPN(com_n) */
+#define mpn_copyd		__MPN(copyd)
+#define mpn_copyi		__MPN(copyi)
+#define mpn_divrem		__MPN(divrem)
+#define mpn_divrem_1		__MPN(divrem_1)
+#define mpn_divrem_2		__MPN(divrem_2)
+#define mpn_dump		__MPN(dump)
+#define mpn_gcd			__MPN(gcd)
+#define mpn_gcd_1		__MPN(gcd_1)
+#define mpn_gcdext		__MPN(gcdext)
+#define mpn_get_str		__MPN(get_str)
+#define mpn_hamdist		__MPN(hamdist)
+#define mpn_invert_limb 	__MPN(invert_limb)
+/* #define mpn_ior_n		__MPN(ior_n) */
+/* #define mpn_iorn_n		__MPN(iorn_n) */
+/* #define mpn_kara_mul_n	__MPN(kara_mul_n)  internal */
+/* #define mpn_kara_sqr_n	__MPN(kara_sqr_n)  internal */
+#define mpn_lshift		__MPN(lshift)
+#define mpn_lshiftc		__MPN(lshiftc)
+#define mpn_mod_1		__MPN(mod_1)
+#define mpn_mul			__MPN(mul)
+#define mpn_mul_1		__MPN(mul_1)
+#define mpn_mul_basecase	__MPN(mul_basecase)
+#define mpn_mul_n		__MPN(mul_n)
+#define mpn_perfect_square_p	__MPN(perfect_square_p)
+#define mpn_popcount		__MPN(popcount)
+#define mpn_preinv_mod_1	__MPN(preinv_mod_1)
+/* #define mpn_nand_n		__MPN(nand_n) */
+/* #define mpn_nior_n		__MPN(nior_n) */
+#define mpn_random		__MPN(random)
+#define mpn_random2		__MPN(random2)
+#define mpn_rshift		__MPN(rshift)
+#define mpn_rshiftc		__MPN(rshiftc)
+#define mpn_scan0		__MPN(scan0)
+#define mpn_scan1		__MPN(scan1)
+#define mpn_set_str		__MPN(set_str)
+#define mpn_sqr_basecase	__MPN(sqr_basecase)
+#define mpn_sqr_n		__MPN(sqr_n)
+#define mpn_sqrtrem		__MPN(sqrtrem)
+#define mpn_sub			__MPN(sub)
+#define mpn_sub_1		__MPN(sub_1)
+#define mpn_sub_n		__MPN(sub_n)
+#define mpn_sub_nc		__MPN(sub_nc)
+#define mpn_submul_1		__MPN(submul_1)
+/* #define mpn_toom3_mul_n		__MPN(toom3_mul_n)  internal */
+/* #define mpn_toom3_sqr_n		__MPN(toom3_sqr_n)  internal */
+/* #define mpn_xnor_n		__MPN(xnor_n) */
+/* #define mpn_xor_n		__MPN(xor_n) */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+DLL_IMPORT mp_limb_t mpn_add _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_srcptr,mp_size_t));
+DLL_IMPORT mp_limb_t mpn_add_1 _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
+DLL_IMPORT mp_limb_t mpn_add_n _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+DLL_IMPORT mp_limb_t mpn_add_nc _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_limb_t));
+
+DLL_IMPORT mp_limb_t mpn_addmul_1 _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
+
+#define mpn_addmul_1c  __MPN(addmul_1c)
+DLL_IMPORT mp_limb_t mpn_addmul_1c _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t));
+
+DLL_IMPORT mp_limb_t mpn_addsub_n _PROTO ((mp_ptr, mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+DLL_IMPORT mp_limb_t mpn_bdivmod _PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t, unsigned long int));
+DLL_IMPORT int mpn_cmp _PROTO ((mp_srcptr, mp_srcptr, mp_size_t));
+
+#define mpn_divexact_by3(dst, src, size)  mpn_divexact_by3c (dst, src, size, 0)
+
+#define mpn_divexact_by3c  __MPN(divexact_by3c)
+DLL_IMPORT mp_limb_t mpn_divexact_by3c _PROTO ((mp_ptr dst, mp_srcptr src,
+                                     mp_size_t size, mp_limb_t carry));
+
+#define mpn_divmod_1(qp,np,nsize,dlimb) mpn_divrem_1 (qp,0,np,nsize,dlimb)
+
+DLL_IMPORT mp_limb_t mpn_divrem _PROTO((mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr, mp_size_t));
+
+DLL_IMPORT mp_limb_t mpn_divrem_1 _PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t));
+
+#define mpn_divrem_1c  __MPN(divrem_1c)
+DLL_IMPORT mp_limb_t mpn_divrem_1c _PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t,
+                                 mp_limb_t, mp_limb_t));
+
+DLL_IMPORT mp_limb_t mpn_divrem_2 _PROTO ((mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr));
+DLL_IMPORT void mpn_dump _PROTO ((mp_srcptr, mp_size_t));
+mp_size_t mpn_gcd _PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
+DLL_IMPORT mp_limb_t mpn_gcd_1 _PROTO ((mp_srcptr, mp_size_t, mp_limb_t));
+mp_size_t mpn_gcdext _PROTO ((mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
+DLL_IMPORT size_t mpn_get_str _PROTO ((unsigned char *, int, mp_ptr, mp_size_t));
+DLL_IMPORT unsigned long int mpn_hamdist _PROTO ((mp_srcptr, mp_srcptr, mp_size_t));
+
+#define mpn_jacobi_base __MPN(jacobi_base)
+DLL_IMPORT int mpn_jacobi_base _PROTO ((mp_limb_t a, mp_limb_t b, int result_bit1));
+
+DLL_IMPORT mp_limb_t mpn_lshift _PROTO ((mp_ptr, mp_srcptr, mp_size_t, unsigned int));
+DLL_IMPORT mp_limb_t mpn_mod_1 _PROTO ((mp_srcptr, mp_size_t, mp_limb_t));
+
+#define mpn_mod_1c  __MPN(mod_1c)
+DLL_IMPORT mp_limb_t mpn_mod_1c _PROTO ((mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t));
+
+#define mpn_mod_1_rshift __MPN(mod_1_rshift)
+DLL_IMPORT mp_limb_t mpn_mod_1_rshift _PROTO ((mp_srcptr, mp_size_t, unsigned,mp_limb_t));
+
+DLL_IMPORT mp_limb_t mpn_mul _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t));
+DLL_IMPORT mp_limb_t mpn_mul_1 _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
+
+#define mpn_mul_1c  __MPN(mul_1c)
+DLL_IMPORT mp_limb_t mpn_mul_1c _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t));
+
+DLL_IMPORT void mpn_mul_basecase _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t));
+DLL_IMPORT void mpn_mul_n _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+DLL_IMPORT int mpn_perfect_square_p _PROTO ((mp_srcptr, mp_size_t));
+DLL_IMPORT unsigned long int mpn_popcount _PROTO ((mp_srcptr, mp_size_t));
+DLL_IMPORT mp_limb_t mpn_preinv_mod_1 _PROTO ((mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t));
+DLL_IMPORT void mpn_random _PROTO ((mp_ptr, mp_size_t));
+DLL_IMPORT void mpn_random2 _PROTO ((mp_ptr, mp_size_t));
+DLL_IMPORT mp_limb_t mpn_rshift _PROTO ((mp_ptr, mp_srcptr, mp_size_t, unsigned int));
+DLL_IMPORT unsigned long int mpn_scan0 _PROTO ((mp_srcptr, unsigned long int));
+DLL_IMPORT unsigned long int mpn_scan1 _PROTO ((mp_srcptr, unsigned long int));
+mp_size_t mpn_set_str _PROTO ((mp_ptr, __gmp_const unsigned char *, size_t, int));
+DLL_IMPORT void mpn_sqr_n _PROTO ((mp_ptr, mp_srcptr, mp_size_t));
+DLL_IMPORT void mpn_sqr_basecase _PROTO ((mp_ptr, mp_srcptr, mp_size_t));
+mp_size_t mpn_sqrtrem _PROTO ((mp_ptr, mp_ptr, mp_srcptr, mp_size_t));
+DLL_IMPORT mp_limb_t mpn_sub _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_srcptr,mp_size_t));
+DLL_IMPORT mp_limb_t mpn_sub_1 _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
+DLL_IMPORT mp_limb_t mpn_sub_n _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+DLL_IMPORT mp_limb_t mpn_sub_nc _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_limb_t));
+DLL_IMPORT mp_limb_t mpn_submul_1 _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t));
+
+#define mpn_submul_1c  __MPN(submul_1c)
+DLL_IMPORT mp_limb_t mpn_submul_1c _PROTO ((mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t));
+
+#define mpn_tdiv_qr  __MPN(tdiv_qr)
+DLL_IMPORT void mpn_tdiv_qr _PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t));
+
+#if defined (__cplusplus)
+}
+#endif
+
+#define mpn_incr_u(p,incr) \
+  do { mp_limb_t __x; mp_ptr __p = p;			\
+    __x = *__p + incr;					\
+    *__p = __x;						\
+    if (__x < incr)					\
+      while (++(*(++__p)) == 0)				\
+        ;						\
+  } while (0)
+
+#define mpn_decr_u(p,incr) \
+  do { mp_limb_t __x; mp_ptr __p = p;			\
+    __x = *__p;						\
+    *__p = __x - incr;					\
+    if (__x < incr)					\
+      while ((*(++__p))-- == 0)				\
+        ;						\
+  } while (0)
+
+#if defined (__GNUC__) || defined (_FORCE_INLINES)
+_EXTERN_INLINE mp_limb_t
+#if (__STDC__-0) || defined (__cplusplus)
+mpn_add_1 (register mp_ptr res_ptr,
+	   register mp_srcptr s1_ptr,
+	   register mp_size_t s1_size,
+	   register mp_limb_t s2_limb)
+#else
+mpn_add_1 (res_ptr, s1_ptr, s1_size, s2_limb)
+     register mp_ptr res_ptr;
+     register mp_srcptr s1_ptr;
+     register mp_size_t s1_size;
+     register mp_limb_t s2_limb;
+#endif
+{
+  register mp_limb_t x;
+
+  x = *s1_ptr++;
+  s2_limb = x + s2_limb;
+  *res_ptr++ = s2_limb;
+  if (s2_limb < x)
+    {
+      while (--s1_size != 0)
+	{
+	  x = *s1_ptr++ + 1;
+	  *res_ptr++ = x;
+	  if (x != 0)
+	    goto fin;
+	}
+
+      return 1;
+    }
+
+ fin:
+  if (res_ptr != s1_ptr)
+    {
+      mp_size_t i;
+      for (i = 0; i < s1_size - 1; i++)
+	res_ptr[i] = s1_ptr[i];
+    }
+  return 0;
+}
+
+_EXTERN_INLINE mp_limb_t
+#if (__STDC__-0) || defined (__cplusplus)
+mpn_add (register mp_ptr res_ptr,
+	 register mp_srcptr s1_ptr,
+	 register mp_size_t s1_size,
+	 register mp_srcptr s2_ptr,
+	 register mp_size_t s2_size)
+#else
+mpn_add (res_ptr, s1_ptr, s1_size, s2_ptr, s2_size)
+     register mp_ptr res_ptr;
+     register mp_srcptr s1_ptr;
+     register mp_size_t s1_size;
+     register mp_srcptr s2_ptr;
+     register mp_size_t s2_size;
+#endif
+{
+  mp_limb_t cy_limb = 0;
+
+  if (s2_size != 0)
+    cy_limb = mpn_add_n (res_ptr, s1_ptr, s2_ptr, s2_size);
+
+  if (s1_size - s2_size != 0)
+    cy_limb = mpn_add_1 (res_ptr + s2_size,
+			 s1_ptr + s2_size,
+			 s1_size - s2_size,
+			 cy_limb);
+  return cy_limb;
+}
+
+_EXTERN_INLINE mp_limb_t
+#if (__STDC__-0) || defined (__cplusplus)
+mpn_sub_1 (register mp_ptr res_ptr,
+	   register mp_srcptr s1_ptr,
+	   register mp_size_t s1_size,
+	   register mp_limb_t s2_limb)
+#else
+mpn_sub_1 (res_ptr, s1_ptr, s1_size, s2_limb)
+     register mp_ptr res_ptr;
+     register mp_srcptr s1_ptr;
+     register mp_size_t s1_size;
+     register mp_limb_t s2_limb;
+#endif
+{
+  register mp_limb_t x;
+
+  x = *s1_ptr++;
+  s2_limb = x - s2_limb;
+  *res_ptr++ = s2_limb;
+  if (s2_limb > x)
+    {
+      while (--s1_size != 0)
+	{
+	  x = *s1_ptr++;
+	  *res_ptr++ = x - 1;
+	  if (x != 0)
+	    goto fin;
+	}
+
+      return 1;
+    }
+
+ fin:
+  if (res_ptr != s1_ptr)
+    {
+      mp_size_t i;
+      for (i = 0; i < s1_size - 1; i++)
+	res_ptr[i] = s1_ptr[i];
+    }
+  return 0;
+}
+
+_EXTERN_INLINE mp_limb_t
+#if (__STDC__-0) || defined (__cplusplus)
+mpn_sub (register mp_ptr res_ptr,
+	 register mp_srcptr s1_ptr,
+	 register mp_size_t s1_size,
+	 register mp_srcptr s2_ptr,
+	 register mp_size_t s2_size)
+#else
+mpn_sub (res_ptr, s1_ptr, s1_size, s2_ptr, s2_size)
+     register mp_ptr res_ptr;
+     register mp_srcptr s1_ptr;
+     register mp_size_t s1_size;
+     register mp_srcptr s2_ptr;
+     register mp_size_t s2_size;
+#endif
+{
+  mp_limb_t cy_limb = 0;
+
+  if (s2_size != 0)
+    cy_limb = mpn_sub_n (res_ptr, s1_ptr, s2_ptr, s2_size);
+
+  if (s1_size - s2_size != 0)
+    cy_limb = mpn_sub_1 (res_ptr + s2_size,
+			 s1_ptr + s2_size,
+			 s1_size - s2_size,
+			 cy_limb);
+  return cy_limb;
+}
+#endif /* __GNUC__ */
+
+/* Allow faster testing for negative, zero, and positive.  */
+#define mpz_sgn(Z) ((Z)->_mp_size < 0 ? -1 : (Z)->_mp_size > 0)
+#define mpf_sgn(F) ((F)->_mp_size < 0 ? -1 : (F)->_mp_size > 0)
+#define mpq_sgn(Q) ((Q)->_mp_num._mp_size < 0 ? -1 : (Q)->_mp_num._mp_size > 0)
+
+/* When using GCC, optimize certain common comparisons.  */
+#if defined (__GNUC__)
+#define mpz_cmp_ui(Z,UI) \
+  (__builtin_constant_p (UI) && (UI) == 0				\
+   ? mpz_sgn (Z) : _mpz_cmp_ui (Z,UI))
+#define mpz_cmp_si(Z,SI) \
+  (__builtin_constant_p (SI) && (SI) == 0 ? mpz_sgn (Z)			\
+   : __builtin_constant_p (SI) && (SI) > 0				\
+    ? _mpz_cmp_ui (Z, (unsigned long int) SI)				\
+   : _mpz_cmp_si (Z,SI))
+#define mpq_cmp_ui(Q,NUI,DUI) \
+  (__builtin_constant_p (NUI) && (NUI) == 0				\
+   ? mpq_sgn (Q) : _mpq_cmp_ui (Q,NUI,DUI))
+#else
+#define mpz_cmp_ui(Z,UI) _mpz_cmp_ui (Z,UI)
+#define mpz_cmp_si(Z,UI) _mpz_cmp_si (Z,UI)
+#define mpq_cmp_ui(Q,NUI,DUI) _mpq_cmp_ui (Q,NUI,DUI)
+#endif
+
+
+/* Using "&" rather than "&&" means these can come out branch-free.  Every
+   mpz_t has at least one limb allocated, so fetching the low limb is always
+   allowed.  */
+#define mpz_odd_p(z)   ((int) ((z)->_mp_size != 0) & (int) (z)->_mp_d[0])
+#define mpz_even_p(z)  (! mpz_odd_p (z))
+
+
+/* Allow direct user access to numerator and denominator of a mpq_t object.  */
+#define mpq_numref(Q) (&((Q)->_mp_num))
+#define mpq_denref(Q) (&((Q)->_mp_den))
+
+
+/* Compatibility with GMP 2 and earlier. */
+#define mpn_divmod(qp,np,nsize,dp,dsize) mpn_divrem (qp,0,np,nsize,dp,dsize)
+
+/* Compatibility with GMP 1.  */
+#define mpz_mdiv	mpz_fdiv_q
+#define mpz_mdivmod	mpz_fdiv_qr
+#define mpz_mmod	mpz_fdiv_r
+#define mpz_mdiv_ui	mpz_fdiv_q_ui
+#define mpz_mdivmod_ui(q,r,n,d) \
+  ((r == 0) ? mpz_fdiv_q_ui (q,n,d) : mpz_fdiv_qr_ui (q,r,n,d))
+#define mpz_mmod_ui(r,n,d) \
+  ((r == 0) ? mpz_fdiv_ui (n,d) : mpz_fdiv_r_ui (r,n,d))
+
+/* Useful synonyms, but not quite compatible with GMP 1.  */
+#define mpz_div		mpz_fdiv_q
+#define mpz_divmod	mpz_fdiv_qr
+#define mpz_div_ui	mpz_fdiv_q_ui
+#define mpz_divmod_ui	mpz_fdiv_qr_ui
+#define mpz_mod_ui	mpz_fdiv_r_ui
+#define mpz_div_2exp	mpz_fdiv_q_2exp
+#define mpz_mod_2exp	mpz_fdiv_r_2exp
+
+#define gmp_errno __gmp_errno
+extern int gmp_errno;
+
+enum
+{
+  GMP_ERROR_NONE = 0,
+  GMP_ERROR_UNSUPPORTED_ARGUMENT = 1,
+  GMP_ERROR_DIVISION_BY_ZERO = 2,
+  GMP_ERROR_SQRT_OF_NEGATIVE = 4,
+  GMP_ERROR_INVALID_ARGUMENT = 8,
+  GMP_ERROR_ALLOCATE = 16,
+  GMP_ERROR_BAD_STRING = 32,
+  GMP_ERROR_UNUSED_ERROR
+};
+
+/* Note: major version number is in mp.h too */
+#define __GNU_MP_VERSION 3
+#define __GNU_MP_VERSION_MINOR 1
+#define __GNU_MP_VERSION_PATCHLEVEL 1
+
+#define gmp_version __gmp_version
+extern __gmp_const char *gmp_version;
+
+#define __GMP_H__
+#endif /* __GMP_H__ */
diff --git a/rts/gmp/insert-dbl.c b/rts/gmp/insert-dbl.c
new file mode 100644
index 0000000000..dc88a56f62
--- /dev/null
+++ b/rts/gmp/insert-dbl.c
@@ -0,0 +1,98 @@
+/* __gmp_insert_double -- convert from array of mp_limb_t to double.
+
+Copyright (C) 1996, 1997, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#ifdef XDEBUG
+#undef _GMP_IEEE_FLOATS
+#endif
+
+#ifndef _GMP_IEEE_FLOATS
+#define _GMP_IEEE_FLOATS 0
+#endif
+
+double
+#if __STDC__
+__gmp_scale2 (double d, int exp)
+#else
+__gmp_scale2 (d, exp)
+     double d;
+     int exp;
+#endif
+{
+#if _GMP_IEEE_FLOATS
+  {
+#if defined (__alpha) && __GNUC__ == 2 && __GNUC_MINOR__ == 8
+    /* Work around alpha-specific bug in GCC 2.8.x.  */
+    volatile
+#endif
+    union ieee_double_extract x;
+    x.d = d;
+    exp += x.s.exp;
+    x.s.exp = exp;
+    if (exp >= 2047)
+      {
+	/* Return +-infinity */
+	x.s.exp = 2047;
+	x.s.manl = x.s.manh = 0;
+      }
+    else if (exp < 1)
+      {
+	x.s.exp = 1;		/* smallest exponent (biased) */
+	/* Divide result by 2 until we have scaled it to the right IEEE
+	   denormalized number, but stop if it becomes zero.  */
+	while (exp < 1 && x.d != 0)
+	  {
+	    x.d *= 0.5;
+	    exp++;
+	  }
+      }
+    return x.d;
+  }
+#else
+  {
+    double factor, r;
+
+    factor = 2.0;
+    if (exp < 0)
+      {
+	factor = 0.5;
+	exp = -exp;
+      }
+    r = d;
+    if (exp != 0)
+      {
+	if ((exp & 1) != 0)
+	  r *= factor;
+	exp >>= 1;
+	while (exp != 0)
+	  {
+	    factor *= factor;
+	    if ((exp & 1) != 0)
+	      r *= factor;
+	    exp >>= 1;
+	  }
+      }
+    return r;
+  }
+#endif
+}
diff --git a/rts/gmp/install-sh b/rts/gmp/install-sh
new file mode 100644
index 0000000000..e9de23842d
--- /dev/null
+++ b/rts/gmp/install-sh
@@ -0,0 +1,251 @@
+#!/bin/sh
+#
+# install - install a program, script, or datafile
+# This comes from X11R5 (mit/util/scripts/install.sh).
+#
+# Copyright 1991 by the Massachusetts Institute of Technology
+#
+# Permission to use, copy, modify, distribute, and sell this software and its
+# documentation for any purpose is hereby granted without fee, provided that
+# the above copyright notice appear in all copies and that both that
+# copyright notice and this permission notice appear in supporting
+# documentation, and that the name of M.I.T. not be used in advertising or
+# publicity pertaining to distribution of the software without specific,
+# written prior permission.  M.I.T. makes no representations about the
+# suitability of this software for any purpose.  It is provided "as is"
+# without express or implied warranty.
+#
+# Calling this script install-sh is preferred over install.sh, to prevent
+# `make' implicit rules from creating a file called install from it
+# when there is no Makefile.
+#
+# This script is compatible with the BSD install script, but was written
+# from scratch.  It can only install one file at a time, a restriction
+# shared with many OS's install programs.
+
+
+# set DOITPROG to echo to test this script
+
+# Don't use :- since 4.3BSD and earlier shells don't like it.
+doit="${DOITPROG-}"
+
+
+# put in absolute paths if you don't have them in your path; or use env. vars.
+
+mvprog="${MVPROG-mv}"
+cpprog="${CPPROG-cp}"
+chmodprog="${CHMODPROG-chmod}"
+chownprog="${CHOWNPROG-chown}"
+chgrpprog="${CHGRPPROG-chgrp}"
+stripprog="${STRIPPROG-strip}"
+rmprog="${RMPROG-rm}"
+mkdirprog="${MKDIRPROG-mkdir}"
+
+transformbasename=""
+transform_arg=""
+instcmd="$mvprog"
+chmodcmd="$chmodprog 0755"
+chowncmd=""
+chgrpcmd=""
+stripcmd=""
+rmcmd="$rmprog -f"
+mvcmd="$mvprog"
+src=""
+dst=""
+dir_arg=""
+
+while [ x"$1" != x ]; do
+    case $1 in
+	-c) instcmd="$cpprog"
+	    shift
+	    continue;;
+
+	-d) dir_arg=true
+	    shift
+	    continue;;
+
+	-m) chmodcmd="$chmodprog $2"
+	    shift
+	    shift
+	    continue;;
+
+	-o) chowncmd="$chownprog $2"
+	    shift
+	    shift
+	    continue;;
+
+	-g) chgrpcmd="$chgrpprog $2"
+	    shift
+	    shift
+	    continue;;
+
+	-s) stripcmd="$stripprog"
+	    shift
+	    continue;;
+
+	-t=*) transformarg=`echo $1 | sed 's/-t=//'`
+	    shift
+	    continue;;
+
+	-b=*) transformbasename=`echo $1 | sed 's/-b=//'`
+	    shift
+	    continue;;
+
+	*)  if [ x"$src" = x ]
+	    then
+		src=$1
+	    else
+		# this colon is to work around a 386BSD /bin/sh bug
+		:
+		dst=$1
+	    fi
+	    shift
+	    continue;;
+    esac
+done
+
+if [ x"$src" = x ]
+then
+	echo "install:	no input file specified"
+	exit 1
+else
+	true
+fi
+
+if [ x"$dir_arg" != x ]; then
+	dst=$src
+	src=""
+	
+	if [ -d $dst ]; then
+		instcmd=:
+		chmodcmd=""
+	else
+		instcmd=mkdir
+	fi
+else
+
+# Waiting for this to be detected by the "$instcmd $src $dsttmp" command
+# might cause directories to be created, which would be especially bad 
+# if $src (and thus $dsttmp) contains '*'.
+
+	if [ -f $src -o -d $src ]
+	then
+		true
+	else
+		echo "install:  $src does not exist"
+		exit 1
+	fi
+	
+	if [ x"$dst" = x ]
+	then
+		echo "install:	no destination specified"
+		exit 1
+	else
+		true
+	fi
+
+# If destination is a directory, append the input filename; if your system
+# does not like double slashes in filenames, you may need to add some logic
+
+	if [ -d $dst ]
+	then
+		dst="$dst"/`basename $src`
+	else
+		true
+	fi
+fi
+
+## this sed command emulates the dirname command
+dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'`
+
+# Make sure that the destination directory exists.
+#  this part is taken from Noah Friedman's mkinstalldirs script
+
+# Skip lots of stat calls in the usual case.
+if [ ! -d "$dstdir" ]; then
+defaultIFS='	
+'
+IFS="${IFS-${defaultIFS}}"
+
+oIFS="${IFS}"
+# Some sh's can't handle IFS=/ for some reason.
+IFS='%'
+set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'`
+IFS="${oIFS}"
+
+pathcomp=''
+
+while [ $# -ne 0 ] ; do
+	pathcomp="${pathcomp}${1}"
+	shift
+
+	if [ ! -d "${pathcomp}" ] ;
+        then
+		$mkdirprog "${pathcomp}"
+	else
+		true
+	fi
+
+	pathcomp="${pathcomp}/"
+done
+fi
+
+if [ x"$dir_arg" != x ]
+then
+	$doit $instcmd $dst &&
+
+	if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi &&
+	if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi &&
+	if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi &&
+	if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi
+else
+
+# If we're going to rename the final executable, determine the name now.
+
+	if [ x"$transformarg" = x ] 
+	then
+		dstfile=`basename $dst`
+	else
+		dstfile=`basename $dst $transformbasename | 
+			sed $transformarg`$transformbasename
+	fi
+
+# don't allow the sed command to completely eliminate the filename
+
+	if [ x"$dstfile" = x ] 
+	then
+		dstfile=`basename $dst`
+	else
+		true
+	fi
+
+# Make a temp file name in the proper directory.
+
+	dsttmp=$dstdir/#inst.$$#
+
+# Move or copy the file name to the temp name
+
+	$doit $instcmd $src $dsttmp &&
+
+	trap "rm -f ${dsttmp}" 0 &&
+
+# and set any options; do chmod last to preserve setuid bits
+
+# If any of these fail, we abort the whole thing.  If we want to
+# ignore errors from any of these, just make sure not to ignore
+# errors from the above "$doit $instcmd $src $dsttmp" command.
+
+	if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi &&
+	if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi &&
+	if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi &&
+	if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi &&
+
+# Now rename the file to the real destination.
+
+	$doit $rmcmd -f $dstdir/$dstfile &&
+	$doit $mvcmd $dsttmp $dstdir/$dstfile 
+
+fi &&
+
+
+exit 0
diff --git a/rts/gmp/longlong.h b/rts/gmp/longlong.h
new file mode 100644
index 0000000000..9a12755053
--- /dev/null
+++ b/rts/gmp/longlong.h
@@ -0,0 +1,1347 @@
+/* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this file; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/* You have to define the following before including this file:
+
+   UWtype -- An unsigned type, default type for operations (typically a "word")
+   UHWtype -- An unsigned type, at least half the size of UWtype.
+   UDWtype -- An unsigned type, at least twice as large a UWtype
+   W_TYPE_SIZE -- size in bits of UWtype
+
+   SItype, USItype -- Signed and unsigned 32 bit types.
+   DItype, UDItype -- Signed and unsigned 64 bit types.
+
+   On a 32 bit machine UWtype should typically be USItype;
+   on a 64 bit machine, UWtype should typically be UDItype.
+*/
+
+#define __BITS4 (W_TYPE_SIZE / 4)
+#define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
+#define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
+#define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
+
+/* This is used to make sure no undesirable sharing between different libraries
+   that use this file takes place.  */
+#ifndef __MPN
+#define __MPN(x) __##x
+#endif
+
+#ifndef _PROTO
+#if (__STDC__-0) || defined (__cplusplus)
+#define _PROTO(x) x
+#else
+#define _PROTO(x) ()
+#endif
+#endif
+
+/* Define auxiliary asm macros.
+
+   1) umul_ppmm(high_prod, low_prod, multipler, multiplicand) multiplies two
+   UWtype integers MULTIPLER and MULTIPLICAND, and generates a two UWtype
+   word product in HIGH_PROD and LOW_PROD.
+
+   2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
+   UDWtype product.  This is just a variant of umul_ppmm.
+
+   3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
+   denominator) divides a UDWtype, composed by the UWtype integers
+   HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
+   in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
+   than DENOMINATOR for correct operation.  If, in addition, the most
+   significant bit of DENOMINATOR must be 1, then the pre-processor symbol
+   UDIV_NEEDS_NORMALIZATION is defined to 1.
+
+   4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
+   denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
+   is rounded towards 0.
+
+   5) count_leading_zeros(count, x) counts the number of zero-bits from the
+   msb to the first non-zero bit in the UWtype X.  This is the number of
+   steps X needs to be shifted left to set the msb.  Undefined for X == 0,
+   unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
+
+   6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
+   from the least significant end.
+
+   7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
+   high_addend_2, low_addend_2) adds two UWtype integers, composed by
+   HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
+   respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
+   (i.e. carry out) is not stored anywhere, and is lost.
+
+   8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
+   high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
+   composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
+   LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
+   and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
+   and is lost.
+
+   If any of these macros are left undefined for a particular CPU,
+   C macros are used.  */
+
+/* The CPUs come in alphabetical order below.
+
+   Please add support for more CPUs here, or improve the current support
+   for the CPUs below!  */
+
+#if defined (__alpha) && W_TYPE_SIZE == 64
+#if defined (__GNUC__)
+#define umul_ppmm(ph, pl, m0, m1) \
+  do {									\
+    UDItype __m0 = (m0), __m1 = (m1);					\
+    __asm__ ("umulh %r1,%2,%0"						\
+	     : "=r" (ph)						\
+	     : "%rJ" (m0), "rI" (m1));					\
+    (pl) = __m0 * __m1;							\
+  } while (0)
+#define UMUL_TIME 18
+#ifndef LONGLONG_STANDALONE
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do { UDItype __di;							\
+    __di = __MPN(invert_limb) (d);					\
+    udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
+  } while (0)
+#define UDIV_NEEDS_NORMALIZATION 1
+#define UDIV_TIME 220
+long __MPN(count_leading_zeros) ();
+#define count_leading_zeros(count, x) \
+  ((count) = __MPN(count_leading_zeros) (x))
+#endif /* LONGLONG_STANDALONE */
+#else /* ! __GNUC__ */
+#include <machine/builtins.h>
+#define umul_ppmm(ph, pl, m0, m1) \
+  do {									\
+    UDItype __m0 = (m0), __m1 = (m1);					\
+    (ph) = __UMULH (m0, m1);						\
+    (pl) = __m0 * __m1;							\
+  } while (0)
+#endif
+#endif /* __alpha */
+
+#if defined (__hppa) && W_TYPE_SIZE == 64
+/* We put the result pointer parameter last here, since it makes passing
+   of the other parameters more efficient.  */
+#ifndef LONGLONG_STANDALONE
+#define umul_ppmm(wh, wl, u, v) \
+  do {									\
+    UDItype __p0;							\
+    (wh) = __MPN(umul_ppmm) (u, v, &__p0);				\
+    (wl) = __p0;							\
+  } while (0)
+extern UDItype __MPN(umul_ppmm) _PROTO ((UDItype, UDItype, UDItype *));
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do { UDItype __r;							\
+    (q) = __MPN(udiv_qrnnd) (n1, n0, d, &__r);				\
+    (r) = __r;								\
+  } while (0)
+extern UDItype __MPN(udiv_qrnnd) _PROTO ((UDItype, UDItype, UDItype, UDItype *));
+#define UMUL_TIME 8
+#define UDIV_TIME 60
+#endif /* LONGLONG_STANDALONE */
+#endif /* hppa */
+
+#if defined (__ia64) && W_TYPE_SIZE == 64
+#if defined (__GNUC__)
+#define umul_ppmm(ph, pl, m0, m1) \
+  do {									\
+    UDItype __m0 = (m0), __m1 = (m1);					\
+    __asm__ ("xma.hu %0 = %1, %2, f0"					\
+	     : "=e" (ph)						\
+	     : "e" (m0), "e" (m1));					\
+    (pl) = __m0 * __m1;							\
+  } while (0)
+#endif
+#endif
+
+
+#if defined (__GNUC__) && !defined (NO_ASM)
+
+/* We sometimes need to clobber "cc" with gcc2, but that would not be
+   understood by gcc1.  Use cpp to avoid major code duplication.  */
+#if __GNUC__ < 2
+#define __CLOBBER_CC
+#define __AND_CLOBBER_CC
+#else /* __GNUC__ >= 2 */
+#define __CLOBBER_CC : "cc"
+#define __AND_CLOBBER_CC , "cc"
+#endif /* __GNUC__ < 2 */
+
+#if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"				\
+	   : "=r" (sh), "=&r" (sl)					\
+	   : "%r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"				\
+	   : "=r" (sh),	"=&r" (sl)					\
+	   : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
+#define umul_ppmm(xh, xl, m0, m1) \
+  do {									\
+    USItype __m0 = (m0), __m1 = (m1);					\
+    __asm__ ("multiplu %0,%1,%2"					\
+	     : "=r" (xl)						\
+	     : "r" (__m0), "r" (__m1));					\
+    __asm__ ("multmu %0,%1,%2"						\
+	     : "=r" (xh)						\
+	     : "r" (__m0), "r" (__m1));					\
+  } while (0)
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  __asm__ ("dividu %0,%3,%4"						\
+	   : "=r" (q), "=q" (r)						\
+	   : "1" (n1), "r" (n0), "r" (d))
+#define count_leading_zeros(count, x) \
+    __asm__ ("clz %0,%1"						\
+	     : "=r" (count)						\
+	     : "r" (x))
+#define COUNT_LEADING_ZEROS_0 32
+#endif /* __a29k__ */
+
+#if defined (__arm__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
+	   : "=r" (sh), "=&r" (sl)					\
+	   : "%r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
+	   : "=r" (sh),	"=&r" (sl)					\
+	   : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
+#if 1 || defined (__arm_m__)		/* `M' series has widening multiply support */
+#define umul_ppmm(xh, xl, a, b) \
+  __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
+#define smul_ppmm(xh, xl, a, b) \
+  __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
+#define UMUL_TIME 5
+#else
+#define umul_ppmm(xh, xl, a, b) \
+  __asm__ ("%@ Inlined umul_ppmm\n"		\
+	"mov	%|r0, %2, lsr #16\n"		\
+	"mov	%|r2, %3, lsr #16\n"		\
+	"bic	%|r1, %2, %|r0, lsl #16\n"	\
+	"bic	%|r2, %3, %|r2, lsl #16\n"	\
+	"mul	%1, %|r1, %|r2\n"		\
+	"mul	%|r2, %|r0, %|r2\n"		\
+	"mul	%|r1, %0, %|r1\n"		\
+	"mul	%0, %|r0, %0\n"			\
+	"adds	%|r1, %|r2, %|r1\n"		\
+	"addcs	%0, %0, #65536\n"		\
+	"adds	%1, %1, %|r1, lsl #16\n"	\
+	"adc	%0, %0, %|r1, lsr #16"					\
+	   : "=&r" (xh), "=r" (xl)					\
+	   : "r" (a), "r" (b)						\
+	   : "r0", "r1", "r2")
+#define UMUL_TIME 20
+#endif
+#define UDIV_TIME 100
+#endif /* __arm__ */
+
+#if defined (__clipper__) && W_TYPE_SIZE == 32
+#define umul_ppmm(w1, w0, u, v) \
+  ({union {UDItype __ll;						\
+	   struct {USItype __l, __h;} __i;				\
+	  } __x;							\
+  __asm__ ("mulwux %2,%0"						\
+	   : "=r" (__x.__ll)						\
+	   : "%0" ((USItype)(u)), "r" ((USItype)(v)));			\
+  (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
+#define smul_ppmm(w1, w0, u, v) \
+  ({union {DItype __ll;							\
+	   struct {SItype __l, __h;} __i;				\
+	  } __x;							\
+  __asm__ ("mulwx %2,%0"						\
+	   : "=r" (__x.__ll)						\
+	   : "%0" ((SItype)(u)), "r" ((SItype)(v)));			\
+  (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
+#define __umulsidi3(u, v) \
+  ({UDItype __w;							\
+    __asm__ ("mulwux %2,%0"						\
+	     : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));	\
+    __w; })
+#endif /* __clipper__ */
+
+/* Fujitsu vector computers.  */
+#if defined (__uxp__) && W_TYPE_SIZE == 32
+#define umul_ppmm(ph, pl, u, v) \
+  do {									\
+    union {UDItype __ll;						\
+	   struct {USItype __h, __l;} __i;				\
+	  } __x;							\
+    __asm__ ("mult.lu %1,%2,%0"	: "=r" (__x.__ll) : "%r" (u), "rK" (v));\
+    (ph) = __x.__i.__h;							\
+    (pl) = __x.__i.__l;							\
+  } while (0)
+#define smul_ppmm(ph, pl, u, v) \
+  do {									\
+    union {UDItype __ll;						\
+	   struct {USItype __h, __l;} __i;				\
+	  } __x;							\
+    __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));	\
+    (ph) = __x.__i.__h;							\
+    (pl) = __x.__i.__l;							\
+  } while (0)
+#endif
+
+#if defined (__gmicro__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("add.w %5,%1\n\taddx %3,%0"					\
+	   : "=g" ((USItype)(sh)), "=&g" ((USItype)(sl))		\
+	   : "%0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
+	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("sub.w %5,%1\n\tsubx %3,%0"					\
+	   : "=g" ((USItype)(sh)), "=&g" ((USItype)(sl))		\
+	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
+	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
+#define umul_ppmm(ph, pl, m0, m1) \
+  __asm__ ("mulx %3,%0,%1"						\
+	   : "=g" ((USItype)(ph)), "=r" ((USItype)(pl))			\
+	   : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
+#define udiv_qrnnd(q, r, nh, nl, d) \
+  __asm__ ("divx %4,%0,%1"						\
+	   : "=g" ((USItype)(q)), "=r" ((USItype)(r))			\
+	   : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
+#define count_leading_zeros(count, x) \
+  __asm__ ("bsch/1 %1,%0"						\
+	   : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
+#endif
+
+#if defined (__hppa) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("add %4,%5,%1\n\taddc %2,%3,%0"				\
+	   : "=r" (sh), "=&r" (sl)					\
+	   : "%rM" (ah), "rM" (bh), "%rM" (al), "rM" (bl))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("sub %4,%5,%1\n\tsubb %2,%3,%0"				\
+	   : "=r" (sh), "=&r" (sl)					\
+	   : "rM" (ah), "rM" (bh), "rM" (al), "rM" (bl))
+#if defined (_PA_RISC1_1)
+#define umul_ppmm(wh, wl, u, v) \
+  do {									\
+    union {UDItype __ll;						\
+	   struct {USItype __h, __l;} __i;				\
+	  } __x;							\
+    __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v));	\
+    (wh) = __x.__i.__h;							\
+    (wl) = __x.__i.__l;							\
+  } while (0)
+#define UMUL_TIME 8
+#define UDIV_TIME 60
+#else
+#define UMUL_TIME 40
+#define UDIV_TIME 80
+#endif
+#ifndef LONGLONG_STANDALONE
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do { USItype __r;							\
+    (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
+    (r) = __r;								\
+  } while (0)
+extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, USItype, USItype, USItype));
+#endif /* LONGLONG_STANDALONE */
+#define count_leading_zeros(count, x) \
+  do {									\
+    USItype __tmp;							\
+    __asm__ (								\
+       "ldi		2,%0\n"							\
+	"extru,=		%1,15,16,%%r0		; Bits 31..16 zero?\n"	\
+	"extru,tr	%1,15,16,%1		; No.  Shift down, skip add.\n"	\
+	"ldo		16(%0),%0		; Yes.  Perform add.\n"		\
+	"extru,=		%1,23,8,%%r0		; Bits 15..8 zero?\n"	\
+	"extru,tr	%1,23,8,%1		; No.  Shift down, skip add.\n"	\
+	"ldo		8(%0),%0		; Yes.  Perform add.\n"		\
+	"extru,=		%1,27,4,%%r0		; Bits 7..4 zero?\n"	\
+	"extru,tr	%1,27,4,%1		; No.  Shift down, skip add.\n"	\
+	"ldo		4(%0),%0		; Yes.  Perform add.\n"		\
+	"extru,=		%1,29,2,%%r0		; Bits 3..2 zero?\n"	\
+	"extru,tr	%1,29,2,%1		; No.  Shift down, skip add.\n"	\
+	"ldo		2(%0),%0		; Yes.  Perform add.\n"		\
+	"extru		%1,30,1,%1		; Extract bit 1.\n"		\
+	"sub		%0,%1,%0		; Subtract it.\n"	\
+	: "=r" (count), "=r" (__tmp) : "1" (x));			\
+  } while (0)
+#endif /* hppa */
+
+#if (defined (__i370__) || defined (__mvs__)) && W_TYPE_SIZE == 32
+#define smul_ppmm(xh, xl, m0, m1) \
+  do {									\
+    union {DItype __ll;							\
+	   struct {USItype __h, __l;} __i;				\
+	  } __x;							\
+    __asm__ ("mr %0,%3"							\
+	     : "=r" (__x.__i.__h), "=r" (__x.__i.__l)			\
+	     : "%1" (m0), "r" (m1));					\
+    (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
+  } while (0)
+#define sdiv_qrnnd(q, r, n1, n0, d) \
+  do {									\
+    union {DItype __ll;							\
+	   struct {USItype __h, __l;} __i;				\
+	  } __x;							\
+    __x.__i.__h = n1; __x.__i.__l = n0;					\
+    __asm__ ("dr %0,%2"							\
+	     : "=r" (__x.__ll)						\
+	     : "0" (__x.__ll), "r" (d));				\
+    (q) = __x.__i.__l; (r) = __x.__i.__h;				\
+  } while (0)
+#endif
+
+#if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("addl %5,%1\n\tadcl %3,%0"					\
+	   : "=r" ((USItype)(sh)), "=&r" ((USItype)(sl))		\
+	   : "%0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
+	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("subl %5,%1\n\tsbbl %3,%0"					\
+	   : "=r" ((USItype)(sh)), "=&r" ((USItype)(sl))		\
+	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
+	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("mull %3"							\
+	   : "=a" (w0), "=d" (w1)					\
+	   : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  __asm__ ("divl %4"							\
+	   : "=a" (q), "=d" (r)						\
+	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(d)))
+#define count_leading_zeros(count, x) \
+  do {									\
+    USItype __cbtmp;							\
+    __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
+    (count) = __cbtmp ^ 31;						\
+  } while (0)
+#define count_trailing_zeros(count, x) \
+  __asm__ ("bsfl %1,%0" : "=r" (count) : "rm" ((USItype)(x)))
+#ifndef UMUL_TIME
+#define UMUL_TIME 10
+#endif
+#ifndef UDIV_TIME
+#define UDIV_TIME 40
+#endif
+#endif /* 80x86 */
+
+#if defined (__i860__) && W_TYPE_SIZE == 32
+#define rshift_rhlc(r,h,l,c) \
+  __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"				\
+	   "=r" (r) : "r" (h), "r" (l), "rn" (c))
+#endif /* i860 */
+
+#if defined (__i960__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"			\
+	   : "=r" (sh), "=&r" (sl)					\
+	   : "%dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"			\
+	   : "=r" (sh), "=&r" (sl)					\
+	   : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
+#define umul_ppmm(w1, w0, u, v) \
+  ({union {UDItype __ll;						\
+	   struct {USItype __l, __h;} __i;				\
+	  } __x;							\
+  __asm__ ("emul %2,%1,%0"						\
+	   : "=d" (__x.__ll) : "%dI" (u), "dI" (v));			\
+  (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
+#define __umulsidi3(u, v) \
+  ({UDItype __w;							\
+    __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));	\
+    __w; })
+#define udiv_qrnnd(q, r, nh, nl, d) \
+  do {									\
+    union {UDItype __ll;						\
+	   struct {USItype __l, __h;} __i;				\
+	  } __nn;							\
+    __nn.__i.__h = (nh); __nn.__i.__l = (nl);				\
+    __asm__ ("ediv %d,%n,%0"						\
+	   : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));		\
+    (r) = __rq.__i.__l; (q) = __rq.__i.__h;				\
+  } while (0)
+#define count_leading_zeros(count, x) \
+  do {									\
+    USItype __cbtmp;							\
+    __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));		\
+    (count) = __cbtmp ^ 31;						\
+  } while (0)
+#define COUNT_LEADING_ZEROS_0 (-32) /* sic */
+#if defined (__i960mx)		/* what is the proper symbol to test??? */
+#define rshift_rhlc(r,h,l,c) \
+  do {									\
+    union {UDItype __ll;						\
+	   struct {USItype __l, __h;} __i;				\
+	  } __nn;							\
+    __nn.__i.__h = (h); __nn.__i.__l = (l);				\
+    __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c));	\
+  }
+#endif /* i960mx */
+#endif /* i960 */
+
+#if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
+     || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
+     || defined (__mc5307__)) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"				\
+	   : "=d" ((USItype)(sh)), "=&d" ((USItype)(sl))		\
+	   : "%0" ((USItype)(ah)), "d" ((USItype)(bh)),			\
+	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"				\
+	   : "=d" ((USItype)(sh)), "=&d" ((USItype)(sl))		\
+	   : "0" ((USItype)(ah)), "d" ((USItype)(bh)),			\
+	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
+/* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
+#if defined (__mc68020__) || defined(mc68020) \
+     || defined (__mc68030__) || defined (mc68030) \
+     || defined (__mc68040__) || defined (mc68040) \
+     || defined (__mc68332__) || defined (mc68332) \
+     || defined (__NeXT__)
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("mulu%.l %3,%1:%0"						\
+	   : "=d" ((USItype)(w0)), "=d" ((USItype)(w1))			\
+	   : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
+#define UMUL_TIME 45
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  __asm__ ("divu%.l %4,%1:%0"						\
+	   : "=d" ((USItype)(q)), "=d" ((USItype)(r))			\
+	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
+#define UDIV_TIME 90
+#define sdiv_qrnnd(q, r, n1, n0, d) \
+  __asm__ ("divs%.l %4,%1:%0"						\
+	   : "=d" ((USItype)(q)), "=d" ((USItype)(r))			\
+	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
+#else /* for other 68k family members use 16x16->32 multiplication */
+#define umul_ppmm(xh, xl, a, b) \
+  do { USItype __umul_tmp1, __umul_tmp2;				\
+	__asm__ ("| Inlined umul_ppmm\n"			\
+	"move%.l	%5,%3\n"				\
+	"move%.l	%2,%0\n"				\
+	"move%.w	%3,%1\n"				\
+	"swap	%3\n"						\
+	"swap	%0\n"						\
+	"mulu%.w	%2,%1\n"				\
+	"mulu%.w	%3,%0\n"				\
+	"mulu%.w	%2,%3\n"				\
+	"swap	%2\n"						\
+	"mulu%.w	%5,%2\n"				\
+	"add%.l	%3,%2\n"					\
+	"jcc	1f\n"						\
+	"add%.l	%#0x10000,%0\n"					\
+"1:	move%.l	%2,%3\n"					\
+	"clr%.w	%2\n"						\
+	"swap	%2\n"						\
+	"swap	%3\n"						\
+	"clr%.w	%3\n"						\
+	"add%.l	%3,%1\n"					\
+	"addx%.l	%2,%0\n"				\
+	"| End inlined umul_ppmm"				\
+	      : "=&d" ((USItype)(xh)), "=&d" ((USItype)(xl)),	\
+	        "=d" (__umul_tmp1), "=&d" (__umul_tmp2)		\
+	      : "%2" ((USItype)(a)), "d" ((USItype)(b)));	\
+  } while (0)
+#define UMUL_TIME 100
+#define UDIV_TIME 400
+#endif /* not mc68020 */
+/* The '020, '030, '040 and '060 have bitfield insns.  */
+#if defined (__mc68020__) || defined (mc68020) \
+     || defined (__mc68030__) || defined (mc68030) \
+     || defined (__mc68040__) || defined (mc68040) \
+     || defined (__mc68060__) || defined (mc68060) \
+     || defined (__NeXT__)
+#define count_leading_zeros(count, x) \
+  __asm__ ("bfffo %1{%b2:%b2},%0"					\
+	   : "=d" ((USItype) (count))					\
+	   : "od" ((USItype) (x)), "n" (0))
+#define COUNT_LEADING_ZEROS_0 32
+#endif
+#endif /* mc68000 */
+
+#if defined (__m88000__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"			\
+	   : "=r" (sh), "=&r" (sl)					\
+	   : "%rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"			\
+	   : "=r" (sh), "=&r" (sl)					\
+	   : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
+#define count_leading_zeros(count, x) \
+  do {									\
+    USItype __cbtmp;							\
+    __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));			\
+    (count) = __cbtmp ^ 31;						\
+  } while (0)
+#define COUNT_LEADING_ZEROS_0 63 /* sic */
+#if defined (__m88110__)
+#define umul_ppmm(wh, wl, u, v) \
+  do {									\
+    union {UDItype __ll;						\
+	   struct {USItype __h, __l;} __i;				\
+	  } __x;							\
+    __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));	\
+    (wh) = __x.__i.__h;							\
+    (wl) = __x.__i.__l;							\
+  } while (0)
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  ({union {UDItype __ll;						\
+	   struct {USItype __h, __l;} __i;				\
+	  } __x, __q;							\
+  __x.__i.__h = (n1); __x.__i.__l = (n0);				\
+  __asm__ ("divu.d %0,%1,%2"						\
+	   : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));		\
+  (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
+#define UMUL_TIME 5
+#define UDIV_TIME 25
+#else
+#define UMUL_TIME 17
+#define UDIV_TIME 150
+#endif /* __m88110__ */
+#endif /* __m88000__ */
+
+#if defined (__mips) && W_TYPE_SIZE == 32
+#if __GNUC__ > 2 || __GNUC_MINOR__ >= 7
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
+#else
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"				\
+	   : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
+#endif
+#define UMUL_TIME 10
+#define UDIV_TIME 100
+#endif /* __mips */
+
+#if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
+#if __GNUC__ > 2 || __GNUC_MINOR__ >= 7
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("dmultu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
+#else
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"				\
+	   : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
+#endif
+#define UMUL_TIME 20
+#define UDIV_TIME 140
+#endif /* __mips */
+
+#if defined (__ns32000__) && W_TYPE_SIZE == 32
+#define umul_ppmm(w1, w0, u, v) \
+  ({union {UDItype __ll;						\
+	   struct {USItype __l, __h;} __i;				\
+	  } __x;							\
+  __asm__ ("meid %2,%0"							\
+	   : "=g" (__x.__ll)						\
+	   : "%0" ((USItype)(u)), "g" ((USItype)(v)));			\
+  (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
+#define __umulsidi3(u, v) \
+  ({UDItype __w;							\
+    __asm__ ("meid %2,%0"						\
+	     : "=g" (__w)						\
+	     : "%0" ((USItype)(u)), "g" ((USItype)(v)));		\
+    __w; })
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  ({union {UDItype __ll;						\
+	   struct {USItype __l, __h;} __i;				\
+	  } __x;							\
+  __x.__i.__h = (n1); __x.__i.__l = (n0);				\
+  __asm__ ("deid %2,%0"							\
+	   : "=g" (__x.__ll)						\
+	   : "0" (__x.__ll), "g" ((USItype)(d)));			\
+  (r) = __x.__i.__l; (q) = __x.__i.__h; })
+#define count_trailing_zeros(count,x) \
+  do {									\
+    __asm__ ("ffsd	%2,%0"						\
+	     : "=r" ((USItype) (count))					\
+	     : "0" ((USItype) 0), "r" ((USItype) (x)));			\
+  } while (0)
+#endif /* __ns32000__ */
+
+/* We should test _IBMR2 here when we add assembly support for the system
+   vendor compilers.  */
+#if (defined (_ARCH_PPC) || defined (_ARCH_PWR) || defined (__powerpc__)) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  do {									\
+    if (__builtin_constant_p (bh) && (bh) == 0)				\
+      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"		\
+	     : "=r" (sh), "=&r" (sl) : "%r" (ah), "%r" (al), "rI" (bl));\
+    else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
+      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"		\
+	     : "=r" (sh), "=&r" (sl) : "%r" (ah), "%r" (al), "rI" (bl));\
+    else								\
+      __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"		\
+	     : "=r" (sh), "=&r" (sl)					\
+	     : "%r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
+  } while (0)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  do {									\
+    if (__builtin_constant_p (ah) && (ah) == 0)				\
+      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"	\
+	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
+    else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)		\
+      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"	\
+	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
+    else if (__builtin_constant_p (bh) && (bh) == 0)			\
+      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"		\
+	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
+    else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
+      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"		\
+	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
+    else								\
+      __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"	\
+	       : "=r" (sh), "=&r" (sl)					\
+	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		\
+  } while (0)
+#define count_leading_zeros(count, x) \
+  __asm__ ("{cntlz|cntlzw} %0,%1" : "=r" (count) : "r" (x))
+#define COUNT_LEADING_ZEROS_0 32
+#if defined (_ARCH_PPC) || defined (__powerpc__)
+#define umul_ppmm(ph, pl, m0, m1) \
+  do {									\
+    USItype __m0 = (m0), __m1 = (m1);					\
+    __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
+    (pl) = __m0 * __m1;							\
+  } while (0)
+#define UMUL_TIME 15
+#define smul_ppmm(ph, pl, m0, m1) \
+  do {									\
+    SItype __m0 = (m0), __m1 = (m1);					\
+    __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
+    (pl) = __m0 * __m1;							\
+  } while (0)
+#define SMUL_TIME 14
+#define UDIV_TIME 120
+#else
+#define UMUL_TIME 8
+#define smul_ppmm(xh, xl, m0, m1) \
+  __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
+#define SMUL_TIME 4
+#define sdiv_qrnnd(q, r, nh, nl, d) \
+  __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
+#define UDIV_TIME 100
+#endif
+#endif /* 32-bit POWER architecture variants.  */
+
+/* We should test _IBMR2 here when we add assembly support for the system
+   vendor compilers.  */
+#if (defined (_ARCH_PPC) || defined (__powerpc__)) && W_TYPE_SIZE == 64
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  do {									\
+    if (__builtin_constant_p (bh) && (bh) == 0)				\
+      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2"		\
+	     : "=r" (sh), "=&r" (sl) : "%r" (ah), "%r" (al), "rI" (bl));\
+    else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
+      __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2"		\
+	     : "=r" (sh), "=&r" (sl) : "%r" (ah), "%r" (al), "rI" (bl));\
+    else								\
+      __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3"		\
+	     : "=r" (sh), "=&r" (sl)					\
+	     : "%r" (ah), "r" (bh), "%r" (al), "rI" (bl));		\
+  } while (0)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  do {									\
+    if (__builtin_constant_p (ah) && (ah) == 0)				\
+      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2"	\
+	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
+    else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)		\
+      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2"	\
+	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
+    else if (__builtin_constant_p (bh) && (bh) == 0)			\
+      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2"		\
+	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
+    else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
+      __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2"		\
+	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
+    else								\
+      __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2"	\
+	       : "=r" (sh), "=&r" (sl)					\
+	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl));		\
+  } while (0)
+#define count_leading_zeros(count, x) \
+  __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
+#define COUNT_LEADING_ZEROS_0 64
+#define umul_ppmm(ph, pl, m0, m1) \
+  do {									\
+    UDItype __m0 = (m0), __m1 = (m1);					\
+    __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
+    (pl) = __m0 * __m1;							\
+  } while (0)
+#define UMUL_TIME 15
+#define smul_ppmm(ph, pl, m0, m1) \
+  do {									\
+    DItype __m0 = (m0), __m1 = (m1);					\
+    __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
+    (pl) = __m0 * __m1;							\
+  } while (0)
+#define SMUL_TIME 14  /* ??? */
+#define UDIV_TIME 120 /* ??? */
+#endif /* 64-bit PowerPC.  */
+
+#if defined (__pyr__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("addw %5,%1\n\taddwc %3,%0"					\
+	   : "=r" ((USItype)(sh)), "=&r" ((USItype)(sl))		\
+	   : "%0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
+	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("subw %5,%1\n\tsubwb %3,%0"					\
+	   : "=r" ((USItype)(sh)), "=&r" ((USItype)(sl))		\
+	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
+	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
+/* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
+#define umul_ppmm(w1, w0, u, v) \
+  ({union {UDItype __ll;						\
+	   struct {USItype __h, __l;} __i;				\
+	  } __x;							\
+  __asm__ ("movw %1,%R0\n\tuemul %2,%0"					\
+	   : "=&r" (__x.__ll)						\
+	   : "g" ((USItype) (u)), "g" ((USItype)(v)));			\
+  (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
+#endif /* __pyr__ */
+
+#if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("a %1,%5\n\tae %0,%3"					\
+	   : "=r" ((USItype)(sh)), "=&r" ((USItype)(sl))		\
+	   : "%0" ((USItype)(ah)), "r" ((USItype)(bh)),			\
+	     "%1" ((USItype)(al)), "r" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("s %1,%5\n\tse %0,%3"					\
+	   : "=r" ((USItype)(sh)), "=&r" ((USItype)(sl))		\
+	   : "0" ((USItype)(ah)), "r" ((USItype)(bh)),			\
+	     "1" ((USItype)(al)), "r" ((USItype)(bl)))
+#define smul_ppmm(ph, pl, m0, m1) \
+  __asm__ (								\
+       "s	r2,r2\n"				\
+	"mts r10,%2\n"					\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"m	r2,%3\n"				\
+	"cas	%0,r2,r0\n"				\
+	"mfs	r10,%1"					\
+	   : "=r" ((USItype)(ph)), "=r" ((USItype)(pl))	\
+	   : "%r" ((USItype)(m0)), "r" ((USItype)(m1))	\
+	   : "r2");					\
+#define UMUL_TIME 20
+#define UDIV_TIME 200
+#define count_leading_zeros(count, x) \
+  do {									\
+    if ((x) >= 0x10000)							\
+      __asm__ ("clz	%0,%1"						\
+	       : "=r" ((USItype)(count)) : "r" ((USItype)(x) >> 16));	\
+    else								\
+      {									\
+	__asm__ ("clz	%0,%1"						\
+		 : "=r" ((USItype)(count)) : "r" ((USItype)(x)));	\
+	(count) += 16;							\
+      }									\
+  } while (0)
+#endif /* RT/ROMP */
+
+#if defined (__sh2__) && W_TYPE_SIZE == 32
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"		\
+	   : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
+#define UMUL_TIME 5
+#endif
+
+#if defined (__sparc__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"				\
+	   : "=r" (sh), "=&r" (sl)					\
+	   : "%rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)		\
+	   __CLOBBER_CC)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"				\
+	   : "=r" (sh), "=&r" (sl)					\
+	   : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl)	\
+	   __CLOBBER_CC)
+#if defined (__sparc_v9__) || defined (__sparcv9)
+/* Perhaps we should use floating-point operations here?  */
+#if 0
+/* Triggers a bug making mpz/tests/t-gcd.c fail.
+   Perhaps we simply need explicitly zero-extend the inputs?  */
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :		\
+	   "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
+#else
+/* Use v8 umul until above bug is fixed.  */
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
+#endif
+/* Use a plain v8 divide for v9.  */
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do {									\
+    USItype __q;							\
+    __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
+	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
+    (r) = (n0) - __q * (d);						\
+    (q) = __q;								\
+  } while (0)
+#else
+#if defined (__sparc_v8__)
+/* Don't match immediate range because, 1) it is not often useful,
+   2) the 'I' flag thinks of the range as a 13 bit signed interval,
+   while we want to match a 13 bit interval, sign extended to 32 bits,
+   but INTERPRETED AS UNSIGNED.  */
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
+#define UMUL_TIME 5
+#ifndef SUPERSPARC	/* SuperSPARC's udiv only handles 53 bit dividends */
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do {									\
+    USItype __q;							\
+    __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
+	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
+    (r) = (n0) - __q * (d);						\
+    (q) = __q;								\
+  } while (0)
+#define UDIV_TIME 25
+#else
+#define UDIV_TIME 60		/* SuperSPARC timing */
+#endif /* SUPERSPARC */
+#else /* ! __sparc_v8__ */
+#if defined (__sparclite__)
+/* This has hardware multiply but not divide.  It also has two additional
+   instructions scan (ffs from high bit) and divscc.  */
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
+#define UMUL_TIME 5
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  __asm__ ("! Inlined udiv_qrnnd\n"					\
+	"wr	%%g0,%2,%%y	! Not a delayed write for sparclite\n"	\
+	"tst	%%g0\n"							\
+	"divscc	%3,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%%g1\n"						\
+	"divscc	%%g1,%4,%0\n"						\
+	"rd	%%y,%1\n"						\
+	"bl,a 1f\n"							\
+	"add	%1,%4,%1\n"						\
+"1:	! End of inline udiv_qrnnd"					\
+	   : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)		\
+	   : "%g1" __AND_CLOBBER_CC)
+#define UDIV_TIME 37
+#define count_leading_zeros(count, x) \
+  __asm__ ("scan %1,0,%0" : "=r" (x) : "r" (count))
+/* Early sparclites return 63 for an argument of 0, but they warn that future
+   implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
+   undefined.  */
+#endif /* __sparclite__ */
+#endif /* __sparc_v8__ */
+#endif /* __sparc_v9__ */
+/* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
+#ifndef umul_ppmm
+#define umul_ppmm(w1, w0, u, v) \
+  __asm__ ("! Inlined umul_ppmm\n"						\
+	"wr	%%g0,%2,%%y	! SPARC has 0-3 delay insn after a wr\n"	\
+	"sra	%3,31,%%g2	! Don't move this insn\n"			\
+	"and	%2,%%g2,%%g2	! Don't move this insn\n"			\
+	"andcc	%%g0,0,%%g1	! Don't move this insn\n"			\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,%3,%%g1\n"							\
+	"mulscc	%%g1,0,%%g1\n"							\
+	"add	%%g1,%%g2,%0\n"							\
+	"rd	%%y,%1"								\
+	   : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)				\
+	   : "%g1", "%g2" __AND_CLOBBER_CC)
+#define UMUL_TIME 39		/* 39 instructions */
+#endif
+#ifndef udiv_qrnnd
+#ifndef LONGLONG_STANDALONE
+#define udiv_qrnnd(q, r, n1, n0, d) \
+  do { USItype __r;							\
+    (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
+    (r) = __r;								\
+  } while (0)
+extern USItype __MPN(udiv_qrnnd) _PROTO ((USItype *, USItype, USItype, USItype));
+#ifndef UDIV_TIME
+#define UDIV_TIME 140
+#endif
+#endif /* LONGLONG_STANDALONE */
+#endif /* udiv_qrnnd */
+#endif /* __sparc__ */
+
+#if defined (__vax__) && W_TYPE_SIZE == 32
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("addl2 %5,%1\n\tadwc %3,%0"					\
+	   : "=g" ((USItype)(sh)), "=&g" ((USItype)(sl))		\
+	   : "%0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
+	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"					\
+	   : "=g" ((USItype)(sh)), "=&g" ((USItype)(sl))		\
+	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
+	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
+#define smul_ppmm(xh, xl, m0, m1) \
+  do {									\
+    union {UDItype __ll;						\
+	   struct {USItype __l, __h;} __i;				\
+	  } __x;							\
+    USItype __m0 = (m0), __m1 = (m1);					\
+    __asm__ ("emul %1,%2,$0,%0"						\
+	     : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));		\
+    (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
+  } while (0)
+#define sdiv_qrnnd(q, r, n1, n0, d) \
+  do {									\
+    union {DItype __ll;							\
+	   struct {SItype __l, __h;} __i;				\
+	  } __x;							\
+    __x.__i.__h = n1; __x.__i.__l = n0;					\
+    __asm__ ("ediv %3,%2,%0,%1"						\
+	     : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));		\
+  } while (0)
+#endif /* __vax__ */
+
+#if defined (__z8000__) && W_TYPE_SIZE == 16
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  __asm__ ("add	%H1,%H5\n\tadc	%H0,%H3"				\
+	   : "=r" ((unsigned int)(sh)), "=&r" ((unsigned int)(sl))	\
+	   : "%0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
+	     "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  __asm__ ("sub	%H1,%H5\n\tsbc	%H0,%H3"				\
+	   : "=r" ((unsigned int)(sh)), "=&r" ((unsigned int)(sl))	\
+	   : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
+	     "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
+#define umul_ppmm(xh, xl, m0, m1) \
+  do {									\
+    union {long int __ll;						\
+	   struct {unsigned int __h, __l;} __i;				\
+	  } __x;							\
+    unsigned int __m0 = (m0), __m1 = (m1);				\
+    __asm__ ("mult	%S0,%H3"					\
+	     : "=r" (__x.__i.__h), "=r" (__x.__i.__l)			\
+	     : "%1" (m0), "rQR" (m1));					\
+    (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
+    (xh) += ((((signed int) __m0 >> 15) & __m1)				\
+	     + (((signed int) __m1 >> 15) & __m0));			\
+  } while (0)
+#endif /* __z8000__ */
+
+#endif /* __GNUC__ */
+
+
+#if !defined (umul_ppmm) && defined (__umulsidi3)
+#define umul_ppmm(ph, pl, m0, m1) \
+  {									\
+    UDWtype __ll = __umulsidi3 (m0, m1);				\
+    ph = (UWtype) (__ll >> W_TYPE_SIZE);				\
+    pl = (UWtype) __ll;							\
+  }
+#endif
+
+#if !defined (__umulsidi3)
+#define __umulsidi3(u, v) \
+  ({UWtype __hi, __lo;							\
+    umul_ppmm (__hi, __lo, u, v);					\
+    ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
+#endif
+
+
+/* Note the prototypes are under !define(umul_ppmm) etc too, since the HPPA
+   versions above are different and we don't want to conflict.  */
+
+#if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm
+#define mpn_umul_ppmm  __MPN(umul_ppmm)
+extern mp_limb_t mpn_umul_ppmm _PROTO ((mp_limb_t *, mp_limb_t, mp_limb_t));
+#define umul_ppmm(wh, wl, u, v)                                 \
+  do {                                                          \
+    mp_limb_t __umul_ppmm__p0;                                  \
+    (wh) = __MPN(umul_ppmm) (&__umul_ppmm__p0,                  \
+                             (mp_limb_t) (u), (mp_limb_t) (v)); \
+    (wl) = __umul_ppmm__p0;                                     \
+  } while (0)
+#endif
+
+#if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd
+#define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
+extern mp_limb_t mpn_udiv_qrnnd _PROTO ((mp_limb_t *,
+                                         mp_limb_t, mp_limb_t, mp_limb_t));
+#define udiv_qrnnd(q, r, n1, n0, d)                                           \
+  do {                                                                        \
+    mp_limb_t __udiv_qrnnd__r;                                                \
+    (q) = mpn_udiv_qrnnd (&__udiv_qrnnd__r,                                   \
+                          (mp_limb_t) (n1), (mp_limb_t) (n0), (mp_limb_t) d); \
+    (r) = __udiv_qrnnd__r;                                                    \
+  } while (0)
+#endif
+
+
+/* If this machine has no inline assembler, use C macros.  */
+
+#if !defined (add_ssaaaa)
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  do {									\
+    UWtype __x;								\
+    __x = (al) + (bl);							\
+    (sh) = (ah) + (bh) + (__x < (al));					\
+    (sl) = __x;								\
+  } while (0)
+#endif
+
+#if !defined (sub_ddmmss)
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  do {									\
+    UWtype __x;								\
+    __x = (al) - (bl);							\
+    (sh) = (ah) - (bh) - (__x > (al));					\
+    (sl) = __x;								\
+  } while (0)
+#endif
+
+/* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
+   smul_ppmm.  */
+#if !defined (umul_ppmm) && defined (smul_ppmm)
+#define umul_ppmm(w1, w0, u, v)						\
+  do {									\
+    UWtype __w1;							\
+    UWtype __xm0 = (u), __xm1 = (v);					\
+    smul_ppmm (__w1, w0, __xm0, __xm1);					\
+    (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
+		+ (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
+  } while (0)
+#endif
+
+/* If we still don't have umul_ppmm, define it using plain C.  */
+#if !defined (umul_ppmm)
+#define umul_ppmm(w1, w0, u, v)						\
+  do {									\
+    UWtype __x0, __x1, __x2, __x3;					\
+    UHWtype __ul, __vl, __uh, __vh;					\
+    UWtype __u = (u), __v = (v);					\
+									\
+    __ul = __ll_lowpart (__u);						\
+    __uh = __ll_highpart (__u);						\
+    __vl = __ll_lowpart (__v);						\
+    __vh = __ll_highpart (__v);						\
+									\
+    __x0 = (UWtype) __ul * __vl;					\
+    __x1 = (UWtype) __ul * __vh;					\
+    __x2 = (UWtype) __uh * __vl;					\
+    __x3 = (UWtype) __uh * __vh;					\
+									\
+    __x1 += __ll_highpart (__x0);/* this can't give carry */		\
+    __x1 += __x2;		/* but this indeed can */		\
+    if (__x1 < __x2)		/* did we get it? */			\
+      __x3 += __ll_B;		/* yes, add it in the proper pos. */	\
+									\
+    (w1) = __x3 + __ll_highpart (__x1);					\
+    (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);		\
+  } while (0)
+#endif
+
+/* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
+   exist in one form or another.  */
+#if !defined (smul_ppmm)
+#define smul_ppmm(w1, w0, u, v)						\
+  do {									\
+    UWtype __w1;							\
+    UWtype __xm0 = (u), __xm1 = (v);					\
+    umul_ppmm (__w1, w0, __xm0, __xm1);					\
+    (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
+		- (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
+  } while (0)
+#endif
+
+/* Define this unconditionally, so it can be used for debugging.  */
+#define __udiv_qrnnd_c(q, r, n1, n0, d) \
+  do {									\
+    UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;			\
+    __d1 = __ll_highpart (d);						\
+    __d0 = __ll_lowpart (d);						\
+									\
+    __q1 = (n1) / __d1;							\
+    __r1 = (n1) - __q1 * __d1;						\
+    __m = (UWtype) __q1 * __d0;						\
+    __r1 = __r1 * __ll_B | __ll_highpart (n0);				\
+    if (__r1 < __m)							\
+      {									\
+	__q1--, __r1 += (d);						\
+	if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
+	  if (__r1 < __m)						\
+	    __q1--, __r1 += (d);					\
+      }									\
+    __r1 -= __m;							\
+									\
+    __q0 = __r1 / __d1;							\
+    __r0 = __r1  - __q0 * __d1;						\
+    __m = (UWtype) __q0 * __d0;						\
+    __r0 = __r0 * __ll_B | __ll_lowpart (n0);				\
+    if (__r0 < __m)							\
+      {									\
+	__q0--, __r0 += (d);						\
+	if (__r0 >= (d))						\
+	  if (__r0 < __m)						\
+	    __q0--, __r0 += (d);					\
+      }									\
+    __r0 -= __m;							\
+									\
+    (q) = (UWtype) __q1 * __ll_B | __q0;				\
+    (r) = __r0;								\
+  } while (0)
+
+/* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
+   __udiv_w_sdiv (defined in libgcc or elsewhere).  */
+#if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
+#define udiv_qrnnd(q, r, nh, nl, d) \
+  do {									\
+    UWtype __r;								\
+    (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);				\
+    (r) = __r;								\
+  } while (0)
+#endif
+
+/* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
+#if !defined (udiv_qrnnd)
+#define UDIV_NEEDS_NORMALIZATION 1
+#define udiv_qrnnd __udiv_qrnnd_c
+#endif
+
+#if !defined (count_leading_zeros)
+extern
+#if __STDC__
+const
+#endif
+unsigned char __clz_tab[];
+#define count_leading_zeros(count, x) \
+  do {									\
+    UWtype __xr = (x);							\
+    UWtype __a;								\
+									\
+    if (W_TYPE_SIZE <= 32)						\
+      {									\
+	__a = __xr < ((UWtype) 1 << 2*__BITS4)				\
+	  ? (__xr < ((UWtype) 1 << __BITS4) ? 0 : __BITS4)		\
+	  : (__xr < ((UWtype) 1 << 3*__BITS4) ?  2*__BITS4 : 3*__BITS4);\
+      }									\
+    else								\
+      {									\
+	for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)			\
+	  if (((__xr >> __a) & 0xff) != 0)				\
+	    break;							\
+      }									\
+									\
+    (count) = W_TYPE_SIZE - (__clz_tab[__xr >> __a] + __a);		\
+  } while (0)
+/* This version gives a well-defined value for zero. */
+#define COUNT_LEADING_ZEROS_0 W_TYPE_SIZE
+#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+#endif
+
+#if !defined (count_trailing_zeros)
+/* Define count_trailing_zeros using count_leading_zeros.  The latter might be
+   defined in asm, but if it is not, the C version above is good enough.  */
+#define count_trailing_zeros(count, x) \
+  do {									\
+    UWtype __ctz_x = (x);						\
+    UWtype __ctz_c;							\
+    count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);			\
+    (count) = W_TYPE_SIZE - 1 - __ctz_c;				\
+  } while (0)
+#endif
+
+#ifndef UDIV_NEEDS_NORMALIZATION
+#define UDIV_NEEDS_NORMALIZATION 0
+#endif
+
+/* Give defaults for UMUL_TIME and UDIV_TIME.  */
+#ifndef UMUL_TIME
+#define UMUL_TIME 1
+#endif
+
+#ifndef UDIV_TIME
+#define UDIV_TIME UMUL_TIME
+#endif
+
+/* count_trailing_zeros is often on the slow side, so make that the default */
+#ifndef COUNT_TRAILING_ZEROS_TIME
+#define COUNT_TRAILING_ZEROS_TIME  15  /* cycles */
+#endif
+
+
diff --git a/rts/gmp/ltconfig b/rts/gmp/ltconfig
new file mode 100644
index 0000000000..6d8cf33e8f
--- /dev/null
+++ b/rts/gmp/ltconfig
@@ -0,0 +1,3109 @@
+#! /bin/sh
+
+# ltconfig - Create a system-specific libtool.
+# Copyright (C) 1996-2000 Free Software Foundation, Inc.
+# Originally by Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
+#
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# A lot of this script is taken from autoconf-2.10.
+
+# Check that we are running under the correct shell.
+SHELL=${CONFIG_SHELL-/bin/sh}
+echo=echo
+if test "X$1" = X--no-reexec; then
+  # Discard the --no-reexec flag, and continue.
+  shift
+elif test "X$1" = X--fallback-echo; then
+  # Avoid inline document here, it may be left over
+  :
+elif test "X`($echo '\t') 2>/dev/null`" = 'X\t'; then
+  # Yippee, $echo works!
+  :
+else
+  # Restart under the correct shell.
+  exec "$SHELL" "$0" --no-reexec ${1+"$@"}
+fi
+
+if test "X$1" = X--fallback-echo; then
+  # used as fallback echo
+  shift
+  cat <<EOF
+$*
+EOF
+  exit 0
+fi
+
+# Find the correct PATH separator.  Usually this is `:', but
+# DJGPP uses `;' like DOS.
+if test "X${PATH_SEPARATOR+set}" != Xset; then
+  UNAME=${UNAME-`uname 2>/dev/null`}
+  case X$UNAME in
+    *-DOS) PATH_SEPARATOR=';' ;;
+    *)     PATH_SEPARATOR=':' ;;
+  esac
+fi
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+if test "X${CDPATH+set}" = Xset; then CDPATH=:; export CDPATH; fi
+
+if test "X${echo_test_string+set}" != Xset; then
+  # find a string as large as possible, as long as the shell can cope with it
+  for cmd in 'sed 50q "$0"' 'sed 20q "$0"' 'sed 10q "$0"' 'sed 2q "$0"' 'echo test'; do
+    # expected sizes: less than 2Kb, 1Kb, 512 bytes, 16 bytes, ...
+    if (echo_test_string="`eval $cmd`") 2>/dev/null &&
+       echo_test_string="`eval $cmd`" &&
+       (test "X$echo_test_string" = "X$echo_test_string") 2>/dev/null; then
+      break
+    fi
+  done
+fi
+
+if test "X`($echo '\t') 2>/dev/null`" = 'X\t' &&
+   echo_testing_string=`($echo "$echo_test_string") 2>/dev/null` &&
+   test "X$echo_testing_string" = "X$echo_test_string"; then
+  :
+else
+  # The Solaris, AIX, and Digital Unix default echo programs unquote
+  # backslashes.  This makes it impossible to quote backslashes using
+  #   echo "$something" | sed 's/\\/\\\\/g'
+  #
+  # So, first we look for a working echo in the user's PATH.
+
+  IFS="${IFS= 	}"; save_ifs="$IFS"; IFS="${IFS}${PATH_SEPARATOR}"
+  for dir in $PATH /usr/ucb; do
+    if (test -f $dir/echo || test -f $dir/echo$ac_exeext) &&
+       test "X`($dir/echo '\t') 2>/dev/null`" = 'X\t' &&
+       echo_testing_string=`($dir/echo "$echo_test_string") 2>/dev/null` &&
+       test "X$echo_testing_string" = "X$echo_test_string"; then
+      echo="$dir/echo"
+      break
+    fi
+  done
+  IFS="$save_ifs"
+
+  if test "X$echo" = Xecho; then
+    # We didn't find a better echo, so look for alternatives.
+    if test "X`(print -r '\t') 2>/dev/null`" = 'X\t' &&
+       echo_testing_string=`(print -r "$echo_test_string") 2>/dev/null` &&
+       test "X$echo_testing_string" = "X$echo_test_string"; then
+      # This shell has a builtin print -r that does the trick.
+      echo='print -r'
+    elif (test -f /bin/ksh || test -f /bin/ksh$ac_exeext) &&
+	 test "X$CONFIG_SHELL" != X/bin/ksh; then
+      # If we have ksh, try running ltconfig again with it.
+      ORIGINAL_CONFIG_SHELL="${CONFIG_SHELL-/bin/sh}"
+      export ORIGINAL_CONFIG_SHELL
+      CONFIG_SHELL=/bin/ksh
+      export CONFIG_SHELL
+      exec "$CONFIG_SHELL" "$0" --no-reexec ${1+"$@"}
+    else
+      # Try using printf.
+      echo='printf "%s\n"'
+      if test "X`($echo '\t') 2>/dev/null`" = 'X\t' &&
+	 echo_testing_string=`($echo "$echo_test_string") 2>/dev/null` &&
+	 test "X$echo_testing_string" = "X$echo_test_string"; then
+	# Cool, printf works
+	:
+      elif echo_testing_string=`("$ORIGINAL_CONFIG_SHELL" "$0" --fallback-echo '\t') 2>/dev/null` &&
+	   test "X$echo_testing_string" = 'X\t' &&
+	   echo_testing_string=`("$ORIGINAL_CONFIG_SHELL" "$0" --fallback-echo "$echo_test_string") 2>/dev/null` &&
+	   test "X$echo_testing_string" = "X$echo_test_string"; then
+	CONFIG_SHELL="$ORIGINAL_CONFIG_SHELL"
+	export CONFIG_SHELL
+	SHELL="$CONFIG_SHELL"
+	export SHELL
+	echo="$CONFIG_SHELL $0 --fallback-echo"
+      elif echo_testing_string=`("$CONFIG_SHELL" "$0" --fallback-echo '\t') 2>/dev/null` &&
+	   test "X$echo_testing_string" = 'X\t' &&
+	   echo_testing_string=`("$CONFIG_SHELL" "$0" --fallback-echo "$echo_test_string") 2>/dev/null` &&
+	   test "X$echo_testing_string" = "X$echo_test_string"; then
+	echo="$CONFIG_SHELL $0 --fallback-echo"
+      else
+	# maybe with a smaller string...
+	prev=:
+
+	for cmd in 'echo test' 'sed 2q "$0"' 'sed 10q "$0"' 'sed 20q "$0"' 'sed 50q "$0"'; do
+	  if (test "X$echo_test_string" = "X`eval $cmd`") 2>/dev/null; then
+	    break
+	  fi
+	  prev="$cmd"
+	done
+
+	if test "$prev" != 'sed 50q "$0"'; then
+	  echo_test_string=`eval $prev`
+	  export echo_test_string
+	  exec "${ORIGINAL_CONFIG_SHELL}" "$0" ${1+"$@"}
+	else
+	  # Oops.  We lost completely, so just stick with echo.
+	  echo=echo
+	fi
+      fi
+    fi
+  fi
+fi
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+Xsed='sed -e s/^X//'
+sed_quote_subst='s/\([\\"\\`$\\\\]\)/\\\1/g'
+
+# Same as above, but do not quote variable references.
+double_quote_subst='s/\([\\"\\`\\\\]\)/\\\1/g'
+
+# Sed substitution to delay expansion of an escaped shell variable in a
+# double_quote_subst'ed string.
+delay_variable_subst='s/\\\\\\\\\\\$/\\\\\\$/g'
+
+# The name of this program.
+progname=`$echo "X$0" | $Xsed -e 's%^.*/%%'`
+
+# Constants:
+PROGRAM=ltconfig
+PACKAGE=libtool
+VERSION=1.3c
+TIMESTAMP=" (1.696 2000/03/14 20:22:42)"
+ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
+ac_link='${CC-cc} -o conftest $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
+rm="rm -f"
+
+help="Try \`$progname --help' for more information."
+
+# Global variables:
+default_ofile=libtool
+can_build_shared=yes
+enable_shared=yes
+# All known linkers require a `.a' archive for static linking (except M$VC,
+# which needs '.lib').
+enable_static=yes
+enable_fast_install=yes
+enable_dlopen=unknown
+enable_win32_dll=no
+pic_mode=default
+ltmain=
+silent=
+srcdir=
+ac_config_guess=
+ac_config_sub=
+host=
+build=NONE
+nonopt=NONE
+ofile="$default_ofile"
+verify_host=yes
+with_gcc=no
+with_gnu_ld=no
+need_locks=yes
+ac_ext=c
+libext=a
+cache_file=
+
+old_AR="$AR"
+old_CC="$CC"
+old_CFLAGS="$CFLAGS"
+old_CPPFLAGS="$CPPFLAGS"
+old_LDFLAGS="$LDFLAGS"
+old_LIBS="$LIBS"
+old_MAGIC="$MAGIC"
+old_LD="$LD"
+old_LN_S="$LN_S"
+old_NM="$NM"
+old_RANLIB="$RANLIB"
+old_STRIP="$STRIP"
+old_AS="$AS"
+old_DLLTOOL="$DLLTOOL"
+old_OBJDUMP="$OBJDUMP"
+old_OBJEXT="$OBJEXT"
+old_EXEEXT="$EXEEXT"
+old_reload_Flag="$reload_flag"
+old_deplibs_check_method="$deplibs_check_method"
+old_file_magic_cmd="$file_magic_cmd"
+
+# Parse the command line options.
+args=
+prev=
+for option
+do
+  case "$option" in
+  -*=*) optarg=`echo "$option" | sed 's/[-_a-zA-Z0-9]*=//'` ;;
+  *) optarg= ;;
+  esac
+
+  # If the previous option needs an argument, assign it.
+  if test -n "$prev"; then
+    eval "$prev=\$option"
+    prev=
+    continue
+  fi
+
+  case "$option" in
+  --help) cat <<EOM
+Usage: $progname [OPTION]... LTMAIN [HOST]
+
+Generate a system-specific libtool script.
+
+    --build                configure for building on BUILD [BUILD=HOST]
+    --debug                enable verbose shell tracing
+    --disable-shared       do not build shared libraries
+    --disable-static       do not build static libraries
+    --disable-fast-install do not optimize for fast installation
+    --enable-dlopen        enable dlopen support
+    --enable-win32-dll     enable building dlls on win32 hosts
+    --help                 display this help and exit
+    --no-verify            do not verify that HOST is a valid host type
+-o, --output=FILE          specify the output file [default=$default_ofile]
+    --quiet                same as \`--silent'
+    --silent               do not print informational messages
+    --srcdir=DIR           find \`config.guess' in DIR
+    --version              output version information and exit
+    --with-gcc             assume that the GNU C compiler will be used
+    --with-gnu-ld          assume that the C compiler uses the GNU linker
+    --prefer-pic           try to use only PIC objects
+    --prefer-non-pic       try to use only non-PIC objects
+    --disable-lock         disable file locking
+    --cache-file=FILE      configure cache file
+
+LTMAIN is the \`ltmain.sh' shell script fragment or \`ltmain.c' program
+that provides basic libtool functionality.
+
+HOST is the canonical host system name [default=guessed].
+EOM
+  exit 0
+  ;;
+
+  --build) prev=build ;;
+  --build=*) build="$optarg" ;;
+
+  --debug)
+    echo "$progname: enabling shell trace mode"
+    set -x
+    ;;
+
+  --disable-shared) enable_shared=no ;;
+
+  --disable-static) enable_static=no ;;
+
+  --disable-fast-install) enable_fast_install=no ;;
+
+  --enable-dlopen) enable_dlopen=yes ;;
+
+  --enable-win32-dll) enable_win32_dll=yes ;;
+
+  --quiet | --silent) silent=yes ;;
+
+  --srcdir) prev=srcdir ;;
+  --srcdir=*) srcdir="$optarg" ;;
+
+  --no-verify) verify_host=no ;;
+
+  --output | -o) prev=ofile ;;
+  --output=*) ofile="$optarg" ;;
+
+  --version) echo "$PROGRAM (GNU $PACKAGE) $VERSION$TIMESTAMP"; exit 0 ;;
+
+  --with-gcc) with_gcc=yes ;;
+  --with-gnu-ld) with_gnu_ld=yes ;;
+
+  --prefer-pic) pic_mode=yes ;;
+  --prefer-non-pic) pic_mode=no ;;
+
+  --disable-lock) need_locks=no ;;
+
+  --cache-file=*) cache_file="$optarg" ;;
+
+  -*)
+    echo "$progname: unrecognized option \`$option'" 1>&2
+    echo "$help" 1>&2
+    exit 1
+    ;;
+
+  *)
+    if test -z "$ltmain"; then
+      ltmain="$option"
+    elif test -z "$host"; then
+# This generates an unnecessary warning for sparc-sun-solaris4.1.3_U1
+#      if test -n "`echo $option| sed 's/[-a-z0-9.]//g'`"; then
+#        echo "$progname: warning \`$option' is not a valid host type" 1>&2
+#      fi
+      host="$option"
+    else
+      echo "$progname: too many arguments" 1>&2
+      echo "$help" 1>&2
+      exit 1
+    fi ;;
+  esac
+done
+
+if test -z "$ltmain"; then
+  echo "$progname: you must specify a LTMAIN file" 1>&2
+  echo "$help" 1>&2
+  exit 1
+fi
+
+if test ! -f "$ltmain"; then
+  echo "$progname: \`$ltmain' does not exist" 1>&2
+  echo "$help" 1>&2
+  exit 1
+fi
+
+# Quote any args containing shell metacharacters.
+ltconfig_args=
+for arg
+do
+  case "$arg" in
+  *" "*|*"	"*|*[\[\]\~\#\$\^\&\*\(\)\{\}\\\|\;\<\>\?]*)
+  ltconfig_args="$ltconfig_args '$arg'" ;;
+  *) ltconfig_args="$ltconfig_args $arg" ;;
+  esac
+done
+
+# A relevant subset of AC_INIT.
+
+# File descriptor usage:
+# 0 standard input
+# 1 file creation
+# 2 errors and warnings
+# 3 some systems may open it to /dev/tty
+# 4 used on the Kubota Titan
+# 5 compiler messages saved in config.log
+# 6 checking for... messages and results
+if test "$silent" = yes; then
+  exec 6>/dev/null
+else
+  exec 6>&1
+fi
+exec 5>>./config.log
+
+# NLS nuisances.
+# Only set LANG and LC_ALL to C if already set.
+# These must not be set unconditionally because not all systems understand
+# e.g. LANG=C (notably SCO).
+if test "X${LC_ALL+set}" = Xset; then LC_ALL=C; export LC_ALL; fi
+if test "X${LANG+set}"   = Xset; then LANG=C;   export LANG;   fi
+
+if test -n "$cache_file" && test -r "$cache_file"; then
+  echo "loading cache $cache_file within ltconfig"
+  . $cache_file
+fi
+
+if (echo "testing\c"; echo 1,2,3) | grep c >/dev/null; then
+  # Stardent Vistra SVR4 grep lacks -e, says ghazi@caip.rutgers.edu.
+  if (echo -n testing; echo 1,2,3) | sed s/-n/xn/ | grep xn >/dev/null; then
+    ac_n= ac_c='
+' ac_t='	'
+  else
+    ac_n=-n ac_c= ac_t=
+  fi
+else
+  ac_n= ac_c='\c' ac_t=
+fi
+
+if test -z "$srcdir"; then
+  # Assume the source directory is the same one as the path to LTMAIN.
+  srcdir=`$echo "X$ltmain" | $Xsed -e 's%/[^/]*$%%'`
+  test "$srcdir" = "$ltmain" && srcdir=.
+fi
+
+trap "$rm conftest*; exit 1" 1 2 15
+if test "$verify_host" = yes; then
+  # Check for config.guess and config.sub.
+  ac_aux_dir=
+  for ac_dir in $srcdir $srcdir/.. $srcdir/../..; do
+    if test -f $ac_dir/config.guess; then
+      ac_aux_dir=$ac_dir
+      break
+    fi
+  done
+  if test -z "$ac_aux_dir"; then
+    echo "$progname: cannot find config.guess in $srcdir $srcdir/.. $srcdir/../.." 1>&2
+    echo "$help" 1>&2
+    exit 1
+  fi
+  ac_config_guess=$ac_aux_dir/config.guess
+  ac_config_sub=$ac_aux_dir/config.sub
+
+  # Make sure we can run config.sub.
+  if $SHELL $ac_config_sub sun4 >/dev/null 2>&1; then :
+  else
+    echo "$progname: cannot run $ac_config_sub" 1>&2
+    echo "$help" 1>&2
+    exit 1
+  fi
+
+  echo $ac_n "checking host system type""... $ac_c" 1>&6
+
+  host_alias=$host
+  case "$host_alias" in
+  "")
+    if host_alias=`$SHELL $ac_config_guess`; then :
+    else
+      echo "$progname: cannot guess host type; you must specify one" 1>&2
+      echo "$help" 1>&2
+      exit 1
+    fi ;;
+  esac
+  host=`$SHELL $ac_config_sub $host_alias`
+  echo "$ac_t$host" 1>&6
+
+  # Make sure the host verified.
+  test -z "$host" && exit 1
+
+  # Check for the build system type
+  echo $ac_n "checking build system type... $ac_c" 1>&6
+
+  build_alias=$build
+  case "$build_alias" in
+  NONE)
+    case $nonopt in
+    NONE) build_alias=$host_alias ;;
+    *) build_alias=$nonopt ;;
+    esac ;;
+  esac
+
+  build=`$SHELL $ac_config_sub $build_alias`
+  build_cpu=`echo $build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'`
+  build_vendor=`echo $build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'`
+  build_os=`echo $build | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'`
+  echo "$ac_t""$build" 1>&6
+
+elif test -z "$host"; then
+  echo "$progname: you must specify a host type if you use \`--no-verify'" 1>&2
+  echo "$help" 1>&2
+  exit 1
+else
+  host_alias=$host
+  build_alias=$host_alias
+  build=$host
+fi
+
+if test x"$host" != x"$build"; then
+  ac_tool_prefix=${host_alias}-
+else
+  ac_tool_prefix=
+fi
+
+host_cpu=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'`
+host_vendor=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'`
+host_os=`echo $host | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'`
+
+# Transform linux* to *-*-linux-gnu*, to support old configure scripts.
+case "$host_os" in
+linux-gnu*) ;;
+linux*) host=`echo $host | sed 's/^\(.*-.*-linux\)\(.*\)$/\1-gnu\2/'`
+esac
+
+case "$host_os" in
+aix3*)
+  # AIX sometimes has problems with the GCC collect2 program.  For some
+  # reason, if we set the COLLECT_NAMES environment variable, the problems
+  # vanish in a puff of smoke.
+  if test "X${COLLECT_NAMES+set}" != Xset; then
+    COLLECT_NAMES=
+    export COLLECT_NAMES
+  fi
+  ;;
+esac
+
+# Determine commands to create old-style static archives.
+old_archive_cmds='$AR cru $oldlib$oldobjs$old_deplibs'
+old_postinstall_cmds='chmod 644 $oldlib'
+old_postuninstall_cmds=
+
+# Set sane defaults for various variables
+test -z "$AR" && AR=ar
+test -z "$AS" && AS=as
+test -z "$CC" && CC=cc
+test -z "$DLLTOOL" && DLLTOOL=dlltool
+test -z "$MAGIC" && MAGIC=file
+test -z "$LD" && LD=ld
+test -z "$LN_S" && LN_S="ln -s"
+test -z "$NM" && NM=nm
+test -z "$OBJDUMP" && OBJDUMP=objdump
+test -z "$RANLIB" && RANLIB=:
+test -z "$STRIP" && STRIP=:
+test -z "$objext" && objext=o
+
+echo $ac_n "checking for objdir... $ac_c" 1>&6
+rm -f .libs 2>/dev/null
+mkdir .libs 2>/dev/null
+if test -d .libs; then
+  objdir=.libs
+else
+  # MS-DOS does not allow filenames that begin with a dot.
+  objdir=_libs
+fi
+rmdir .libs 2>/dev/null
+echo "$ac_t$objdir" 1>&6
+
+# Allow CC to be a program name with arguments.
+set dummy $CC
+compiler="$2"
+
+# We assume here that the value for ac_cv_prog_cc_pic will not be cached
+# in isolation, and that seeing it set (from the cache) indicates that
+# the associated values are set (in the cache) correctly too.
+echo $ac_n "checking for $compiler option to produce PIC... $ac_c" 1>&6
+echo "$progname:563:checking for $compiler option to produce PIC" 1>&5
+if test "X${ac_cv_prog_cc_pic+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  ac_cv_prog_cc_pic=
+  ac_cv_prog_cc_shlib=
+  ac_cv_prog_cc_wl=
+  ac_cv_prog_cc_static=
+  ac_cv_prog_cc_no_builtin=
+  ac_cv_prog_cc_can_build_shared=$can_build_shared
+
+  if test "$with_gcc" = yes; then
+    ac_cv_prog_cc_wl='-Wl,'
+    ac_cv_prog_cc_static='-static'
+
+    case "$host_os" in
+    beos* | irix5* | irix6* | osf3* | osf4* | osf5*)
+      # PIC is the default for these OSes.
+      ;;
+    aix*)
+      # Below there is a dirty hack to force normal static linking with -ldl
+      # The problem is because libdl dynamically linked with both libc and
+      # libC (AIX C++ library), which obviously doesn't included in libraries
+      # list by gcc. This cause undefined symbols with -static flags.
+      # This hack allows C programs to be linked with "-static -ldl", but
+      # we not sure about C++ programs.
+      ac_cv_prog_cc_static="$ac_cv_prog_cc_static ${ac_cv_prog_cc_wl}-lC"
+      ;;
+    cygwin* | mingw* | os2*)
+      # This hack is so that the source file can tell whether it is being
+      # built for inclusion in a dll (and should export symbols for example).
+      ac_cv_prog_cc_pic='-DDLL_EXPORT'
+      ;;
+    amigaos*)
+      # FIXME: we need at least 68020 code to build shared libraries, but
+      # adding the `-m68020' flag to GCC prevents building anything better,
+      # like `-m68040'.
+      ac_cv_prog_cc_pic='-m68020 -resident32 -malways-restore-a4'
+      ;;
+    sysv4*MP*)
+      if test -d /usr/nec; then
+	 ac_cv_prog_cc_pic=-Kconform_pic
+      fi
+      ;;
+    *)
+      ac_cv_prog_cc_pic='-fPIC'
+      ;;
+    esac
+  else
+    # PORTME Check for PIC flags for the system compiler.
+    case "$host_os" in
+    aix3* | aix4*)
+     # All AIX code is PIC.
+      ac_cv_prog_cc_static='-bnso -bI:/lib/syscalls.exp'
+      ;;
+
+    hpux9* | hpux10* | hpux11*)
+      # Is there a better ac_cv_prog_cc_static that works with the bundled CC?
+      ac_cv_prog_cc_wl='-Wl,'
+      ac_cv_prog_cc_static="${ac_cv_prog_cc_wl}-a ${ac_cv_prog_cc_wl}archive"
+      ac_cv_prog_cc_pic='+Z'
+      ;;
+
+    irix5* | irix6*)
+      ac_cv_prog_cc_wl='-Wl,'
+      ac_cv_prog_cc_static='-non_shared'
+      # PIC (with -KPIC) is the default.
+      ;;
+
+    cygwin* | mingw* | os2*)
+      # This hack is so that the source file can tell whether it is being
+      # built for inclusion in a dll (and should export symbols for example).
+      ac_cv_prog_cc_pic='-DDLL_EXPORT'
+      ;;
+
+    osf3* | osf4* | osf5*)
+      # All OSF/1 code is PIC.
+      ac_cv_prog_cc_wl='-Wl,'
+      ac_cv_prog_cc_static='-non_shared'
+      ;;
+
+    sco3.2v5*)
+      ac_cv_prog_cc_pic='-Kpic'
+      ac_cv_prog_cc_static='-dn'
+      ac_cv_prog_cc_shlib='-belf'
+      ;;
+
+    solaris*)
+      ac_cv_prog_cc_pic='-KPIC'
+      ac_cv_prog_cc_static='-Bstatic'
+      ac_cv_prog_cc_wl='-Wl,'
+      ;;
+
+    sunos4*)
+      ac_cv_prog_cc_pic='-PIC'
+      ac_cv_prog_cc_static='-Bstatic'
+      ac_cv_prog_cc_wl='-Qoption ld '
+      ;;
+
+    sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+      ac_cv_prog_cc_pic='-KPIC'
+      ac_cv_prog_cc_static='-Bstatic'
+      ac_cv_prog_cc_wl='-Wl,'
+      ;;
+
+    uts4*)
+      ac_cv_prog_cc_pic='-pic'
+      ac_cv_prog_cc_static='-Bstatic'
+      ;;
+
+    sysv4*MP*)
+      if test -d /usr/nec ;then
+	ac_cv_prog_cc_pic='-Kconform_pic'
+	ac_cv_prog_cc_static='-Bstatic'
+      fi
+      ;;
+
+    *)
+      ac_cv_prog_cc_can_build_shared=no
+      ;;
+    esac
+  fi
+fi
+if test -z "$ac_cv_prog_cc_pic"; then
+  echo "$ac_t"none 1>&6
+else
+  echo "$ac_t""$ac_cv_prog_cc_pic" 1>&6
+
+  # Check to make sure the pic_flag actually works.
+  echo $ac_n "checking if $compiler PIC flag $ac_cv_prog_cc_pic works... $ac_c" 1>&6
+  echo "$progname:693:checking that $compiler PIC flag $ac_cv_prog_cc_pic works." 1>&5
+  if test "X${ac_cv_prog_cc_pic_works+set}" = Xset; then
+    echo $ac_n "(cached) $ac_c" 1>&6
+  else
+    ac_cv_prog_cc_pic_works=yes
+    $rm conftest*
+    echo "int some_variable = 0;" > conftest.c
+    save_CFLAGS="$CFLAGS"
+    CFLAGS="$CFLAGS $ac_cv_prog_cc_pic -DPIC"
+    if { (eval echo $progname:702: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>conftest.err; } && test -s conftest.$objext; then
+      # Append any warnings to the config.log.
+      cat conftest.err 1>&5
+
+      case "$host_os" in
+      hpux9* | hpux10* | hpux11*)
+	# On HP-UX, both CC and GCC only warn that PIC is supported... then
+	# they create non-PIC objects.  So, if there were any warnings, we
+	# assume that PIC is not supported.
+	if test -s conftest.err; then
+	  ac_cv_prog_cc_pic_works=no
+	  ac_cv_prog_cc_can_build_shared=no
+	  ac_cv_prog_cc_pic=
+	else
+	  ac_cv_prog_cc_pic_works=yes
+	  ac_cv_prog_cc_pic=" $ac_cv_prog_cc_pic"
+	fi
+	;;
+      *)
+	ac_cv_prog_cc_pic_works=yes
+	ac_cv_prog_cc_pic=" $ac_cv_prog_cc_pic"
+	;;
+      esac
+    else
+      # Append any errors to the config.log.
+      cat conftest.err 1>&5
+      ac_cv_prog_cc_pic_works=no
+      ac_cv_prog_cc_can_build_shared=no
+      ac_cv_prog_cc_pic=
+    fi
+    CFLAGS="$save_CFLAGS"
+    $rm conftest*
+  fi
+  # Belt *and* braces to stop my trousers falling down:
+  if test "X$ac_cv_prog_cc_pic_works" = Xno; then
+    ac_cv_prog_cc_pic=
+    ac_cv_prog_cc_can_build_shared=no
+  fi
+  echo "$ac_t""$ac_cv_prog_cc_pic_works" 1>&6
+fi
+
+# Check for any special shared library compilation flags.
+if test -n "$ac_cv_prog_cc_shlib"; then
+  echo "$progname: warning: \`$CC' requires \`$ac_cv_prog_cc_shlib' to build shared libraries" 1>&2
+  if echo "$old_CC $old_CFLAGS " | egrep -e "[ 	]$ac_cv_prog_cc_shlib[ 	]" >/dev/null; then :
+  else
+    echo "$progname: add \`$ac_cv_prog_cc_shlib' to the CC or CFLAGS env variable and reconfigure" 1>&2
+    ac_cv_prog_cc_can_build_shared=no
+  fi
+fi
+
+echo $ac_n "checking if $compiler static flag $ac_cv_prog_cc_static works... $ac_c" 1>&6
+echo "$progname:754: checking if $compiler static flag $ac_cv_prog_cc_static works" >&5
+if test "X${ac_cv_prog_cc_static_works+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  $rm conftest*
+  echo 'main(){return(0);}' > conftest.c
+  save_LDFLAGS="$LDFLAGS"
+  LDFLAGS="$LDFLAGS $ac_cv_prog_cc_static"
+  if { (eval echo $progname:762: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
+    ac_cv_prog_cc_static_works=yes
+  else
+    ac_cv_prog_cc_static_works=no
+    ac_cv_prog_cc_static=
+  fi
+  LDFLAGS="$save_LDFLAGS"
+  $rm conftest*
+fi
+# Belt *and* braces to stop my trousers falling down:
+if test "X$ac_cv_prog_cc_static_works" = Xno; then
+  ac_cv_prog_cc_static=
+fi
+echo "$ac_t""$ac_cv_prog_cc_static_works" 1>&6
+pic_flag="$ac_cv_prog_cc_pic"
+special_shlib_compile_flags="$ac_cv_prog_cc_shlib"
+wl="$ac_cv_prog_cc_wl"
+link_static_flag="$ac_cv_prog_cc_static"
+no_builtin_flag="$ac_cv_prog_cc_no_builtin"
+can_build_shared="$ac_cv_prog_cc_can_build_shared"
+
+# Check to see if options -o and -c are simultaneously supported by compiler
+echo $ac_n "checking if $compiler supports -c -o file.o... $ac_c" 1>&6
+$rm -r conftest 2>/dev/null
+mkdir conftest
+cd conftest
+$rm conftest*
+echo "int some_variable = 0;" > conftest.c
+mkdir out
+# According to Tom Tromey, Ian Lance Taylor reported there are C compilers
+# that will create temporary files in the current directory regardless of
+# the output directory.  Thus, making CWD read-only will cause this test
+# to fail, enabling locking or at least warning the user not to do parallel
+# builds.
+chmod -w .
+save_CFLAGS="$CFLAGS"
+CFLAGS="$CFLAGS -o out/conftest2.o"
+echo "$progname:799: checking if $compiler supports -c -o file.o" >&5
+if { (eval echo $progname:800: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>out/conftest.err; } && test -s out/conftest2.o; then
+
+  # The compiler can only warn and ignore the option if not recognized
+  # So say no if there are warnings
+    if test -s out/conftest.err; then
+      echo "$ac_t"no 1>&6
+      compiler_c_o=no
+    else
+      echo "$ac_t"yes 1>&6
+      compiler_c_o=yes
+    fi
+else
+  # Append any errors to the config.log.
+  cat out/conftest.err 1>&5
+  compiler_c_o=no
+  echo "$ac_t"no 1>&6
+fi
+CFLAGS="$save_CFLAGS"
+chmod u+w .
+$rm conftest* out/*
+rmdir out
+cd ..
+rmdir conftest
+$rm -r conftest 2>/dev/null
+
+if test x"$compiler_c_o" = x"yes"; then
+  # Check to see if we can write to a .lo
+  echo $ac_n "checking if $compiler supports -c -o file.lo... $ac_c" 1>&6
+  $rm conftest*
+  echo "int some_variable = 0;" > conftest.c
+  save_CFLAGS="$CFLAGS"
+  CFLAGS="$CFLAGS -c -o conftest.lo"
+  echo "$progname:832: checking if $compiler supports -c -o file.lo" >&5
+if { (eval echo $progname:833: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>conftest.err; } && test -s conftest.lo; then
+
+    # The compiler can only warn and ignore the option if not recognized
+    # So say no if there are warnings
+      if test -s conftest.err; then
+	echo "$ac_t"no 1>&6
+	compiler_o_lo=no
+      else
+	echo "$ac_t"yes 1>&6
+	compiler_o_lo=yes
+      fi
+  else
+    # Append any errors to the config.log.
+    cat conftest.err 1>&5
+    compiler_o_lo=no
+    echo "$ac_t"no 1>&6
+  fi
+  CFLAGS="$save_CFLAGS"
+  $rm conftest*
+else
+  compiler_o_lo=no
+fi
+
+# Check to see if we can do hard links to lock some files if needed
+hard_links="nottested"
+if test "$compiler_c_o" = no && test "$need_locks" != no; then
+  # do not overwrite the value of need_locks provided by the user
+  echo $ac_n "checking if we can lock with hard links... $ac_c" 1>&6
+  hard_links=yes
+  $rm conftest*
+  ln conftest.a conftest.b 2>/dev/null && hard_links=no
+  touch conftest.a
+  ln conftest.a conftest.b 2>&5 || hard_links=no
+  ln conftest.a conftest.b 2>/dev/null && hard_links=no
+  echo "$ac_t$hard_links" 1>&6
+  $rm conftest*
+  if test "$hard_links" = no; then
+    echo "*** WARNING: \`$CC' does not support \`-c -o', so \`make -j' may be unsafe" >&2
+    need_locks=warn
+  fi
+else
+  need_locks=no
+fi
+
+if test "$with_gcc" = yes; then
+  # Check to see if options -fno-rtti -fno-exceptions are supported by compiler
+  echo $ac_n "checking if $compiler supports -fno-rtti -fno-exceptions ... $ac_c" 1>&6
+  $rm conftest*
+  echo "int some_variable = 0;" > conftest.c
+  save_CFLAGS="$CFLAGS"
+  CFLAGS="$CFLAGS -fno-rtti -fno-exceptions -c conftest.c"
+  echo "$progname:884: checking if $compiler supports -fno-rtti -fno-exceptions" >&5
+  if { (eval echo $progname:885: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>conftest.err; } && test -s conftest.o; then
+
+    # The compiler can only warn and ignore the option if not recognized
+    # So say no if there are warnings
+      if test -s conftest.err; then
+	echo "$ac_t"no 1>&6
+	compiler_rtti_exceptions=no
+      else
+	echo "$ac_t"yes 1>&6
+	compiler_rtti_exceptions=yes
+      fi
+  else
+    # Append any errors to the config.log.
+    cat conftest.err 1>&5
+    compiler_rtti_exceptions=no
+    echo "$ac_t"no 1>&6
+  fi
+  CFLAGS="$save_CFLAGS"
+  $rm conftest*
+
+  if test "$compiler_rtti_exceptions" = "yes"; then
+    no_builtin_flag=' -fno-builtin -fno-rtti -fno-exceptions'
+  else
+    no_builtin_flag=' -fno-builtin'
+  fi
+
+fi
+
+# See if the linker supports building shared libraries.
+echo $ac_n "checking whether the linker ($LD) supports shared libraries... $ac_c" 1>&6
+
+allow_undefined_flag=
+no_undefined_flag=
+need_lib_prefix=unknown
+need_version=unknown
+# when you set need_version to no, make sure it does not cause -set_version
+# flags to be left without arguments
+archive_cmds=
+archive_expsym_cmds=
+old_archive_from_new_cmds=
+old_archive_from_expsyms_cmds=
+striplib=
+old_striplib=
+export_dynamic_flag_spec=
+whole_archive_flag_spec=
+thread_safe_flag_spec=
+hardcode_into_libs=no
+hardcode_libdir_flag_spec=
+hardcode_libdir_separator=
+hardcode_direct=no
+hardcode_minus_L=no
+hardcode_shlibpath_var=unsupported
+runpath_var=
+link_all_deplibs=unknown
+always_export_symbols=no
+export_symbols_cmds='$NM $libobjs $convenience | $global_symbol_pipe | sed '\''s/.* //'\'' | sort | uniq > $export_symbols'
+# include_expsyms should be a list of space-separated symbols to be *always*
+# included in the symbol list
+include_expsyms=
+# exclude_expsyms can be an egrep regular expression of symbols to exclude
+# it will be wrapped by ` (' and `)$', so one must not match beginning or
+# end of line.  Example: `a|bc|.*d.*' will exclude the symbols `a' and `bc',
+# as well as any symbol that contains `d'.
+exclude_expsyms="_GLOBAL_OFFSET_TABLE_"
+# Although _GLOBAL_OFFSET_TABLE_ is a valid symbol C name, most a.out
+# platforms (ab)use it in PIC code, but their linkers get confused if
+# the symbol is explicitly referenced.  Since portable code cannot
+# rely on this symbol name, it's probably fine to never include it in
+# preloaded symbol tables.
+extract_expsyms_cmds=
+
+case "$host_os" in
+cygwin* | mingw*)
+  # FIXME: the MSVC++ port hasn't been tested in a loooong time
+  # When not using gcc, we currently assume that we are using
+  # Microsoft Visual C++.
+  if test "$with_gcc" != yes; then
+    with_gnu_ld=no
+  fi
+  ;;
+
+esac
+
+ld_shlibs=yes
+if test "$with_gnu_ld" = yes; then
+  # If archive_cmds runs LD, not CC, wlarc should be empty
+  wlarc='${wl}'
+
+  # See if GNU ld supports shared libraries.
+  case "$host_os" in
+  aix3* | aix4*)
+    # On AIX, the GNU linker is very broken
+    ld_shlibs=no
+    cat <<EOF 1>&2
+
+*** Warning: the GNU linker, at least up to release 2.9.1, is reported
+*** to be unable to reliably create shared libraries on AIX.
+*** Therefore, libtool is disabling shared libraries support.  If you
+*** really care for shared libraries, you may want to modify your PATH
+*** so that a non-GNU linker is found, and then restart.
+
+EOF
+    ;;
+
+  amigaos*)
+    archive_cmds='$rm $output_objdir/a2ixlibrary.data~$echo "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$echo "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$echo "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$echo "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR cru $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_minus_L=yes
+
+    # Samuel A. Falvo II <kc5tja@dolphin.openprojects.net> reports
+    # that the semantics of dynamic libraries on AmigaOS, at least up
+    # to version 4, is to share data among multiple programs linked
+    # with the same dynamic library.  Since this doesn't match the
+    # behavior of shared libraries on other platforms, we can use
+    # them.
+    ld_shlibs=no
+    ;;
+
+  beos*)
+    if $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
+      allow_undefined_flag=unsupported
+      # Joseph Beckenbach <jrb3@best.com> says some releases of gcc
+      # support --undefined.  This deserves some investigation.  FIXME
+      archive_cmds='$CC -nostart $libobjs $deplibs $linker_flags ${wl}-soname $wl$soname -o $lib'
+    else
+      ld_shlibs=no
+    fi
+    ;;
+
+  cygwin* | mingw*)
+    # hardcode_libdir_flag_spec is actually meaningless, as there is
+    # no search path for DLLs.
+    hardcode_libdir_flag_spec='-L$libdir'
+    allow_undefined_flag=unsupported
+    always_export_symbols=yes
+
+    extract_expsyms_cmds='test -f $output_objdir/impgen.c || \
+      sed -e "/^# \/\* impgen\.c starts here \*\//,/^# \/\* impgen.c ends here \*\// { s/^# //; p; }" -e d < $0 > $output_objdir/impgen.c~
+      test -f $output_objdir/impgen.exe || (cd $output_objdir && \
+      if test "x$HOST_CC" != "x" ; then $HOST_CC -o impgen impgen.c ; \
+      else $CC -o impgen impgen.c ; fi)~
+      $output_objdir/impgen $dir/$soname > $output_objdir/$soname-def'
+
+    old_archive_from_expsyms_cmds='$DLLTOOL --as=$AS --dllname $soname --def $output_objdir/$soname-def --output-lib $output_objdir/$newlib'
+
+    # cygwin and mingw dlls have different entry points and sets of symbols
+    # to exclude.
+    # FIXME: what about values for MSVC?
+    dll_entry=__cygwin_dll_entry@12
+    dll_exclude_symbols=DllMain@12,_cygwin_dll_entry@12,_cygwin_noncygwin_dll_entry@12~
+    case "$host_os" in
+    mingw*)
+      # mingw values
+      dll_entry=_DllMainCRTStartup@12
+      dll_exclude_symbols=DllMain@12,DllMainCRTStartup@12,DllEntryPoint@12~
+      ;;
+    esac
+
+    # mingw and cygwin differ, and it's simplest to just exclude the union
+    # of the two symbol sets.
+    dll_exclude_symbols=DllMain@12,_cygwin_dll_entry@12,_cygwin_noncygwin_dll_entry@12,DllMainCRTStartup@12,DllEntryPoint@12
+
+    # recent cygwin and mingw systems supply a stub DllMain which the user
+    # can override, but on older systems we have to supply one (in ltdll.c)
+    if test "x$lt_cv_need_dllmain" = "xyes"; then
+      ltdll_obj='$output_objdir/$soname-ltdll.'"$objext "
+      ltdll_cmds='test -f $output_objdir/$soname-ltdll.c || sed -e "/^# \/\* ltdll\.c starts here \*\//,/^# \/\* ltdll.c ends here \*\// { s/^# //; p; }" -e d < $0 > $output_objdir/$soname-ltdll.c~
+	test -f $output_objdir/$soname-ltdll.$objext || (cd $output_objdir && $CC -c $soname-ltdll.c)~'
+    else
+      ltdll_obj=
+      ltdll_cmds=
+    fi
+
+    # Extract the symbol export list from an `--export-all' def file,
+    # then regenerate the def file from the symbol export list, so that
+    # the compiled dll only exports the symbol export list.
+    # Be careful not to strip the DATA tag left be newer dlltools.
+    export_symbols_cmds="$ltdll_cmds"'
+      $DLLTOOL --export-all --exclude-symbols '$dll_exclude_symbols' --output-def $output_objdir/$soname-def '$ltdll_obj'$libobjs $convenience~
+      sed -e "1,/EXPORTS/d" -e "s/ @ [0-9]*//" -e "s/ *;.*$//" < $output_objdir/$soname-def > $export_symbols'
+
+    # If DATA tags from a recent dlltool are present, honour them!
+    archive_expsym_cmds='echo EXPORTS > $output_objdir/$soname-def~
+      _lt_hint=1;
+      cat $export_symbols | while read symbol; do
+	set dummy \$symbol;
+	case \$# in
+	  2) echo "	\$2 @ \$_lt_hint ; " >> $output_objdir/$soname-def;;
+	  *) echo "     \$2 @ \$_lt_hint \$3 ; " >> $output_objdir/$soname-def;;
+	esac;
+	_lt_hint=`expr 1 + \$_lt_hint`;
+      done~
+      '"$ltdll_cmds"'
+      $CC -Wl,--base-file,$output_objdir/$soname-base '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $lib '$ltdll_obj'$libobjs $deplibs $compiler_flags~
+      $DLLTOOL --as=$AS --dllname $soname --exclude-symbols '$dll_exclude_symbols' --def $output_objdir/$soname-def --base-file $output_objdir/$soname-base --output-exp $output_objdir/$soname-exp~
+      $CC -Wl,--base-file,$output_objdir/$soname-base $output_objdir/$soname-exp '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $lib '$ltdll_obj'$libobjs $deplibs $compiler_flags~
+      $DLLTOOL --as=$AS --dllname $soname --exclude-symbols '$dll_exclude_symbols' --def $output_objdir/$soname-def --base-file $output_objdir/$soname-base --output-exp $output_objdir/$soname-exp~
+      $CC $output_objdir/$soname-exp '$lt_cv_cc_dll_switch' -Wl,-e,'$dll_entry' -o $lib '$ltdll_obj'$libobjs $deplibs $compiler_flags'
+    ;;
+
+  netbsd*)
+    if echo __ELF__ | $CC -E - | grep __ELF__ >/dev/null; then
+      archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+      archive_expsym_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+    else
+      archive_cmds='$LD -Bshareable $libobjs $deplibs $linker_flags -o $lib'
+    fi
+    ;;
+
+  solaris* | sysv5*)
+    if $LD -v 2>&1 | egrep 'BFD 2\.8' > /dev/null; then
+      ld_shlibs=no
+      cat <<EOF 1>&2
+
+*** Warning: The releases 2.8.* of the GNU linker cannot reliably
+*** create shared libraries on Solaris systems.  Therefore, libtool
+*** is disabling shared libraries support.  We urge you to upgrade GNU
+*** binutils to release 2.9.1 or newer.  Another option is to modify
+*** your PATH or compiler configuration so that the native linker is
+*** used, and then restart.
+
+EOF
+    elif $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
+      archive_cmds='$CC -shared $libobjs $deplibs $linker_flags ${wl}-soname $wl$soname -o $lib'
+      archive_expsym_cmds='$CC -shared $libobjs $deplibs $linker_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+    else
+      ld_shlibs=no
+    fi
+    ;;
+
+  sunos4*)
+    archive_cmds='$LD -assert pure-text -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+    wlarc=
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  *)
+    if $LD --help 2>&1 | egrep ': supported targets:.* elf' > /dev/null; then
+      archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
+      archive_expsym_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
+    else
+      ld_shlibs=no
+    fi
+    ;;
+  esac
+
+  if test "$ld_shlibs" = yes; then
+    runpath_var=LD_RUN_PATH
+    hardcode_libdir_flag_spec='${wl}--rpath ${wl}$libdir'
+    export_dynamic_flag_spec='${wl}--export-dynamic'
+    case $host_os in
+    cygwin* | mingw*)
+      # dlltool doesn't understand --whole-archive et. al.
+      whole_archive_flag_spec=
+      ;;
+    *)
+      # ancient GNU ld didn't support --whole-archive et. al.
+      if $LD --help 2>&1 | egrep 'no-whole-archive' > /dev/null; then
+	whole_archive_flag_spec="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive'
+      else
+	whole_archive_flag_spec=
+      fi
+      ;;
+    esac
+  fi
+else
+  # PORTME fill in a description of your system's linker (not GNU ld)
+  case "$host_os" in
+  aix3*)
+    allow_undefined_flag=unsupported
+    always_export_symbols=yes
+    archive_expsym_cmds='$LD -o $output_objdir/$soname $libobjs $deplibs $linker_flags -bE:$export_symbols -T512 -H512 -bM:SRE~$AR cru $lib $output_objdir/$soname'
+    # Note: this linker hardcodes the directories in LIBPATH if there
+    # are no directories specified by -L.
+    hardcode_minus_L=yes
+    if test "$with_gcc" = yes && test -z "$link_static_flag"; then
+      # Neither direct hardcoding nor static linking is supported with a
+      # broken collect2.
+      hardcode_direct=unsupported
+    fi
+    ;;
+
+  aix4*)
+    hardcode_libdir_flag_spec='${wl}-b ${wl}nolibpath ${wl}-b ${wl}libpath:$libdir:/usr/lib:/lib'
+    hardcode_libdir_separator=':'
+    if test "$with_gcc" = yes; then
+      collect2name=`${CC} -print-prog-name=collect2`
+      if test -f "$collect2name" && \
+	 strings "$collect2name" | grep resolve_lib_name >/dev/null
+      then
+	# We have reworked collect2
+	hardcode_direct=yes
+      else
+	# We have old collect2
+	hardcode_direct=unsupported
+	# It fails to find uninstalled libraries when the uninstalled
+	# path is not listed in the libpath.  Setting hardcode_minus_L
+	# to unsupported forces relinking
+	hardcode_minus_L=yes
+	hardcode_libdir_flag_spec='-L$libdir'
+	hardcode_libdir_separator=
+      fi
+      shared_flag='-shared'
+    else
+      shared_flag='${wl}-bM:SRE'
+      hardcode_direct=yes
+    fi
+    allow_undefined_flag=' ${wl}-berok'
+    archive_cmds="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs $compiler_flags ${wl}-bexpall ${wl}-bnoentry${allow_undefined_flag}'
+    archive_expsym_cmds="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs $compiler_flags ${wl}-bE:$export_symbols ${wl}-bnoentry${allow_undefined_flag}'
+    case "$host_os" in aix4.[01]|aix4.[01].*)
+      # According to Greg Wooledge, -bexpall is only supported from AIX 4.2 on
+      always_export_symbols=yes ;;
+    esac
+   ;;
+
+  amigaos*)
+    archive_cmds='$rm $output_objdir/a2ixlibrary.data~$echo "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$echo "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$echo "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$echo "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR cru $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_minus_L=yes
+    # see comment about different semantics on the GNU ld section
+    ld_shlibs=no
+    ;;
+
+  cygwin* | mingw*)
+    # When not using gcc, we currently assume that we are using
+    # Microsoft Visual C++.
+    # hardcode_libdir_flag_spec is actually meaningless, as there is
+    # no search path for DLLs.
+    hardcode_libdir_flag_spec=' '
+    allow_undefined_flag=unsupported
+    # Tell ltmain to make .lib files, not .a files.
+    libext=lib
+    # FIXME: Setting linknames here is a bad hack.
+    archive_cmds='$CC -o $lib $libobjs $compiler_flags `echo "$deplibs" | sed -e '\''s/ -lc$//'\''` -link -dll~linknames='
+    # The linker will automatically build a .lib file if we build a DLL.
+    old_archive_from_new_cmds='true'
+    # FIXME: Should let the user specify the lib program.
+    old_archive_cmds='lib /OUT:$oldlib$oldobjs$old_deplibs'
+    fix_srcfile_path='`cygpath -w $srcfile`'
+    ;;
+
+  freebsd1*)
+    ld_shlibs=no
+    ;;
+
+  # FreeBSD 2.2.[012] allows us to include c++rt0.o to get C++ constructor
+  # support.  Future versions do this automatically, but an explicit c++rt0.o
+  # does not break anything, and helps significantly (at the cost of a little
+  # extra space).
+  freebsd2.2*)
+    archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags /usr/lib/c++rt0.o'
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  # Unfortunately, older versions of FreeBSD 2 do not have this feature.
+  freebsd2*)
+    archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_direct=yes
+    hardcode_minus_L=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  # FreeBSD 3 and greater uses gcc -shared to do shared libraries.
+  freebsd*)
+    archive_cmds='$CC -shared -o $lib $libobjs $deplibs $compiler_flags'
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  hpux9* | hpux10* | hpux11*)
+    case "$host_os" in
+    hpux9*) archive_cmds='$rm $output_objdir/$soname~$LD -b +b $install_libdir -o $output_objdir/$soname $libobjs $deplibs $linker_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib' ;;
+    *) archive_cmds='$LD -b +h $soname +b $install_libdir -o $lib $libobjs $deplibs $linker_flags' ;;
+    esac
+    hardcode_libdir_flag_spec='${wl}+b ${wl}$libdir'
+    hardcode_libdir_separator=:
+    hardcode_direct=yes
+    hardcode_minus_L=yes # Not in the search PATH, but as the default
+			 # location of the library.
+    export_dynamic_flag_spec='${wl}-E'
+    ;;
+
+  irix5* | irix6*)
+    if test "$with_gcc" = yes; then
+      archive_cmds='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+    else
+      archive_cmds='$LD -shared $libobjs $deplibs $linker_flags -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${output_objdir}/so_locations -o $lib'
+    fi
+    hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+    hardcode_libdir_separator=:
+    link_all_deplibs=yes
+    ;;
+
+  netbsd*)
+    if echo __ELF__ | $CC -E - | grep __ELF__ >/dev/null; then
+      archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'  # a.out
+    else
+      archive_cmds='$LD -shared -o $lib $libobjs $deplibs $linker_flags'      # ELF
+    fi
+    hardcode_libdir_flag_spec='${wl}-R$libdir'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  openbsd*)
+    archive_cmds='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_direct=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  os2*)
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_minus_L=yes
+    allow_undefined_flag=unsupported
+    archive_cmds='$echo "LIBRARY $libname INITINSTANCE" > $output_objdir/$libname.def~$echo "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~$echo DATA >> $output_objdir/$libname.def~$echo " SINGLE NONSHARED" >> $output_objdir/$libname.def~$echo EXPORTS >> $output_objdir/$libname.def~emxexp $libobjs >> $output_objdir/$libname.def~$CC -Zdll -Zcrtdll -o $lib $libobjs $deplibs $compiler_flags $output_objdir/$libname.def'
+    old_archive_from_new_cmds='emximp -o $output_objdir/$libname.a $output_objdir/$libname.def'
+    ;;
+
+  osf3*)
+    if test "$with_gcc" = yes; then
+      allow_undefined_flag=' ${wl}-expect_unresolved ${wl}\*'
+      archive_cmds='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+    else
+      allow_undefined_flag=' -expect_unresolved \*'
+      archive_cmds='$LD -shared${allow_undefined_flag} $libobjs $deplibs $linker_flags -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${output_objdir}/so_locations -o $lib'
+    fi
+    hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+    hardcode_libdir_separator=:
+    ;;
+
+  osf4* | osf5*)	# as osf3* with the addition of -msym flag
+    if test "$with_gcc" = yes; then
+      allow_undefined_flag=' ${wl}-expect_unresolved ${wl}\*'
+      archive_cmds='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags ${wl}-msym ${wl}-soname ${wl}$soname `test -n "$verstring" && echo ${wl}-set_version ${wl}$verstring` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
+    else
+      allow_undefined_flag=' -expect_unresolved \*'
+      archive_cmds='$LD -shared${allow_undefined_flag} $libobjs $deplibs $linker_flags -msym -soname $soname `test -n "$verstring" && echo -set_version $verstring` -update_registry ${output_objdir}/so_locations -o $lib'
+    fi
+    hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir'
+    hardcode_libdir_separator=:
+    ;;
+
+  sco3.2v5*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_shlibpath_var=no
+    runpath_var=LD_RUN_PATH
+    hardcode_runpath_var=yes
+    ;;
+
+  solaris*)
+    no_undefined_flag=' -z text'
+    # $CC -shared without GNU ld will not create a library from C++
+    # object files and a static libstdc++, better avoid it by now
+    archive_cmds='$LD -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    archive_expsym_cmds='$echo "{ global:" > $lib.exp~cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $lib.exp~$echo "local: *; };" >> $lib.exp~
+		$LD -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $linker_flags~$rm $lib.exp'
+    hardcode_libdir_flag_spec='-R$libdir'
+    hardcode_shlibpath_var=no
+    case "$host_os" in
+    solaris2.[0-5] | solaris2.[0-5].*) ;;
+    *) # Supported since Solaris 2.6 (maybe 2.5.1?)
+      whole_archive_flag_spec='-z allextract$convenience -z defaultextract' ;;
+    esac
+    link_all_deplibs=yes
+    ;;
+
+  sunos4*)
+    archive_cmds='$LD -assert pure-text -Bstatic -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_direct=yes
+    hardcode_minus_L=yes
+    hardcode_shlibpath_var=no
+    ;;
+
+  sysv4)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    runpath_var='LD_RUN_PATH'
+    hardcode_shlibpath_var=no
+    hardcode_direct=no #Motorola manual says yes, but my tests say they lie
+    ;;
+
+  sysv4.3*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_shlibpath_var=no
+    export_dynamic_flag_spec='-Bexport'
+    ;;
+
+  sysv5*)
+    no_undefined_flag=' -z text'
+    # $CC -shared without GNU ld will not create a library from C++
+    # object files and a static libstdc++, better avoid it by now
+    archive_cmds='$LD -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    archive_expsym_cmds='$echo "{ global:" > $lib.exp~cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $lib.exp~$echo "local: *; };" >> $lib.exp~
+		$LD -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $linker_flags~$rm $lib.exp'
+    hardcode_libdir_flag_spec=
+    hardcode_shlibpath_var=no
+    runpath_var='LD_RUN_PATH'
+    ;;
+
+  uts4*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_shlibpath_var=no
+    ;;
+
+  dgux*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_libdir_flag_spec='-L$libdir'
+    hardcode_shlibpath_var=no
+    ;;
+
+  sysv4*MP*)
+    if test -d /usr/nec; then
+      archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+      hardcode_shlibpath_var=no
+      runpath_var=LD_RUN_PATH
+      hardcode_runpath_var=yes
+      ld_shlibs=yes
+    fi
+    ;;
+
+  sysv4.2uw2*)
+    archive_cmds='$LD -G -o $lib $libobjs $deplibs $linker_flags'
+    hardcode_direct=yes
+    hardcode_minus_L=no
+    hardcode_shlibpath_var=no
+    hardcode_runpath_var=yes
+    runpath_var=LD_RUN_PATH
+    ;;
+
+  unixware7*)
+    archive_cmds='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
+    runpath_var='LD_RUN_PATH'
+    hardcode_shlibpath_var=no
+    ;;
+
+  *)
+    ld_shlibs=no
+    ;;
+  esac
+fi
+echo "$ac_t$ld_shlibs" 1>&6
+test "$ld_shlibs" = no && can_build_shared=no
+
+# Check hardcoding attributes.
+echo $ac_n "checking how to hardcode library paths into programs... $ac_c" 1>&6
+hardcode_action=
+if test -n "$hardcode_libdir_flag_spec" || \
+   test -n "$runpath_var"; then
+
+  # We can hardcode non-existant directories.
+  if test "$hardcode_direct" != no &&
+     # If the only mechanism to avoid hardcoding is shlibpath_var, we
+     # have to relink, otherwise we might link with an installed library
+     # when we should be linking with a yet-to-be-installed one
+     ## test "$hardcode_shlibpath_var" != no &&
+     test "$hardcode_minus_L" != no; then
+    # Linking always hardcodes the temporary library directory.
+    hardcode_action=relink
+  else
+    # We can link without hardcoding, and we can hardcode nonexisting dirs.
+    hardcode_action=immediate
+  fi
+else
+  # We cannot hardcode anything, or else we can only hardcode existing
+  # directories.
+  hardcode_action=unsupported
+fi
+echo "$ac_t$hardcode_action" 1>&6
+
+echo $ac_n "checking whether stripping libraries is possible... $ac_c" 1>&6
+if test -n "$STRIP" && $STRIP -V 2>&1 | grep "GNU strip" >/dev/null; then
+  test -z "$old_striplib" && old_striplib="$STRIP --strip-debug"
+  test -z "$striplib" && striplib="$STRIP --strip-unneeded"
+  echo "${ac_t}yes" 1>&6
+else
+  echo "${ac_t}no" 1>&6
+fi
+
+reload_cmds='$LD$reload_flag -o $output$reload_objs'
+test -z "$deplibs_check_method" && deplibs_check_method=unknown
+
+# PORTME Fill in your ld.so characteristics
+library_names_spec=
+libname_spec='lib$name'
+soname_spec=
+postinstall_cmds=
+postuninstall_cmds=
+finish_cmds=
+finish_eval=
+shlibpath_var=
+shlibpath_overrides_runpath=unknown
+version_type=none
+dynamic_linker="$host_os ld.so"
+sys_lib_dlsearch_path_spec="/lib /usr/lib"
+sys_lib_search_path_spec="/lib /usr/lib /usr/local/lib"
+
+echo $ac_n "checking dynamic linker characteristics... $ac_c" 1>&6
+case "$host_os" in
+aix3*)
+  version_type=linux
+  library_names_spec='${libname}${release}.so$versuffix $libname.a'
+  shlibpath_var=LIBPATH
+
+  # AIX has no versioning support, so we append a major version to the name.
+  soname_spec='${libname}${release}.so$major'
+  ;;
+
+aix4*)
+  version_type=linux
+  # AIX has no versioning support, so currently we can not hardcode correct
+  # soname into executable. Probably we can add versioning support to
+  # collect2, so additional links can be useful in future.
+  # We preserve .a as extension for shared libraries though AIX4.2
+  # and later linker supports .so
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.a'
+  shlibpath_var=LIBPATH
+  ;;
+
+amigaos*)
+  library_names_spec='$libname.ixlibrary $libname.a'
+  # Create ${libname}_ixlibrary.a entries in /sys/libs.
+  finish_eval='for lib in `ls $libdir/*.ixlibrary 2>/dev/null`; do libname=`$echo "X$lib" | $Xsed -e '\''s%^.*/\([^/]*\)\.ixlibrary$%\1%'\''`; test $rm /sys/libs/${libname}_ixlibrary.a; $show "(cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a)"; (cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a) || exit 1; done'
+  ;;
+
+beos*)
+  library_names_spec='${libname}.so'
+  dynamic_linker="$host_os ld.so"
+  shlibpath_var=LIBRARY_PATH
+  lt_cv_dlopen="load_add_on"
+  lt_cv_dlopen_libs=
+  lt_cv_dlopen_self=yes
+  ;;
+
+bsdi4*)
+  version_type=linux
+  need_version=no
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  sys_lib_search_path_spec="/shlib /usr/lib /usr/X11/lib /usr/contrib/lib /lib /usr/local/lib"
+  sys_lib_dlsearch_path_spec="/shlib /usr/lib /usr/local/lib"
+  export_dynamic_flag_spec=-rdynamic
+  # the default ld.so.conf also contains /usr/contrib/lib and
+  # /usr/X11R6/lib (/usr/X11 is a link to /usr/X11R6), but let us allow
+  # libtool to hard-code these into programs
+  ;;
+
+cygwin* | mingw*)
+  version_type=windows
+  need_version=no
+  need_lib_prefix=no
+  if test "$with_gcc" = yes; then
+    library_names_spec='${libname}`echo ${release} | sed -e 's/[.]/-/g'`${versuffix}.dll'
+  else
+    library_names_spec='${libname}`echo ${release} | sed -e 's/[.]/-/g'`${versuffix}.dll $libname.lib'
+  fi
+  dynamic_linker='Win32 ld.exe'
+  # FIXME: first we should search . and the directory the executable is in
+  shlibpath_var=PATH
+  lt_cv_dlopen="LoadLibrary"
+  lt_cv_dlopen_libs=
+  ;;
+
+freebsd1*)
+  dynamic_linker=no
+  ;;
+
+freebsd*)
+  objformat=`test -x /usr/bin/objformat && /usr/bin/objformat || echo aout`
+  version_type=freebsd-$objformat
+  case "$version_type" in
+    freebsd-elf*)
+      library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so $libname.so'
+      need_version=no
+      need_lib_prefix=no
+      ;;
+    freebsd-*)
+      library_names_spec='${libname}${release}.so$versuffix $libname.so$versuffix'
+      need_version=yes
+      ;;
+  esac
+  shlibpath_var=LD_LIBRARY_PATH
+  case "$host_os" in
+  freebsd2*)
+    shlibpath_overrides_runpath=yes
+    ;;
+  freebsd3.[01]* | freebsdelf3.[01]*)
+    shlibpath_overrides_runpath=yes
+    hardcode_into_libs=yes
+    ;;
+  *) # from 3.2 on
+    shlibpath_overrides_runpath=no
+    hardcode_into_libs=yes
+    ;;
+  esac
+  ;;
+
+gnu*)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so${major} ${libname}.so'
+  soname_spec='${libname}${release}.so$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  hardcode_into_libs=yes
+  ;;
+
+hpux9* | hpux10* | hpux11*)
+  # Give a soname corresponding to the major version so that dld.sl refuses to
+  # link against other versions.
+  dynamic_linker="$host_os dld.sl"
+  version_type=sunos
+  need_lib_prefix=no
+  need_version=no
+  shlibpath_var=SHLIB_PATH
+  shlibpath_overrides_runpath=no # +s is required to enable SHLIB_PATH
+  library_names_spec='${libname}${release}.sl$versuffix ${libname}${release}.sl$major $libname.sl'
+  soname_spec='${libname}${release}.sl$major'
+  # HP-UX runs *really* slowly unless shared libraries are mode 555.
+  postinstall_cmds='chmod 555 $lib'
+  ;;
+
+irix5* | irix6*)
+  version_type=irix
+  need_lib_prefix=no
+  need_version=no
+  soname_spec='${libname}${release}.so.$major'
+  library_names_spec='${libname}${release}.so.$versuffix ${libname}${release}.so.$major ${libname}${release}.so $libname.so'
+  case "$host_os" in
+  irix5*)
+    libsuff= shlibsuff=
+    ;;
+  *)
+    case "$LD" in # libtool.m4 will add one of these switches to LD
+    *-32|*"-32 ") libsuff= shlibsuff= libmagic=32-bit;;
+    *-n32|*"-n32 ") libsuff=32 shlibsuff=N32 libmagic=N32;;
+    *-64|*"-64 ") libsuff=64 shlibsuff=64 libmagic=64-bit;;
+    *) libsuff= shlibsuff= libmagic=never-match;;
+    esac
+    ;;
+  esac
+  shlibpath_var=LD_LIBRARY${shlibsuff}_PATH
+  shlibpath_overrides_runpath=no
+  sys_lib_search_path_spec="/usr/lib${libsuff} /lib${libsuff} /usr/local/lib${libsuff}"
+  sys_lib_dlsearch_path_spec="/usr/lib${libsuff} /lib${libsuff}"
+  ;;
+
+# No shared lib support for Linux oldld, aout, or coff.
+linux-gnuoldld* | linux-gnuaout* | linux-gnucoff*)
+  dynamic_linker=no
+  ;;
+
+# This must be Linux ELF.
+linux-gnu*)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig -n $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=no
+  # This implies no fast_install, which is unacceptable.
+  # Some rework will be needed to allow for fast_install
+  # before this can be enabled.
+  hardcode_into_libs=yes
+
+  if test -f /lib/ld.so.1; then
+    dynamic_linker='GNU ld.so'
+  else
+    # Only the GNU ld.so supports shared libraries on MkLinux.
+    case "$host_cpu" in
+    powerpc*) dynamic_linker=no ;;
+    *) dynamic_linker='Linux ld.so' ;;
+    esac
+  fi
+  ;;
+
+netbsd*)
+  version_type=sunos
+  if echo __ELF__ | $CC -E - | grep __ELF__ >/dev/null; then
+    library_names_spec='${libname}${release}.so$versuffix ${libname}.so$versuffix'
+    finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
+    dynamic_linker='NetBSD (a.out) ld.so'
+  else
+    library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major ${libname}${release}.so ${libname}.so'
+    soname_spec='${libname}${release}.so$major'
+    dynamic_linker='NetBSD ld.elf_so'
+  fi
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+openbsd*)
+  version_type=sunos
+  if test "$with_gnu_ld" = yes; then
+    need_lib_prefix=no
+    need_version=no
+  fi
+  library_names_spec='${libname}${release}.so$versuffix ${libname}.so$versuffix'
+  finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+os2*)
+  libname_spec='$name'
+  need_lib_prefix=no
+  library_names_spec='$libname.dll $libname.a'
+  dynamic_linker='OS/2 ld.exe'
+  shlibpath_var=LIBPATH
+  ;;
+
+osf3* | osf4* | osf5*)
+  version_type=osf
+  need_version=no
+  soname_spec='${libname}${release}.so'
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so $libname.so'
+  shlibpath_var=LD_LIBRARY_PATH
+  sys_lib_search_path_spec="/usr/shlib /usr/ccs/lib /usr/lib/cmplrs/cc /usr/lib /usr/local/lib /var/shlib"
+  sys_lib_dlsearch_path_spec="$sys_lib_search_path_spec"
+  ;;
+
+sco3.2v5*)
+  version_type=osf
+  soname_spec='${libname}${release}.so$major'
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+solaris*)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  hardcode_into_libs=yes
+  # ldd complains unless libraries are executable
+  postinstall_cmds='chmod +x $lib'
+  ;;
+
+sunos4*)
+  version_type=sunos
+  library_names_spec='${libname}${release}.so$versuffix ${libname}.so$versuffix'
+  finish_cmds='PATH="\$PATH:/usr/etc" ldconfig $libdir'
+  shlibpath_var=LD_LIBRARY_PATH
+  shlibpath_overrides_runpath=yes
+  if test "$with_gnu_ld" = yes; then
+    need_lib_prefix=no
+  fi
+  need_version=yes
+  ;;
+
+sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
+  version_type=linux
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  case "$host_vendor" in
+    motorola)
+      need_lib_prefix=no
+      need_version=no
+      shlibpath_overrides_runpath=no
+      sys_lib_search_path_spec='/lib /usr/lib /usr/ccs/lib'
+      ;;
+  esac
+  ;;
+
+uts4*)
+  version_type=linux
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+dgux*)
+  version_type=linux
+  need_lib_prefix=no
+  need_version=no
+  library_names_spec='${libname}${release}.so$versuffix ${libname}${release}.so$major $libname.so'
+  soname_spec='${libname}${release}.so$major'
+  shlibpath_var=LD_LIBRARY_PATH
+  ;;
+
+sysv4*MP*)
+  if test -d /usr/nec ;then
+    version_type=linux
+    library_names_spec='$libname.so.$versuffix $libname.so.$major $libname.so'
+    soname_spec='$libname.so.$major'
+    shlibpath_var=LD_LIBRARY_PATH
+  fi
+  ;;
+
+*)
+  dynamic_linker=no
+  ;;
+esac
+echo "$ac_t$dynamic_linker" 1>&6
+test "$dynamic_linker" = no && can_build_shared=no
+
+# Check for command to grab the raw symbol name followed by C symbol from nm.
+echo $ac_n "checking command to parse $NM output... $ac_c" 1>&6
+
+# These are sane defaults that work on at least a few old systems.
+# [They come from Ultrix.  What could be older than Ultrix?!! ;)]
+
+# Character class describing NM global symbol codes.
+symcode='[BCDEGRST]'
+
+# Regexp to match symbols that can be accessed directly from C.
+sympat='\([_A-Za-z][_A-Za-z0-9]*\)'
+
+# Transform the above into a raw symbol and a C symbol.
+symxfrm='\1 \2\3 \3'
+
+# Transform an extracted symbol line into a proper C declaration
+global_symbol_to_cdecl="sed -n -e 's/^. .* \(.*\)$/extern char \1;/p'"
+
+# Define system-specific variables.
+case "$host_os" in
+aix*)
+  symcode='[BCDT]'
+  ;;
+cygwin* | mingw*)
+  symcode='[ABCDGISTW]'
+  ;;
+hpux*) # Its linker distinguishes data from code symbols
+  global_symbol_to_cdecl="sed -n -e 's/^T .* \(.*\)$/extern char \1();/p' -e 's/^. .* \(.*\)$/extern char \1;/p'"
+  ;;
+irix*)
+  symcode='[BCDEGRST]'
+  ;;
+solaris* | sysv5*)
+  symcode='[BDT]'
+  ;;
+sysv4)
+  symcode='[DFNSTU]'
+  ;;
+esac
+
+# Handle CRLF in mingw too chain
+opt_cr=
+case "$host_os" in
+mingw*)
+  opt_cr=`echo 'x\{0,1\}' | tr x '\015'` # option cr in regexp
+  ;;
+esac
+
+# If we're using GNU nm, then use its standard symbol codes.
+if $NM -V 2>&1 | egrep '(GNU|with BFD)' > /dev/null; then
+  symcode='[ABCDGISTW]'
+fi
+
+# Try without a prefix undercore, then with it.
+for ac_symprfx in "" "_"; do
+
+  # Write the raw and C identifiers.
+global_symbol_pipe="sed -n -e 's/^.*[ 	]\($symcode\)[ 	][ 	]*\($ac_symprfx\)$sympat$opt_cr$/$symxfrm/p'"
+
+  # Check to see that the pipe works correctly.
+  pipe_works=no
+  $rm conftest*
+  cat > conftest.c <<EOF
+#ifdef __cplusplus
+extern "C" {
+#endif
+char nm_test_var;
+void nm_test_func(){}
+#ifdef __cplusplus
+}
+#endif
+main(){nm_test_var='a';nm_test_func();return(0);}
+EOF
+
+  echo "$progname:1867: checking if global_symbol_pipe works" >&5
+  if { (eval echo $progname:1868: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; } && test -s conftest.$objext; then
+    # Now try to grab the symbols.
+    nlist=conftest.nm
+    if { echo "$progname:1871: eval \"$NM conftest.$objext | $global_symbol_pipe > $nlist\"" >&5; eval "$NM conftest.$objext | $global_symbol_pipe > $nlist 2>&5"; } && test -s "$nlist"; then
+
+      # Try sorting and uniquifying the output.
+      if sort "$nlist" | uniq > "$nlist"T; then
+	mv -f "$nlist"T "$nlist"
+      else
+	rm -f "$nlist"T
+      fi
+
+      # Make sure that we snagged all the symbols we need.
+      if egrep ' nm_test_var$' "$nlist" >/dev/null; then
+	if egrep ' nm_test_func$' "$nlist" >/dev/null; then
+	  cat <<EOF > conftest.c
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+EOF
+	  # Now generate the symbol file.
+	  eval "$global_symbol_to_cdecl"' < "$nlist" >> conftest.c'
+
+	  cat <<EOF >> conftest.c
+#if defined (__STDC__) && __STDC__
+# define lt_ptr_t void *
+#else
+# define lt_ptr_t char *
+# define const
+#endif
+
+/* The mapping between symbol names and symbols. */
+const struct {
+  const char *name;
+  lt_ptr_t address;
+}
+lt_preloaded_symbols[] =
+{
+EOF
+	  sed 's/^. \(.*\) \(.*\)$/  {"\2", (lt_ptr_t) \&\2},/' < "$nlist" >> conftest.c
+	  cat <<\EOF >> conftest.c
+  {0, (lt_ptr_t) 0}
+};
+
+#ifdef __cplusplus
+}
+#endif
+EOF
+	  # Now try linking the two files.
+	  mv conftest.$objext conftstm.$objext
+	  save_LIBS="$LIBS"
+	  save_CFLAGS="$CFLAGS"
+	  LIBS="conftstm.$objext"
+	  CFLAGS="$CFLAGS$no_builtin_flag"
+	  if { (eval echo $progname:1923: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest; then
+	    pipe_works=yes
+	  else
+	    echo "$progname: failed program was:" >&5
+	    cat conftest.c >&5
+	  fi
+	  LIBS="$save_LIBS"
+	else
+	  echo "cannot find nm_test_func in $nlist" >&5
+	fi
+      else
+	echo "cannot find nm_test_var in $nlist" >&5
+      fi
+    else
+      echo "cannot run $global_symbol_pipe" >&5
+    fi
+  else
+    echo "$progname: failed program was:" >&5
+    cat conftest.c >&5
+  fi
+  $rm conftest* conftst*
+
+  # Do not use the global_symbol_pipe unless it works.
+  if test "$pipe_works" = yes; then
+    break
+  else
+    global_symbol_pipe=
+  fi
+done
+if test "$pipe_works" = yes; then
+  echo "${ac_t}ok" 1>&6
+else
+  echo "${ac_t}failed" 1>&6
+fi
+
+if test -z "$global_symbol_pipe"; then
+  global_symbol_to_cdecl=
+fi
+
+# Report the final consequences.
+echo "checking if libtool supports shared libraries... $can_build_shared" 1>&6
+
+# Only try to build win32 dlls if AC_LIBTOOL_WIN32_DLL was used in
+# configure.in, otherwise build static only libraries.
+case "$host_os" in
+cygwin* | mingw* | os2*)
+  if test x$can_build_shared = xyes; then
+    test x$enable_win32_dll = xno && can_build_shared=no
+    echo "checking if package supports dlls... $can_build_shared" 1>&6
+  fi
+;;
+esac
+
+echo $ac_n "checking whether to build shared libraries... $ac_c" 1>&6
+test "$can_build_shared" = "no" && enable_shared=no
+
+# On AIX, shared libraries and static libraries use the same namespace, and
+# are all built from PIC.
+case "$host_os" in
+aix3*)
+  test "$enable_shared" = yes && enable_static=no
+  if test -n "$RANLIB"; then
+    archive_cmds="$archive_cmds~\$RANLIB \$lib"
+    postinstall_cmds='$RANLIB $lib'
+  fi
+  ;;
+
+aix4*)
+  test "$enable_shared" = yes && enable_static=no
+  ;;
+esac
+
+echo "$ac_t$enable_shared" 1>&6
+
+# Make sure either enable_shared or enable_static is yes.
+test "$enable_shared" = yes || enable_static=yes
+
+echo "checking whether to build static libraries... $enable_static" 1>&6
+
+if test "$hardcode_action" = relink || test "$hardcode_into_libs" = all; then
+  # Fast installation is not supported
+  enable_fast_install=no
+elif test "$shlibpath_overrides_runpath" = yes ||
+     test "$enable_shared" = no; then
+  # Fast installation is not necessary
+  enable_fast_install=needless
+fi
+
+# Check whether we must set pic_mode to default
+test -z "$pic_flag" && pic_mode=default
+# On Cygwin there's no "real" PIC flag so we must build both object types
+case "$host_os" in
+cygwin* | mingw* | os2*)
+  pic_mode=default
+  ;;
+esac
+if test $pic_mode = no && test "$deplibs_check_method" != pass_all; then
+  # non-PIC code in shared libraries is not supported
+  pic_mode=default
+fi
+
+if test "x$enable_dlopen" != xyes; then
+  enable_dlopen=unknown
+  enable_dlopen_self=unknown
+  enable_dlopen_self_static=unknown
+else
+if test "X${lt_cv_dlopen+set}" != Xset; then
+  lt_cv_dlopen=no lt_cv_dlopen_libs=
+echo $ac_n "checking for dlopen in -ldl""... $ac_c" 1>&6
+echo "$progname:2032: checking for dlopen in -ldl" >&5
+if test "X${ac_cv_lib_dl_dlopen+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  ac_save_LIBS="$LIBS"
+LIBS="-ldl  $LIBS"
+cat > conftest.$ac_ext <<EOF
+#line 2039 "ltconfig"
+/* Override any gcc2 internal prototype to avoid an error.  */
+/* We use char because int might match the return type of a gcc2
+    builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char dlopen();
+
+int main() {
+dlopen()
+; return 0; }
+EOF
+if { (eval echo $progname:2052: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  ac_cv_lib_dl_dlopen=yes
+else
+  echo "$progname: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_lib_dl_dlopen=no
+fi
+rm -f conftest*
+LIBS="$ac_save_LIBS"
+
+fi
+if test "X$ac_cv_lib_dl_dlopen" = Xyes; then
+  echo "$ac_t""yes" 1>&6
+  lt_cv_dlopen="dlopen" lt_cv_dlopen_libs="-ldl"
+else
+  echo "$ac_t""no" 1>&6
+echo $ac_n "checking for dlopen""... $ac_c" 1>&6
+echo "$progname:2071: checking for dlopen" >&5
+if test "X${ac_cv_func_dlopen+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  cat > conftest.$ac_ext <<EOF
+#line 2076 "ltconfig"
+/* System header to define __stub macros and hopefully few prototypes,
+    which can conflict with char dlopen(); below.  */
+#include <assert.h>
+/* Override any gcc2 internal prototype to avoid an error.  */
+/* We use char because int might match the return type of a gcc2
+    builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char dlopen();
+
+int main() {
+
+/* The GNU C library defines this for functions which it implements
+    to always fail with ENOSYS.  Some functions are actually named
+    something starting with __ and the normal name is an alias.  */
+#if defined (__stub_dlopen) || defined (__stub___dlopen)
+choke me
+#else
+dlopen();
+#endif
+
+; return 0; }
+EOF
+if { (eval echo $progname:2101: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  ac_cv_func_dlopen=yes
+else
+  echo "$progname: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_func_dlopen=no
+fi
+rm -f conftest*
+fi
+if test "X$ac_cv_func_dlopen" = Xyes; then
+  echo "$ac_t""yes" 1>&6
+  lt_cv_dlopen="dlopen"
+else
+  echo "$ac_t""no" 1>&6
+echo $ac_n "checking for dld_link in -ldld""... $ac_c" 1>&6
+echo "$progname:2118: checking for dld_link in -ldld" >&5
+if test "X${ac_cv_lib_dld_dld_link+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  ac_save_LIBS="$LIBS"
+LIBS="-ldld  $LIBS"
+cat > conftest.$ac_ext <<EOF
+#line 2125 "ltconfig"
+/* Override any gcc2 internal prototype to avoid an error.  */
+/* We use char because int might match the return type of a gcc2
+    builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char dld_link();
+
+int main() {
+dld_link()
+; return 0; }
+EOF
+if { (eval echo $progname:2138: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  ac_cv_lib_dld_dld_link=yes
+else
+  echo "$progname: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_lib_dld_dld_link=no
+fi
+rm -f conftest*
+LIBS="$ac_save_LIBS"
+
+fi
+if test "X$ac_cv_lib_dld_dld_link" = Xyes; then
+  echo "$ac_t""yes" 1>&6
+  lt_cv_dlopen="dld_link" lt_cv_dlopen_libs="-ldld"
+else
+  echo "$ac_t""no" 1>&6
+echo $ac_n "checking for shl_load""... $ac_c" 1>&6
+echo "$progname:2157: checking for shl_load" >&5
+if test "X${ac_cv_func_shl_load+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  cat > conftest.$ac_ext <<EOF
+#line 2162 "ltconfig"
+/* System header to define __stub macros and hopefully few prototypes,
+    which can conflict with char shl_load(); below.  */
+#include <assert.h>
+/* Override any gcc2 internal prototype to avoid an error.  */
+/* We use char because int might match the return type of a gcc2
+    builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char shl_load();
+
+int main() {
+
+/* The GNU C library defines this for functions which it implements
+    to always fail with ENOSYS.  Some functions are actually named
+    something starting with __ and the normal name is an alias.  */
+#if defined (__stub_shl_load) || defined (__stub___shl_load)
+choke me
+#else
+shl_load();
+#endif
+
+; return 0; }
+EOF
+if { (eval echo $progname:2187: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  ac_cv_func_shl_load=yes
+else
+  echo "$progname: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_func_shl_load=no
+fi
+rm -f conftest*
+fi
+
+if test "X$ac_cv_func_shl_load" = Xyes; then
+  echo "$ac_t""yes" 1>&6
+  lt_cv_dlopen="shl_load"
+else
+  echo "$ac_t""no" 1>&6
+echo $ac_n "checking for shl_load in -ldld""... $ac_c" 1>&6
+echo "$progname:2205: checking for shl_load in -ldld" >&5
+if test "X${ac_cv_lib_dld_shl_load+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  ac_save_LIBS="$LIBS"
+LIBS="-ldld  $LIBS"
+cat > conftest.$ac_ext <<EOF
+#line 2212 "ltconfig"
+#include "confdefs.h"
+/* Override any gcc2 internal prototype to avoid an error.  */
+/* We use char because int might match the return type of a gcc2
+    builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char shl_load();
+
+int main() {
+shl_load()
+; return 0; }
+EOF
+if { (eval echo $progname:2226: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
+  rm -rf conftest*
+  ac_cv_lib_dld_shl_load=yes
+else
+  echo "$progname: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  ac_cv_lib_dld_shl_load=no
+fi
+rm -f conftest*
+LIBS="$ac_save_LIBS"
+
+fi
+if test "X$ac_cv_lib_dld_shl_load" = Xyes; then
+  echo "$ac_t""yes" 1>&6
+  lt_cv_dlopen="shl_load" lt_cv_dlopen_libs="-ldld"
+else
+  echo "$ac_t""no" 1>&6
+fi
+
+
+fi
+
+
+fi
+
+
+fi
+
+
+fi
+
+fi
+
+  if test "x$lt_cv_dlopen" != xno; then
+    enable_dlopen=yes
+  fi
+
+  case "$lt_cv_dlopen" in
+  dlopen)
+for ac_hdr in dlfcn.h; do
+ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'`
+echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6
+echo "$progname:2269: checking for $ac_hdr" >&5
+if eval "test \"`echo 'X$''{'ac_cv_header_$ac_safe'+set}'`\" = Xset"; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  cat > conftest.$ac_ext <<EOF
+#line 2274 "ltconfig"
+#include <$ac_hdr>
+int fnord = 0;
+int main () { }
+EOF
+ac_try="$ac_compile >/dev/null 2>conftest.out"
+{ (eval echo $progname:2280: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
+ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
+if test -z "$ac_err"; then
+  rm -rf conftest*
+  eval "ac_cv_header_$ac_safe=yes"
+else
+  echo "$ac_err" >&5
+  echo "$progname: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -rf conftest*
+  eval "ac_cv_header_$ac_safe=no"
+fi
+rm -f conftest*
+fi
+if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then
+  echo "$ac_t""yes" 1>&6
+else
+  echo "$ac_t""no" 1>&6
+fi
+done
+
+    if test "x$ac_cv_header_dlfcn_h" = xyes; then
+      CPPFLAGS="$CPPFLAGS -DHAVE_DLFCN_H"
+    fi
+    eval LDFLAGS=\"\$LDFLAGS $export_dynamic_flag_spec\"
+    LIBS="$lt_cv_dlopen_libs $LIBS"
+
+  echo $ac_n "checking whether a program can dlopen itself""... $ac_c" 1>&6
+echo "$progname:2308: checking whether a program can dlopen itself" >&5
+if test "X${lt_cv_dlopen_self+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test "$cross_compiling" = yes; then
+    lt_cv_dlopen_self=cross
+  else
+    cat > conftest.c <<EOF
+#line 2316 "ltconfig"
+
+#if HAVE_DLFCN_H
+#include <dlfcn.h>
+#endif
+
+#include <stdio.h>
+
+#ifdef RTLD_GLOBAL
+# define LTDL_GLOBAL	RTLD_GLOBAL
+#else
+# ifdef DL_GLOBAL
+#  define LTDL_GLOBAL	DL_GLOBAL
+# else
+#  define LTDL_GLOBAL	0
+# endif
+#endif
+
+/* We may have to define LTDL_LAZY_OR_NOW in the command line if we
+   find out it does not work in some platform. */
+#ifndef LTDL_LAZY_OR_NOW
+# ifdef RTLD_LAZY
+#  define LTDL_LAZY_OR_NOW	RTLD_LAZY
+# else
+#  ifdef DL_LAZY
+#   define LTDL_LAZY_OR_NOW	DL_LAZY
+#  else
+#   ifdef RTLD_NOW
+#    define LTDL_LAZY_OR_NOW	RTLD_NOW
+#   else
+#    ifdef DL_NOW
+#     define LTDL_LAZY_OR_NOW	DL_NOW
+#    else
+#     define LTDL_LAZY_OR_NOW	0
+#    endif
+#   endif
+#  endif
+# endif
+#endif
+
+fnord() { int i=42;}
+main() { void *self, *ptr1, *ptr2; self=dlopen(0,LTDL_GLOBAL|LTDL_LAZY_OR_NOW);
+    if(self) { ptr1=dlsym(self,"fnord"); ptr2=dlsym(self,"_fnord");
+	       if(ptr1 || ptr2) { dlclose(self); exit(0); } } exit(1); }
+
+EOF
+if { (eval echo $progname:2362: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest && (./conftest; exit) 2>/dev/null
+then
+  lt_cv_dlopen_self=yes
+else
+  echo "$progname: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -fr conftest*
+  lt_cv_dlopen_self=no
+fi
+rm -fr conftest*
+fi
+
+fi
+
+echo "$ac_t""$lt_cv_dlopen_self" 1>&6
+
+  if test "$lt_cv_dlopen_self" = yes; then
+    LDFLAGS="$LDFLAGS $link_static_flag"
+  echo $ac_n "checking whether a statically linked program can dlopen itself""... $ac_c" 1>&6
+echo "$progname:2381: checking whether a statically linked program can dlopen itself" >&5
+if test "X${lt_cv_dlopen_self_static+set}" = Xset; then
+  echo $ac_n "(cached) $ac_c" 1>&6
+else
+  if test "$cross_compiling" = yes; then
+    lt_cv_dlopen_self_static=cross
+  else
+    cat > conftest.c <<EOF
+#line 2389 "ltconfig"
+
+#if HAVE_DLFCN_H
+#include <dlfcn.h>
+#endif
+
+#include <stdio.h>
+
+#ifdef RTLD_GLOBAL
+# define LTDL_GLOBAL	RTLD_GLOBAL
+#else
+# ifdef DL_GLOBAL
+#  define LTDL_GLOBAL	DL_GLOBAL
+# else
+#  define LTDL_GLOBAL	0
+# endif
+#endif
+
+/* We may have to define LTDL_LAZY_OR_NOW in the command line if we
+   find out it does not work in some platform. */
+#ifndef LTDL_LAZY_OR_NOW
+# ifdef RTLD_LAZY
+#  define LTDL_LAZY_OR_NOW	RTLD_LAZY
+# else
+#  ifdef DL_LAZY
+#   define LTDL_LAZY_OR_NOW	DL_LAZY
+#  else
+#   ifdef RTLD_NOW
+#    define LTDL_LAZY_OR_NOW	RTLD_NOW
+#   else
+#    ifdef DL_NOW
+#     define LTDL_LAZY_OR_NOW	DL_NOW
+#    else
+#     define LTDL_LAZY_OR_NOW	0
+#    endif
+#   endif
+#  endif
+# endif
+#endif
+
+fnord() { int i=42;}
+main() { void *self, *ptr1, *ptr2; self=dlopen(0,LTDL_GLOBAL|LTDL_LAZY_OR_NOW);
+    if(self) { ptr1=dlsym(self,"fnord"); ptr2=dlsym(self,"_fnord");
+    if(ptr1 || ptr2) { dlclose(self); exit(0); } } exit(1); }
+
+EOF
+if { (eval echo $progname:2435: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest && (./conftest; exit) 2>/dev/null
+then
+  lt_cv_dlopen_self_static=yes
+else
+  echo "$progname: failed program was:" >&5
+  cat conftest.$ac_ext >&5
+  rm -fr conftest*
+  lt_cv_dlopen_self_static=no
+fi
+rm -fr conftest*
+fi
+
+fi
+
+echo "$ac_t""$lt_cv_dlopen_self_static" 1>&6
+fi
+    ;;
+  esac
+
+  case "$lt_cv_dlopen_self" in
+  yes|no) enable_dlopen_self=$lt_cv_dlopen_self ;;
+  *) enable_dlopen_self=unknown ;;
+  esac
+
+  case "$lt_cv_dlopen_self_static" in
+  yes|no) enable_dlopen_self_static=$lt_cv_dlopen_self_static ;;
+  *) enable_dlopen_self_static=unknown ;;
+  esac
+fi
+
+# Copy echo and quote the copy, instead of the original, because it is
+# used later.
+ltecho="$echo"
+if test "X$ltecho" = "X$CONFIG_SHELL $0 --fallback-echo"; then
+   ltecho="$CONFIG_SHELL \$0 --fallback-echo"
+fi
+LTSHELL="$SHELL"
+
+LTCONFIG_VERSION="$VERSION"
+
+# Only quote variables if we're using ltmain.sh.
+case "$ltmain" in
+*.sh)
+  # Now quote all the things that may contain metacharacters.
+  for var in ltecho old_AR old_CC old_CFLAGS old_CPPFLAGS \
+    old_MAGIC old_LD old_LDFLAGS old_LIBS \
+    old_LN_S old_NM old_RANLIB old_STRIP \
+    old_AS old_DLLTOOL old_OBJDUMP \
+    old_OBJEXT old_EXEEXT old_reload_flag \
+    old_deplibs_check_method old_file_magic_cmd \
+    AR CC LD LN_S NM LTSHELL LTCONFIG_VERSION \
+    reload_flag reload_cmds wl \
+    pic_flag link_static_flag no_builtin_flag export_dynamic_flag_spec \
+    thread_safe_flag_spec whole_archive_flag_spec libname_spec \
+    library_names_spec soname_spec \
+    RANLIB old_archive_cmds old_archive_from_new_cmds old_postinstall_cmds \
+    old_postuninstall_cmds archive_cmds archive_expsym_cmds postinstall_cmds \
+    postuninstall_cmds extract_expsyms_cmds old_archive_from_expsyms_cmds \
+    old_striplib striplib file_magic_cmd export_symbols_cmds \
+    deplibs_check_method allow_undefined_flag no_undefined_flag \
+    finish_cmds finish_eval global_symbol_pipe global_symbol_to_cdecl \
+    hardcode_libdir_flag_spec hardcode_libdir_separator  \
+    sys_lib_search_path_spec sys_lib_dlsearch_path_spec \
+    compiler_c_o compiler_o_lo need_locks exclude_expsyms include_expsyms; do
+
+    case "$var" in
+    reload_cmds | old_archive_cmds | old_archive_from_new_cmds | \
+    old_postinstall_cmds | old_postuninstall_cmds | \
+    export_symbols_cmds | archive_cmds | archive_expsym_cmds | \
+    extract_expsyms_cmds | old_archive_from_expsyms_cmds | \
+    postinstall_cmds | postuninstall_cmds | \
+    finish_cmds | sys_lib_search_path_spec | sys_lib_dlsearch_path_spec)
+      # Double-quote double-evaled strings.
+      eval "$var=\\\"\`\$echo \"X\$$var\" | \$Xsed -e \"\$double_quote_subst\" -e \"\$sed_quote_subst\" -e \"\$delay_variable_subst\"\`\\\"" ### testsuite: skip nested quoting test
+      ;;
+    *)
+      eval "$var=\\\"\`\$echo \"X\$$var\" | \$Xsed -e \"\$sed_quote_subst\"\`\\\"" ### testsuite: skip nested quoting test
+      ;;
+    esac
+  done
+
+  case "$ltecho" in
+  *'\$0 --fallback-echo"')
+    ltecho=`$echo "X$ltecho" | $Xsed -e 's/\\\\\\\$0 --fallback-echo"$/$0 --fallback-echo"/'`
+    ;;
+  esac
+
+  trap "$rm \"$ofile\"; exit 1" 1 2 15
+  echo "creating $ofile"
+  $rm "$ofile"
+  cat <<EOF > "$ofile"
+#! $SHELL
+
+# `$echo "$ofile" | sed 's%^.*/%%'` - Provide generalized library-building support services.
+# Generated automatically by $PROGRAM (GNU $PACKAGE $VERSION$TIMESTAMP)
+# NOTE: Changes made to this file will be lost: look at ltconfig or ltmain.sh.
+#
+# Copyright (C) 1996-2000 Free Software Foundation, Inc.
+# Originally by Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# Sed that helps us avoid accidentally triggering echo(1) options like -n.
+Xsed="sed -e s/^X//"
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+if test "X\${CDPATH+set}" = Xset; then CDPATH=:; export CDPATH; fi
+
+### BEGIN LIBTOOL CONFIG
+EOF
+  cfgfile="$ofile"
+  ;;
+
+*)
+  # Double-quote the variables that need it (for aesthetics).
+  for var in old_AR old_CC old_CFLAGS old_CPPFLAGS \
+    old_MAGIC old_LD old_LDFLAGS old_LIBS \
+    old_LN_S old_NM old_RANLIB old_STRIP \
+    old_AS old_DLLTOOL old_OBJDUMP \
+    old_OBJEXT old_EXEEXT old_reload_flag \
+    old_deplibs_check_method old_file_magic_cmd; do
+    eval "$var=\\\"\$var\\\""
+  done
+
+  # Just create a config file.
+  cfgfile="$ofile.cfg"
+  trap "$rm \"$cfgfile\"; exit 1" 1 2 15
+  echo "creating $cfgfile"
+  $rm "$cfgfile"
+  cat <<EOF > "$cfgfile"
+# `$echo "$cfgfile" | sed 's%^.*/%%'` - Libtool configuration file.
+# Generated automatically by $PROGRAM (GNU $PACKAGE $VERSION$TIMESTAMP)
+EOF
+  ;;
+esac
+
+cat <<EOF >> "$cfgfile"
+# Libtool was configured as follows, on host `(hostname || uname -n) 2>/dev/null | sed 1q`:
+#
+# AR=$old_AR CC=$old_CC CFLAGS=$old_CFLAGS CPPFLAGS=$old_CPPFLAGS \\
+# MAGIC=$old_MAGIC LD=$old_LD LDFLAGS=$old_LDFLAGS LIBS=$old_LIBS \\
+# LN_S=$old_LN_S NM=$old_NM RANLIB=$old_RANLIB STRIP=$old_STRIP \\
+# AS=$old_AS DLLTOOL=$old_DLLTOOL OBJDUMP=$old_OBJDUMP \\
+# objext=$old_OBJEXT exeext=$old_EXEEXT reload_flag=$old_reload_flag \\
+# deplibs_check_method=$old_deplibs_check_method file_magic_cmd=$old_file_magic_cmd \\
+#   $0$ltconfig_args
+#
+# Compiler and other test output produced by $progname, useful for
+# debugging $progname, is in ./config.log if it exists.
+# The version of $progname that generated this script.
+LTCONFIG_VERSION=$LTCONFIG_VERSION
+
+# Shell to use when invoking shell scripts.
+SHELL=$LTSHELL
+
+# Whether or not to build shared libraries.
+build_libtool_libs=$enable_shared
+
+# Whether or not to build static libraries.
+build_old_libs=$enable_static
+
+# Whether or not to optimize for fast installation.
+fast_install=$enable_fast_install
+
+# The host system.
+host_alias=$host_alias
+host=$host
+
+# An echo program that does not interpret backslashes.
+echo=$ltecho
+
+# The archiver.
+AR=$AR
+
+# The default C compiler.
+CC=$CC
+
+# The linker used to build libraries.
+LD=$LD
+
+# Whether we need hard or soft links.
+LN_S=$LN_S
+
+# A BSD-compatible nm program.
+NM=$NM
+
+# A symbol stripping program
+STRIP=$STRIP
+
+# Used to examine libraries when file_magic_cmd begins "file"
+MAGIC=$MAGIC
+
+# Used on cygwin: DLL creation program.
+DLLTOOL="$DLLTOOL"
+
+# Used on cygwin: object dumper.
+OBJDUMP="$OBJDUMP"
+
+# Used on cygwin: assembler.
+AS="$AS"
+
+# The name of the directory that contains temporary libtool files.
+objdir=$objdir
+
+# How to create reloadable object files.
+reload_flag=$reload_flag
+reload_cmds=$reload_cmds
+
+# How to pass a linker flag through the compiler.
+wl=$wl
+
+# Object file suffix (normally "o").
+objext="$objext"
+
+# Old archive suffix (normally "a").
+libext="$libext"
+
+# Executable file suffix (normally "").
+exeext="$exeext"
+
+# Additional compiler flags for building library objects.
+pic_flag=$pic_flag
+pic_mode=$pic_mode
+
+# Does compiler simultaneously support -c and -o options?
+compiler_c_o=$compiler_c_o
+
+# Can we write directly to a .lo ?
+compiler_o_lo=$compiler_o_lo
+
+# Must we lock files when doing compilation ?
+need_locks=$need_locks
+
+# Do we need the lib prefix for modules?
+need_lib_prefix=$need_lib_prefix
+
+# Do we need a version for libraries?
+need_version=$need_version
+
+# Whether dlopen is supported.
+dlopen_support=$enable_dlopen
+
+# Whether dlopen of programs is supported.
+dlopen_self=$enable_dlopen_self
+
+# Whether dlopen of statically linked programs is supported.
+dlopen_self_static=$enable_dlopen_self_static
+
+# Compiler flag to prevent dynamic linking.
+link_static_flag=$link_static_flag
+
+# Compiler flag to turn off builtin functions.
+no_builtin_flag=$no_builtin_flag
+
+# Compiler flag to allow reflexive dlopens.
+export_dynamic_flag_spec=$export_dynamic_flag_spec
+
+# Compiler flag to generate shared objects directly from archives.
+whole_archive_flag_spec=$whole_archive_flag_spec
+
+# Compiler flag to generate thread-safe objects.
+thread_safe_flag_spec=$thread_safe_flag_spec
+
+# Library versioning type.
+version_type=$version_type
+
+# Format of library name prefix.
+libname_spec=$libname_spec
+
+# List of archive names.  First name is the real one, the rest are links.
+# The last name is the one that the linker finds with -lNAME.
+library_names_spec=$library_names_spec
+
+# The coded name of the library, if different from the real name.
+soname_spec=$soname_spec
+
+# Commands used to build and install an old-style archive.
+RANLIB=$RANLIB
+old_archive_cmds=$old_archive_cmds
+old_postinstall_cmds=$old_postinstall_cmds
+old_postuninstall_cmds=$old_postuninstall_cmds
+
+# Create an old-style archive from a shared archive.
+old_archive_from_new_cmds=$old_archive_from_new_cmds
+
+# Create a temporary old-style archive to link instead of a shared archive.
+old_archive_from_expsyms_cmds=$old_archive_from_expsyms_cmds
+
+# Commands used to build and install a shared archive.
+archive_cmds=$archive_cmds
+archive_expsym_cmds=$archive_expsym_cmds
+postinstall_cmds=$postinstall_cmds
+postuninstall_cmds=$postuninstall_cmds
+
+# Commands to strip libraries.
+old_striplib=$old_striplib
+striplib=$striplib
+
+# Method to check whether dependent libraries are shared objects.
+deplibs_check_method=$deplibs_check_method
+
+# Command to use when deplibs_check_method == file_magic.
+file_magic_cmd=$file_magic_cmd
+
+# Flag that allows shared libraries with undefined symbols to be built.
+allow_undefined_flag=$allow_undefined_flag
+
+# Flag that forces no undefined symbols.
+no_undefined_flag=$no_undefined_flag
+
+# Commands used to finish a libtool library installation in a directory.
+finish_cmds=$finish_cmds
+
+# Same as above, but a single script fragment to be evaled but not shown.
+finish_eval=$finish_eval
+
+# Take the output of nm and produce a listing of raw symbols and C names.
+global_symbol_pipe=$global_symbol_pipe
+
+# Transform the output of nm in a proper C declaration
+global_symbol_to_cdecl=$global_symbol_to_cdecl
+
+# This is the shared library runtime path variable.
+runpath_var=$runpath_var
+
+# This is the shared library path variable.
+shlibpath_var=$shlibpath_var
+
+# Is shlibpath searched before the hard-coded library search path?
+shlibpath_overrides_runpath=$shlibpath_overrides_runpath
+
+# How to hardcode a shared library path into an executable.
+hardcode_action=$hardcode_action
+
+# Whether we should hardcode library paths into libraries.
+hardcode_into_libs=$hardcode_into_libs
+
+# Flag to hardcode \$libdir into a binary during linking.
+# This must work even if \$libdir does not exist.
+hardcode_libdir_flag_spec=$hardcode_libdir_flag_spec
+
+# Whether we need a single -rpath flag with a separated argument.
+hardcode_libdir_separator=$hardcode_libdir_separator
+
+# Set to yes if using DIR/libNAME.so during linking hardcodes DIR into the
+# resulting binary.
+hardcode_direct=$hardcode_direct
+
+# Set to yes if using the -LDIR flag during linking hardcodes DIR into the
+# resulting binary.
+hardcode_minus_L=$hardcode_minus_L
+
+# Set to yes if using SHLIBPATH_VAR=DIR during linking hardcodes DIR into
+# the resulting binary.
+hardcode_shlibpath_var=$hardcode_shlibpath_var
+
+# Whether libtool must link a program against all its dependency libraries.
+link_all_deplibs=$link_all_deplibs
+
+# Compile-time system search path for libraries
+sys_lib_search_path_spec=$sys_lib_search_path_spec
+
+# Run-time system search path for libraries
+sys_lib_dlsearch_path_spec=$sys_lib_dlsearch_path_spec
+
+# Fix the shell variable \$srcfile for the compiler.
+fix_srcfile_path="$fix_srcfile_path"
+
+# Set to yes if exported symbols are required.
+always_export_symbols=$always_export_symbols
+
+# The commands to list exported symbols.
+export_symbols_cmds=$export_symbols_cmds
+
+# The commands to extract the exported symbol list from a shared archive.
+extract_expsyms_cmds=$extract_expsyms_cmds
+
+# Symbols that should not be listed in the preloaded symbols.
+exclude_expsyms=$exclude_expsyms
+
+# Symbols that must always be exported.
+include_expsyms=$include_expsyms
+
+EOF
+
+case "$ltmain" in
+*.sh)
+  echo '### END LIBTOOL CONFIG' >> "$ofile"
+  echo >> "$ofile"
+  case "$host_os" in
+  aix3*)
+    cat <<\EOF >> "$ofile"
+
+# AIX sometimes has problems with the GCC collect2 program.  For some
+# reason, if we set the COLLECT_NAMES environment variable, the problems
+# vanish in a puff of smoke.
+if test "X${COLLECT_NAMES+set}" != Xset; then
+  COLLECT_NAMES=
+  export COLLECT_NAMES
+fi
+EOF
+    ;;
+  esac
+  case "$host" in
+  *-*-cygwin* | *-*-mingw* | *-*-os2*)
+    cat <<'EOF' >> "$ofile"
+      # This is a source program that is used to create dlls on Windows
+      # Don't remove nor modify the starting and closing comments
+# /* ltdll.c starts here */
+# #define WIN32_LEAN_AND_MEAN
+# #include <windows.h>
+# #undef WIN32_LEAN_AND_MEAN
+# #include <stdio.h>
+#
+# #ifndef __CYGWIN__
+# #  ifdef __CYGWIN32__
+# #    define __CYGWIN__ __CYGWIN32__
+# #  endif
+# #endif
+#
+# #ifdef __cplusplus
+# extern "C" {
+# #endif
+# BOOL APIENTRY DllMain (HINSTANCE hInst, DWORD reason, LPVOID reserved);
+# #ifdef __cplusplus
+# }
+# #endif
+#
+# #ifdef __CYGWIN__
+# #include <cygwin/cygwin_dll.h>
+# DECLARE_CYGWIN_DLL( DllMain );
+# #endif
+# HINSTANCE __hDllInstance_base;
+#
+# BOOL APIENTRY
+# DllMain (HINSTANCE hInst, DWORD reason, LPVOID reserved)
+# {
+#   __hDllInstance_base = hInst;
+#   return TRUE;
+# }
+# /* ltdll.c ends here */
+      # This is a source program that is used to create import libraries
+      # on Windows for dlls which lack them. Don't remove nor modify the
+      # starting and closing comments
+# /* impgen.c starts here */
+# /*   Copyright (C) 1999-2000 Free Software Foundation, Inc.
+#
+#  This file is part of GNU libtool.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software
+#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#  */
+#
+#  #include <stdio.h>		/* for printf() */
+#  #include <unistd.h>		/* for open(), lseek(), read() */
+#  #include <fcntl.h>		/* for O_RDONLY, O_BINARY */
+#  #include <string.h>		/* for strdup() */
+#
+#  /* O_BINARY isn't required (or even defined sometimes) under Unix */
+#  #ifndef O_BINARY
+#  #define O_BINARY 0
+#  #endif
+#
+#  static unsigned int
+#  pe_get16 (fd, offset)
+#       int fd;
+#       int offset;
+#  {
+#    unsigned char b[2];
+#    lseek (fd, offset, SEEK_SET);
+#    read (fd, b, 2);
+#    return b[0] + (b[1]<<8);
+#  }
+#
+#  static unsigned int
+#  pe_get32 (fd, offset)
+#      int fd;
+#      int offset;
+#  {
+#    unsigned char b[4];
+#    lseek (fd, offset, SEEK_SET);
+#    read (fd, b, 4);
+#    return b[0] + (b[1]<<8) + (b[2]<<16) + (b[3]<<24);
+#  }
+#
+#  static unsigned int
+#  pe_as32 (ptr)
+#       void *ptr;
+#  {
+#    unsigned char *b = ptr;
+#    return b[0] + (b[1]<<8) + (b[2]<<16) + (b[3]<<24);
+#  }
+#
+#  int
+#  main (argc, argv)
+#      int argc;
+#      char *argv[];
+#  {
+#      int dll;
+#      unsigned long pe_header_offset, opthdr_ofs, num_entries, i;
+#      unsigned long export_rva, export_size, nsections, secptr, expptr;
+#      unsigned long name_rvas, nexp;
+#      unsigned char *expdata, *erva;
+#      char *filename, *dll_name;
+#
+#      filename = argv[1];
+#
+#      dll = open(filename, O_RDONLY|O_BINARY);
+#      if (!dll)
+#  	return 1;
+#
+#      dll_name = filename;
+#
+#      for (i=0; filename[i]; i++)
+#  	if (filename[i] == '/' || filename[i] == '\\'  || filename[i] == ':')
+#  	    dll_name = filename + i +1;
+#
+#      pe_header_offset = pe_get32 (dll, 0x3c);
+#      opthdr_ofs = pe_header_offset + 4 + 20;
+#      num_entries = pe_get32 (dll, opthdr_ofs + 92);
+#
+#      if (num_entries < 1) /* no exports */
+#  	return 1;
+#
+#      export_rva = pe_get32 (dll, opthdr_ofs + 96);
+#      export_size = pe_get32 (dll, opthdr_ofs + 100);
+#      nsections = pe_get16 (dll, pe_header_offset + 4 +2);
+#      secptr = (pe_header_offset + 4 + 20 +
+#  	      pe_get16 (dll, pe_header_offset + 4 + 16));
+#
+#      expptr = 0;
+#      for (i = 0; i < nsections; i++)
+#      {
+#  	char sname[8];
+#  	unsigned long secptr1 = secptr + 40 * i;
+#  	unsigned long vaddr = pe_get32 (dll, secptr1 + 12);
+#  	unsigned long vsize = pe_get32 (dll, secptr1 + 16);
+#  	unsigned long fptr = pe_get32 (dll, secptr1 + 20);
+#  	lseek(dll, secptr1, SEEK_SET);
+#  	read(dll, sname, 8);
+#  	if (vaddr <= export_rva && vaddr+vsize > export_rva)
+#  	{
+#  	    expptr = fptr + (export_rva - vaddr);
+#  	    if (export_rva + export_size > vaddr + vsize)
+#  		export_size = vsize - (export_rva - vaddr);
+#  	    break;
+#  	}
+#      }
+#
+#      expdata = (unsigned char*)malloc(export_size);
+#      lseek (dll, expptr, SEEK_SET);
+#      read (dll, expdata, export_size);
+#      erva = expdata - export_rva;
+#
+#      nexp = pe_as32 (expdata+24);
+#      name_rvas = pe_as32 (expdata+32);
+#
+#      printf ("EXPORTS\n");
+#      for (i = 0; i<nexp; i++)
+#      {
+#  	unsigned long name_rva = pe_as32 (erva+name_rvas+i*4);
+#  	printf ("\t%s @ %ld ;\n", erva+name_rva, 1+ i);
+#      }
+#
+#      return 0;
+#  }
+# /* impgen.c ends here */
+
+EOF
+    ;;
+  esac
+
+
+  # Append the ltmain.sh script.
+  sed '$q' "$ltmain" >> "$ofile" || (rm -f "$ofile"; exit 1)
+  # We use sed instead of cat because bash on DJGPP gets confused if
+  # if finds mixed CR/LF and LF-only lines.  Since sed operates in
+  # text mode, it properly converts lines to CR/LF.  This bash problem
+  # is reportedly fixed, but why not run on old versions too?
+
+  chmod +x "$ofile"
+  ;;
+
+*)
+  # Compile the libtool program.
+  echo "FIXME: would compile $ltmain"
+  ;;
+esac
+
+test -n "$cache_file" || exit 0
+
+# AC_CACHE_SAVE
+trap '' 1 2 15
+cat > confcache <<\EOF
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs.  It is not useful on other systems.
+# If it contains results you don't want to keep, you may remove or edit it.
+#
+# By default, configure uses ./config.cache as the cache file,
+# creating it if it does not exist already.  You can give configure
+# the --cache-file=FILE option to use a different cache file; that is
+# what configure does when it calls configure scripts in
+# subdirectories, so they share the cache.
+# Giving --cache-file=/dev/null disables caching, for debugging configure.
+# config.status only pays attention to the cache file if you give it the
+# --recheck option to rerun configure.
+#
+EOF
+# The following way of writing the cache mishandles newlines in values,
+# but we know of no workaround that is simple, portable, and efficient.
+# So, don't put newlines in cache variables' values.
+# Ultrix sh set writes to stderr and can't be redirected directly,
+# and sets the high bit in the cache file unless we assign to the vars.
+(set) 2>&1 |
+  case `(ac_space=' '; set | grep ac_space) 2>&1` in
+  *ac_space=\ *)
+    # `set' does not quote correctly, so add quotes (double-quote substitution
+    # turns \\\\ into \\, and sed turns \\ into \).
+    sed -n \
+      -e "s/'/'\\\\''/g" \
+      -e "s/^\\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\\)=\\(.*\\)/\\1=\${\\1='\\2'}/p"
+    ;;
+  *)
+    # `set' quotes correctly as required by POSIX, so do not add quotes.
+    sed -n -e 's/^\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\)=\(.*\)/\1=${\1=\2}/p'
+    ;;
+  esac >> confcache
+if cmp -s $cache_file confcache; then
+  :
+else
+  if test -w $cache_file; then
+    echo "updating cache $cache_file"
+    cat confcache > $cache_file
+  else
+    echo "not updating unwritable cache $cache_file"
+  fi
+fi
+rm -f confcache
+
+exit 0
+
+# Local Variables:
+# mode:shell-script
+# sh-indentation:2
+# End:
diff --git a/rts/gmp/ltmain.sh b/rts/gmp/ltmain.sh
new file mode 100644
index 0000000000..d81d89f878
--- /dev/null
+++ b/rts/gmp/ltmain.sh
@@ -0,0 +1,4692 @@
+# ltmain.sh - Provide generalized library-building support services.
+# NOTE: Changing this file will not affect anything until you rerun ltconfig.
+#
+# Copyright (C) 1996-2000 Free Software Foundation, Inc.
+# Originally by Gordon Matzigkeit <gord@gnu.ai.mit.edu>, 1996
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# Check that we have a working $echo.
+if test "X$1" = X--no-reexec; then
+  # Discard the --no-reexec flag, and continue.
+  shift
+elif test "X$1" = X--fallback-echo; then
+  # Avoid inline document here, it may be left over
+  :
+elif test "X`($echo '\t') 2>/dev/null`" = 'X\t'; then
+  # Yippee, $echo works!
+  :
+else
+  # Restart under the correct shell, and then maybe $echo will work.
+  exec $SHELL "$0" --no-reexec ${1+"$@"}
+fi
+
+if test "X$1" = X--fallback-echo; then
+  # used as fallback echo
+  shift
+  cat <<EOF
+$*
+EOF
+  exit 0
+fi
+
+# The name of this program.
+progname=`$echo "$0" | sed 's%^.*/%%'`
+modename="$progname"
+
+# Constants.
+PROGRAM=ltmain.sh
+PACKAGE=libtool
+VERSION=1.3c
+TIMESTAMP=" (1.696 2000/03/14 20:22:42)"
+
+default_mode=
+help="Try \`$progname --help' for more information."
+magic="%%%MAGIC variable%%%"
+mkdir="mkdir"
+mv="mv -f"
+rm="rm -f"
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+Xsed='sed -e 1s/^X//'
+sed_quote_subst='s/\([\\`\\"$\\\\]\)/\\\1/g'
+SP2NL='tr \040 \012'
+NL2SP='tr \015\012 \040\040'
+
+# NLS nuisances.
+# Only set LANG and LC_ALL to C if already set.
+# These must not be set unconditionally because not all systems understand
+# e.g. LANG=C (notably SCO).
+# We save the old values to restore during execute mode.
+if test "${LC_ALL+set}" = set; then
+  save_LC_ALL="$LC_ALL"; LC_ALL=C; export LC_ALL
+fi
+if test "${LANG+set}" = set; then
+  save_LANG="$LANG"; LANG=C; export LANG
+fi
+
+if test "$LTCONFIG_VERSION" != "$VERSION"; then
+  echo "$modename: ltconfig version \`$LTCONFIG_VERSION' does not match $PROGRAM version \`$VERSION'" 1>&2
+  echo "Fatal configuration error.  See the $PACKAGE docs for more information." 1>&2
+  exit 1
+fi
+
+if test "$build_libtool_libs" != yes && test "$build_old_libs" != yes; then
+  echo "$modename: not configured to build any kind of library" 1>&2
+  echo "Fatal configuration error.  See the $PACKAGE docs for more information." 1>&2
+  exit 1
+fi
+
+# Global variables.
+mode=$default_mode
+nonopt=
+prev=
+prevopt=
+run=
+show="$echo"
+show_help=
+execute_dlfiles=
+lo2o="s/\\.lo\$/.${objext}/"
+o2lo="s/\\.${objext}\$/.lo/"
+
+# Parse our command line options once, thoroughly.
+while test $# -gt 0
+do
+  arg="$1"
+  shift
+
+  case "$arg" in
+  -*=*) optarg=`$echo "X$arg" | $Xsed -e 's/[-_a-zA-Z0-9]*=//'` ;;
+  *) optarg= ;;
+  esac
+
+  # If the previous option needs an argument, assign it.
+  if test -n "$prev"; then
+    case "$prev" in
+    execute_dlfiles)
+      eval "$prev=\"\$$prev \$arg\""
+      ;;
+    *)
+      eval "$prev=\$arg"
+      ;;
+    esac
+
+    prev=
+    prevopt=
+    continue
+  fi
+
+  # Have we seen a non-optional argument yet?
+  case "$arg" in
+  --help)
+    show_help=yes
+    ;;
+
+  --version)
+    echo "$PROGRAM (GNU $PACKAGE) $VERSION$TIMESTAMP"
+    exit 0
+    ;;
+
+  --config)
+    sed -e '1,/^### BEGIN LIBTOOL CONFIG/d' -e '/^### END LIBTOOL CONFIG/,$d' $0
+    exit 0
+    ;;
+
+  --debug)
+    echo "$progname: enabling shell trace mode"
+    set -x
+    ;;
+
+  --dry-run | -n)
+    run=:
+    ;;
+
+  --features)
+    echo "host: $host"
+    if test "$build_libtool_libs" = yes; then
+      echo "enable shared libraries"
+    else
+      echo "disable shared libraries"
+    fi
+    if test "$build_old_libs" = yes; then
+      echo "enable static libraries"
+    else
+      echo "disable static libraries"
+    fi
+    exit 0
+    ;;
+
+  --finish) mode="finish" ;;
+
+  --mode) prevopt="--mode" prev=mode ;;
+  --mode=*) mode="$optarg" ;;
+
+  --quiet | --silent)
+    show=:
+    ;;
+
+  -dlopen)
+    prevopt="-dlopen"
+    prev=execute_dlfiles
+    ;;
+
+  -*)
+    $echo "$modename: unrecognized option \`$arg'" 1>&2
+    $echo "$help" 1>&2
+    exit 1
+    ;;
+
+  *)
+    nonopt="$arg"
+    break
+    ;;
+  esac
+done
+
+if test -n "$prevopt"; then
+  $echo "$modename: option \`$prevopt' requires an argument" 1>&2
+  $echo "$help" 1>&2
+  exit 1
+fi
+
+if test -z "$show_help"; then
+
+  # Infer the operation mode.
+  if test -z "$mode"; then
+    case "$nonopt" in
+    *cc | *++ | gcc* | *-gcc*)
+      mode=link
+      for arg
+      do
+	case "$arg" in
+	-c)
+	   mode=compile
+	   break
+	   ;;
+	esac
+      done
+      ;;
+    *db | *dbx | *strace | *truss)
+      mode=execute
+      ;;
+    *install*|cp|mv)
+      mode=install
+      ;;
+    *rm)
+      mode=uninstall
+      ;;
+    *)
+      # If we have no mode, but dlfiles were specified, then do execute mode.
+      test -n "$execute_dlfiles" && mode=execute
+
+      # Just use the default operation mode.
+      if test -z "$mode"; then
+	if test -n "$nonopt"; then
+	  $echo "$modename: warning: cannot infer operation mode from \`$nonopt'" 1>&2
+	else
+	  $echo "$modename: warning: cannot infer operation mode without MODE-ARGS" 1>&2
+	fi
+      fi
+      ;;
+    esac
+  fi
+
+  # Only execute mode is allowed to have -dlopen flags.
+  if test -n "$execute_dlfiles" && test "$mode" != execute; then
+    $echo "$modename: unrecognized option \`-dlopen'" 1>&2
+    $echo "$help" 1>&2
+    exit 1
+  fi
+
+  # Change the help message to a mode-specific one.
+  generic_help="$help"
+  help="Try \`$modename --help --mode=$mode' for more information."
+
+  # These modes are in order of execution frequency so that they run quickly.
+  case "$mode" in
+  # libtool compile mode
+  compile)
+    modename="$modename: compile"
+    # Get the compilation command and the source file.
+    base_compile=
+    prev=
+    lastarg=
+    srcfile="$nonopt"
+    suppress_output=
+
+    user_target=no
+    for arg
+    do
+      case "$prev" in
+      "") ;;
+      xcompiler)
+	# Aesthetically quote the previous argument.
+	prev=
+	lastarg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
+
+	case "$arg" in
+	# Double-quote args containing other shell metacharacters.
+	# Many Bourne shells cannot handle close brackets correctly
+	# in scan sets, so we specify it separately.
+	*[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
+	  arg="\"$arg\""
+	  ;;
+	esac
+
+	# Add the previous argument to base_compile.
+	if test -z "$base_compile"; then
+	  base_compile="$lastarg"
+	else
+	  base_compile="$base_compile $lastarg"
+	fi
+	continue
+	;;
+      esac
+
+      # Accept any command-line options.
+      case "$arg" in
+      -o)
+	if test "$user_target" != "no"; then
+	  $echo "$modename: you cannot specify \`-o' more than once" 1>&2
+	  exit 1
+	fi
+	user_target=next
+	;;
+
+      -static)
+	build_old_libs=yes
+	continue
+	;;
+
+      -Xcompiler)
+	prev=xcompiler
+	continue
+	;;
+
+      -Wc,*)
+	args=`$echo "X$arg" | $Xsed -e "s/^-Wc,//"`
+	lastarg=
+	IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=','
+	for arg in $args; do
+	  IFS="$save_ifs"
+
+	  # Double-quote args containing other shell metacharacters.
+	  # Many Bourne shells cannot handle close brackets correctly
+	  # in scan sets, so we specify it separately.
+	  case "$arg" in
+	    *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
+	    arg="\"$arg\""
+	    ;;
+	  esac
+	  lastarg="$lastarg $arg"
+	done
+	IFS="$save_ifs"
+	lastarg=`$echo "X$lastarg" | $Xsed -e "s/^ //"`
+
+	# Add the arguments to base_compile.
+	if test -z "$base_compile"; then
+	  base_compile="$lastarg"
+	else
+	  base_compile="$base_compile $lastarg"
+	fi
+	continue
+	;;
+      esac
+
+      case "$user_target" in
+      next)
+	# The next one is the -o target name
+	user_target=yes
+	continue
+	;;
+      yes)
+	# We got the output file
+	user_target=set
+	libobj="$arg"
+	continue
+	;;
+      esac
+
+      # Accept the current argument as the source file.
+      lastarg="$srcfile"
+      srcfile="$arg"
+
+      # Aesthetically quote the previous argument.
+
+      # Backslashify any backslashes, double quotes, and dollar signs.
+      # These are the only characters that are still specially
+      # interpreted inside of double-quoted scrings.
+      lastarg=`$echo "X$lastarg" | $Xsed -e "$sed_quote_subst"`
+
+      # Double-quote args containing other shell metacharacters.
+      # Many Bourne shells cannot handle close brackets correctly
+      # in scan sets, so we specify it separately.
+      case "$lastarg" in
+      *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
+	lastarg="\"$lastarg\""
+	;;
+      esac
+
+      # Add the previous argument to base_compile.
+      if test -z "$base_compile"; then
+	base_compile="$lastarg"
+      else
+	base_compile="$base_compile $lastarg"
+      fi
+    done
+
+    case "$user_target" in
+    set)
+      ;;
+    no)
+      # Get the name of the library object.
+      libobj=`$echo "X$srcfile" | $Xsed -e 's%^.*/%%'`
+      ;;
+    *)
+      $echo "$modename: you must specify a target with \`-o'" 1>&2
+      exit 1
+      ;;
+    esac
+
+    # Recognize several different file suffixes.
+    # If the user specifies -o file.o, it is replaced with file.lo
+    xform='[cCFSfmso]'
+    case "$libobj" in
+    *.ada) xform=ada ;;
+    *.adb) xform=adb ;;
+    *.ads) xform=ads ;;
+    *.asm) xform=asm ;;
+    *.c++) xform=c++ ;;
+    *.cc) xform=cc ;;
+    *.cpp) xform=cpp ;;
+    *.cxx) xform=cxx ;;
+    *.f90) xform=f90 ;;
+    *.for) xform=for ;;
+    esac
+
+    libobj=`$echo "X$libobj" | $Xsed -e "s/\.$xform$/.lo/"`
+
+    case "$libobj" in
+    *.lo) obj=`$echo "X$libobj" | $Xsed -e "$lo2o"` ;;
+    *)
+      $echo "$modename: cannot determine name of library object from \`$libobj'" 1>&2
+      exit 1
+      ;;
+    esac
+
+    if test -z "$base_compile"; then
+      $echo "$modename: you must specify a compilation command" 1>&2
+      $echo "$help" 1>&2
+      exit 1
+    fi
+
+    # Delete any leftover library objects.
+    if test "$build_old_libs" = yes; then
+      removelist="$obj $libobj"
+    else
+      removelist="$libobj"
+    fi
+
+    $run $rm $removelist
+    trap "$run $rm $removelist; exit 1" 1 2 15
+
+    # Calculate the filename of the output object if compiler does
+    # not support -o with -c
+    if test "$compiler_c_o" = no; then
+      output_obj=`$echo "X$srcfile" | $Xsed -e 's%^.*/%%' -e 's%\..*$%%'`.${objext}
+      lockfile="$output_obj.lock"
+      removelist="$removelist $output_obj $lockfile"
+      trap "$run $rm $removelist; exit 1" 1 2 15
+    else
+      need_locks=no
+      lockfile=
+    fi
+
+    # Lock this critical section if it is needed
+    # We use this script file to make the link, it avoids creating a new file
+    if test "$need_locks" = yes; then
+      until ln "$0" "$lockfile" 2>/dev/null; do
+	$show "Waiting for $lockfile to be removed"
+	sleep 2
+      done
+    elif test "$need_locks" = warn; then
+      if test -f "$lockfile"; then
+	echo "\
+*** ERROR, $lockfile exists and contains:
+`cat $lockfile 2>/dev/null`
+
+This indicates that another process is trying to use the same
+temporary object file, and libtool could not work around it because
+your compiler does not support \`-c' and \`-o' together.  If you
+repeat this compilation, it may succeed, by chance, but you had better
+avoid parallel builds (make -j) in this platform, or get a better
+compiler."
+
+	$run $rm $removelist
+	exit 1
+      fi
+      echo $srcfile > "$lockfile"
+    fi
+
+    if test -n "$fix_srcfile_path"; then
+      eval srcfile=\"$fix_srcfile_path\"
+    fi
+
+    # Only build a PIC object if we are building libtool libraries.
+    if test "$build_libtool_libs" = yes; then
+      # Without this assignment, base_compile gets emptied.
+      fbsd_hideous_sh_bug=$base_compile
+
+      if test "$pic_mode" != no; then
+	# All platforms use -DPIC, to notify preprocessed assembler code.
+	command="$base_compile $srcfile $pic_flag -DPIC"
+      else
+	# Don't build PIC code
+	command="$base_compile $srcfile"
+      fi
+      if test "$build_old_libs" = yes; then
+	lo_libobj="$libobj"
+	dir=`$echo "X$libobj" | $Xsed -e 's%/[^/]*$%%'`
+	if test "X$dir" = "X$libobj"; then
+	  dir="$objdir"
+	else
+	  dir="$dir/$objdir"
+	fi
+	libobj="$dir/"`$echo "X$libobj" | $Xsed -e 's%^.*/%%'`
+
+	if test -d "$dir"; then
+	  $show "$rm $libobj"
+	  $run $rm $libobj
+	else
+	  $show "$mkdir $dir"
+	  $run $mkdir $dir
+	  status=$?
+	  if test $status -ne 0 && test ! -d $dir; then
+	    exit $status
+	  fi
+	fi
+      fi
+      if test "$compiler_o_lo" = yes; then
+	output_obj="$libobj"
+	command="$command -o $output_obj"
+      elif test "$compiler_c_o" = yes; then
+	output_obj="$obj"
+	command="$command -o $output_obj"
+      fi
+
+      $run $rm "$output_obj"
+      $show "$command"
+      if $run eval "$command"; then :
+      else
+	test -n "$output_obj" && $run $rm $removelist
+	exit 1
+      fi
+
+      if test "$need_locks" = warn &&
+	 test x"`cat $lockfile 2>/dev/null`" != x"$srcfile"; then
+	echo "\
+*** ERROR, $lockfile contains:
+`cat $lockfile 2>/dev/null`
+
+but it should contain:
+$srcfile
+
+This indicates that another process is trying to use the same
+temporary object file, and libtool could not work around it because
+your compiler does not support \`-c' and \`-o' together.  If you
+repeat this compilation, it may succeed, by chance, but you had better
+avoid parallel builds (make -j) in this platform, or get a better
+compiler."
+
+	$run $rm $removelist
+	exit 1
+      fi
+
+      # Just move the object if needed, then go on to compile the next one
+      if test x"$output_obj" != x"$libobj"; then
+	$show "$mv $output_obj $libobj"
+	if $run $mv $output_obj $libobj; then :
+	else
+	  error=$?
+	  $run $rm $removelist
+	  exit $error
+	fi
+      fi
+
+      # If we have no pic_flag, then copy the object into place and finish.
+      if (test -z "$pic_flag" || test "$pic_mode" != default) &&
+	 test "$build_old_libs" = yes; then
+	# Rename the .lo from within objdir to obj
+	if test -f $obj; then
+	  $show $rm $obj
+	  $run $rm $obj
+	fi
+
+	$show "$mv $libobj $obj"
+	if $run $mv $libobj $obj; then :
+	else
+	  error=$?
+	  $run $rm $removelist
+	  exit $error
+	fi
+
+	xdir=`$echo "X$obj" | $Xsed -e 's%/[^/]*$%%'`
+	if test "X$xdir" = "X$obj"; then
+	  xdir="."
+	else
+	  xdir="$xdir"
+	fi
+	baseobj=`$echo "X$obj" | $Xsed -e "s%.*/%%"`
+	libobj=`$echo "X$baseobj" | $Xsed -e "$o2lo"`
+	# Now arrange that obj and lo_libobj become the same file
+	$show "(cd $xdir && $LN_S $baseobj $libobj)"
+	if $run eval '(cd $xdir && $LN_S $baseobj $libobj)'; then
+	  exit 0
+	else
+	  error=$?
+	  $run $rm $removelist
+	  exit $error
+	fi
+      fi
+
+      # Allow error messages only from the first compilation.
+      suppress_output=' >/dev/null 2>&1'
+    fi
+
+    # Only build a position-dependent object if we build old libraries.
+    if test "$build_old_libs" = yes; then
+      if test "$pic_mode" != yes; then
+	# Don't build PIC code
+	command="$base_compile $srcfile"
+      else
+	# All platforms use -DPIC, to notify preprocessed assembler code.
+	command="$base_compile $srcfile $pic_flag -DPIC"
+      fi
+      if test "$compiler_c_o" = yes; then
+	command="$command -o $obj"
+	output_obj="$obj"
+      fi
+
+      # Suppress compiler output if we already did a PIC compilation.
+      command="$command$suppress_output"
+      $run $rm "$output_obj"
+      $show "$command"
+      if $run eval "$command"; then :
+      else
+	$run $rm $removelist
+	exit 1
+      fi
+
+      if test "$need_locks" = warn &&
+	 test x"`cat $lockfile 2>/dev/null`" != x"$srcfile"; then
+	echo "\
+*** ERROR, $lockfile contains:
+`cat $lockfile 2>/dev/null`
+
+but it should contain:
+$srcfile
+
+This indicates that another process is trying to use the same
+temporary object file, and libtool could not work around it because
+your compiler does not support \`-c' and \`-o' together.  If you
+repeat this compilation, it may succeed, by chance, but you had better
+avoid parallel builds (make -j) in this platform, or get a better
+compiler."
+
+	$run $rm $removelist
+	exit 1
+      fi
+
+      # Just move the object if needed
+      if test x"$output_obj" != x"$obj"; then
+	$show "$mv $output_obj $obj"
+	if $run $mv $output_obj $obj; then :
+	else
+	  error=$?
+	  $run $rm $removelist
+	  exit $error
+	fi
+      fi
+
+      # Create an invalid libtool object if no PIC, so that we do not
+      # accidentally link it into a program.
+      if test "$build_libtool_libs" != yes; then
+	$show "echo timestamp > $libobj"
+	$run eval "echo timestamp > \$libobj" || exit $?
+      else
+	# Move the .lo from within objdir
+	$show "$mv $libobj $lo_libobj"
+	if $run $mv $libobj $lo_libobj; then :
+	else
+	  error=$?
+	  $run $rm $removelist
+	  exit $error
+	fi
+      fi
+    fi
+
+    # Unlock the critical section if it was locked
+    if test "$need_locks" != no; then
+      $rm "$lockfile"
+    fi
+
+    exit 0
+    ;;
+
+  # libtool link mode
+  link | relink)
+    modename="$modename: link"
+    case "$host" in
+    *-*-cygwin* | *-*-mingw* | *-*-os2*)
+      # It is impossible to link a dll without this setting, and
+      # we shouldn't force the makefile maintainer to figure out
+      # which system we are compiling for in order to pass an extra
+      # flag for every libtool invokation.
+      # allow_undefined=no
+
+      # FIXME: Unfortunately, there are problems with the above when trying
+      # to make a dll which has undefined symbols, in which case not
+      # even a static library is built.  For now, we need to specify
+      # -no-undefined on the libtool link line when we can be certain
+      # that all symbols are satisfied, otherwise we get a static library.
+      allow_undefined=yes
+      ;;
+    *)
+      allow_undefined=yes
+      ;;
+    esac
+    libtool_args="$nonopt"
+    compile_command="$nonopt"
+    finalize_command="$nonopt"
+
+    compile_rpath=
+    finalize_rpath=
+    compile_shlibpath=
+    finalize_shlibpath=
+    convenience=
+    old_convenience=
+    deplibs=
+    old_deplibs=
+    compiler_flags=
+    linker_flags=
+    dllsearchpath=
+    lib_search_path=`pwd`
+
+    avoid_version=no
+    dlfiles=
+    dlprefiles=
+    dlself=no
+    export_dynamic=no
+    export_symbols=
+    export_symbols_regex=
+    generated=
+    libobjs=
+    ltlibs=
+    module=no
+    no_install=no
+    objs=
+    prefer_static_libs=no
+    preload=no
+    prev=
+    prevarg=
+    release=
+    rpath=
+    xrpath=
+    perm_rpath=
+    temp_rpath=
+    thread_safe=no
+    vinfo=
+
+    # We need to know -static, to get the right output filenames.
+    for arg
+    do
+      case "$arg" in
+      -all-static | -static)
+	if test "X$arg" = "X-all-static"; then
+	  if test "$build_libtool_libs" = yes && test -z "$link_static_flag"; then
+	    $echo "$modename: warning: complete static linking is impossible in this configuration" 1>&2
+	  fi
+	  if test -n "$link_static_flag"; then
+	    dlopen_self=$dlopen_self_static
+	  fi
+	else
+	  if test -z "$pic_flag" && test -n "$link_static_flag"; then
+	    dlopen_self=$dlopen_self_static
+	  fi
+	fi
+	build_libtool_libs=no
+	build_old_libs=yes
+	prefer_static_libs=yes
+	break
+	;;
+      esac
+    done
+
+    # See if our shared archives depend on static archives.
+    test -n "$old_archive_from_new_cmds" && build_old_libs=yes
+
+    # Go through the arguments, transforming them on the way.
+    while test $# -gt 0; do
+      arg="$1"
+      shift
+      case "$arg" in
+      *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
+	qarg=\"`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`\" ### testsuite: skip nested quoting test
+	;;
+      *) qarg=$arg ;;
+      esac
+      libtool_args="$libtool_args $qarg"
+
+      # If the previous option needs an argument, assign it.
+      if test -n "$prev"; then
+	case "$prev" in
+	output)
+	  compile_command="$compile_command @OUTPUT@"
+	  finalize_command="$finalize_command @OUTPUT@"
+	  ;;
+	esac
+
+	case "$prev" in
+	dlfiles|dlprefiles)
+	  if test "$preload" = no; then
+	    # Add the symbol object into the linking commands.
+	    compile_command="$compile_command @SYMFILE@"
+	    finalize_command="$finalize_command @SYMFILE@"
+	    preload=yes
+	  fi
+	  case "$arg" in
+	  *.la | *.lo) ;;  # We handle these cases below.
+	  force)
+	    if test "$dlself" = no; then
+	      dlself=needless
+	      export_dynamic=yes
+	    fi
+	    prev=
+	    continue
+	    ;;
+	  self)
+	    if test "$prev" = dlprefiles; then
+	      dlself=yes
+	    elif test "$prev" = dlfiles && test "$dlopen_self" != yes; then
+	      dlself=yes
+	    else
+	      dlself=needless
+	      export_dynamic=yes
+	    fi
+	    prev=
+	    continue
+	    ;;
+	  *)
+	    if test "$prev" = dlfiles; then
+	      dlfiles="$dlfiles $arg"
+	    else
+	      dlprefiles="$dlprefiles $arg"
+	    fi
+	    prev=
+	    continue
+	    ;;
+	  esac
+	  ;;
+	expsyms)
+	  export_symbols="$arg"
+	  if test ! -f "$arg"; then
+	    $echo "$modename: symbol file \`$arg' does not exist"
+	    exit 1
+	  fi
+	  prev=
+	  continue
+	  ;;
+	expsyms_regex)
+	  export_symbols_regex="$arg"
+	  prev=
+	  continue
+	  ;;
+	release)
+	  release="-$arg"
+	  prev=
+	  continue
+	  ;;
+	rpath | xrpath)
+	  # We need an absolute path.
+	  case "$arg" in
+	  [\\/]* | [A-Za-z]:[\\/]*) ;;
+	  *)
+	    $echo "$modename: only absolute run-paths are allowed" 1>&2
+	    exit 1
+	    ;;
+	  esac
+	  if test "$prev" = rpath; then
+	    case "$rpath " in
+	    *" $arg "*) ;;
+	    *) rpath="$rpath $arg" ;;
+	    esac
+	  else
+	    case "$xrpath " in
+	    *" $arg "*) ;;
+	    *) xrpath="$xrpath $arg" ;;
+	    esac
+	  fi
+	  prev=
+	  continue
+	  ;;
+	xcompiler)
+	  compiler_flags="$compiler_flags $qarg"
+	  prev=
+	  compile_command="$compile_command $qarg"
+	  finalize_command="$finalize_command $qarg"
+	  continue
+	  ;;
+	xlinker)
+	  linker_flags="$linker_flags $qarg"
+	  compiler_flags="$compiler_flags $wl$qarg"
+	  prev=
+	  compile_command="$compile_command $wl$qarg"
+	  finalize_command="$finalize_command $wl$qarg"
+	  continue
+	  ;;
+	*)
+	  eval "$prev=\"\$arg\""
+	  prev=
+	  continue
+	  ;;
+	esac
+      fi
+
+      prevarg="$arg"
+
+      case "$arg" in
+      -all-static)
+	if test -n "$link_static_flag"; then
+	  compile_command="$compile_command $link_static_flag"
+	  finalize_command="$finalize_command $link_static_flag"
+	fi
+	continue
+	;;
+
+      -allow-undefined)
+	# FIXME: remove this flag sometime in the future.
+	$echo "$modename: \`-allow-undefined' is deprecated because it is the default" 1>&2
+	continue
+	;;
+
+      -avoid-version)
+	avoid_version=yes
+	continue
+	;;
+
+      -dlopen)
+	prev=dlfiles
+	continue
+	;;
+
+      -dlpreopen)
+	prev=dlprefiles
+	continue
+	;;
+
+      -export-dynamic)
+	export_dynamic=yes
+	continue
+	;;
+
+      -export-symbols | -export-symbols-regex)
+	if test -n "$export_symbols" || test -n "$export_symbols_regex"; then
+	  $echo "$modename: not more than one -exported-symbols argument allowed"
+	  exit 1
+	fi
+	if test "X$arg" = "X-export-symbols"; then
+	  prev=expsyms
+	else
+	  prev=expsyms_regex
+	fi
+	continue
+	;;
+
+      -L*)
+	dir=`$echo "X$arg" | $Xsed -e 's/^-L//'`
+	# We need an absolute path.
+	case "$dir" in
+	[\\/]* | [A-Za-z]:[\\/]*) ;;
+	*)
+	  absdir=`cd "$dir" && pwd`
+	  if test -z "$absdir"; then
+	    $echo "$modename: cannot determine absolute directory name of \`$dir'" 1>&2
+	    exit 1
+	  fi
+	  dir="$absdir"
+	  ;;
+	esac
+	case "$deplibs " in
+	*" -L$dir "*) ;;
+	*)
+	  deplibs="$deplibs -L$dir"
+	  lib_search_path="$lib_search_path $dir"
+	  ;;
+	esac
+	case "$host" in
+	*-*-cygwin* | *-*-mingw* | *-*-os2*)
+	  case ":$dllsearchpath:" in
+	  *":$dir:"*) ;;
+	  *) dllsearchpath="$dllsearchpath:$dir";;
+	  esac
+	  ;;
+	esac
+	continue
+	;;
+
+      -l*)
+	if test "$arg" = "-lc"; then
+	  case "$host" in
+	  *-*-cygwin* | *-*-mingw* | *-*-os2* | *-*-beos*)
+	    # These systems don't actually have c library (as such)
+	    continue
+	    ;;
+	  esac
+	elif test "$arg" = "-lm"; then
+	  case "$host" in
+	  *-*-cygwin* | *-*-beos*)
+	    # These systems don't actually have math library (as such)
+	    continue
+	    ;;
+	  esac
+	fi
+	deplibs="$deplibs $arg"
+	continue
+	;;
+
+      -module)
+	module=yes
+	continue
+	;;
+
+      -no-fast-install)
+	fast_install=no
+	continue
+	;;
+
+      -no-install)
+	case "$host" in
+	*-*-cygwin* | *-*-mingw* | *-*-os2*)
+	  # The PATH hackery in wrapper scripts is required on Windows
+	  # in order for the loader to find any dlls it needs.
+	  $echo "$modename: warning: \`-no-install' is ignored for $host" 1>&2
+	  $echo "$modename: warning: assuming \`-no-fast-install' instead" 1>&2
+	  fast_install=no
+	  ;;
+	*)
+	  no_install=yes
+	  ;;
+	esac
+	continue
+	;;
+
+      -no-undefined)
+	allow_undefined=no
+	continue
+	;;
+
+      -o) prev=output ;;
+
+      -release)
+	prev=release
+	continue
+	;;
+
+      -rpath)
+	prev=rpath
+	continue
+	;;
+
+      -R)
+	prev=xrpath
+	continue
+	;;
+
+      -R*)
+	dir=`$echo "X$arg" | $Xsed -e 's/^-R//'`
+	# We need an absolute path.
+	case "$dir" in
+	[\\/]* | [A-Za-z]:[\\/]*) ;;
+	*)
+	  $echo "$modename: only absolute run-paths are allowed" 1>&2
+	  exit 1
+	  ;;
+	esac
+	case "$xrpath " in
+	*" $dir "*) ;;
+	*) xrpath="$xrpath $dir" ;;
+	esac
+	continue
+	;;
+
+      -static)
+	# If we have no pic_flag, then this is the same as -all-static.
+	if test -z "$pic_flag" && test -n "$link_static_flag"; then
+	  compile_command="$compile_command $link_static_flag"
+	  finalize_command="$finalize_command $link_static_flag"
+	fi
+	continue
+	;;
+
+      -thread-safe)
+	thread_safe=yes
+	continue
+	;;
+
+      -version-info)
+	prev=vinfo
+	continue
+	;;
+
+      -Wc,*)
+	args=`$echo "X$arg" | $Xsed -e "$sed_quote_subst" -e 's/^-Wc,//'`
+	arg=
+	IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=','
+	for flag in $args; do
+	  IFS="$save_ifs"
+	  case "$flag" in
+	    *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
+	    flag="\"$flag\""
+	    ;;
+	  esac
+	  arg="$arg $wl$flag"
+	  compiler_flags="$compiler_flags $flag"
+	done
+	IFS="$save_ifs"
+	arg=`$echo "X$arg" | $Xsed -e "s/^ //"`
+	;;
+
+      -Wl,*)
+	args=`$echo "X$arg" | $Xsed -e "$sed_quote_subst" -e 's/^-Wl,//'`
+	arg=
+	IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=','
+	for flag in $args; do
+	  IFS="$save_ifs"
+	  case "$flag" in
+	    *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
+	    flag="\"$flag\""
+	    ;;
+	  esac
+	  arg="$arg $wl$flag"
+	  compiler_flags="$compiler_flags $wl$flag"
+	  linker_flags="$linker_flags $flag"
+	done
+	IFS="$save_ifs"
+	arg=`$echo "X$arg" | $Xsed -e "s/^ //"`
+	;;
+
+      -Xcompiler)
+	prev=xcompiler
+	continue
+	;;
+
+      -Xlinker)
+	prev=xlinker
+	continue
+	;;
+
+      # Some other compiler flag.
+      -* | +*)
+	# Unknown arguments in both finalize_command and compile_command need
+	# to be aesthetically quoted because they are evaled later.
+	arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
+	case "$arg" in
+	*[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
+	  arg="\"$arg\""
+	  ;;
+	esac
+	;;
+
+      *.$objext)
+	# A standard object.
+	objs="$objs $arg"
+	;;
+
+      *.lo)
+	# A library object.
+	if test "$prev" = dlfiles; then
+	  # This file was specified with -dlopen.
+	  if test "$build_libtool_libs" = yes && test "$dlopen_support" = yes; then
+	    dlfiles="$dlfiles $arg"
+	    prev=
+	    continue
+	  else
+	    # If libtool objects are unsupported, then we need to preload.
+	    prev=dlprefiles
+	  fi
+	fi
+
+	if test "$prev" = dlprefiles; then
+	  # Preload the old-style object.
+	  dlprefiles="$dlprefiles "`$echo "X$arg" | $Xsed -e "$lo2o"`
+	  prev=
+	else
+	  libobjs="$libobjs $arg"
+	fi
+	;;
+
+      *.$libext)
+	# An archive.
+	deplibs="$deplibs $arg"
+	old_deplibs="$old_deplibs $arg"
+	continue
+	;;
+
+      *.la)
+	# A libtool-controlled library.
+
+	if test "$prev" = dlfiles; then
+	  # This library was specified with -dlopen.
+	  dlfiles="$dlfiles $arg"
+	  prev=
+	elif test "$prev" = dlprefiles; then
+	  # The library was specified with -dlpreopen.
+	  dlprefiles="$dlprefiles $arg"
+	  prev=
+	else
+	  deplibs="$deplibs $arg"
+	fi
+	continue
+	;;
+
+      # Some other compiler argument.
+      *)
+	# Unknown arguments in both finalize_command and compile_command need
+	# to be aesthetically quoted because they are evaled later.
+	arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
+	case "$arg" in
+	*[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*|"")
+	  arg="\"$arg\""
+	  ;;
+	esac
+	;;
+      esac
+
+      # Now actually substitute the argument into the commands.
+      if test -n "$arg"; then
+	compile_command="$compile_command $arg"
+	finalize_command="$finalize_command $arg"
+      fi
+    done
+
+    if test -n "$prev"; then
+      $echo "$modename: the \`$prevarg' option requires an argument" 1>&2
+      $echo "$help" 1>&2
+      exit 1
+    fi
+
+    if test "$export_dynamic" = yes && test -n "$export_dynamic_flag_spec"; then
+      eval arg=\"$export_dynamic_flag_spec\"
+      compile_command="$compile_command $arg"
+      finalize_command="$finalize_command $arg"
+    fi
+
+    oldlibs=
+    # calculate the name of the file, without its directory
+    outputname=`$echo "X$output" | $Xsed -e 's%^.*/%%'`
+    libobjs_save="$libobjs"
+
+    if test -n "$shlibpath_var"; then
+      # get the directories listed in $shlibpath_var
+      eval shlib_search_path=\`\$echo \"X \${$shlibpath_var}\" \| \$Xsed -e \'s/:/ /g\'\`
+    else
+      shlib_search_path=
+    fi
+    eval sys_lib_search_path=\"$sys_lib_search_path_spec\"
+    eval sys_lib_dlsearch_path=\"$sys_lib_dlsearch_path_spec\"
+    lib_search_path="$lib_search_path $sys_lib_search_path $shlib_search_path"
+
+    output_objdir=`$echo "X$output" | $Xsed -e 's%/[^/]*$%%'`
+    if test "X$output_objdir" = "X$output"; then
+      output_objdir="$objdir"
+    else
+      output_objdir="$output_objdir/$objdir"
+    fi
+    # Create the object directory.
+    if test ! -d $output_objdir; then
+      $show "$mkdir $output_objdir"
+      $run $mkdir $output_objdir
+      status=$?
+      if test $status -ne 0 && test ! -d $output_objdir; then
+	exit $status
+      fi
+    fi
+
+    case "$output" in
+    "")
+      $echo "$modename: you must specify an output file" 1>&2
+      $echo "$help" 1>&2
+      exit 1
+      ;;
+    *.$libext)
+      linkmode=oldlib ;;
+    *.lo | *.$objext)
+      linkmode=obj ;;
+    *.la)
+      linkmode=lib ;;
+    *) # Anything else should be a program.
+      linkmode=prog ;;
+    esac
+
+    specialdeplibs=
+    libs=
+    # Find all interdependent deplibs that
+    # are linked more than once (e.g. -la -lb -la)
+    for deplib in $deplibs; do
+      case "$libs " in
+      *" $deplib "*) specialdeplibs="$specialdeplibs $deplib" ;;
+      esac
+      libs="$libs $deplib"
+    done
+    deplibs=
+    newdependency_libs=
+    uninst_path= # paths that contain uninstalled libtool libraries
+    new_lib_search_path=
+    need_relink=no # whether we're linking any uninstalled libtool libraries
+    case $linkmode in
+    lib)
+	passes="link"
+	for file in $dlfiles $dlprefiles; do
+	  case "$file" in
+	  *.la) ;;
+	  *)
+	    $echo "$modename: libraries can \`-dlopen' only libtool libraries" 1>&2
+	    exit 1
+	    ;;
+	  esac
+	done
+	;;
+    prog)
+	compile_deplibs=
+	finalize_deplibs=
+	alldeplibs=no
+	newdlfiles=
+	newdlprefiles=
+	link_against_libtool_libs=
+	passes="scan dlopen dlpreopen link"
+	;;
+    *)	passes="link"
+	;;
+    esac
+    for pass in $passes; do
+      if test $linkmode = prog; then
+	case $pass in
+	dlopen) libs="$dlfiles" ;;
+	dlpreopen) libs="$dlprefiles" ;;
+	link) libs="$deplibs %DEPLIBS% $dependency_libs" ;;
+	esac
+      fi
+      if test $pass = dlopen; then
+	# Collect dlpreopened libraries
+	save_deplibs="$deplibs"
+	deplibs=
+      fi
+      for deplib in $libs; do
+	lib=
+	found=no
+	case "$deplib" in
+	-l*)
+	  if test $linkmode != lib && test $linkmode != prog; then
+	    $echo "$modename: warning: \`-l' is ignored for archives/objects" 1>&2
+	    continue
+	  fi
+	  name=`$echo "X$deplib" | $Xsed -e 's/^-l//'`
+	  for searchdir in $lib_search_path; do
+	    # Search the libtool library
+	    lib="$searchdir/lib${name}.la"
+	    if test -f "$lib"; then
+	      found=yes
+	      break
+	    fi
+	  done
+	  if test "$found" != yes; then
+	    if test "$linkmode,$pass" = "prog,link"; then
+	      compile_deplibs="$deplib $compile_deplibs"
+	      finalize_deplibs="$deplib $finalize_deplibs"
+	    else
+	      deplibs="$deplib $deplibs"
+	      test $linkmode = lib && newdependency_libs="$deplib $newdependency_libs"
+	    fi
+	    continue
+	  fi
+	  ;;
+	-L*)
+	  case $linkmode in
+	  lib)
+	    deplibs="$deplib $deplibs"
+	    newdependency_libs="$deplib $newdependency_libs"
+	    new_lib_search_path="$new_lib_search_path "`$echo "X$deplib" | $Xsed -e 's/^-L//'`
+	    ;;
+	  prog)
+	    if test $pass = scan; then
+	      deplibs="$deplib $deplibs"
+	      new_lib_search_path="$new_lib_search_path "`$echo "X$deplib" | $Xsed -e 's/^-L//'`
+	    else
+	      compile_deplibs="$deplib $compile_deplibs"
+	      finalize_deplibs="$deplib $finalize_deplibs"
+	    fi
+	    ;;
+	  *)
+	    $echo "$modename: warning: \`-L' is ignored for archives/objects" 1>&2
+	    ;;
+	  esac
+	  continue
+	  ;;
+	-R*)
+	  if test "$linkmode,$pass" = "prog,link"; then
+	    dir=`$echo "X$deplib" | $Xsed -e 's/^-R//'`
+	    # Make sure the xrpath contains only unique directories.
+	    case "$xrpath " in
+	    *" $dir "*) ;;
+	    *) xrpath="$xrpath $dir" ;;
+	    esac
+	  fi
+	  continue
+	  ;;
+	*.la) lib="$deplib" ;;
+	*.$libext)
+	  case $linkmode in
+	  lib)
+	    if test "$deplibs_check_method" != pass_all; then
+	      echo
+	      echo "*** Warning: This library needs some functionality provided by $deplib."
+	      echo "*** I have the capability to make that library automatically link in when"
+	      echo "*** you link to this library.  But I can only do this if you have a"
+	      echo "*** shared version of the library, which you do not appear to have."
+	    else
+	      echo
+	      echo "*** Warning: Linking the shared library $output against the"
+	      echo "*** static library $deplib is not portable!"
+	      deplibs="$deplib $deplibs"
+	    fi
+	    continue
+	    ;;
+	  prog)
+	    if test $pass != link; then
+	      deplibs="$deplib $deplibs"
+	    else
+	      compile_deplibs="$deplib $compile_deplibs"
+	      finalize_deplibs="$deplib $finalize_deplibs"
+	    fi
+	    continue
+	    ;;
+	  esac
+	  ;;
+	*.lo | *.$objext)
+	  if test $linkmode = prog; then
+	    if test $pass = dlpreopen || test "$dlopen_support" != yes || test "$build_libtool_libs" = no; then
+	      # If there is no dlopen support or we're linking statically,
+	      # we need to preload.
+	      newdlprefiles="$newdlprefiles $deplib"
+	      compile_deplibs="$deplib $compile_deplibs"
+	      finalize_deplibs="$deplib $finalize_deplibs"
+	    else
+	      newdlfiles="$newdlfiles $deplib"
+	    fi
+	  fi
+	  continue
+	  ;;
+	%DEPLIBS%)
+	  alldeplibs=yes
+	  continue
+	  ;;
+	esac
+	if test $found = yes || test -f "$lib"; then :
+	else
+	  $echo "$modename: cannot find the library \`$lib'" 1>&2
+	  exit 1
+	fi
+
+	# Check to see that this really is a libtool archive.
+	if (sed -e '2q' $lib | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then :
+	else
+	  $echo "$modename: \`$lib' is not a valid libtool archive" 1>&2
+	  exit 1
+	fi
+
+	ladir=`$echo "X$lib" | $Xsed -e 's%/[^/]*$%%'`
+	test "X$ladir" = "X$lib" && ladir="."
+
+	dlname=
+	dlopen=
+	dlpreopen=
+	libdir=
+	library_names=
+	old_library=
+	# If the library was installed with an old release of libtool,
+	# it will not redefine variable installed.
+	installed=yes
+
+	# Read the .la file
+	case "$lib" in
+	*/* | *\\*) . $lib ;;
+	*) . ./$lib ;;
+	esac
+
+	if test $linkmode = lib || test "$linkmode,$pass" = "prog,scan"; then
+	  test -n "$dlopen" && dlfiles="$dlfiles $dlopen"
+	  test -n "$dlpreopen" && dlprefiles="$dlprefiles $dlpreopen"
+	fi
+
+	if test $linkmode != lib && test $linkmode != prog; then
+	  # only check for convenience libraries
+	  if test -z "$old_library"; then
+	    $echo "$modename: cannot find name of link library for \`$lib'" 1>&2
+	    exit 1
+	  fi
+	  if test -n "$libdir"; then
+	    $echo "$modename: \`$lib' is not a convenience library" 1>&2
+	    exit 1
+	  fi
+	  # It is a libtool convenience library, so add in its objects.
+	  convenience="$convenience $ladir/$objdir/$old_library"
+	  old_convenience="$old_convenience $ladir/$objdir/$old_library"
+	  continue
+	fi
+
+	# Get the name of the library we link against.
+	linklib=
+	for l in $old_library $library_names; do
+	  linklib="$l"
+	done
+	if test -z "$linklib"; then
+	  $echo "$modename: cannot find name of link library for \`$lib'" 1>&2
+	  exit 1
+	fi
+
+	# This library was specified with -dlopen.
+	if test $pass = dlopen; then
+	  if test -z "$dlname" || test "$dlopen_support" != yes || test "$build_libtool_libs" = no; then
+	    # If there is no dlname, no dlopen support or we're linking statically,
+	    # we need to preload.
+	    dlprefiles="$dlprefiles $lib"
+	  else
+	    newdlfiles="$newdlfiles $lib"
+	  fi
+	  continue
+	fi
+
+	# We need an absolute path.
+	case "$ladir" in
+	[\\/]* | [A-Za-z]:[\\/]*) abs_ladir="$ladir" ;;
+	*)
+	  abs_ladir=`cd "$ladir" && pwd`
+	  if test -z "$abs_ladir"; then
+	    $echo "$modename: warning: cannot determine absolute directory name of \`$ladir'" 1>&2
+	    $echo "$modename: passing it literally to the linker, although it might fail" 1>&2
+	    abs_ladir="$ladir"
+	  fi
+	  ;;
+	esac
+	laname=`$echo "X$lib" | $Xsed -e 's%^.*/%%'`
+
+	# Find the relevant object directory and library name.
+	if test "X$installed" = Xyes; then
+	  if test ! -f "$libdir/$linklib" && test -f "$abs_ladir/$linklib"; then
+	    $echo "$modename: warning: library \`$lib' was moved." 1>&2
+	    dir="$ladir"
+	    absdir="$abs_ladir"
+	    libdir="$abs_ladir"
+	  else
+	    dir="$libdir"
+	    absdir="$libdir"
+	  fi
+	else
+	  dir="$ladir/$objdir"
+	  absdir="$abs_ladir/$objdir"
+	  # Remove this search path later
+	  uninst_path="$uninst_path $abs_ladir"
+	fi
+	name=`$echo "X$laname" | $Xsed -e 's/\.la$//' -e 's/^lib//'`
+
+	# This library was specified with -dlpreopen.
+	if test $pass = dlpreopen; then
+	  # Prefer using a static library (so that no silly _DYNAMIC symbols
+	  # are required to link).
+	  if test -n "$old_library"; then
+	    newdlprefiles="$newdlprefiles $dir/$old_library"
+	  else
+	    newdlprefiles="$newdlprefiles $dir/$linklib"
+	  fi
+	fi
+
+	if test $linkmode = prog && test $pass != link; then
+	  new_lib_search_path="$new_lib_search_path $ladir"
+	  deplibs="$lib $deplibs"
+
+	  linkalldeplibs=no
+	  if test "$link_all_deplibs" != no || test "$fast_install" != no || \
+	     test "$build_libtool_libs" = no || test -z "$library_names"; then
+	    linkalldeplibs=yes
+	  fi
+
+	  tmp_libs=
+	  for deplib in $dependency_libs; do
+	    case "$deplib" in
+	    -L*) new_lib_search_path="$new_lib_search_path "`$echo "X$deplib" | $Xsed -e 's/^-L//'`;; ### testsuite: skip nested quoting test
+	    esac
+	    # Need to link against all dependency_libs?
+	    if test $linkalldeplibs = yes; then
+	      deplibs="$deplib $deplibs"
+	    else
+	      # Need to hardcode shared library paths
+	      # or/and link against static libraries
+	      newdependency_libs="$deplib $newdependency_libs"
+	    fi
+	    case "$tmp_libs " in
+	    *" $deplib "*) specialdeplibs="$specialdeplibs $deplib" ;;
+	    esac
+	    tmp_libs="$tmp_libs $deplib"
+	  done
+	  continue
+	fi
+
+	if test -z "$libdir"; then
+	  # It is a libtool convenience library, so add in its objects.
+	  convenience="$convenience $dir/$old_library"
+	  old_convenience="$old_convenience $dir/$old_library"
+	  if test $linkmode = lib; then
+	    deplibs="$dir/$old_library $deplibs"
+	    tmp_libs=
+	    for deplib in $dependency_libs; do
+	      newdependency_libs="$deplib $newdependency_libs"
+	      case "$tmp_libs " in
+	      *" $deplib "*) specialdeplibs="$specialdeplibs $deplib" ;;
+	      esac
+	      tmp_libs="$tmp_libs $deplib"
+	    done
+	  elif test "$linkmode,$pass" = "prog,link"; then
+	    compile_deplibs="$dir/$old_library $compile_deplibs"
+	    finalize_deplibs="$dir/$old_library $finalize_deplibs"
+	  fi
+	  continue
+	fi
+
+	if test "$linkmode,$pass" = "prog,link"; then
+	  if test -n "$library_names" &&
+	     { test "$hardcode_into_libs" != all || test "$alldeplibs" != yes; } &&
+	     { test "$prefer_static_libs" = no || test -z "$old_library"; }; then
+	    # We need to hardcode the library path
+	    if test -n "$shlibpath_var"; then
+	      # Make sure the rpath contains only unique directories.
+	      case "$temp_rpath " in
+	      *" $dir "*) ;;
+	      *" $absdir "*) ;;
+	      *) temp_rpath="$temp_rpath $dir" ;;
+	      esac
+	    fi
+
+	    # Hardcode the library path.
+	    # Skip directories that are in the system default run-time
+	    # search path.
+	    case " $sys_lib_dlsearch_path " in
+	    *" $absdir "*) ;;
+	    *)
+	      case "$compile_rpath " in
+	      *" $absdir "*) ;;
+	      *) compile_rpath="$compile_rpath $absdir"
+	      esac
+	      ;;
+	    esac
+
+	    case " $sys_lib_dlsearch_path " in
+	    *" $libdir "*) ;;
+	    *)
+	      case "$finalize_rpath " in
+	      *" $libdir "*) ;;
+	      *) finalize_rpath="$finalize_rpath $libdir"
+	      esac
+	      ;;
+	    esac
+	  fi
+
+	  if test "$alldeplibs" = yes &&
+	     { test "$deplibs_check_method" = pass_all ||
+	       { test "$build_libtool_libs" = yes &&
+		 test -n "$library_names"; }; }; then
+	    # Do we only need to link against static libraries?
+	    continue
+	  fi
+	fi
+
+	link_static=no # Whether this library is linked statically
+	if test -n "$library_names" &&
+	   { test "$prefer_static_libs" = no || test -z "$old_library"; }; then
+	  link_against_libtool_libs="$link_against_libtool_libs $lib"
+	  test "X$installed" = xno && need_relink=yes
+	  # This is a shared library
+	  if test $linkmode = lib && test "$hardcode_into_libs" = all; then
+	    # Hardcode the library path.
+	    # Skip directories that are in the system default run-time
+	    # search path.
+	    case " $sys_lib_dlsearch_path " in
+	    *" $absdir "*) ;;
+	    *)
+	      case "$compile_rpath " in
+	      *" $absdir "*) ;;
+	      *) compile_rpath="$compile_rpath $absdir"
+	      esac
+	      ;;
+	    esac
+	    case " $sys_lib_dlsearch_path " in
+	    *" $libdir "*) ;;
+	    *)
+	      case "$finalize_rpath " in
+	      *" $libdir "*) ;;
+	      *) finalize_rpath="$finalize_rpath $libdir"
+	      esac
+	      ;;
+	    esac
+	  fi
+
+	  if test -n "$old_archive_from_expsyms_cmds"; then
+	    # figure out the soname
+	    set dummy $library_names
+	    realname="$2"
+	    shift; shift
+	    libname=`eval \\$echo \"$libname_spec\"`
+	    if test -n "$soname_spec"; then
+	      eval soname=\"$soname_spec\"
+	    else
+	      soname="$realname"
+	    fi
+
+	    # Make a new name for the extract_expsyms_cmds to use
+	    newlib="libimp-`echo $soname | sed 's/^lib//;s/\.dll$//'`.a"
+
+	    # If the library has no export list, then create one now
+	    if test -f "$output_objdir/$soname-def"; then :
+	    else
+	      $show "extracting exported symbol list from \`$soname'"
+	      IFS="${IFS= 	}"; save_ifs="$IFS"; IFS='~'
+	      eval cmds=\"$extract_expsyms_cmds\"
+	      for cmd in $cmds; do
+		IFS="$save_ifs"
+		$show "$cmd"
+		$run eval "$cmd" || exit $?
+	      done
+	      IFS="$save_ifs"
+	    fi
+
+	    # Create $newlib
+	    if test -f "$output_objdir/$newlib"; then :; else
+	      $show "generating import library for \`$soname'"
+	      IFS="${IFS= 	}"; save_ifs="$IFS"; IFS='~'
+	      eval cmds=\"$old_archive_from_expsyms_cmds\"
+	      for cmd in $cmds; do
+		IFS="$save_ifs"
+		$show "$cmd"
+		$run eval "$cmd" || exit $?
+	      done
+	      IFS="$save_ifs"
+	    fi
+	    # make sure the library variables are pointing to the new library
+	    dir=$output_objdir
+	    linklib=$newlib
+	  fi
+
+	  if test $linkmode = prog || test "$mode" != relink; then
+	    add_shlibpath=
+	    add_dir=
+	    add=
+	    lib_linked=yes
+	    case "$hardcode_action" in
+	    immediate | unsupported)
+	      if test "$hardcode_direct" = no; then
+		add="$dir/$linklib"
+	      elif test "$hardcode_minus_L" = no; then
+		case "$host" in
+		*-*-sunos*) add_shlibpath="$dir" ;;
+		esac
+		add_dir="-L$dir"
+		add="-l$name"
+	      elif test "$hardcode_shlibpath_var" = no; then
+		add_shlibpath="$dir"
+		add="-l$name"
+	      else
+		lib_linked=no
+	      fi
+	      ;;
+	    relink)
+	      if test "$hardcode_direct" = yes; then
+		add="$dir/$linklib"
+	      elif test "$hardcode_minus_L" = yes; then
+		add_dir="-L$dir"
+		add="-l$name"
+	      elif test "$hardcode_shlibpath_var" = yes; then
+		add_shlibpath="$dir"
+		add="-l$name"
+	      else
+		lib_linked=no
+	      fi
+	      ;;
+	    *) lib_linked=no ;;
+	    esac
+
+	    if test "$lib_linked" != yes; then
+	      $echo "$modename: configuration error: unsupported hardcode properties"
+	      exit 1
+	    fi
+
+	    if test -n "$add_shlibpath"; then
+	      case ":$compile_shlibpath:" in
+	      *":$add_shlibpath:"*) ;;
+	      *) compile_shlibpath="$compile_shlibpath$add_shlibpath:" ;;
+	      esac
+	    fi
+	    if test $linkmode = prog; then
+	      test -n "$add_dir" && compile_deplibs="$add_dir $compile_deplibs"
+	      test -n "$add" && compile_deplibs="$add $compile_deplibs"
+	    else
+	      test -n "$add_dir" && deplibs="$add_dir $deplibs"
+	      test -n "$add" && deplibs="$add $deplibs"
+	      if test "$hardcode_direct" != yes && \
+		 test "$hardcode_minus_L" != yes && \
+		 test "$hardcode_shlibpath_var" = yes; then
+		case ":$finalize_shlibpath:" in
+		*":$libdir:"*) ;;
+		*) finalize_shlibpath="$finalize_shlibpath$libdir:" ;;
+		esac
+	      fi
+	    fi
+	  fi
+
+	  if test $linkmode = prog || test "$mode" = relink; then
+	    add_shlibpath=
+	    add_dir=
+	    add=
+	    # Finalize command for both is simple: just hardcode it.
+	    if test "$hardcode_direct" = yes; then
+	      add="$libdir/$linklib"
+	    elif test "$hardcode_minus_L" = yes; then
+	      add_dir="-L$libdir"
+	      add="-l$name"
+	    elif test "$hardcode_shlibpath_var" = yes; then
+	      case ":$finalize_shlibpath:" in
+	      *":$libdir:"*) ;;
+	      *) finalize_shlibpath="$finalize_shlibpath$libdir:" ;;
+	      esac
+	      add="-l$name"
+	    else
+	      # We cannot seem to hardcode it, guess we'll fake it.
+	      add_dir="-L$libdir"
+	      add="-l$name"
+	    fi
+
+	    if test $linkmode = prog; then
+	      test -n "$add_dir" && finalize_deplibs="$add_dir $finalize_deplibs"
+	      test -n "$add" && finalize_deplibs="$add $finalize_deplibs"
+	    else
+	      test -n "$add_dir" && deplibs="$add_dir $deplibs"
+	      test -n "$add" && deplibs="$add deplibs"
+	    fi
+	  fi
+	elif test $linkmode = prog; then
+	  # Here we assume that one of hardcode_direct or hardcode_minus_L
+	  # is not unsupported.  This is valid on all known static and
+	  # shared platforms.
+	  if test "$hardcode_direct" != unsupported; then
+	    test -n "$old_library" && linklib="$old_library"
+	    compile_deplibs="$dir/$linklib $compile_deplibs"
+	    finalize_deplibs="$dir/$linklib $finalize_deplibs"
+	  else
+	    compile_deplibs="-l$name -L$dir $compile_deplibs"
+	    finalize_deplibs="-l$name -L$dir $finalize_deplibs"
+	  fi
+	elif test "$build_libtool_libs" = yes; then
+	  # Not a shared library
+	  if test "$deplibs_check_method" != pass_all; then
+	    # We're trying link a shared library against a static one
+	    # but the system doesn't support it.
+	    # Just print a warning and add the library to dependency_libs so
+	    # that the program can be linked against the static library.
+	    echo
+	    echo "*** Warning: This library needs some functionality provided by $lib."
+	    echo "*** I have the capability to make that library automatically link in when"
+	    echo "*** you link to this library.  But I can only do this if you have a"
+	    echo "*** shared version of the library, which you do not appear to have."
+	  else
+	    convenience="$convenience $dir/$old_library"
+	    old_convenience="$old_convenience $dir/$old_library"
+	    deplibs="$dir/$old_library $deplibs"
+	    link_static=yes
+	  fi
+	fi
+
+	if test $linkmode = lib; then
+	  if test -n "$dependency_libs" &&
+	     { test "$hardcode_into_libs" = no || test $build_old_libs = yes ||
+	       test $link_static = yes; }; then
+	    # Extract -R from dependency_libs
+	    temp_deplibs=
+	    for libdir in $dependency_libs; do
+	      case "$libdir" in
+	      -R*) temp_xrpath=`$echo "X$libdir" | $Xsed -e 's/^-R//'`
+		   case " $xrpath " in
+		   *" $temp_xrpath "*) ;;
+		   *) xrpath="$xrpath $temp_xrpath";;
+		   esac;;
+	      *) temp_deplibs="$temp_deplibs $libdir";;
+	      esac
+	    done
+	    dependency_libs="$temp_deplibs"
+	  fi
+
+	  new_lib_search_path="$new_lib_search_path $absdir"
+	  # Link against this library
+	  test "$link_static" = no && newdependency_libs="$abs_ladir/$laname $newdependency_libs"
+	  # ... and its dependency_libs
+	  tmp_libs=
+	  for deplib in $dependency_libs; do
+	    newdependency_libs="$deplib $newdependency_libs"
+	    case "$tmp_libs " in
+	    *" $deplib "*) specialdeplibs="$specialdeplibs $deplib" ;;
+	    esac
+	    tmp_libs="$tmp_libs $deplib"
+	  done
+
+	  if test $link_all_deplibs != no; then
+	    # Add the search paths of all dependency libraries
+	    for deplib in $dependency_libs; do
+	      case "$deplib" in
+	      -L*) path="$deplib" ;;
+	      *.la)
+		dir=`$echo "X$deplib" | $Xsed -e 's%/[^/]*$%%'`
+		test "X$dir" = "X$deplib" && dir="."
+		# We need an absolute path.
+		case "$dir" in
+		[\\/]* | [A-Za-z]:[\\/]*) absdir="$dir" ;;
+		*)
+		  absdir=`cd "$dir" && pwd`
+		  if test -z "$absdir"; then
+		    $echo "$modename: warning: cannot determine absolute directory name of \`$dir'" 1>&2
+		    absdir="$dir"
+		  fi
+		  ;;
+		esac
+		if grep "^installed=no" $deplib > /dev/null; then
+		  path="-L$absdir/$objdir"
+		else
+		  eval libdir=`sed -n -e 's/^libdir=\(.*\)$/\1/p' $deplib`
+		  if test -z "$libdir"; then
+		    $echo "$modename: \`$deplib' is not a valid libtool archive" 1>&2
+		    exit 1
+		  fi
+		  if test "$absdir" != "$libdir"; then
+		    $echo "$modename: warning: \`$deplib' seems to be moved" 1>&2
+		  fi
+		  path="-L$absdir"
+		fi
+		;;
+	      *) continue ;;
+	      esac
+	      case " $deplibs " in
+	      *" $path "*) ;;
+	      *) deplibs="$deplibs $path" ;;
+	      esac
+	    done
+	  fi
+	fi
+      done
+      dependency_libs="$newdependency_libs"
+      if test $pass = dlpreopen; then
+	# Link the dlpreopened libraries before other libraries
+	deplibs="$deplibs $save_deplibs"
+      elif test $pass != dlopen; then
+	# Make sure lib_search_path contains only unique directories.
+	lib_search_path=
+	for dir in $new_lib_search_path; do
+	  case "$lib_search_path " in
+	  *" $dir "*) ;;
+	  *) lib_search_path="$lib_search_path $dir" ;;
+	  esac
+	done
+	lib_search_path="$lib_search_path $sys_lib_search_path"
+
+	if test "$linkmode,$pass" != "prog,link"; then
+	  vars="deplibs"
+	else
+	  vars="compile_deplibs finalize_deplibs"
+	fi
+	for var in $vars dependency_libs; do
+	  # Make sure that $var contains only unique libraries
+	  # and add them in reverse order
+	  eval tmp_libs=\"\$$var\"
+	  new_libs=
+	  for deplib in $tmp_libs; do
+	    case "$deplib" in
+	    -L*) new_libs="$deplib $new_libs" ;;
+	    *)
+	      case " $specialdeplibs " in
+	      *" $deplib "*) new_libs="$deplib $new_libs" ;;
+	      *)
+		case " $new_libs " in
+		*" $deplib "*) ;;
+		*) new_libs="$deplib $new_libs" ;;
+		esac
+		;;
+	      esac
+	      ;;
+	    esac
+	  done
+	  tmp_libs=
+	  for deplib in $new_libs; do
+	    case "$deplib" in
+	    -L*)
+	      case " $tmp_libs " in
+	      *" $deplib "*) ;;
+	      *) tmp_libs="$tmp_libs $deplib" ;;
+	      esac
+	      ;;
+	    *) tmp_libs="$tmp_libs $deplib" ;;
+	    esac
+	  done
+	  eval $var=\"$tmp_libs\"
+	done
+      fi
+    done
+    if test $linkmode = prog; then
+      dlfiles="$newdlfiles"
+      dlprefiles="$newdlprefiles"
+    fi
+
+    case $linkmode in
+    oldlib)
+      if test -n "$deplibs"; then
+	$echo "$modename: warning: \`-l' and \`-L' are ignored for archives" 1>&2
+      fi
+
+      if test -n "$dlfiles$dlprefiles" || test "$dlself" != no; then
+	$echo "$modename: warning: \`-dlopen' is ignored for archives" 1>&2
+      fi
+
+      if test -n "$rpath"; then
+	$echo "$modename: warning: \`-rpath' is ignored for archives" 1>&2
+      fi
+
+      if test -n "$xrpath"; then
+	$echo "$modename: warning: \`-R' is ignored for archives" 1>&2
+      fi
+
+      if test -n "$vinfo"; then
+	$echo "$modename: warning: \`-version-info' is ignored for archives" 1>&2
+      fi
+
+      if test -n "$release"; then
+	$echo "$modename: warning: \`-release' is ignored for archives" 1>&2
+      fi
+
+      if test -n "$export_symbols" || test -n "$export_symbols_regex"; then
+	$echo "$modename: warning: \`-export-symbols' is ignored for archives" 1>&2
+      fi
+
+      # Now set the variables for building old libraries.
+      build_libtool_libs=no
+      oldlibs="$output"
+      objs="$objs$old_deplibs"
+      ;;
+
+    lib)
+      # Make sure we only generate libraries of the form `libNAME.la'.
+      case "$outputname" in
+      lib*)
+	name=`$echo "X$outputname" | $Xsed -e 's/\.la$//' -e 's/^lib//'`
+	eval libname=\"$libname_spec\"
+	;;
+      *)
+	if test "$module" = no; then
+	  $echo "$modename: libtool library \`$output' must begin with \`lib'" 1>&2
+	  $echo "$help" 1>&2
+	  exit 1
+	fi
+	if test "$need_lib_prefix" != no; then
+	  # Add the "lib" prefix for modules if required
+	  name=`$echo "X$outputname" | $Xsed -e 's/\.la$//'`
+	  eval libname=\"$libname_spec\"
+	else
+	  libname=`$echo "X$outputname" | $Xsed -e 's/\.la$//'`
+	fi
+	;;
+      esac
+
+      if test -n "$objs"; then
+	if test "$deplibs_check_method" != pass_all; then
+	  $echo "$modename: cannot build libtool library \`$output' from non-libtool objects on this host:$objs" 2>&1
+	  exit 1
+	else
+	  echo
+	  echo "*** Warning: Linking the shared library $output against the non-libtool"
+	  echo "*** objects $objs is not portable!"
+	  libobjs="$libobjs $objs"
+	fi
+      fi
+
+      if test "$dlself" != no; then
+	$echo "$modename: warning: \`-dlopen self' is ignored for libtool libraries" 1>&2
+      fi
+
+      set dummy $rpath
+      if test $# -gt 2; then
+	$echo "$modename: warning: ignoring multiple \`-rpath's for a libtool library" 1>&2
+      fi
+      install_libdir="$2"
+
+      oldlibs=
+      if test -z "$rpath"; then
+	if test "$build_libtool_libs" = yes; then
+	  # Building a libtool convenience library.
+	  libext=al
+	  oldlibs="$output_objdir/$libname.$libext $oldlibs"
+	  build_libtool_libs=convenience
+	  build_old_libs=yes
+	fi
+
+	if test -n "$vinfo"; then
+	  $echo "$modename: warning: \`-version-info' is ignored for convenience libraries" 1>&2
+	fi
+
+	if test -n "$release"; then
+	  $echo "$modename: warning: \`-release' is ignored for convenience libraries" 1>&2
+	fi
+      else
+
+	# Parse the version information argument.
+	IFS="${IFS= 	}"; save_ifs="$IFS"; IFS=':'
+	set dummy $vinfo 0 0 0
+	IFS="$save_ifs"
+
+	if test -n "$8"; then
+	  $echo "$modename: too many parameters to \`-version-info'" 1>&2
+	  $echo "$help" 1>&2
+	  exit 1
+	fi
+
+	current="$2"
+	revision="$3"
+	age="$4"
+
+	# Check that each of the things are valid numbers.
+	case "$current" in
+	0 | [1-9] | [1-9][0-9]*) ;;
+	*)
+	  $echo "$modename: CURRENT \`$current' is not a nonnegative integer" 1>&2
+	  $echo "$modename: \`$vinfo' is not valid version information" 1>&2
+	  exit 1
+	  ;;
+	esac
+
+	case "$revision" in
+	0 | [1-9] | [1-9][0-9]*) ;;
+	*)
+	  $echo "$modename: REVISION \`$revision' is not a nonnegative integer" 1>&2
+	  $echo "$modename: \`$vinfo' is not valid version information" 1>&2
+	  exit 1
+	  ;;
+	esac
+
+	case "$age" in
+	0 | [1-9] | [1-9][0-9]*) ;;
+	*)
+	  $echo "$modename: AGE \`$age' is not a nonnegative integer" 1>&2
+	  $echo "$modename: \`$vinfo' is not valid version information" 1>&2
+	  exit 1
+	  ;;
+	esac
+
+	if test $age -gt $current; then
+	  $echo "$modename: AGE \`$age' is greater than the current interface number \`$current'" 1>&2
+	  $echo "$modename: \`$vinfo' is not valid version information" 1>&2
+	  exit 1
+	fi
+
+	# Calculate the version variables.
+	major=
+	versuffix=
+	verstring=
+	case "$version_type" in
+	none) ;;
+
+	irix)
+	  major=`expr $current - $age + 1`
+	  versuffix="$major.$revision"
+	  verstring="sgi$major.$revision"
+
+	  # Add in all the interfaces that we are compatible with.
+	  loop=$revision
+	  while test $loop != 0; do
+	    iface=`expr $revision - $loop`
+	    loop=`expr $loop - 1`
+	    verstring="sgi$major.$iface:$verstring"
+	  done
+	  ;;
+
+	linux)
+	  major=.`expr $current - $age`
+	  versuffix="$major.$age.$revision"
+	  ;;
+
+	osf)
+	  major=`expr $current - $age`
+	  versuffix=".$current.$age.$revision"
+	  verstring="$current.$age.$revision"
+
+	  # Add in all the interfaces that we are compatible with.
+	  loop=$age
+	  while test $loop != 0; do
+	    iface=`expr $current - $loop`
+	    loop=`expr $loop - 1`
+	    verstring="$verstring:${iface}.0"
+	  done
+
+	  # Make executables depend on our current version.
+	  verstring="$verstring:${current}.0"
+	  ;;
+
+	sunos)
+	  major=".$current"
+	  versuffix=".$current.$revision"
+	  ;;
+
+	freebsd-aout)
+	  major=".$current"
+	  versuffix=".$current.$revision";
+	  ;;
+
+	freebsd-elf)
+	  major=".$current"
+	  versuffix=".$current";
+	  ;;
+
+	windows)
+	  # Like Linux, but with '-' rather than '.', since we only
+	  # want one extension on Windows 95.
+	  major=`expr $current - $age`
+	  versuffix="-$major-$age-$revision"
+	  ;;
+
+	*)
+	  $echo "$modename: unknown library version type \`$version_type'" 1>&2
+	  echo "Fatal configuration error.  See the $PACKAGE docs for more information." 1>&2
+	  exit 1
+	  ;;
+	esac
+
+	# Clear the version info if we defaulted, and they specified a release.
+	if test -z "$vinfo" && test -n "$release"; then
+	  major=
+	  verstring="0.0"
+	  if test "$need_version" = no; then
+	    versuffix=
+	  else
+	    versuffix=".0.0"
+	  fi
+	fi
+
+	# Remove version info from name if versioning should be avoided
+	if test "$avoid_version" = yes && test "$need_version" = no; then
+	  major=
+	  versuffix=
+	  verstring=""
+	fi
+
+	# Check to see if the archive will have undefined symbols.
+	if test "$allow_undefined" = yes; then
+	  if test "$allow_undefined_flag" = unsupported; then
+	    $echo "$modename: warning: undefined symbols not allowed in $host shared libraries" 1>&2
+	    build_libtool_libs=no
+	    build_old_libs=yes
+	  fi
+	else
+	  # Don't allow undefined symbols.
+	  allow_undefined_flag="$no_undefined_flag"
+	fi
+      fi
+
+      if test "$mode" != relink; then
+	# Remove our outputs.
+	$show "${rm}r $output_objdir/$outputname $output_objdir/$libname.* $output_objdir/${libname}${release}.*"
+	$run ${rm}r $output_objdir/$outputname $output_objdir/$libname.* $output_objdir/${libname}${release}.*
+      fi
+
+      # Now set the variables for building old libraries.
+      if test "$build_old_libs" = yes && test "$build_libtool_libs" != convenience ; then
+	oldlibs="$oldlibs $output_objdir/$libname.$libext"
+
+	# Transform .lo files to .o files.
+	oldobjs="$objs "`$echo "X$libobjs" | $SP2NL | $Xsed -e '/\.'${libext}'$/d' -e "$lo2o" | $NL2SP`
+      fi
+
+      # Eliminate all temporary directories.
+      for path in $uninst_path; do
+	lib_search_path=`echo "$lib_search_path " | sed -e 's% $path % %g'`
+	deplibs=`echo "$deplibs " | sed -e 's% -L$path % %g'`
+	dependency_libs=`echo "$dependency_libs " | sed -e 's% -L$path % %g'`
+      done
+
+      if test -n "$xrpath"; then
+	# If the user specified any rpath flags, then add them.
+	temp_xrpath=
+	for libdir in $xrpath; do
+	  temp_xrpath="$temp_xrpath -R$libdir"
+	  case "$finalize_rpath " in
+	  *" $libdir "*) ;;
+	  *) finalize_rpath="$finalize_rpath $libdir" ;;
+	  esac
+	done
+	if test "$hardcode_into_libs" = no || test $build_old_libs = yes; then
+	  dependency_libs="$temp_xrpath $dependency_libs"
+	fi
+      fi
+
+      # Make sure dlfiles contains only unique files that won't be dlpreopened
+      old_dlfiles="$dlfiles"
+      dlfiles=
+      for lib in $old_dlfiles; do
+	case " $dlprefiles $dlfiles " in
+	*" $lib "*) ;;
+	*) dlfiles="$dlfiles $lib" ;;
+	esac
+      done
+
+      # Make sure dlprefiles contains only unique files
+      old_dlprefiles="$dlprefiles"
+      dlprefiles=
+      for lib in $old_dlprefiles; do
+	case "$dlprefiles " in
+	*" $lib "*) ;;
+	*) dlprefiles="$dlprefiles $lib" ;;
+	esac
+      done
+
+      if test "$build_libtool_libs" = yes; then
+	if test -n "$rpath"; then
+	  case "$host" in
+	  *-*-cygwin* | *-*-mingw* | *-*-os2* | *-*-beos*)
+	    # these systems don't actually have a c library (as such)!
+	    ;;
+	  *)
+	    # Add libc to deplibs on all other systems.
+	    deplibs="$deplibs -lc"
+	    ;;
+	  esac
+	fi
+
+	# Transform deplibs into only deplibs that can be linked in shared.
+	name_save=$name
+	libname_save=$libname
+	release_save=$release
+	versuffix_save=$versuffix
+	major_save=$major
+	# I'm not sure if I'm treating the release correctly.  I think
+	# release should show up in the -l (ie -lgmp5) so we don't want to
+	# add it in twice.  Is that correct?
+	release=""
+	versuffix=""
+	major=""
+	newdeplibs=
+	droppeddeps=no
+	case "$deplibs_check_method" in
+	pass_all)
+	  # Don't check for shared/static.  Everything works.
+	  # This might be a little naive.  We might want to check
+	  # whether the library exists or not.  But this is on
+	  # osf3 & osf4 and I'm not really sure... Just
+	  # implementing what was already the behaviour.
+	  newdeplibs=$deplibs
+	  ;;
+	test_compile)
+	  # This code stresses the "libraries are programs" paradigm to its
+	  # limits. Maybe even breaks it.  We compile a program, linking it
+	  # against the deplibs as a proxy for the library.  Then we can check
+	  # whether they linked in statically or dynamically with ldd.
+	  $rm conftest.c
+	  cat > conftest.c <<EOF
+	  int main() { return 0; }
+EOF
+	  $rm conftest
+	  $CC -o conftest conftest.c $deplibs
+	  if test $? -eq 0 ; then
+	    ldd_output=`ldd conftest`
+	    for i in $deplibs; do
+	      name="`expr $i : '-l\(.*\)'`"
+	      # If $name is empty we are operating on a -L argument.
+	      if test "$name" != "" ; then
+		libname=`eval \\$echo \"$libname_spec\"`
+		deplib_matches=`eval \\$echo \"$library_names_spec\"`
+		set dummy $deplib_matches
+		deplib_match=$2
+		if test `expr "$ldd_output" : ".*$deplib_match"` -ne 0 ; then
+		  newdeplibs="$newdeplibs $i"
+		else
+		  droppeddeps=yes
+		  echo
+		  echo "*** Warning: This library needs some functionality provided by $i."
+		  echo "*** I have the capability to make that library automatically link in when"
+		  echo "*** you link to this library.  But I can only do this if you have a"
+		  echo "*** shared version of the library, which you do not appear to have."
+		fi
+	      else
+		newdeplibs="$newdeplibs $i"
+	      fi
+	    done
+	  else
+	    # Error occured in the first compile.  Let's try to salvage the situation:
+	    # Compile a seperate program for each library.
+	    for i in $deplibs; do
+	      name="`expr $i : '-l\(.*\)'`"
+	     # If $name is empty we are operating on a -L argument.
+	      if test "$name" != "" ; then
+		$rm conftest
+		$CC -o conftest conftest.c $i
+		# Did it work?
+		if test $? -eq 0 ; then
+		  ldd_output=`ldd conftest`
+		  libname=`eval \\$echo \"$libname_spec\"`
+		  deplib_matches=`eval \\$echo \"$library_names_spec\"`
+		  set dummy $deplib_matches
+		  deplib_match=$2
+		  if test `expr "$ldd_output" : ".*$deplib_match"` -ne 0 ; then
+		    newdeplibs="$newdeplibs $i"
+		  else
+		    droppeddeps=yes
+		    echo
+		    echo "*** Warning: This library needs some functionality provided by $i."
+		    echo "*** I have the capability to make that library automatically link in when"
+		    echo "*** you link to this library.  But I can only do this if you have a"
+		    echo "*** shared version of the library, which you do not appear to have."
+		  fi
+		else
+		  droppeddeps=yes
+		  echo
+		  echo "*** Warning!  Library $i is needed by this library but I was not able to"
+		  echo "***  make it link in!  You will probably need to install it or some"
+		  echo "*** library that it depends on before this library will be fully"
+		  echo "*** functional.  Installing it before continuing would be even better."
+		fi
+	      else
+		newdeplibs="$newdeplibs $i"
+	      fi
+	    done
+	  fi
+	  ;;
+	file_magic*)
+	  set dummy $deplibs_check_method
+	  file_magic_regex=`expr "$deplibs_check_method" : "$2 \(.*\)"`
+	  for a_deplib in $deplibs; do
+	    name="`expr $a_deplib : '-l\(.*\)'`"
+	    # If $name is empty we are operating on a -L argument.
+	    if test "$name" != "" ; then
+	      libname=`eval \\$echo \"$libname_spec\"`
+	      for i in $lib_search_path; do
+		    potential_libs=`ls $i/$libname[.-]* 2>/dev/null`
+		    for potent_lib in $potential_libs; do
+		      # Follow soft links.
+		      if ls -lLd "$potent_lib" 2>/dev/null \
+			 | grep " -> " >/dev/null; then
+			continue
+		      fi
+		      # The statement above tries to avoid entering an
+		      # endless loop below, in case of cyclic links.
+		      # We might still enter an endless loop, since a link
+		      # loop can be closed while we follow links,
+		      # but so what?
+		      potlib="$potent_lib"
+		      while test -h "$potlib" 2>/dev/null; do
+			potliblink=`ls -ld $potlib | sed 's/.* -> //'`
+			case "$potliblink" in
+			[\\/]* | [A-Za-z]:[\\/]*) potlib="$potliblink";;
+			*) potlib=`$echo "X$potlib" | $Xsed -e 's,[^/]*$,,'`"$potliblink";;
+			esac
+		      done
+		      if eval $file_magic_cmd \"\$potlib\" 2>/dev/null \
+			 | sed 10q \
+			 | egrep "$file_magic_regex" > /dev/null; then
+			newdeplibs="$newdeplibs $a_deplib"
+			a_deplib=""
+			break 2
+		      fi
+		    done
+	      done
+	      if test -n "$a_deplib" ; then
+		droppeddeps=yes
+		echo
+		echo "*** Warning: This library needs some functionality provided by $a_deplib."
+		echo "*** I have the capability to make that library automatically link in when"
+		echo "*** you link to this library.  But I can only do this if you have a"
+		echo "*** shared version of the library, which you do not appear to have."
+	      fi
+	    else
+	      # Add a -L argument.
+	      newdeplibs="$newdeplibs $a_deplib"
+	    fi
+	  done # Gone through all deplibs.
+	  ;;
+	none | unknown | *)
+	  newdeplibs=""
+	  if $echo "X $deplibs" | $Xsed -e 's/ -lc$//' \
+	       -e 's/ -[LR][^ ]*//g' -e 's/[ 	]//g' |
+	     grep . >/dev/null; then
+	    echo
+	    if test "X$deplibs_check_method" = "Xnone"; then
+	      echo "*** Warning: inter-library dependencies are not supported in this platform."
+	    else
+	      echo "*** Warning: inter-library dependencies are not known to be supported."
+	    fi
+	    echo "*** All declared inter-library dependencies are being dropped."
+	    droppeddeps=yes
+	  fi
+	  ;;
+	esac
+	versuffix=$versuffix_save
+	major=$major_save
+	release=$release_save
+	libname=$libname_save
+	name=$name_save
+
+	if test "$droppeddeps" = yes; then
+	  if test "$module" = yes; then
+	    echo
+	    echo "*** Warning: libtool could not satisfy all declared inter-library"
+	    echo "*** dependencies of module $libname.  Therefore, libtool will create"
+	    echo "*** a static module, that should work as long as the dlopening"
+	    echo "*** application is linked with the -dlopen flag."
+	    if test -z "$global_symbol_pipe"; then
+	      echo
+	      echo "*** However, this would only work if libtool was able to extract symbol"
+	      echo "*** lists from a program, using \`nm' or equivalent, but libtool could"
+	      echo "*** not find such a program.  So, this module is probably useless."
+	      echo "*** \`nm' from GNU binutils and a full rebuild may help."
+	    fi
+	    if test "$build_old_libs" = no; then
+	      oldlibs="$output_objdir/$libname.$libext"
+	      build_libtool_libs=module
+	      build_old_libs=yes
+	    else
+	      build_libtool_libs=no
+	    fi
+	  else
+	    echo "*** The inter-library dependencies that have been dropped here will be"
+	    echo "*** automatically added whenever a program is linked with this library"
+	    echo "*** or is declared to -dlopen it."
+	  fi
+	fi
+	# Done checking deplibs!
+	deplibs=$newdeplibs
+      fi
+
+      # All the library-specific variables (install_libdir is set above).
+      library_names=
+      old_library=
+      dlname=
+
+      # Test again, we may have decided not to build it any more
+      if test "$build_libtool_libs" = yes; then
+	if test "$hardcode_into_libs" != no; then
+	  # Hardcode the library paths
+	  hardcode_libdirs=
+	  dep_rpath=
+	  rpath="$finalize_rpath"
+	  test "$mode" != relink && rpath="$compile_rpath$rpath"
+	  for libdir in $rpath; do
+	    if test -n "$hardcode_libdir_flag_spec"; then
+	      if test -n "$hardcode_libdir_separator"; then
+		if test -z "$hardcode_libdirs"; then
+		  hardcode_libdirs="$libdir"
+		else
+		  # Just accumulate the unique libdirs.
+		  case "$hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator" in
+		  *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
+		    ;;
+		  *)
+		    hardcode_libdirs="$hardcode_libdirs$hardcode_libdir_separator$libdir"
+		    ;;
+		  esac
+		fi
+	      else
+		eval flag=\"$hardcode_libdir_flag_spec\"
+		dep_rpath="$dep_rpath $flag"
+	      fi
+	    elif test -n "$runpath_var"; then
+	      case "$perm_rpath " in
+	      *" $libdir "*) ;;
+	      *) perm_rpath="$perm_rpath $libdir" ;;
+	      esac
+	    fi
+	  done
+	  # Substitute the hardcoded libdirs into the rpath.
+	  if test -n "$hardcode_libdir_separator" &&
+	     test -n "$hardcode_libdirs"; then
+	    libdir="$hardcode_libdirs"
+	    eval dep_rpath=\"$hardcode_libdir_flag_spec\"
+	  fi
+	  if test -n "$runpath_var" && test -n "$perm_rpath"; then
+	    # We should set the runpath_var.
+	    rpath=
+	    for dir in $perm_rpath; do
+	      rpath="$rpath$dir:"
+	    done
+	    eval "$runpath_var='$rpath\$$runpath_var'; export $runpath_var"
+	  fi
+	  test -n "$dep_rpath" && deplibs="$dep_rpath $deplibs"
+	fi
+
+	shlibpath="$finalize_shlibpath"
+	test "$mode" != relink && shlibpath="$compile_shlibpath$shlibpath"
+	if test -n "$shlibpath"; then
+	  eval "$shlibpath_var='$shlibpath\$$shlibpath_var'; export $shlibpath_var"
+	fi
+
+	# Get the real and link names of the library.
+	eval library_names=\"$library_names_spec\"
+	set dummy $library_names
+	realname="$2"
+	shift; shift
+
+	if test -n "$soname_spec"; then
+	  eval soname=\"$soname_spec\"
+	else
+	  soname="$realname"
+	fi
+
+	lib="$output_objdir/$realname"
+	for link
+	do
+	  linknames="$linknames $link"
+	done
+
+	# Ensure that we have .o objects for linkers which dislike .lo
+	# (e.g. aix) in case we are running --disable-static
+	for obj in $libobjs; do
+	  xdir=`$echo "X$obj" | $Xsed -e 's%/[^/]*$%%'`
+	  if test "X$xdir" = "X$obj"; then
+	    xdir="."
+	  else
+	    xdir="$xdir"
+	  fi
+	  baseobj=`$echo "X$obj" | $Xsed -e 's%^.*/%%'`
+	  oldobj=`$echo "X$baseobj" | $Xsed -e "$lo2o"`
+	  if test ! -f $xdir/$oldobj; then
+	    $show "(cd $xdir && ${LN_S} $baseobj $oldobj)"
+	    $run eval '(cd $xdir && ${LN_S} $baseobj $oldobj)' || exit $?
+	  fi
+	done
+
+	# Use standard objects if they are pic
+	test -z "$pic_flag" && libobjs=`$echo "X$libobjs" | $SP2NL | $Xsed -e "$lo2o" | $NL2SP`
+
+	# Prepare the list of exported symbols
+	if test -z "$export_symbols"; then
+	  if test "$always_export_symbols" = yes || test -n "$export_symbols_regex"; then
+	    $show "generating symbol list for \`$libname.la'"
+	    export_symbols="$output_objdir/$libname.exp"
+	    $run $rm $export_symbols
+	    eval cmds=\"$export_symbols_cmds\"
+	    IFS="${IFS= 	}"; save_ifs="$IFS"; IFS='~'
+	    for cmd in $cmds; do
+	      IFS="$save_ifs"
+	      $show "$cmd"
+	      $run eval "$cmd" || exit $?
+	    done
+	    IFS="$save_ifs"
+	    if test -n "$export_symbols_regex"; then
+	      $show "egrep -e \"$export_symbols_regex\" \"$export_symbols\" > \"${export_symbols}T\""
+	      $run eval 'egrep -e "$export_symbols_regex" "$export_symbols" > "${export_symbols}T"'
+	      $show "$mv \"${export_symbols}T\" \"$export_symbols\""
+	      $run eval '$mv "${export_symbols}T" "$export_symbols"'
+	    fi
+	  fi
+	fi
+
+	if test -n "$export_symbols" && test -n "$include_expsyms"; then
+	  $run eval '$echo "X$include_expsyms" | $SP2NL >> "$export_symbols"'
+	fi
+
+	if test -n "$convenience"; then
+	  if test -n "$whole_archive_flag_spec"; then
+	    eval libobjs=\"\$libobjs $whole_archive_flag_spec\"
+	  else
+	    gentop="$output_objdir/${outputname}x"
+	    $show "${rm}r $gentop"
+	    $run ${rm}r "$gentop"
+	    $show "mkdir $gentop"
+	    $run mkdir "$gentop"
+	    status=$?
+	    if test $status -ne 0 && test ! -d "$gentop"; then
+	      exit $status
+	    fi
+	    generated="$generated $gentop"
+
+	    for xlib in $convenience; do
+	      # Extract the objects.
+	      case "$xlib" in
+	      [\\/]* | [A-Za-z]:[\\/]*) xabs="$xlib" ;;
+	      *) xabs=`pwd`"/$xlib" ;;
+	      esac
+	      xlib=`$echo "X$xlib" | $Xsed -e 's%^.*/%%'`
+	      xdir="$gentop/$xlib"
+
+	      $show "${rm}r $xdir"
+	      $run ${rm}r "$xdir"
+	      $show "mkdir $xdir"
+	      $run mkdir "$xdir"
+	      status=$?
+	      if test $status -ne 0 && test ! -d "$xdir"; then
+		exit $status
+	      fi
+	      $show "(cd $xdir && $AR x $xabs)"
+	      $run eval "(cd \$xdir && $AR x \$xabs)" || exit $?
+
+	      libobjs="$libobjs "`find $xdir -name \*.o -print -o -name \*.lo -print | $NL2SP`
+	    done
+	  fi
+	fi
+
+	if test "$thread_safe" = yes && test -n "$thread_safe_flag_spec"; then
+	  eval flag=\"$thread_safe_flag_spec\"
+	  linker_flags="$linker_flags $flag"
+	fi
+
+	# Make a backup of the uninstalled library when relinking
+	if test "$mode" = relink && test "$hardcode_into_libs" = all; then
+	  $run eval '(cd $output_objdir && $rm ${realname}U && $mv $realname ${realname}U)' || exit $?
+	fi
+
+	# Do each of the archive commands.
+	if test -n "$export_symbols" && test -n "$archive_expsym_cmds"; then
+	  eval cmds=\"$archive_expsym_cmds\"
+	else
+	  eval cmds=\"$archive_cmds\"
+	fi
+	IFS="${IFS= 	}"; save_ifs="$IFS"; IFS='~'
+	for cmd in $cmds; do
+	  IFS="$save_ifs"
+	  $show "$cmd"
+	  $run eval "$cmd" || exit $?
+	done
+	IFS="$save_ifs"
+
+	# Restore the uninstalled library and exit
+	if test "$mode" = relink && test "$hardcode_into_libs" = all; then
+	  $run eval '(cd $output_objdir && $rm ${realname}T && $mv $realname ${realname}T && $mv "$realname"U $realname)' || exit $?
+	  exit 0
+	fi
+
+	# Create links to the real library.
+	for linkname in $linknames; do
+	  if test "$realname" != "$linkname"; then
+	    $show "(cd $output_objdir && $rm $linkname && $LN_S $realname $linkname)"
+	    $run eval '(cd $output_objdir && $rm $linkname && $LN_S $realname $linkname)' || exit $?
+	  fi
+	done
+
+	# If -module or -export-dynamic was specified, set the dlname.
+	if test "$module" = yes || test "$export_dynamic" = yes; then
+	  # On all known operating systems, these are identical.
+	  dlname="$soname"
+	fi
+      fi
+      ;;
+
+    obj)
+      if test -n "$deplibs"; then
+	$echo "$modename: warning: \`-l' and \`-L' are ignored for objects" 1>&2
+      fi
+
+      if test -n "$dlfiles$dlprefiles" || test "$dlself" != no; then
+	$echo "$modename: warning: \`-dlopen' is ignored for objects" 1>&2
+      fi
+
+      if test -n "$rpath"; then
+	$echo "$modename: warning: \`-rpath' is ignored for objects" 1>&2
+      fi
+
+      if test -n "$xrpath"; then
+	$echo "$modename: warning: \`-R' is ignored for objects" 1>&2
+      fi
+
+      if test -n "$vinfo"; then
+	$echo "$modename: warning: \`-version-info' is ignored for objects" 1>&2
+      fi
+
+      if test -n "$release"; then
+	$echo "$modename: warning: \`-release' is ignored for objects" 1>&2
+      fi
+
+      case "$output" in
+      *.lo)
+	if test -n "$objs$old_deplibs"; then
+	  $echo "$modename: cannot build library object \`$output' from non-libtool objects" 1>&2
+	  exit 1
+	fi
+	libobj="$output"
+	obj=`$echo "X$output" | $Xsed -e "$lo2o"`
+	;;
+      *)
+	libobj=
+	obj="$output"
+	;;
+      esac
+
+      # Delete the old objects.
+      $run $rm $obj $libobj
+
+      # Objects from convenience libraries.  This assumes
+      # single-version convenience libraries.  Whenever we create
+      # different ones for PIC/non-PIC, this we'll have to duplicate
+      # the extraction.
+      reload_conv_objs=
+      gentop=
+      # reload_cmds runs $LD directly, so let us get rid of
+      # -Wl from whole_archive_flag_spec
+      wl=
+
+      if test -n "$convenience"; then
+	if test -n "$whole_archive_flag_spec"; then
+	  eval reload_conv_objs=\"\$reload_objs $whole_archive_flag_spec\"
+	else
+	  gentop="$output_objdir/${obj}x"
+	  $show "${rm}r $gentop"
+	  $run ${rm}r "$gentop"
+	  $show "mkdir $gentop"
+	  $run mkdir "$gentop"
+	  status=$?
+	  if test $status -ne 0 && test ! -d "$gentop"; then
+	    exit $status
+	  fi
+	  generated="$generated $gentop"
+
+	  for xlib in $convenience; do
+	    # Extract the objects.
+	    case "$xlib" in
+	    [\\/]* | [A-Za-z]:[\\/]*) xabs="$xlib" ;;
+	    *) xabs=`pwd`"/$xlib" ;;
+	    esac
+	    xlib=`$echo "X$xlib" | $Xsed -e 's%^.*/%%'`
+	    xdir="$gentop/$xlib"
+
+	    $show "${rm}r $xdir"
+	    $run ${rm}r "$xdir"
+	    $show "mkdir $xdir"
+	    $run mkdir "$xdir"
+	    status=$?
+	    if test $status -ne 0 && test ! -d "$xdir"; then
+	      exit $status
+	    fi
+	    $show "(cd $xdir && $AR x $xabs)"
+	    $run eval "(cd \$xdir && $AR x \$xabs)" || exit $?
+
+	    reload_conv_objs="$reload_objs "`find $xdir -name \*.o -print -o -name \*.lo -print | $NL2SP`
+	  done
+	fi
+      fi
+
+      # Create the old-style object.
+      reload_objs="$objs$old_deplibs "`$echo "X$libobjs" | $SP2NL | $Xsed -e '/\.'${libext}$'/d' -e '/\.lib$/d' -e "$lo2o" | $NL2SP`" $reload_conv_objs" ### testsuite: skip nested quoting test
+
+      output="$obj"
+      eval cmds=\"$reload_cmds\"
+      IFS="${IFS= 	}"; save_ifs="$IFS"; IFS='~'
+      for cmd in $cmds; do
+	IFS="$save_ifs"
+	$show "$cmd"
+	$run eval "$cmd" || exit $?
+      done
+      IFS="$save_ifs"
+
+      # Exit if we aren't doing a library object file.
+      if test -z "$libobj"; then
+	if test -n "$gentop"; then
+	  $show "${rm}r $gentop"
+	  $run ${rm}r $gentop
+	fi
+
+	exit 0
+      fi
+
+      if test "$build_libtool_libs" != yes; then
+	if test -n "$gentop"; then
+	  $show "${rm}r $gentop"
+	  $run ${rm}r $gentop
+	fi
+
+	# Create an invalid libtool object if no PIC, so that we don't
+	# accidentally link it into a program.
+	$show "echo timestamp > $libobj"
+	$run eval "echo timestamp > $libobj" || exit $?
+	exit 0
+      fi
+
+      if test -n "$pic_flag" || test "$pic_mode" != default; then
+	# Only do commands if we really have different PIC objects.
+	reload_objs="$libobjs $reload_conv_objs"
+	output="$libobj"
+	eval cmds=\"$reload_cmds\"
+	IFS="${IFS= 	}"; save_ifs="$IFS"; IFS='~'
+	for cmd in $cmds; do
+	  IFS="$save_ifs"
+	  $show "$cmd"
+	  $run eval "$cmd" || exit $?
+	done
+	IFS="$save_ifs"
+      else
+	# Just create a symlink.
+	$show $rm $libobj
+	$run $rm $libobj
+	xdir=`$echo "X$libobj" | $Xsed -e 's%/[^/]*$%%'`
+	if test "X$xdir" = "X$libobj"; then
+	  xdir="."
+	else
+	  xdir="$xdir"
+	fi
+	baseobj=`$echo "X$libobj" | $Xsed -e 's%^.*/%%'`
+	oldobj=`$echo "X$baseobj" | $Xsed -e "$lo2o"`
+	$show "(cd $xdir && $LN_S $oldobj $baseobj)"
+	$run eval '(cd $xdir && $LN_S $oldobj $baseobj)' || exit $?
+      fi
+
+      if test -n "$gentop"; then
+	$show "${rm}r $gentop"
+	$run ${rm}r $gentop
+      fi
+
+      exit 0
+      ;;
+
+    prog)
+      if test -n "$vinfo"; then
+	$echo "$modename: warning: \`-version-info' is ignored for programs" 1>&2
+      fi
+
+      if test -n "$release"; then
+	$echo "$modename: warning: \`-release' is ignored for programs" 1>&2
+      fi
+
+      if test "$preload" = yes; then
+	if test "$dlopen_support" = unknown && test "$dlopen_self" = unknown &&
+	   test "$dlopen_self_static" = unknown; then
+	  $echo "$modename: warning: \`AC_LIBTOOL_DLOPEN' not used. Assuming no dlopen support."
+	fi
+      fi
+
+      compile_command="$compile_command $compile_deplibs"
+      finalize_command="$finalize_command $finalize_deplibs"
+
+      if test -n "$rpath$xrpath"; then
+	# If the user specified any rpath flags, then add them.
+	for libdir in $rpath $xrpath; do
+	  # This is the magic to use -rpath.
+	  case "$finalize_rpath " in
+	  *" $libdir "*) ;;
+	  *) finalize_rpath="$finalize_rpath $libdir" ;;
+	  esac
+	done
+      fi
+
+      # Now hardcode the library paths
+      rpath=
+      hardcode_libdirs=
+      for libdir in $compile_rpath $finalize_rpath; do
+	if test -n "$hardcode_libdir_flag_spec"; then
+	  if test -n "$hardcode_libdir_separator"; then
+	    if test -z "$hardcode_libdirs"; then
+	      hardcode_libdirs="$libdir"
+	    else
+	      # Just accumulate the unique libdirs.
+	      case "$hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator" in
+	      *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
+		;;
+	      *)
+		hardcode_libdirs="$hardcode_libdirs$hardcode_libdir_separator$libdir"
+		;;
+	      esac
+	    fi
+	  else
+	    eval flag=\"$hardcode_libdir_flag_spec\"
+	    rpath="$rpath $flag"
+	  fi
+	elif test -n "$runpath_var"; then
+	  case "$perm_rpath " in
+	  *" $libdir "*) ;;
+	  *) perm_rpath="$perm_rpath $libdir" ;;
+	  esac
+	fi
+	case "$host" in
+	*-*-cygwin* | *-*-mingw* | *-*-os2*)
+	  case ":$dllsearchpath:" in
+	  *":$libdir:"*) ;;
+	  *) dllsearchpath="$dllsearchpath:$libdir";;
+	  esac
+	  ;;
+	esac
+      done
+      # Substitute the hardcoded libdirs into the rpath.
+      if test -n "$hardcode_libdir_separator" &&
+	 test -n "$hardcode_libdirs"; then
+	libdir="$hardcode_libdirs"
+	eval rpath=\" $hardcode_libdir_flag_spec\"
+      fi
+      compile_rpath="$rpath"
+
+      rpath=
+      hardcode_libdirs=
+      for libdir in $finalize_rpath; do
+	if test -n "$hardcode_libdir_flag_spec"; then
+	  if test -n "$hardcode_libdir_separator"; then
+	    if test -z "$hardcode_libdirs"; then
+	      hardcode_libdirs="$libdir"
+	    else
+	      # Just accumulate the unique libdirs.
+	      case "$hardcode_libdir_separator$hardcode_libdirs$hardcode_libdir_separator" in
+	      *"$hardcode_libdir_separator$libdir$hardcode_libdir_separator"*)
+		;;
+	      *)
+		hardcode_libdirs="$hardcode_libdirs$hardcode_libdir_separator$libdir"
+		;;
+	      esac
+	    fi
+	  else
+	    eval flag=\"$hardcode_libdir_flag_spec\"
+	    rpath="$rpath $flag"
+	  fi
+	elif test -n "$runpath_var"; then
+	  case "$finalize_perm_rpath " in
+	  *" $libdir "*) ;;
+	  *) finalize_perm_rpath="$finalize_perm_rpath $libdir" ;;
+	  esac
+	fi
+      done
+      # Substitute the hardcoded libdirs into the rpath.
+      if test -n "$hardcode_libdir_separator" &&
+	 test -n "$hardcode_libdirs"; then
+	libdir="$hardcode_libdirs"
+	eval rpath=\" $hardcode_libdir_flag_spec\"
+      fi
+      finalize_rpath="$rpath"
+
+      if test -n "$libobjs" && test "$build_old_libs" = yes; then
+	# Transform all the library objects into standard objects.
+	compile_command=`$echo "X$compile_command" | $SP2NL | $Xsed -e "$lo2o" | $NL2SP`
+	finalize_command=`$echo "X$finalize_command" | $SP2NL | $Xsed -e "$lo2o" | $NL2SP`
+      fi
+
+      dlsyms=
+      if test -n "$dlfiles$dlprefiles" || test "$dlself" != no; then
+	if test -n "$NM" && test -n "$global_symbol_pipe"; then
+	  dlsyms="${outputname}S.c"
+	else
+	  $echo "$modename: not configured to extract global symbols from dlpreopened files" 1>&2
+	fi
+      fi
+
+      if test -n "$dlsyms"; then
+	case "$dlsyms" in
+	"") ;;
+	*.c)
+	  # Discover the nlist of each of the dlfiles.
+	  nlist="$output_objdir/${outputname}.nm"
+
+	  $show "$rm $nlist ${nlist}S ${nlist}T"
+	  $run $rm "$nlist" "${nlist}S" "${nlist}T"
+
+	  # Parse the name list into a source file.
+	  $show "creating $output_objdir/$dlsyms"
+
+	  test -z "$run" && $echo > "$output_objdir/$dlsyms" "\
+/* $dlsyms - symbol resolution table for \`$outputname' dlsym emulation. */
+/* Generated by $PROGRAM - GNU $PACKAGE $VERSION$TIMESTAMP */
+
+#ifdef __cplusplus
+extern \"C\" {
+#endif
+
+/* Prevent the only kind of declaration conflicts we can make. */
+#define lt_preloaded_symbols some_other_symbol
+
+/* External symbol declarations for the compiler. */\
+"
+
+	  if test "$dlself" = yes; then
+	    $show "generating symbol list for \`$output'"
+
+	    test -z "$run" && $echo ': @PROGRAM@ ' > "$nlist"
+
+	    # Add our own program objects to the symbol list.
+	    progfiles=`$echo "X$objs$old_deplibs" | $SP2NL | $Xsed -e "$lo2o" | $NL2SP`
+	    for arg in $progfiles; do
+	      $show "extracting global C symbols from \`$arg'"
+	      $run eval "$NM $arg | $global_symbol_pipe >> '$nlist'"
+	    done
+
+	    if test -n "$exclude_expsyms"; then
+	      $run eval 'egrep -v " ($exclude_expsyms)$" "$nlist" > "$nlist"T'
+	      $run eval '$mv "$nlist"T "$nlist"'
+	    fi
+
+	    if test -n "$export_symbols_regex"; then
+	      $run eval 'egrep -e "$export_symbols_regex" "$nlist" > "$nlist"T'
+	      $run eval '$mv "$nlist"T "$nlist"'
+	    fi
+
+	    # Prepare the list of exported symbols
+	    if test -z "$export_symbols"; then
+	      export_symbols="$output_objdir/$output.exp"
+	      $run $rm $export_symbols
+	      $run eval "sed -n -e '/^: @PROGRAM@$/d' -e 's/^.* \(.*\)$/\1/p' "'< "$nlist" > "$export_symbols"'
+	    else
+	      $run eval "sed -e 's/\([][.*^$]\)/\\\1/g' -e 's/^/ /' -e 's/$/$/'"' < "$export_symbols" > "$output_objdir/$output.exp"'
+	      $run eval 'grep -f "$output_objdir/$output.exp" < "$nlist" > "$nlist"T'
+	      $run eval 'mv "$nlist"T "$nlist"'
+	    fi
+	  fi
+
+	  for arg in $dlprefiles; do
+	    $show "extracting global C symbols from \`$arg'"
+	    name=`echo "$arg" | sed -e 's%^.*/%%'`
+	    $run eval 'echo ": $name " >> "$nlist"'
+	    $run eval "$NM $arg | $global_symbol_pipe >> '$nlist'"
+	  done
+
+	  if test -z "$run"; then
+	    # Make sure we have at least an empty file.
+	    test -f "$nlist" || : > "$nlist"
+
+	    if test -n "$exclude_expsyms"; then
+	      egrep -v " ($exclude_expsyms)$" "$nlist" > "$nlist"T
+	      $mv "$nlist"T "$nlist"
+	    fi
+
+	    # Try sorting and uniquifying the output.
+	    if grep -v "^: " < "$nlist" | sort +2 | uniq > "$nlist"S; then
+	      :
+	    else
+	      grep -v "^: " < "$nlist" > "$nlist"S
+	    fi
+
+	    if test -f "$nlist"S; then
+	      eval "$global_symbol_to_cdecl"' < "$nlist"S >> "$output_objdir/$dlsyms"'
+	    else
+	      echo '/* NONE */' >> "$output_objdir/$dlsyms"
+	    fi
+
+	    $echo >> "$output_objdir/$dlsyms" "\
+
+#undef lt_preloaded_symbols
+
+#if defined (__STDC__) && __STDC__
+# define lt_ptr_t void *
+#else
+# define lt_ptr_t char *
+# define const
+#endif
+
+/* The mapping between symbol names and symbols. */
+const struct {
+  const char *name;
+  lt_ptr_t address;
+}
+lt_preloaded_symbols[] =
+{\
+"
+
+	    sed -n -e 's/^: \([^ ]*\) $/  {\"\1\", (lt_ptr_t) 0},/p' \
+		-e 's/^. \([^ ]*\) \([^ ]*\)$/  {"\2", (lt_ptr_t) \&\2},/p' \
+		  < "$nlist" >> "$output_objdir/$dlsyms"
+
+	    $echo >> "$output_objdir/$dlsyms" "\
+  {0, (lt_ptr_t) 0}
+};
+
+/* This works around a problem in FreeBSD linker */
+#ifdef FREEBSD_WORKAROUND
+static const void *lt_preloaded_setup() {
+  return lt_preloaded_symbols;
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif\
+"
+	  fi
+
+	  pic_flag_for_symtable=
+	  case "$host" in
+	  # compiling the symbol table file with pic_flag works around
+	  # a FreeBSD bug that causes programs to crash when -lm is
+	  # linked before any other PIC object.  But we must not use
+	  # pic_flag when linking with -static.  The problem exists in
+	  # FreeBSD 2.2.6 and is fixed in FreeBSD 3.1.
+	  *-*-freebsd2*|*-*-freebsd3.0*|*-*-freebsdelf3.0*)
+	    case "$compile_command " in
+	    *" -static "*) ;;
+	    *) pic_flag_for_symtable=" $pic_flag -DPIC -DFREEBSD_WORKAROUND";;
+	    esac;;
+	  *-*-hpux*)
+	    case "$compile_command " in
+	    *" -static "*) ;;
+	    *) pic_flag_for_symtable=" $pic_flag -DPIC";;
+	    esac
+	  esac
+
+	  # Now compile the dynamic symbol file.
+	  $show "(cd $output_objdir && $CC -c$no_builtin_flag$pic_flag_for_symtable \"$dlsyms\")"
+	  $run eval '(cd $output_objdir && $CC -c$no_builtin_flag$pic_flag_for_symtable "$dlsyms")' || exit $?
+
+	  # Clean up the generated files.
+	  $show "$rm $output_objdir/$dlsyms $nlist ${nlist}S ${nlist}T"
+	  $run $rm "$output_objdir/$dlsyms" "$nlist" "${nlist}S" "${nlist}T"
+
+	  # Transform the symbol file into the correct name.
+	  compile_command=`$echo "X$compile_command" | $Xsed -e "s%@SYMFILE@%$output_objdir/${outputname}S.${objext}%"`
+	  finalize_command=`$echo "X$finalize_command" | $Xsed -e "s%@SYMFILE@%$output_objdir/${outputname}S.${objext}%"`
+	  ;;
+	*)
+	  $echo "$modename: unknown suffix for \`$dlsyms'" 1>&2
+	  exit 1
+	  ;;
+	esac
+      else
+	# We keep going just in case the user didn't refer to
+	# lt_preloaded_symbols.  The linker will fail if global_symbol_pipe
+	# really was required.
+
+	# Nullify the symbol file.
+	compile_command=`$echo "X$compile_command" | $Xsed -e "s% @SYMFILE@%%"`
+	finalize_command=`$echo "X$finalize_command" | $Xsed -e "s% @SYMFILE@%%"`
+      fi
+
+      if test -z "$link_against_libtool_libs" || test "$build_libtool_libs" != yes; then
+	# Replace the output file specification.
+	compile_command=`$echo "X$compile_command" | $Xsed -e 's%@OUTPUT@%'"$output"'%g'`
+	link_command="$compile_command$compile_rpath"
+
+	# We have no uninstalled library dependencies, so finalize right now.
+	$show "$link_command"
+	$run eval "$link_command"
+	status=$?
+
+	# Delete the generated files.
+	if test -n "$dlsyms"; then
+	  $show "$rm $output_objdir/${outputname}S.${objext}"
+	  $run $rm "$output_objdir/${outputname}S.${objext}"
+	fi
+
+	exit $status
+      fi
+
+      if test -n "$shlibpath_var"; then
+	# We should set the shlibpath_var
+	rpath=
+	for dir in $temp_rpath; do
+	  case "$dir" in
+	  [\\/]* | [A-Za-z]:[\\/]*)
+	    # Absolute path.
+	    rpath="$rpath$dir:"
+	    ;;
+	  *)
+	    # Relative path: add a thisdir entry.
+	    rpath="$rpath\$thisdir/$dir:"
+	    ;;
+	  esac
+	done
+	temp_rpath="$rpath"
+      fi
+
+      if test -n "$compile_shlibpath$finalize_shlibpath"; then
+	compile_command="$shlibpath_var=\"$compile_shlibpath$finalize_shlibpath\$$shlibpath_var\" $compile_command"
+      fi
+      if test -n "$finalize_shlibpath"; then
+	finalize_command="$shlibpath_var=\"$finalize_shlibpath\$$shlibpath_var\" $finalize_command"
+      fi
+
+      compile_var=
+      finalize_var=
+      if test -n "$runpath_var"; then
+	if test -n "$perm_rpath"; then
+	  # We should set the runpath_var.
+	  rpath=
+	  for dir in $perm_rpath; do
+	    rpath="$rpath$dir:"
+	  done
+	  compile_var="$runpath_var=\"$rpath\$$runpath_var\" "
+	fi
+	if test -n "$finalize_perm_rpath"; then
+	  # We should set the runpath_var.
+	  rpath=
+	  for dir in $finalize_perm_rpath; do
+	    rpath="$rpath$dir:"
+	  done
+	  finalize_var="$runpath_var=\"$rpath\$$runpath_var\" "
+	fi
+      fi
+
+      if test "$no_install" = yes; then
+	# We don't need to create a wrapper script.
+	link_command="$compile_var$compile_command$compile_rpath"
+	# Replace the output file specification.
+	link_command=`$echo "X$link_command" | $Xsed -e 's%@OUTPUT@%'"$output"'%g'`
+	# Delete the old output file.
+	$run $rm $output
+	# Link the executable and exit
+	$show "$link_command"
+	$run eval "$link_command" || exit $?
+	exit 0
+      fi
+
+      if test "$hardcode_action" = relink || test "$hardcode_into_libs" = all; then
+	# Fast installation is not supported
+	link_command="$compile_var$compile_command$compile_rpath"
+	relink_command="$finalize_var$finalize_command$finalize_rpath"
+
+	$echo "$modename: warning: this platform does not like uninstalled shared libraries" 1>&2
+	$echo "$modename: \`$output' will be relinked during installation" 1>&2
+      else
+	if test "$fast_install" != no; then
+	  link_command="$finalize_var$compile_command$finalize_rpath"
+	  if test "$fast_install" = yes; then
+	    relink_command=`$echo "X$compile_var$compile_command$compile_rpath" | $Xsed -e 's%@OUTPUT@%\$progdir/\$file%g'`
+	  else
+	    # fast_install is set to needless
+	    relink_command=
+	  fi
+	else
+	  link_command="$compile_var$compile_command$compile_rpath"
+	  relink_command="$finalize_var$finalize_command$finalize_rpath"
+	fi
+      fi
+
+      # Replace the output file specification.
+      link_command=`$echo "X$link_command" | $Xsed -e 's%@OUTPUT@%'"$output_objdir/$outputname"'%g'`
+
+      # Delete the old output files.
+      $run $rm $output $output_objdir/$outputname $output_objdir/lt-$outputname
+
+      $show "$link_command"
+      $run eval "$link_command" || exit $?
+
+      # Now create the wrapper script.
+      $show "creating $output"
+
+      # Quote the relink command for shipping.
+      if test -n "$relink_command"; then
+	relink_command="cd `pwd`; $relink_command"
+	relink_command=`$echo "X$relink_command" | $Xsed -e "$sed_quote_subst"`
+      fi
+
+      # Quote $echo for shipping.
+      if test "X$echo" = "X$SHELL $0 --fallback-echo"; then
+	case "$0" in
+	[\\/]* | [A-Za-z]:[\\/]*) qecho="$SHELL $0 --fallback-echo";;
+	*) qecho="$SHELL `pwd`/$0 --fallback-echo";;
+	esac
+	qecho=`$echo "X$qecho" | $Xsed -e "$sed_quote_subst"`
+      else
+	qecho=`$echo "X$echo" | $Xsed -e "$sed_quote_subst"`
+      fi
+
+      # Only actually do things if our run command is non-null.
+      if test -z "$run"; then
+	# win32 will think the script is a binary if it has
+	# a .exe suffix, so we strip it off here.
+	case $output in
+	  *.exe) output=`echo $output|sed 's,.exe$,,'` ;;
+	esac
+	$rm $output
+	trap "$rm $output; exit 1" 1 2 15
+
+	$echo > $output "\
+#! $SHELL
+
+# $output - temporary wrapper script for $objdir/$outputname
+# Generated by $PROGRAM - GNU $PACKAGE $VERSION$TIMESTAMP
+#
+# The $output program cannot be directly executed until all the libtool
+# libraries that it depends on are installed.
+#
+# This wrapper script should never be moved out of the build directory.
+# If it is, it will not operate correctly.
+
+# Sed substitution that helps us do robust quoting.  It backslashifies
+# metacharacters that are still active within double-quoted strings.
+Xsed='sed -e 1s/^X//'
+sed_quote_subst='$sed_quote_subst'
+
+# The HP-UX ksh and POSIX shell print the target directory to stdout
+# if CDPATH is set.
+if test \"\${CDPATH+set}\" = set; then CDPATH=:; export CDPATH; fi
+
+relink_command=\"$relink_command\"
+
+# This environment variable determines our operation mode.
+if test \"\$libtool_install_magic\" = \"$magic\"; then
+  # install mode needs the following variable:
+  link_against_libtool_libs='$link_against_libtool_libs'
+else
+  # When we are sourced in execute mode, \$file and \$echo are already set.
+  if test \"\$libtool_execute_magic\" != \"$magic\"; then
+    echo=\"$qecho\"
+    file=\"\$0\"
+    # Make sure echo works.
+    if test \"X\$1\" = X--no-reexec; then
+      # Discard the --no-reexec flag, and continue.
+      shift
+    elif test \"X\`(\$echo '\t') 2>/dev/null\`\" = 'X\t'; then
+      # Yippee, \$echo works!
+      :
+    else
+      # Restart under the correct shell, and then maybe \$echo will work.
+      exec $SHELL \"\$0\" --no-reexec \${1+\"\$@\"}
+    fi
+  fi\
+"
+	$echo >> $output "\
+
+  # Find the directory that this script lives in.
+  thisdir=\`\$echo \"X\$file\" | \$Xsed -e 's%/[^/]*$%%'\`
+  test \"x\$thisdir\" = \"x\$file\" && thisdir=.
+
+  # Follow symbolic links until we get to the real thisdir.
+  file=\`ls -ld \"\$file\" | sed -n 's/.*-> //p'\`
+  while test -n \"\$file\"; do
+    destdir=\`\$echo \"X\$file\" | \$Xsed -e 's%/[^/]*\$%%'\`
+
+    # If there was a directory component, then change thisdir.
+    if test \"x\$destdir\" != \"x\$file\"; then
+      case \"\$destdir\" in
+      [\\/]* | [A-Za-z]:[\\/]*) thisdir=\"\$destdir\" ;;
+      *) thisdir=\"\$thisdir/\$destdir\" ;;
+      esac
+    fi
+
+    file=\`\$echo \"X\$file\" | \$Xsed -e 's%^.*/%%'\`
+    file=\`ls -ld \"\$thisdir/\$file\" | sed -n 's/.*-> //p'\`
+  done
+
+  # Try to get the absolute directory name.
+  absdir=\`cd \"\$thisdir\" && pwd\`
+  test -n \"\$absdir\" && thisdir=\"\$absdir\"
+"
+
+	if test "$fast_install" = yes; then
+	  echo >> $output "\
+  program=lt-'$outputname'
+  progdir=\"\$thisdir/$objdir\"
+
+  if test ! -f \"\$progdir/\$program\" || \\
+     { file=\`ls -1dt \"\$progdir/\$program\" \"\$progdir/../\$program\" 2>/dev/null | sed 1q\`; \\
+       test \"X\$file\" != \"X\$progdir/\$program\"; }; then
+
+    file=\"\$\$-\$program\"
+
+    if test ! -d \"\$progdir\"; then
+      $mkdir \"\$progdir\"
+    else
+      $rm \"\$progdir/\$file\"
+    fi"
+
+	  echo >> $output "\
+
+    # relink executable if necessary
+    if test -n \"\$relink_command\"; then
+      if (eval \$relink_command); then :
+      else
+	$rm \"\$progdir/\$file\"
+	exit 1
+      fi
+    fi
+
+    $mv \"\$progdir/\$file\" \"\$progdir/\$program\" 2>/dev/null ||
+    { $rm \"\$progdir/\$program\";
+      $mv \"\$progdir/\$file\" \"\$progdir/\$program\"; }
+    $rm \"\$progdir/\$file\"
+  fi"
+	else
+	  echo >> $output "\
+  program='$outputname'
+  progdir=\"\$thisdir/$objdir\"
+"
+	fi
+
+	echo >> $output "\
+
+  if test -f \"\$progdir/\$program\"; then"
+
+	# Export our shlibpath_var if we have one.
+	if test "$shlibpath_overrides_runpath" = yes && test -n "$shlibpath_var" && test -n "$temp_rpath"; then
+	  $echo >> $output "\
+    # Add our own library path to $shlibpath_var
+    $shlibpath_var=\"$temp_rpath\$$shlibpath_var\"
+
+    # Some systems cannot cope with colon-terminated $shlibpath_var
+    # The second colon is a workaround for a bug in BeOS R4 sed
+    $shlibpath_var=\`\$echo \"X\$$shlibpath_var\" | \$Xsed -e 's/::*\$//'\`
+
+    export $shlibpath_var
+"
+	fi
+
+	# fixup the dll searchpath if we need to.
+	if test -n "$dllsearchpath"; then
+	  $echo >> $output "\
+    # Add the dll search path components to the executable PATH
+    PATH=$dllsearchpath:\$PATH
+"
+	fi
+
+	$echo >> $output "\
+    if test \"\$libtool_execute_magic\" != \"$magic\"; then
+      # Run the actual program with our arguments.
+"
+	case $host in
+	*-*-cygwin* | *-*-mingw | *-*-os2*)
+	  # win32 systems need to use the prog path for dll
+	  # lookup to work
+	  $echo >> $output "\
+      exec \$progdir\\\\\$program \${1+\"\$@\"}
+"
+	  ;;
+	*)
+	  $echo >> $output "\
+      # Export the path to the program.
+      PATH=\"\$progdir:\$PATH\"
+      export PATH
+
+      exec \$program \${1+\"\$@\"}
+"
+	  ;;
+	esac
+	$echo >> $output "\
+      \$echo \"\$0: cannot exec \$program \${1+\"\$@\"}\"
+      exit 1
+    fi
+  else
+    # The program doesn't exist.
+    \$echo \"\$0: error: \$progdir/\$program does not exist\" 1>&2
+    \$echo \"This script is just a wrapper for \$program.\" 1>&2
+    echo \"See the $PACKAGE documentation for more information.\" 1>&2
+    exit 1
+  fi
+fi\
+"
+	chmod +x $output
+      fi
+      exit 0
+      ;;
+    esac
+
+    # See if we need to build an old-fashioned archive.
+    for oldlib in $oldlibs; do
+
+      if test "$build_libtool_libs" = convenience; then
+	oldobjs="$libobjs_save"
+	addlibs="$convenience"
+	build_libtool_libs=no
+      else
+	if test "$build_libtool_libs" = module; then
+	  oldobjs="$libobjs_save"
+	  build_libtool_libs=no
+	else
+	  oldobjs="$objs$old_deplibs "`$echo "X$libobjs_save" | $SP2NL | $Xsed -e '/\.'${libext}'$/d' -e '/\.lib$/d' -e "$lo2o" | $NL2SP`
+	fi
+	addlibs="$old_convenience"
+      fi
+
+      if test -n "$addlibs"; then
+	gentop="$output_objdir/${outputname}x"
+	$show "${rm}r $gentop"
+	$run ${rm}r "$gentop"
+	$show "mkdir $gentop"
+	$run mkdir "$gentop"
+	status=$?
+	if test $status -ne 0 && test ! -d "$gentop"; then
+	  exit $status
+	fi
+	generated="$generated $gentop"
+
+	# Add in members from convenience archives.
+	for xlib in $addlibs; do
+	  # Extract the objects.
+	  case "$xlib" in
+	  [\\/]* | [A-Za-z]:[\\/]*) xabs="$xlib" ;;
+	  *) xabs=`pwd`"/$xlib" ;;
+	  esac
+	  xlib=`$echo "X$xlib" | $Xsed -e 's%^.*/%%'`
+	  xdir="$gentop/$xlib"
+
+	  $show "${rm}r $xdir"
+	  $run ${rm}r "$xdir"
+	  $show "mkdir $xdir"
+	  $run mkdir "$xdir"
+	  status=$?
+	  if test $status -ne 0 && test ! -d "$xdir"; then
+	    exit $status
+	  fi
+	  $show "(cd $xdir && $AR x $xabs)"
+	  $run eval "(cd \$xdir && $AR x \$xabs)" || exit $?
+
+	  oldobjs="$oldobjs "`find $xdir -name \*.${objext} -print -o -name \*.lo -print | $NL2SP`
+	done
+      fi
+
+      # Do each command in the archive commands.
+      if test -n "$old_archive_from_new_cmds" && test "$build_libtool_libs" = yes; then
+	eval cmds=\"$old_archive_from_new_cmds\"
+      else
+	# Ensure that we have .o objects in place in case we decided
+	# not to build a shared library, and have fallen back to building
+	# static libs even though --disable-static was passed!
+	for oldobj in $oldobjs; do
+	  if test ! -f $oldobj; then
+	    xdir=`$echo "X$oldobj" | $Xsed -e 's%/[^/]*$%%'`
+	    if test "X$xdir" = "X$oldobj"; then
+	      xdir="."
+	    else
+	      xdir="$xdir"
+	    fi
+	    baseobj=`$echo "X$oldobj" | $Xsed -e 's%^.*/%%'`
+	    obj=`$echo "X$baseobj" | $Xsed -e "$o2lo"`
+	    $show "(cd $xdir && ${LN_S} $obj $baseobj)"
+	    $run eval '(cd $xdir && ${LN_S} $obj $baseobj)' || exit $?
+	  fi
+	done
+
+	eval cmds=\"$old_archive_cmds\"
+      fi
+      IFS="${IFS= 	}"; save_ifs="$IFS"; IFS='~'
+      for cmd in $cmds; do
+	IFS="$save_ifs"
+	$show "$cmd"
+	$run eval "$cmd" || exit $?
+      done
+      IFS="$save_ifs"
+    done
+
+    if test -n "$generated"; then
+      $show "${rm}r$generated"
+      $run ${rm}r$generated
+    fi
+
+    # Now create the libtool archive.
+    case "$output" in
+    *.la)
+      old_library=
+      test "$build_old_libs" = yes && old_library="$libname.$libext"
+      $show "creating $output"
+
+      # Quote the link command for shipping.
+      relink_command="cd `pwd`; $SHELL $0 --mode=relink $libtool_args"
+      relink_command=`$echo "X$relink_command" | $Xsed -e "$sed_quote_subst"`
+
+      # Only create the output if not a dry run.
+      if test -z "$run"; then
+	for installed in no yes; do
+	  if test "$installed" = yes; then
+	    if test -z "$install_libdir"; then
+	      break
+	    fi
+	    output="$output_objdir/$outputname"i
+	    # Replace all uninstalled libtool libraries with the installed ones
+	    newdependency_libs=
+	    for deplib in $dependency_libs; do
+	      case "$deplib" in
+	      *.la)
+		name=`$echo "X$deplib" | $Xsed -e 's%^.*/%%'`
+		eval libdir=`sed -n -e 's/^libdir=\(.*\)$/\1/p' $deplib`
+		if test -z "$libdir"; then
+		  $echo "$modename: \`$deplib' is not a valid libtool archive" 1>&2
+		  exit 1
+		fi
+		newdependency_libs="$newdependency_libs $libdir/$name"
+		;;
+	      *) newdependency_libs="$newdependency_libs $deplib" ;;
+	      esac
+	    done
+	    dependency_libs="$newdependency_libs"
+	    newdlfiles=
+	    for lib in $dlfiles; do
+	      name=`$echo "X$lib" | $Xsed -e 's%^.*/%%'`
+	      eval libdir=`sed -n -e 's/^libdir=\(.*\)$/\1/p' $lib`
+	      if test -z "$libdir"; then
+		$echo "$modename: \`$lib' is not a valid libtool archive" 1>&2
+		exit 1
+	      fi
+	      newdlfiles="$newdlfiles $libdir/$name"
+	    done
+	    dlfiles="$newdlfiles"
+	    newdlprefiles=
+	    for lib in $dlprefiles; do
+	      name=`$echo "X$lib" | $Xsed -e 's%^.*/%%'`
+	      eval libdir=`sed -n -e 's/^libdir=\(.*\)$/\1/p' $lib`
+	      if test -z "$libdir"; then
+		$echo "$modename: \`$lib' is not a valid libtool archive" 1>&2
+		exit 1
+	      fi
+	      newdlprefiles="$newdlprefiles $libdir/$name"
+	    done
+	    dlprefiles="$newdlprefiles"
+	  fi
+	  $rm $output
+	  $echo > $output "\
+# $outputname - a libtool library file
+# Generated by $PROGRAM - GNU $PACKAGE $VERSION$TIMESTAMP
+#
+# Please DO NOT delete this file!
+# It is necessary for linking the library.
+
+# The name that we can dlopen(3).
+dlname='$dlname'
+
+# Names of this library.
+library_names='$library_names'
+
+# The name of the static archive.
+old_library='$old_library'
+
+# Libraries that this one depends upon.
+dependency_libs='$dependency_libs'
+
+# Version information for $libname.
+current=$current
+age=$age
+revision=$revision
+
+# Is this an already installed library?
+installed=$installed
+
+# Files to dlopen/dlpreopen
+dlopen='$dlfiles'
+dlpreopen='$dlprefiles'
+
+# Directory that this library needs to be installed in:
+libdir='$install_libdir'"
+	  if test "$installed" = no; then
+	    $echo >> $output "\
+relink_command=\"$relink_command\""
+	  fi
+	done
+      fi
+
+      # Do a symbolic link so that the libtool archive can be found in
+      # LD_LIBRARY_PATH before the program is installed.
+      $show "(cd $output_objdir && $rm $outputname && $LN_S ../$outputname $outputname)"
+      $run eval '(cd $output_objdir && $rm $outputname && $LN_S ../$outputname $outputname)' || exit $?
+      ;;
+    esac
+    exit 0
+    ;;
+
+  # libtool install mode
+  install)
+    modename="$modename: install"
+
+    # There may be an optional sh(1) argument at the beginning of
+    # install_prog (especially on Windows NT).
+    if test "$nonopt" = "$SHELL" || test "$nonopt" = /bin/sh; then
+      # Aesthetically quote it.
+      arg=`$echo "X$nonopt" | $Xsed -e "$sed_quote_subst"`
+      case "$arg" in
+      *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*)
+	arg="\"$arg\""
+	;;
+      esac
+      install_prog="$arg "
+      arg="$1"
+      shift
+    else
+      install_prog=
+      arg="$nonopt"
+    fi
+
+    # The real first argument should be the name of the installation program.
+    # Aesthetically quote it.
+    arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
+    case "$arg" in
+    *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*)
+      arg="\"$arg\""
+      ;;
+    esac
+    install_prog="$install_prog$arg"
+
+    # We need to accept at least all the BSD install flags.
+    dest=
+    files=
+    opts=
+    prev=
+    install_type=
+    isdir=no
+    stripme=
+    for arg
+    do
+      if test -n "$dest"; then
+	files="$files $dest"
+	dest="$arg"
+	continue
+      fi
+
+      case "$arg" in
+      -d) isdir=yes ;;
+      -f) prev="-f" ;;
+      -g) prev="-g" ;;
+      -m) prev="-m" ;;
+      -o) prev="-o" ;;
+      -s)
+	stripme=" -s"
+	continue
+	;;
+      -*) ;;
+
+      *)
+	# If the previous option needed an argument, then skip it.
+	if test -n "$prev"; then
+	  prev=
+	else
+	  dest="$arg"
+	  continue
+	fi
+	;;
+      esac
+
+      # Aesthetically quote the argument.
+      arg=`$echo "X$arg" | $Xsed -e "$sed_quote_subst"`
+      case "$arg" in
+      *[\[\~\#\^\&\*\(\)\{\}\|\;\<\>\?\'\ \	]*|*]*)
+	arg="\"$arg\""
+	;;
+      esac
+      install_prog="$install_prog $arg"
+    done
+
+    if test -z "$install_prog"; then
+      $echo "$modename: you must specify an install program" 1>&2
+      $echo "$help" 1>&2
+      exit 1
+    fi
+
+    if test -n "$prev"; then
+      $echo "$modename: the \`$prev' option requires an argument" 1>&2
+      $echo "$help" 1>&2
+      exit 1
+    fi
+
+    if test -z "$files"; then
+      if test -z "$dest"; then
+	$echo "$modename: no file or destination specified" 1>&2
+      else
+	$echo "$modename: you must specify a destination" 1>&2
+      fi
+      $echo "$help" 1>&2
+      exit 1
+    fi
+
+    # Strip any trailing slash from the destination.
+    dest=`$echo "X$dest" | $Xsed -e 's%/$%%'`
+
+    # Check to see that the destination is a directory.
+    test -d "$dest" && isdir=yes
+    if test "$isdir" = yes; then
+      destdir="$dest"
+      destname=
+    else
+      destdir=`$echo "X$dest" | $Xsed -e 's%/[^/]*$%%'`
+      test "X$destdir" = "X$dest" && destdir=.
+      destname=`$echo "X$dest" | $Xsed -e 's%^.*/%%'`
+
+      # Not a directory, so check to see that there is only one file specified.
+      set dummy $files
+      if test $# -gt 2; then
+	$echo "$modename: \`$dest' is not a directory" 1>&2
+	$echo "$help" 1>&2
+	exit 1
+      fi
+    fi
+    case "$destdir" in
+    [\\/]* | [A-Za-z]:[\\/]*) ;;
+    *)
+      for file in $files; do
+	case "$file" in
+	*.lo) ;;
+	*)
+	  $echo "$modename: \`$destdir' must be an absolute directory name" 1>&2
+	  $echo "$help" 1>&2
+	  exit 1
+	  ;;
+	esac
+      done
+      ;;
+    esac
+
+    # This variable tells wrapper scripts just to set variables rather
+    # than running their programs.
+    libtool_install_magic="$magic"
+
+    staticlibs=
+    future_libdirs=
+    current_libdirs=
+    for file in $files; do
+
+      # Do each installation.
+      case "$file" in
+      *.$libext)
+	# Do the static libraries later.
+	staticlibs="$staticlibs $file"
+	;;
+
+      *.la)
+	# Check to see that this really is a libtool archive.
+	if (sed -e '2q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then :
+	else
+	  $echo "$modename: \`$file' is not a valid libtool archive" 1>&2
+	  $echo "$help" 1>&2
+	  exit 1
+	fi
+
+	library_names=
+	old_library=
+	relink_command=
+	# If there is no directory component, then add one.
+	case "$file" in
+	*/* | *\\*) . $file ;;
+	*) . ./$file ;;
+	esac
+
+	# Add the libdir to current_libdirs if it is the destination.
+	if test "X$destdir" = "X$libdir"; then
+	  case "$current_libdirs " in
+	  *" $libdir "*) ;;
+	  *) current_libdirs="$current_libdirs $libdir" ;;
+	  esac
+	else
+	  # Note the libdir as a future libdir.
+	  case "$future_libdirs " in
+	  *" $libdir "*) ;;
+	  *) future_libdirs="$future_libdirs $libdir" ;;
+	  esac
+	fi
+
+	dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`/
+	test "X$dir" = "X$file/" && dir=
+	dir="$dir$objdir"
+
+	if test "$hardcode_into_libs" = all; then
+	  if test -z "$relink_command"; then
+	    $echo "$modename: invalid libtool pseudo library \`$file'" 1>&2
+	    exit 1
+	  fi
+	  $echo "$modename: warning: relinking \`$file'" 1>&2
+	  $show "$relink_command"
+	  if $run eval "$relink_command"; then :
+	  else
+	    $echo "$modename: error: relink \`$file' with the above command before installing it" 1>&2
+	    continue
+	  fi
+	fi
+
+	# See the names of the shared library.
+	set dummy $library_names
+	if test -n "$2"; then
+	  realname="$2"
+	  shift
+	  shift
+
+	  srcname="$realname"
+	  test "$hardcode_into_libs" = all && srcname="$realname"T
+
+	  # Install the shared library and build the symlinks.
+	  $show "$install_prog $dir/$srcname $destdir/$realname"
+	  $run eval "$install_prog $dir/$srcname $destdir/$realname" || exit $?
+	  if test -n "$stripme" && test -n "$striplib"; then
+	    $show "$striplib $destdir/$realname"
+	    $run eval "$striplib $destdir/$realname" || exit $?
+	  fi
+
+	  if test $# -gt 0; then
+	    # Delete the old symlinks, and create new ones.
+	    for linkname
+	    do
+	      if test "$linkname" != "$realname"; then
+		$show "(cd $destdir && $rm $linkname && $LN_S $realname $linkname)"
+		$run eval "(cd $destdir && $rm $linkname && $LN_S $realname $linkname)"
+	      fi
+	    done
+	  fi
+
+	  # Do each command in the postinstall commands.
+	  lib="$destdir/$realname"
+	  eval cmds=\"$postinstall_cmds\"
+	  IFS="${IFS= 	}"; save_ifs="$IFS"; IFS='~'
+	  for cmd in $cmds; do
+	    IFS="$save_ifs"
+	    $show "$cmd"
+	    $run eval "$cmd" || exit $?
+	  done
+	  IFS="$save_ifs"
+	fi
+
+	# Install the pseudo-library for information purposes.
+	name=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+	instname="$dir/$name"i
+	$show "$install_prog $instname $destdir/$name"
+	$run eval "$install_prog $instname $destdir/$name" || exit $?
+
+	# Maybe install the static library, too.
+	test -n "$old_library" && staticlibs="$staticlibs $dir/$old_library"
+	;;
+
+      *.lo)
+	# Install (i.e. copy) a libtool object.
+
+	# Figure out destination file name, if it wasn't already specified.
+	if test -n "$destname"; then
+	  destfile="$destdir/$destname"
+	else
+	  destfile=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+	  destfile="$destdir/$destfile"
+	fi
+
+	# Deduce the name of the destination old-style object file.
+	case "$destfile" in
+	*.lo)
+	  staticdest=`$echo "X$destfile" | $Xsed -e "$lo2o"`
+	  ;;
+	*.$objext)
+	  staticdest="$destfile"
+	  destfile=
+	  ;;
+	*)
+	  $echo "$modename: cannot copy a libtool object to \`$destfile'" 1>&2
+	  $echo "$help" 1>&2
+	  exit 1
+	  ;;
+	esac
+
+	# Install the libtool object if requested.
+	if test -n "$destfile"; then
+	  $show "$install_prog $file $destfile"
+	  $run eval "$install_prog $file $destfile" || exit $?
+	fi
+
+	# Install the old object if enabled.
+	if test "$build_old_libs" = yes; then
+	  # Deduce the name of the old-style object file.
+	  staticobj=`$echo "X$file" | $Xsed -e "$lo2o"`
+
+	  $show "$install_prog $staticobj $staticdest"
+	  $run eval "$install_prog \$staticobj \$staticdest" || exit $?
+	fi
+	exit 0
+	;;
+
+      *)
+	# Figure out destination file name, if it wasn't already specified.
+	if test -n "$destname"; then
+	  destfile="$destdir/$destname"
+	else
+	  destfile=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+	  destfile="$destdir/$destfile"
+	fi
+
+	# Do a test to see if this is really a libtool program.
+	if (sed -e '4q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then
+	  link_against_libtool_libs=
+	  relink_command=
+
+	  # If there is no directory component, then add one.
+	  case "$file" in
+	  */* | *\\*) . $file ;;
+	  *) . ./$file ;;
+	  esac
+
+	  # Check the variables that should have been set.
+	  if test -z "$link_against_libtool_libs"; then
+	    $echo "$modename: invalid libtool wrapper script \`$file'" 1>&2
+	    exit 1
+	  fi
+
+	  finalize=yes
+	  for lib in $link_against_libtool_libs; do
+	    # Check to see that each library is installed.
+	    libdir=
+	    if test -f "$lib"; then
+	      # If there is no directory component, then add one.
+	      case "$lib" in
+	      */* | *\\*) . $lib ;;
+	      *) . ./$lib ;;
+	      esac
+	    fi
+	    libfile="$libdir/"`$echo "X$lib" | $Xsed -e 's%^.*/%%g'` ### testsuite: skip nested quoting test
+	    if test -n "$libdir" && test ! -f "$libfile"; then
+	      $echo "$modename: warning: \`$lib' has not been installed in \`$libdir'" 1>&2
+	      finalize=no
+	    fi
+	  done
+
+	  relink_command=
+	  # If there is no directory component, then add one.
+	  case "$file" in
+	  */* | *\\*) . $file ;;
+	  *) . ./$file ;;
+	  esac
+
+	  outputname=
+	  if test "$fast_install" = no && test -n "$relink_command"; then
+	    if test "$finalize" = yes && test -z "$run"; then
+	      tmpdir="/tmp"
+	      test -n "$TMPDIR" && tmpdir="$TMPDIR"
+	      tmpdir="$tmpdir/libtool-$$"
+	      if $mkdir -p "$tmpdir" && chmod 700 "$tmpdir"; then :
+	      else
+		$echo "$modename: error: cannot create temporary directory \`$tmpdir'" 1>&2
+		continue
+	      fi
+	      outputname="$tmpdir/$file"
+	      # Replace the output file specification.
+	      relink_command=`$echo "X$relink_command" | $Xsed -e 's%@OUTPUT@%'"$outputname"'%g'`
+
+	      $show "$relink_command"
+	      if $run eval "$relink_command"; then :
+	      else
+		$echo "$modename: error: relink \`$file' with the above command before installing it" 1>&2
+		${rm}r "$tmpdir"
+		continue
+	      fi
+	      file="$outputname"
+	    else
+	      $echo "$modename: warning: cannot relink \`$file'" 1>&2
+	    fi
+	  else
+	    # Install the binary that we compiled earlier.
+	    file=`$echo "X$file" | $Xsed -e "s%\([^/]*\)$%$objdir/\1%"`
+	  fi
+	fi
+
+	$show "$install_prog$stripme $file $destfile"
+	$run eval "$install_prog\$stripme \$file \$destfile" || exit $?
+	test -n "$outputname" && ${rm}r "$tmpdir"
+	;;
+      esac
+    done
+
+    for file in $staticlibs; do
+      name=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+
+      # Set up the ranlib parameters.
+      oldlib="$destdir/$name"
+
+      $show "$install_prog $file $oldlib"
+      $run eval "$install_prog \$file \$oldlib" || exit $?
+
+      if test -n "$stripme" && test -n "$striplib"; then
+	$show "$old_striplib $oldlib"
+	$run eval "$old_striplib $oldlib" || exit $?
+      fi
+
+      # Do each command in the postinstall commands.
+      eval cmds=\"$old_postinstall_cmds\"
+      IFS="${IFS= 	}"; save_ifs="$IFS"; IFS='~'
+      for cmd in $cmds; do
+	IFS="$save_ifs"
+	$show "$cmd"
+	$run eval "$cmd" || exit $?
+      done
+      IFS="$save_ifs"
+    done
+
+    if test -n "$future_libdirs"; then
+      $echo "$modename: warning: remember to run \`$progname --finish$future_libdirs'" 1>&2
+    fi
+
+    if test -n "$current_libdirs"; then
+      # Maybe just do a dry run.
+      test -n "$run" && current_libdirs=" -n$current_libdirs"
+      exec $SHELL $0 --finish$current_libdirs
+      exit 1
+    fi
+
+    exit 0
+    ;;
+
+  # libtool finish mode
+  finish)
+    modename="$modename: finish"
+    libdirs="$nonopt"
+    admincmds=
+
+    if test -n "$finish_cmds$finish_eval" && test -n "$libdirs"; then
+      for dir
+      do
+	libdirs="$libdirs $dir"
+      done
+
+      for libdir in $libdirs; do
+	if test -n "$finish_cmds"; then
+	  # Do each command in the finish commands.
+	  eval cmds=\"$finish_cmds\"
+	  IFS="${IFS= 	}"; save_ifs="$IFS"; IFS='~'
+	  for cmd in $cmds; do
+	    IFS="$save_ifs"
+	    $show "$cmd"
+	    $run eval "$cmd" || admincmds="$admincmds
+       $cmd"
+	  done
+	  IFS="$save_ifs"
+	fi
+	if test -n "$finish_eval"; then
+	  # Do the single finish_eval.
+	  eval cmds=\"$finish_eval\"
+	  $run eval "$cmds" || admincmds="$admincmds
+       $cmds"
+	fi
+      done
+    fi
+
+    # Exit here if they wanted silent mode.
+    test "$show" = : && exit 0
+
+    echo "----------------------------------------------------------------------"
+    echo "Libraries have been installed in:"
+    for libdir in $libdirs; do
+      echo "   $libdir"
+    done
+    echo
+    echo "If you ever happen to want to link against installed libraries"
+    echo "in a given directory, LIBDIR, you must either use libtool, and"
+    echo "specify the full pathname of the library, or use \`-LLIBDIR'"
+    echo "flag during linking and do at least one of the following:"
+    if test -n "$shlibpath_var"; then
+      echo "   - add LIBDIR to the \`$shlibpath_var' environment variable"
+      echo "     during execution"
+    fi
+    if test -n "$runpath_var"; then
+      echo "   - add LIBDIR to the \`$runpath_var' environment variable"
+      echo "     during linking"
+    fi
+    if test -n "$hardcode_libdir_flag_spec"; then
+      libdir=LIBDIR
+      eval flag=\"$hardcode_libdir_flag_spec\"
+
+      echo "   - use the \`$flag' linker flag"
+    fi
+    if test -n "$admincmds"; then
+      echo "   - have your system administrator run these commands:$admincmds"
+    fi
+    if test -f /etc/ld.so.conf; then
+      echo "   - have your system administrator add LIBDIR to \`/etc/ld.so.conf'"
+    fi
+    echo
+    echo "See any operating system documentation about shared libraries for"
+    echo "more information, such as the ld(1) and ld.so(8) manual pages."
+    echo "----------------------------------------------------------------------"
+    exit 0
+    ;;
+
+  # libtool execute mode
+  execute)
+    modename="$modename: execute"
+
+    # The first argument is the command name.
+    cmd="$nonopt"
+    if test -z "$cmd"; then
+      $echo "$modename: you must specify a COMMAND" 1>&2
+      $echo "$help"
+      exit 1
+    fi
+
+    # Handle -dlopen flags immediately.
+    for file in $execute_dlfiles; do
+      if test ! -f "$file"; then
+	$echo "$modename: \`$file' is not a file" 1>&2
+	$echo "$help" 1>&2
+	exit 1
+      fi
+
+      dir=
+      case "$file" in
+      *.la)
+	# Check to see that this really is a libtool archive.
+	if (sed -e '2q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then :
+	else
+	  $echo "$modename: \`$lib' is not a valid libtool archive" 1>&2
+	  $echo "$help" 1>&2
+	  exit 1
+	fi
+
+	# Read the libtool library.
+	dlname=
+	library_names=
+
+	# If there is no directory component, then add one.
+	case "$file" in
+	*/* | *\\*) . $file ;;
+	*) . ./$file ;;
+	esac
+
+	# Skip this library if it cannot be dlopened.
+	if test -z "$dlname"; then
+	  # Warn if it was a shared library.
+	  test -n "$library_names" && $echo "$modename: warning: \`$file' was not linked with \`-export-dynamic'"
+	  continue
+	fi
+
+	dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`
+	test "X$dir" = "X$file" && dir=.
+
+	if test -f "$dir/$objdir/$dlname"; then
+	  dir="$dir/$objdir"
+	else
+	  $echo "$modename: cannot find \`$dlname' in \`$dir' or \`$dir/$objdir'" 1>&2
+	  exit 1
+	fi
+	;;
+
+      *.lo)
+	# Just add the directory containing the .lo file.
+	dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`
+	test "X$dir" = "X$file" && dir=.
+	;;
+
+      *)
+	$echo "$modename: warning \`-dlopen' is ignored for non-libtool libraries and objects" 1>&2
+	continue
+	;;
+      esac
+
+      # Get the absolute pathname.
+      absdir=`cd "$dir" && pwd`
+      test -n "$absdir" && dir="$absdir"
+
+      # Now add the directory to shlibpath_var.
+      if eval "test -z \"\$$shlibpath_var\""; then
+	eval "$shlibpath_var=\"\$dir\""
+      else
+	eval "$shlibpath_var=\"\$dir:\$$shlibpath_var\""
+      fi
+    done
+
+    # This variable tells wrapper scripts just to set shlibpath_var
+    # rather than running their programs.
+    libtool_execute_magic="$magic"
+
+    # Check if any of the arguments is a wrapper script.
+    args=
+    for file
+    do
+      case "$file" in
+      -*) ;;
+      *)
+	# Do a test to see if this is really a libtool program.
+	if (sed -e '4q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then
+	  # If there is no directory component, then add one.
+	  case "$file" in
+	  */* | *\\*) . $file ;;
+	  *) . ./$file ;;
+	  esac
+
+	  # Transform arg to wrapped name.
+	  file="$progdir/$program"
+	fi
+	;;
+      esac
+      # Quote arguments (to preserve shell metacharacters).
+      file=`$echo "X$file" | $Xsed -e "$sed_quote_subst"`
+      args="$args \"$file\""
+    done
+
+    if test -z "$run"; then
+      if test -n "$shlibpath_var"; then
+	# Export the shlibpath_var.
+	eval "export $shlibpath_var"
+      fi
+
+      # Restore saved enviroment variables
+      if test "${save_LC_ALL+set}" = set; then
+	LC_ALL="$save_LC_ALL"; export LC_ALL
+      fi
+      if test "${save_LANG+set}" = set; then
+	LANG="$save_LANG"; export LANG
+      fi
+
+      # Now actually exec the command.
+      eval "exec \$cmd$args"
+
+      $echo "$modename: cannot exec \$cmd$args"
+      exit 1
+    else
+      # Display what would be done.
+      if test -n "$shlibpath_var"; then
+	eval "\$echo \"\$shlibpath_var=\$$shlibpath_var\""
+	$echo "export $shlibpath_var"
+      fi
+      $echo "$cmd$args"
+      exit 0
+    fi
+    ;;
+
+  # libtool clean and uninstall mode
+  clean | uninstall)
+    modename="$modename: $mode"
+    rm="$nonopt"
+    files=
+
+    # This variable tells wrapper scripts just to set variables rather
+    # than running their programs.
+    libtool_install_magic="$magic"
+
+    for arg
+    do
+      case "$arg" in
+      -*) rm="$rm $arg" ;;
+      *) files="$files $arg" ;;
+      esac
+    done
+
+    if test -z "$rm"; then
+      $echo "$modename: you must specify an RM program" 1>&2
+      $echo "$help" 1>&2
+      exit 1
+    fi
+
+    for file in $files; do
+      dir=`$echo "X$file" | $Xsed -e 's%/[^/]*$%%'`
+      if test "X$dir" = "X$file"; then
+	dir=.
+	objdir="$objdir"
+      else
+	objdir="$dir/$objdir"
+      fi
+      name=`$echo "X$file" | $Xsed -e 's%^.*/%%'`
+      test $mode = uninstall && objdir="$dir"
+
+      rmfiles="$file"
+
+      case "$name" in
+      *.la)
+	# Possibly a libtool archive, so verify it.
+	if (sed -e '2q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then
+	  . $dir/$name
+
+	  # Delete the libtool libraries and symlinks.
+	  for n in $library_names; do
+	    rmfiles="$rmfiles $objdir/$n"
+	  done
+	  test -n "$old_library" && rmfiles="$rmfiles $objdir/$old_library"
+	  test $mode = clean && rmfiles="$rmfiles $objdir/$name $objdir/${name}i"
+
+	  if test $mode = uninstall; then
+	    if test -n "$library_names"; then
+	      # Do each command in the postuninstall commands.
+	      eval cmds=\"$postuninstall_cmds\"
+	      IFS="${IFS= 	}"; save_ifs="$IFS"; IFS='~'
+	      for cmd in $cmds; do
+		IFS="$save_ifs"
+		$show "$cmd"
+		$run eval "$cmd"
+	      done
+	      IFS="$save_ifs"
+	    fi
+
+	    if test -n "$old_library"; then
+	      # Do each command in the old_postuninstall commands.
+	      eval cmds=\"$old_postuninstall_cmds\"
+	      IFS="${IFS= 	}"; save_ifs="$IFS"; IFS='~'
+	      for cmd in $cmds; do
+		IFS="$save_ifs"
+		$show "$cmd"
+		$run eval "$cmd"
+	      done
+	      IFS="$save_ifs"
+	    fi
+	    # FIXME: should reinstall the best remaining shared library.
+	  fi
+	fi
+	;;
+
+      *.lo)
+	if test "$build_old_libs" = yes; then
+	  oldobj=`$echo "X$name" | $Xsed -e "$lo2o"`
+	  rmfiles="$rmfiles $dir/$oldobj"
+	fi
+	;;
+
+      *)
+	# Do a test to see if this is a libtool program.
+	if test $mode = clean &&
+	   (sed -e '4q' $file | egrep "^# Generated by .*$PACKAGE") >/dev/null 2>&1; then
+	  relink_command=
+	  . $dir/$file
+
+	  rmfiles="$rmfiles $objdir/$name $objdir/${name}S.${objext}"
+	  if test "$fast_install" = yes && test -n "$relink_command"; then
+	    rmfiles="$rmfiles $objdir/lt-$name"
+	  fi
+	fi
+	;;
+      esac
+      $show "$rm $rmfiles"
+      $run $rm $rmfiles
+    done
+    exit 0
+    ;;
+
+  "")
+    $echo "$modename: you must specify a MODE" 1>&2
+    $echo "$generic_help" 1>&2
+    exit 1
+    ;;
+  esac
+
+  $echo "$modename: invalid operation mode \`$mode'" 1>&2
+  $echo "$generic_help" 1>&2
+  exit 1
+fi # test -z "$show_help"
+
+# We need to display help for each of the modes.
+case "$mode" in
+"") $echo \
+"Usage: $modename [OPTION]... [MODE-ARG]...
+
+Provide generalized library-building support services.
+
+    --config          show all configuration variables
+    --debug           enable verbose shell tracing
+-n, --dry-run         display commands without modifying any files
+    --features        display basic configuration information and exit
+    --finish          same as \`--mode=finish'
+    --help            display this help message and exit
+    --mode=MODE       use operation mode MODE [default=inferred from MODE-ARGS]
+    --quiet           same as \`--silent'
+    --silent          don't print informational messages
+    --version         print version information
+
+MODE must be one of the following:
+
+      clean           remove files from the build directory
+      compile         compile a source file into a libtool object
+      execute         automatically set library path, then run a program
+      finish          complete the installation of libtool libraries
+      install         install libraries or executables
+      link            create a library or an executable
+      uninstall       remove libraries from an installed directory
+
+MODE-ARGS vary depending on the MODE.  Try \`$modename --help --mode=MODE' for
+a more detailed description of MODE."
+  exit 0
+  ;;
+
+clean)
+  $echo \
+"Usage: $modename [OPTION]... --mode=clean RM [RM-OPTION]... FILE...
+
+Remove files from the build directory.
+
+RM is the name of the program to use to delete files associated with each FILE
+(typically \`/bin/rm').  RM-OPTIONS are options (such as \`-f') to be passed
+to RM.
+
+If FILE is a libtool library, object or program, all the files associated
+with it are deleted. Otherwise, only FILE itself is deleted using RM."
+  ;;
+
+compile)
+  $echo \
+"Usage: $modename [OPTION]... --mode=compile COMPILE-COMMAND... SOURCEFILE
+
+Compile a source file into a libtool library object.
+
+This mode accepts the following additional options:
+
+  -o OUTPUT-FILE    set the output file name to OUTPUT-FILE
+  -static           always build a \`.o' file suitable for static linking
+
+COMPILE-COMMAND is a command to be used in creating a \`standard' object file
+from the given SOURCEFILE.
+
+The output file name is determined by removing the directory component from
+SOURCEFILE, then substituting the C source code suffix \`.c' with the
+library object suffix, \`.lo'."
+  ;;
+
+execute)
+  $echo \
+"Usage: $modename [OPTION]... --mode=execute COMMAND [ARGS]...
+
+Automatically set library path, then run a program.
+
+This mode accepts the following additional options:
+
+  -dlopen FILE      add the directory containing FILE to the library path
+
+This mode sets the library path environment variable according to \`-dlopen'
+flags.
+
+If any of the ARGS are libtool executable wrappers, then they are translated
+into their corresponding uninstalled binary, and any of their required library
+directories are added to the library path.
+
+Then, COMMAND is executed, with ARGS as arguments."
+  ;;
+
+finish)
+  $echo \
+"Usage: $modename [OPTION]... --mode=finish [LIBDIR]...
+
+Complete the installation of libtool libraries.
+
+Each LIBDIR is a directory that contains libtool libraries.
+
+The commands that this mode executes may require superuser privileges.  Use
+the \`--dry-run' option if you just want to see what would be executed."
+  ;;
+
+install)
+  $echo \
+"Usage: $modename [OPTION]... --mode=install INSTALL-COMMAND...
+
+Install executables or libraries.
+
+INSTALL-COMMAND is the installation command.  The first component should be
+either the \`install' or \`cp' program.
+
+The rest of the components are interpreted as arguments to that command (only
+BSD-compatible install options are recognized)."
+  ;;
+
+link)
+  $echo \
+"Usage: $modename [OPTION]... --mode=link LINK-COMMAND...
+
+Link object files or libraries together to form another library, or to
+create an executable program.
+
+LINK-COMMAND is a command using the C compiler that you would use to create
+a program from several object files.
+
+The following components of LINK-COMMAND are treated specially:
+
+  -all-static       do not do any dynamic linking at all
+  -avoid-version    do not add a version suffix if possible
+  -dlopen FILE      \`-dlpreopen' FILE if it cannot be dlopened at runtime
+  -dlpreopen FILE   link in FILE and add its symbols to lt_preloaded_symbols
+  -export-dynamic   allow symbols from OUTPUT-FILE to be resolved with dlsym(3)
+  -export-symbols SYMFILE
+		    try to export only the symbols listed in SYMFILE
+  -export-symbols-regex REGEX
+		    try to export only the symbols matching REGEX
+  -LLIBDIR          search LIBDIR for required installed libraries
+  -lNAME            OUTPUT-FILE requires the installed library libNAME
+  -module           build a library that can dlopened
+  -no-fast-install  disable the fast-install mode
+  -no-install       link a not-installable executable
+  -no-undefined     declare that a library does not refer to external symbols
+  -o OUTPUT-FILE    create OUTPUT-FILE from the specified objects
+  -release RELEASE  specify package release information
+  -rpath LIBDIR     the created library will eventually be installed in LIBDIR
+  -R[ ]LIBDIR       add LIBDIR to the runtime path of programs and libraries
+  -static           do not do any dynamic linking of libtool libraries
+  -version-info CURRENT[:REVISION[:AGE]]
+		    specify library version info [each variable defaults to 0]
+
+All other options (arguments beginning with \`-') are ignored.
+
+Every other argument is treated as a filename.  Files ending in \`.la' are
+treated as uninstalled libtool libraries, other files are standard or library
+object files.
+
+If the OUTPUT-FILE ends in \`.la', then a libtool library is created,
+only library objects (\`.lo' files) may be specified, and \`-rpath' is
+required, except when creating a convenience library.
+
+If OUTPUT-FILE ends in \`.a' or \`.lib', then a standard library is created
+using \`ar' and \`ranlib', or on Windows using \`lib'.
+
+If OUTPUT-FILE ends in \`.lo' or \`.${objext}', then a reloadable object file
+is created, otherwise an executable program is created."
+  ;;
+
+uninstall)
+  $echo \
+"Usage: $modename [OPTION]... --mode=uninstall RM [RM-OPTION]... FILE...
+
+Remove libraries from an installation directory.
+
+RM is the name of the program to use to delete files associated with each FILE
+(typically \`/bin/rm').  RM-OPTIONS are options (such as \`-f') to be passed
+to RM.
+
+If FILE is a libtool library, all the files associated with it are deleted.
+Otherwise, only FILE itself is deleted using RM."
+  ;;
+
+*)
+  $echo "$modename: invalid operation mode \`$mode'" 1>&2
+  $echo "$help" 1>&2
+  exit 1
+  ;;
+esac
+
+echo
+$echo "Try \`$modename --help' for more information about other modes."
+
+exit 0
+
+# Local Variables:
+# mode:shell-script
+# sh-indentation:2
+# End:
diff --git a/rts/gmp/mdate-sh b/rts/gmp/mdate-sh
new file mode 100644
index 0000000000..37171f21fb
--- /dev/null
+++ b/rts/gmp/mdate-sh
@@ -0,0 +1,92 @@
+#!/bin/sh
+# Get modification time of a file or directory and pretty-print it.
+# Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
+# written by Ulrich Drepper <drepper@gnu.ai.mit.edu>, June 1995
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software Foundation,
+# Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+# Prevent date giving response in another language.
+LANG=C
+export LANG
+LC_ALL=C
+export LC_ALL
+LC_TIME=C
+export LC_TIME
+
+# Get the extended ls output of the file or directory.
+# On HPUX /bin/sh, "set" interprets "-rw-r--r--" as options, so the "x" below.
+if ls -L /dev/null 1>/dev/null 2>&1; then
+  set - x`ls -L -l -d $1`
+else
+  set - x`ls -l -d $1`
+fi
+# The month is at least the fourth argument
+# (3 shifts here, the next inside the loop).
+shift
+shift
+shift
+
+# Find the month.  Next argument is day, followed by the year or time.
+month=
+until test $month
+do
+  shift
+  case $1 in
+    Jan) month=January; nummonth=1;;
+    Feb) month=February; nummonth=2;;
+    Mar) month=March; nummonth=3;;
+    Apr) month=April; nummonth=4;;
+    May) month=May; nummonth=5;;
+    Jun) month=June; nummonth=6;;
+    Jul) month=July; nummonth=7;;
+    Aug) month=August; nummonth=8;;
+    Sep) month=September; nummonth=9;;
+    Oct) month=October; nummonth=10;;
+    Nov) month=November; nummonth=11;;
+    Dec) month=December; nummonth=12;;
+  esac
+done
+
+day=$2
+
+# Here we have to deal with the problem that the ls output gives either
+# the time of day or the year.
+case $3 in
+  *:*) set `date`; eval year=\$$#
+       case $2 in
+	 Jan) nummonthtod=1;;
+	 Feb) nummonthtod=2;;
+	 Mar) nummonthtod=3;;
+	 Apr) nummonthtod=4;;
+	 May) nummonthtod=5;;
+	 Jun) nummonthtod=6;;
+	 Jul) nummonthtod=7;;
+	 Aug) nummonthtod=8;;
+	 Sep) nummonthtod=9;;
+	 Oct) nummonthtod=10;;
+	 Nov) nummonthtod=11;;
+	 Dec) nummonthtod=12;;
+       esac
+       # For the first six month of the year the time notation can also
+       # be used for files modified in the last year.
+       if (expr $nummonth \> $nummonthtod) > /dev/null;
+       then
+	 year=`expr $year - 1`
+       fi;;
+  *) year=$3;;
+esac
+
+# The result.
+echo $day $month $year
diff --git a/rts/gmp/memory.c b/rts/gmp/memory.c
new file mode 100644
index 0000000000..9df440ce22
--- /dev/null
+++ b/rts/gmp/memory.c
@@ -0,0 +1,160 @@
+/* Memory allocation routines.
+
+Copyright (C) 1991, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h>
+#include <stdlib.h> /* for malloc, realloc, free */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#ifdef __NeXT__
+#define static
+#endif
+
+
+void *	(*_mp_allocate_func) _PROTO ((size_t)) = _mp_default_allocate;
+void *	(*_mp_reallocate_func) _PROTO ((void *, size_t, size_t))
+     = _mp_default_reallocate;
+void	(*_mp_free_func) _PROTO ((void *, size_t)) = _mp_default_free;
+
+
+/* Default allocation functions.  In case of failure to allocate/reallocate
+   an error message is written to stderr and the program aborts.  */
+
+void *
+#if __STDC__
+_mp_default_allocate (size_t size)
+#else
+_mp_default_allocate (size)
+     size_t size;
+#endif
+{
+  void *ret;
+#ifdef DEBUG
+  size_t req_size = size;
+  size += 2 * BYTES_PER_MP_LIMB;
+#endif
+  ret = malloc (size);
+  if (ret == 0)
+    {
+      perror ("cannot allocate in gmp");
+      abort ();
+    }
+  
+#ifdef DEBUG
+  {
+    mp_ptr p = ret;
+    p++;
+    p[-1] = (0xdeadbeef << 31) + 0xdeafdeed;
+    if (req_size % BYTES_PER_MP_LIMB == 0)
+      p[req_size / BYTES_PER_MP_LIMB] = ~((0xdeadbeef << 31) + 0xdeafdeed);
+    ret = p;
+  }
+#endif
+  return ret;
+}
+
+void *
+#if __STDC__
+_mp_default_reallocate (void *oldptr, size_t old_size, size_t new_size)
+#else
+_mp_default_reallocate (oldptr, old_size, new_size)
+     void *oldptr;
+     size_t old_size;
+     size_t new_size;
+#endif
+{
+  void *ret;
+
+#ifdef DEBUG
+  size_t req_size = new_size;
+
+  if (old_size != 0)
+    {
+      mp_ptr p = oldptr;
+      if (p[-1] != (0xdeadbeef << 31) + 0xdeafdeed)
+	{
+	  fprintf (stderr, "gmp: (realloc) data clobbered before allocation block\n");
+	  abort ();
+	}
+      if (old_size % BYTES_PER_MP_LIMB == 0)
+	if (p[old_size / BYTES_PER_MP_LIMB] != ~((0xdeadbeef << 31) + 0xdeafdeed))
+	  {
+	    fprintf (stderr, "gmp: (realloc) data clobbered after allocation block\n");
+	    abort ();
+	  }
+      oldptr = p - 1;
+    }
+
+  new_size += 2 * BYTES_PER_MP_LIMB;
+#endif
+
+  ret = realloc (oldptr, new_size);
+  if (ret == 0)
+    {
+      perror ("cannot allocate in gmp");
+      abort ();
+    }
+
+#ifdef DEBUG
+  {
+    mp_ptr p = ret;
+    p++;
+    p[-1] = (0xdeadbeef << 31) + 0xdeafdeed;
+    if (req_size % BYTES_PER_MP_LIMB == 0)
+      p[req_size / BYTES_PER_MP_LIMB] = ~((0xdeadbeef << 31) + 0xdeafdeed);
+    ret = p;
+  }
+#endif
+  return ret;
+}
+
+void
+#if __STDC__
+_mp_default_free (void *blk_ptr, size_t blk_size)
+#else
+_mp_default_free (blk_ptr, blk_size)
+     void *blk_ptr;
+     size_t blk_size;
+#endif
+{
+#ifdef DEBUG
+  {
+    mp_ptr p = blk_ptr;
+    if (blk_size != 0)
+      {
+	if (p[-1] != (0xdeadbeef << 31) + 0xdeafdeed)
+	  {
+	    fprintf (stderr, "gmp: (free) data clobbered before allocation block\n");
+	    abort ();
+	  }
+	if (blk_size % BYTES_PER_MP_LIMB == 0)
+	  if (p[blk_size / BYTES_PER_MP_LIMB] != ~((0xdeadbeef << 31) + 0xdeafdeed))
+	    {
+	      fprintf (stderr, "gmp: (free) data clobbered after allocation block\n");
+	      abort ();
+	    }
+      }
+    blk_ptr = p - 1;
+  }
+#endif
+  free (blk_ptr);
+}
diff --git a/rts/gmp/missing b/rts/gmp/missing
new file mode 100644
index 0000000000..c60e9d772f
--- /dev/null
+++ b/rts/gmp/missing
@@ -0,0 +1,244 @@
+#! /bin/sh
+# Common stub for a few missing GNU programs while installing.
+# Copyright (C) 1996, 1997, 1999 Free Software Foundation, Inc.
+# Originally by Fran,cois Pinard <pinard@iro.umontreal.ca>, 1996.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# 02111-1307, USA.
+
+if test $# -eq 0; then
+  echo 1>&2 "Try \`$0 --help' for more information"
+  exit 1
+fi
+
+run=:
+
+case "$1" in
+--run)
+  # Try to run requested program, and just exit if it succeeds.
+  run=
+  shift
+  "$@" && exit 0
+  ;;
+esac
+
+# If it does not exist, or fails to run (possibly an outdated version),
+# try to emulate it.
+case "$1" in
+
+  -h|--h|--he|--hel|--help)
+    echo "\
+$0 [OPTION]... PROGRAM [ARGUMENT]...
+
+Handle \`PROGRAM [ARGUMENT]...' for when PROGRAM is missing, or return an
+error status if there is no known handling for PROGRAM.
+
+Options:
+  -h, --help      display this help and exit
+  -v, --version   output version information and exit
+  --run           try to run the given command, and emulate it if it fails
+
+Supported PROGRAM values:
+  aclocal      touch file \`aclocal.m4'
+  autoconf     touch file \`configure'
+  autoheader   touch file \`config.h.in'
+  automake     touch all \`Makefile.in' files
+  bison        create \`y.tab.[ch]', if possible, from existing .[ch]
+  flex         create \`lex.yy.c', if possible, from existing .c
+  lex          create \`lex.yy.c', if possible, from existing .c
+  makeinfo     touch the output file
+  tar          try tar, gnutar, gtar, then tar without non-portable flags
+  yacc         create \`y.tab.[ch]', if possible, from existing .[ch]"
+    ;;
+
+  -v|--v|--ve|--ver|--vers|--versi|--versio|--version)
+    echo "missing 0.2 - GNU automake"
+    ;;
+
+  -*)
+    echo 1>&2 "$0: Unknown \`$1' option"
+    echo 1>&2 "Try \`$0 --help' for more information"
+    exit 1
+    ;;
+
+  aclocal)
+    echo 1>&2 "\
+WARNING: \`$1' is missing on your system.  You should only need it if
+         you modified \`acinclude.m4' or \`configure.in'.  You might want
+         to install the \`Automake' and \`Perl' packages.  Grab them from
+         any GNU archive site."
+    touch aclocal.m4
+    ;;
+
+  autoconf)
+    echo 1>&2 "\
+WARNING: \`$1' is missing on your system.  You should only need it if
+         you modified \`configure.in'.  You might want to install the
+         \`Autoconf' and \`GNU m4' packages.  Grab them from any GNU
+         archive site."
+    touch configure
+    ;;
+
+  autoheader)
+    echo 1>&2 "\
+WARNING: \`$1' is missing on your system.  You should only need it if
+         you modified \`acconfig.h' or \`configure.in'.  You might want
+         to install the \`Autoconf' and \`GNU m4' packages.  Grab them
+         from any GNU archive site."
+    files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER(\([^)]*\)).*/\1/p' configure.in`
+    test -z "$files" && files="config.h"
+    touch_files=
+    for f in $files; do
+      case "$f" in
+      *:*) touch_files="$touch_files "`echo "$f" |
+				       sed -e 's/^[^:]*://' -e 's/:.*//'`;;
+      *) touch_files="$touch_files $f.in";;
+      esac
+    done
+    touch $touch_files
+    ;;
+
+  automake)
+    echo 1>&2 "\
+WARNING: \`$1' is missing on your system.  You should only need it if
+         you modified \`Makefile.am', \`acinclude.m4' or \`configure.in'.
+         You might want to install the \`Automake' and \`Perl' packages.
+         Grab them from any GNU archive site."
+    find . -type f -name Makefile.am -print |
+	   sed 's/\.am$/.in/' |
+	   while read f; do touch "$f"; done
+    ;;
+
+  bison|yacc)
+    echo 1>&2 "\
+WARNING: \`$1' is missing on your system.  You should only need it if
+         you modified a \`.y' file.  You may need the \`Bison' package
+         in order for those modifications to take effect.  You can get
+         \`Bison' from any GNU archive site."
+    rm -f y.tab.c y.tab.h
+    if [ $# -ne 1 ]; then
+        eval LASTARG="\${$#}"
+	case "$LASTARG" in
+	*.y)
+	    SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'`
+	    if [ -f "$SRCFILE" ]; then
+	         cp "$SRCFILE" y.tab.c
+	    fi
+	    SRCFILE=`echo "$LASTARG" | sed 's/y$/h/'`
+	    if [ -f "$SRCFILE" ]; then
+	         cp "$SRCFILE" y.tab.h
+	    fi
+	  ;;
+	esac
+    fi
+    if [ ! -f y.tab.h ]; then
+	echo >y.tab.h
+    fi
+    if [ ! -f y.tab.c ]; then
+	echo 'main() { return 0; }' >y.tab.c
+    fi
+    ;;
+
+  lex|flex)
+    echo 1>&2 "\
+WARNING: \`$1' is missing on your system.  You should only need it if
+         you modified a \`.l' file.  You may need the \`Flex' package
+         in order for those modifications to take effect.  You can get
+         \`Flex' from any GNU archive site."
+    rm -f lex.yy.c
+    if [ $# -ne 1 ]; then
+        eval LASTARG="\${$#}"
+	case "$LASTARG" in
+	*.l)
+	    SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'`
+	    if [ -f "$SRCFILE" ]; then
+	         cp "$SRCFILE" lex.yy.c
+	    fi
+	  ;;
+	esac
+    fi
+    if [ ! -f lex.yy.c ]; then
+	echo 'main() { return 0; }' >lex.yy.c
+    fi
+    ;;
+
+  makeinfo)
+    echo 1>&2 "\
+WARNING: \`$1' is missing on your system.  You should only need it if
+         you modified a \`.texi' or \`.texinfo' file, or any other file
+         indirectly affecting the aspect of the manual.  The spurious
+         call might also be the consequence of using a buggy \`make' (AIX,
+         DU, IRIX).  You might want to install the \`Texinfo' package or
+         the \`GNU make' package.  Grab either from any GNU archive site."
+    file=`echo "$*" | sed -n 's/.*-o \([^ ]*\).*/\1/p'`
+    if test -z "$file"; then
+      file=`echo "$*" | sed 's/.* \([^ ]*\) *$/\1/'`
+      file=`sed -n '/^@setfilename/ { s/.* \([^ ]*\) *$/\1/; p; q; }' $file`
+    fi
+    touch $file
+    ;;
+
+  tar)
+    shift
+    if test -n "$run"; then
+      echo 1>&2 "ERROR: \`tar' requires --run"
+      exit 1
+    fi
+
+    # We have already tried tar in the generic part.
+    # Look for gnutar/gtar before invocation to avoid ugly error
+    # messages.
+    if (gnutar --version > /dev/null 2>&1); then
+       gnutar ${1+"$@"} && exit 0
+    fi
+    if (gtar --version > /dev/null 2>&1); then
+       gtar ${1+"$@"} && exit 0
+    fi
+    firstarg="$1"
+    if shift; then
+	case "$firstarg" in
+	*o*)
+	    firstarg=`echo "$firstarg" | sed s/o//`
+	    tar "$firstarg" ${1+"$@"} && exit 0
+	    ;;
+	esac
+	case "$firstarg" in
+	*h*)
+	    firstarg=`echo "$firstarg" | sed s/h//`
+	    tar "$firstarg" ${1+"$@"} && exit 0
+	    ;;
+	esac
+    fi
+
+    echo 1>&2 "\
+WARNING: I can't seem to be able to run \`tar' with the given arguments.
+         You may want to install GNU tar or Free paxutils, or check the
+         command line arguments."
+    exit 1
+    ;;
+
+  *)
+    echo 1>&2 "\
+WARNING: \`$1' is needed, and you do not seem to have it handy on your
+         system.  You might have modified some files without having the
+         proper tools for further handling them.  Check the \`README' file,
+         it often tells you about the needed prerequirements for installing
+         this package.  You may also peek at any GNU archive site, in case
+         some other package would contain this missing \`$1' program."
+    exit 1
+    ;;
+esac
+
+exit 0
diff --git a/rts/gmp/mkinstalldirs b/rts/gmp/mkinstalldirs
new file mode 100644
index 0000000000..5e17cd39fb
--- /dev/null
+++ b/rts/gmp/mkinstalldirs
@@ -0,0 +1,38 @@
+#! /bin/sh
+# mkinstalldirs --- make directory hierarchy
+# Author: Noah Friedman <friedman@prep.ai.mit.edu>
+# Created: 1993-05-16
+# Public domain
+
+errstatus=0
+
+for file
+do
+   set fnord `echo ":$file" | sed -ne 's/^:\//#/;s/^://;s/\// /g;s/^#/\//;p'`
+   shift
+
+   pathcomp=
+   for d
+   do
+     pathcomp="$pathcomp$d"
+     case "$pathcomp" in
+       -* ) pathcomp=./$pathcomp ;;
+     esac
+
+     if test ! -d "$pathcomp"; then
+        echo "mkdir $pathcomp"
+
+        mkdir "$pathcomp" || lasterr=$?
+
+        if test ! -d "$pathcomp"; then
+  	  errstatus=$lasterr
+        fi
+     fi
+
+     pathcomp="$pathcomp/"
+   done
+done
+
+exit $errstatus
+
+# mkinstalldirs ends here
diff --git a/rts/gmp/mp.h b/rts/gmp/mp.h
new file mode 100644
index 0000000000..ffab4cba82
--- /dev/null
+++ b/rts/gmp/mp.h
@@ -0,0 +1,124 @@
+/* mp.h -- Definitions for Berkeley compatible multiple precision functions.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#ifndef __MP_H__
+
+#ifndef __GNU_MP__		/* to allow inclusion of both gmp.h and mp.h */
+#define __GNU_MP__ 3
+#define __need_size_t
+#include <stddef.h>
+#undef __need_size_t
+
+#if defined (__STDC__) || defined (__cplusplus)
+#define __gmp_const const
+#else
+#define __gmp_const
+#endif
+
+#if defined (__GNUC__)
+#define __gmp_inline __inline__
+#else
+#define __gmp_inline
+#endif
+
+#ifndef _EXTERN_INLINE
+#ifdef __GNUC__
+#define _EXTERN_INLINE extern __inline__
+#else
+#define _EXTERN_INLINE static
+#endif
+#endif
+
+#ifdef _SHORT_LIMB
+typedef unsigned int		mp_limb_t;
+typedef int			mp_limb_signed_t;
+#else
+#ifdef _LONG_LONG_LIMB
+typedef unsigned long long int	mp_limb_t;
+typedef long long int		mp_limb_signed_t;
+#else
+typedef unsigned long int	mp_limb_t;
+typedef long int		mp_limb_signed_t;
+#endif
+#endif
+
+typedef mp_limb_t *		mp_ptr;
+typedef __gmp_const mp_limb_t *	mp_srcptr;
+typedef int			mp_size_t;
+typedef long int		mp_exp_t;
+
+typedef struct
+{
+  int _mp_alloc;		/* Number of *limbs* allocated and pointed
+				   to by the D field.  */
+  int _mp_size;			/* abs(SIZE) is the number of limbs
+				   the last field points to.  If SIZE
+				   is negative this is a negative
+				   number.  */
+  mp_limb_t *_mp_d;		/* Pointer to the limbs.  */
+} __mpz_struct;
+#endif /* __GNU_MP__ */
+
+/* User-visible types.  */
+typedef __mpz_struct MINT;
+
+
+#ifndef _PROTO
+#if (__STDC__-0) || defined (__cplusplus)
+#define _PROTO(x) x
+#else
+#define _PROTO(x) ()
+#endif
+#endif
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#define mp_set_memory_functions __gmp_set_memory_functions
+void mp_set_memory_functions _PROTO ((void *(*) (size_t),
+                                      void *(*) (void *, size_t, size_t),
+                                      void (*) (void *, size_t)));
+MINT *itom _PROTO ((signed short int));
+MINT *xtom _PROTO ((const char *));
+void move _PROTO ((const MINT *, MINT *));
+void madd _PROTO ((const MINT *, const MINT *, MINT *));
+void msub _PROTO ((const MINT *, const MINT *, MINT *));
+void mult _PROTO ((const MINT *, const MINT *, MINT *));
+void mdiv _PROTO ((const MINT *, const MINT *, MINT *, MINT *));
+void sdiv _PROTO ((const MINT *, signed short int, MINT *, signed short int *));
+void msqrt _PROTO ((const MINT *, MINT *, MINT *));
+void pow _PROTO ((const MINT *, const MINT *, const MINT *, MINT *));
+void rpow _PROTO ((const MINT *, signed short int, MINT *));
+void gcd _PROTO ((const MINT *, const MINT *, MINT *));
+int mcmp _PROTO ((const MINT *, const MINT *));
+void min _PROTO ((MINT *));
+void mout _PROTO ((const MINT *));
+char *mtox _PROTO ((const MINT *));
+void mfree _PROTO ((MINT *));
+
+#if defined (__cplusplus)
+}
+#endif
+
+#define __MP_H__
+#endif /* __MP_H__ */
diff --git a/rts/gmp/mp_bpl.c b/rts/gmp/mp_bpl.c
new file mode 100644
index 0000000000..df8b03e5ab
--- /dev/null
+++ b/rts/gmp/mp_bpl.c
@@ -0,0 +1,27 @@
+/*
+Copyright (C) 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+const int mp_bits_per_limb = BITS_PER_MP_LIMB;
+const int __gmp_0 = 0;
+int __gmp_junk;
diff --git a/rts/gmp/mp_clz_tab.c b/rts/gmp/mp_clz_tab.c
new file mode 100644
index 0000000000..1bbd1d6a66
--- /dev/null
+++ b/rts/gmp/mp_clz_tab.c
@@ -0,0 +1,36 @@
+/* __clz_tab -- support for longlong.h
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+const
+unsigned char __clz_tab[] =
+{
+  0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+  6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+  8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+  8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+  8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+};
diff --git a/rts/gmp/mp_minv_tab.c b/rts/gmp/mp_minv_tab.c
new file mode 100644
index 0000000000..4afff85cfc
--- /dev/null
+++ b/rts/gmp/mp_minv_tab.c
@@ -0,0 +1,50 @@
+/* A table of data supporting modlimb_invert().
+
+   THE CONTENTS OF THIS FILE ARE FOR INTERNAL USE AND MAY CHANGE
+   INCOMPATIBLY OR DISAPPEAR IN A FUTURE GNU MP RELEASE.  */
+
+/*
+Copyright (C) 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+/* modlimb_invert_table[i] is the multiplicative inverse of 2*i+1 mod 256,
+   ie. (modlimb_invert_table[i] * (2*i+1)) % 256 == 1 */
+
+const unsigned char  modlimb_invert_table[128] = {
+  0x01, 0xAB, 0xCD, 0xB7, 0x39, 0xA3, 0xC5, 0xEF,
+  0xF1, 0x1B, 0x3D, 0xA7, 0x29, 0x13, 0x35, 0xDF,
+  0xE1, 0x8B, 0xAD, 0x97, 0x19, 0x83, 0xA5, 0xCF,
+  0xD1, 0xFB, 0x1D, 0x87, 0x09, 0xF3, 0x15, 0xBF,
+  0xC1, 0x6B, 0x8D, 0x77, 0xF9, 0x63, 0x85, 0xAF,
+  0xB1, 0xDB, 0xFD, 0x67, 0xE9, 0xD3, 0xF5, 0x9F,
+  0xA1, 0x4B, 0x6D, 0x57, 0xD9, 0x43, 0x65, 0x8F,
+  0x91, 0xBB, 0xDD, 0x47, 0xC9, 0xB3, 0xD5, 0x7F,
+  0x81, 0x2B, 0x4D, 0x37, 0xB9, 0x23, 0x45, 0x6F,
+  0x71, 0x9B, 0xBD, 0x27, 0xA9, 0x93, 0xB5, 0x5F,
+  0x61, 0x0B, 0x2D, 0x17, 0x99, 0x03, 0x25, 0x4F,
+  0x51, 0x7B, 0x9D, 0x07, 0x89, 0x73, 0x95, 0x3F,
+  0x41, 0xEB, 0x0D, 0xF7, 0x79, 0xE3, 0x05, 0x2F,
+  0x31, 0x5B, 0x7D, 0xE7, 0x69, 0x53, 0x75, 0x1F,
+  0x21, 0xCB, 0xED, 0xD7, 0x59, 0xC3, 0xE5, 0x0F,
+  0x11, 0x3B, 0x5D, 0xC7, 0x49, 0x33, 0x55, 0xFF
+};
diff --git a/rts/gmp/mp_set_fns.c b/rts/gmp/mp_set_fns.c
new file mode 100644
index 0000000000..55d4d9d6e4
--- /dev/null
+++ b/rts/gmp/mp_set_fns.c
@@ -0,0 +1,48 @@
+/* mp_set_memory_functions -- Set the allocate, reallocate, and free functions
+   for use by the mp package.
+
+Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mp_set_memory_functions (void *(*alloc_func) (size_t),
+			 void *(*realloc_func) (void *, size_t, size_t),
+			 void (*free_func) (void *, size_t))
+#else
+mp_set_memory_functions (alloc_func, realloc_func, free_func)
+     void *(*alloc_func) ();
+     void *(*realloc_func) ();
+     void (*free_func) ();
+#endif
+{
+  if (alloc_func == 0)
+    alloc_func = _mp_default_allocate;
+  if (realloc_func == 0)
+    realloc_func = _mp_default_reallocate;
+  if (free_func == 0)
+    free_func = _mp_default_free;
+
+  _mp_allocate_func = alloc_func;
+  _mp_reallocate_func = realloc_func;
+  _mp_free_func = free_func;
+}
diff --git a/rts/gmp/mpn/Makefile.am b/rts/gmp/mpn/Makefile.am
new file mode 100644
index 0000000000..1c49ccda25
--- /dev/null
+++ b/rts/gmp/mpn/Makefile.am
@@ -0,0 +1,94 @@
+## Process this file with automake to generate Makefile.in
+
+# Copyright (C) 1996, 1998, 1999, 2000 Free Software Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+AUTOMAKE_OPTIONS = gnu no-dependencies
+SUBDIRS = tests
+
+CPP	= @CPP@
+
+# -DOPERATION_$* tells multi-function files which function to produce.
+INCLUDES = -I$(top_srcdir) -DOPERATION_$*
+
+GENERIC_SOURCES = mp_bases.c
+OFILES = @mpn_objects@
+
+noinst_LTLIBRARIES = libmpn.la
+libmpn_la_SOURCES = $(GENERIC_SOURCES)
+libmpn_la_LIBADD = $(OFILES)
+libmpn_la_DEPENDENCIES = $(OFILES)
+
+TARG_DIST = a29k alpha arm clipper cray generic hppa i960 lisp m68k m88k \
+  mips2 mips3 ns32k pa64 pa64w power powerpc32 powerpc64 pyr sh sparc32 \
+  sparc64 thumb vax x86 z8000 z8000x
+
+EXTRA_DIST = underscore.h asm-defs.m4 $(TARG_DIST)
+
+# COMPILE minus CC.  FIXME: Really pass *_CFLAGS to CPP?
+COMPILE_FLAGS =	\
+	$(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+
+SUFFIXES = .s .S .asm
+
+# *.s are not preprocessed at all.
+.s.o:
+	$(CCAS) $(COMPILE_FLAGS) $<
+.s.obj:
+	$(CCAS) $(COMPILE_FLAGS) `cygpath -w $<`
+.s.lo:
+	$(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) $<
+
+# *.S are preprocessed with CPP.
+.S.o:
+	$(CPP) $(COMPILE_FLAGS) $< | grep -v '^#' >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	rm -f tmp-$*.s
+.S.obj:
+	$(CPP) $(COMPILE_FLAGS) `cygpath -w $<` | grep -v '^#' >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	rm -f tmp-$*.s
+
+# We have to rebuild the static object file without passing -DPIC to
+# preprocessor.  The overhead cost is one extra assemblation.  FIXME:
+# Teach libtool how to assemble with a preprocessor pass (CPP or m4).
+
+.S.lo:
+	$(CPP) $(COMPILE_FLAGS) -DPIC $< | grep -v '^#' >tmp-$*.s
+	$(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	$(CPP) $(COMPILE_FLAGS) $< | grep -v '^#' >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $*.o
+	rm -f tmp-$*.s
+
+# *.m4 are preprocessed with m4.
+.asm.o:
+	$(M4) -DOPERATION_$* $< >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	rm -f tmp-$*.s
+.asm.obj:
+	$(M4) -DOPERATION_$* `cygpath -w $<` >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	rm -f tmp-$*.s
+.asm.lo:
+	$(M4) -DPIC -DOPERATION_$* $< >tmp-$*.s
+	$(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	$(M4) -DOPERATION_$* $< >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $*.o
+	rm -f tmp-$*.s
diff --git a/rts/gmp/mpn/Makefile.in b/rts/gmp/mpn/Makefile.in
new file mode 100644
index 0000000000..59ee958c92
--- /dev/null
+++ b/rts/gmp/mpn/Makefile.in
@@ -0,0 +1,472 @@
+# Makefile.in generated automatically by automake 1.4a from Makefile.am
+
+# Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+SHELL = @SHELL@
+
+srcdir = @srcdir@
+top_srcdir = @top_srcdir@
+VPATH = @srcdir@
+prefix = @prefix@
+exec_prefix = @exec_prefix@
+
+bindir = @bindir@
+sbindir = @sbindir@
+libexecdir = @libexecdir@
+datadir = @datadir@
+sysconfdir = @sysconfdir@
+sharedstatedir = @sharedstatedir@
+localstatedir = @localstatedir@
+libdir = @libdir@
+infodir = @infodir@
+mandir = @mandir@
+includedir = @includedir@
+oldincludedir = /usr/include
+
+DESTDIR =
+
+pkgdatadir = $(datadir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+
+top_builddir = ..
+
+ACLOCAL = @ACLOCAL@
+AUTOCONF = @AUTOCONF@
+AUTOMAKE = @AUTOMAKE@
+AUTOHEADER = @AUTOHEADER@
+
+INSTALL = @INSTALL@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_FLAG =
+transform = @program_transform_name@
+
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+
+@SET_MAKE@
+build_alias = @build_alias@
+build_triplet = @build@
+host_alias = @host_alias@
+host_triplet = @host@
+target_alias = @target_alias@
+target_triplet = @target@
+AMDEP = @AMDEP@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AWK = @AWK@
+CALLING_CONVENTIONS_OBJS = @CALLING_CONVENTIONS_OBJS@
+CC = @CC@
+CCAS = @CCAS@
+CPP = @CPP@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+EXEEXT = @EXEEXT@
+LIBTOOL = @LIBTOOL@
+LN_S = @LN_S@
+M4 = @M4@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+PACKAGE = @PACKAGE@
+RANLIB = @RANLIB@
+SPEED_CYCLECOUNTER_OBJS = @SPEED_CYCLECOUNTER_OBJS@
+STRIP = @STRIP@
+U = @U@
+VERSION = @VERSION@
+gmp_srclinks = @gmp_srclinks@
+install_sh = @install_sh@
+mpn_objects = @mpn_objects@
+mpn_objs_in_libgmp = @mpn_objs_in_libgmp@
+
+# Copyright (C) 1996, 1998, 1999, 2000 Free Software Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+AUTOMAKE_OPTIONS = gnu no-dependencies
+SUBDIRS =
+
+CPP = @CPP@
+
+# -DOPERATION_$* tells multi-function files which function to produce.
+INCLUDES = -I$(top_srcdir) -DOPERATION_$*
+
+GENERIC_SOURCES = mp_bases.c
+OFILES = @mpn_objects@
+
+noinst_LTLIBRARIES = libmpn.la
+libmpn_la_SOURCES = $(GENERIC_SOURCES)
+libmpn_la_LIBADD = $(OFILES)
+libmpn_la_DEPENDENCIES = $(OFILES)
+
+TARG_DIST = a29k alpha arm clipper cray generic hppa i960 lisp m68k m88k \
+  mips2 mips3 ns32k pa64 pa64w power powerpc32 powerpc64 pyr sh sparc32 \
+  sparc64 thumb vax x86 z8000 z8000x
+
+
+EXTRA_DIST = underscore.h asm-defs.m4 $(TARG_DIST)
+
+# COMPILE minus CC.  FIXME: Really pass *_CFLAGS to CPP?
+COMPILE_FLAGS = \
+	$(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+
+
+SUFFIXES = .s .S .asm
+subdir = mpn
+mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs
+CONFIG_HEADER = ../config.h
+CONFIG_CLEAN_FILES = 
+LTLIBRARIES =  $(noinst_LTLIBRARIES)
+
+
+DEFS = @DEFS@ -I. -I$(srcdir) -I..
+CPPFLAGS = @CPPFLAGS@
+LDFLAGS = @LDFLAGS@
+LIBS = @LIBS@
+libmpn_la_LDFLAGS = 
+am_libmpn_la_OBJECTS =  mp_bases.lo
+libmpn_la_OBJECTS =  $(am_libmpn_la_OBJECTS)
+COMPILE = $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CFLAGS = @CFLAGS@
+CCLD = $(CC)
+LINK = $(LIBTOOL) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+DIST_SOURCES =  $(libmpn_la_SOURCES)
+DIST_COMMON =  README Makefile.am Makefile.in
+
+
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+
+GZIP_ENV = --best
+depcomp = 
+SOURCES = $(libmpn_la_SOURCES)
+OBJECTS = $(am_libmpn_la_OBJECTS)
+
+all: all-redirect
+.SUFFIXES:
+.SUFFIXES: .S .asm .c .lo .o .obj .s
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4) 
+	cd $(top_srcdir) && $(AUTOMAKE) --gnu mpn/Makefile
+
+Makefile: $(srcdir)/Makefile.in  $(top_builddir)/config.status
+	cd $(top_builddir) \
+	  && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
+
+
+mostlyclean-noinstLTLIBRARIES:
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+
+distclean-noinstLTLIBRARIES:
+
+maintainer-clean-noinstLTLIBRARIES:
+
+mostlyclean-compile:
+	-rm -f *.o core *.core
+	-rm -f *.$(OBJEXT)
+
+clean-compile:
+
+distclean-compile:
+	-rm -f *.tab.c
+
+maintainer-clean-compile:
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+distclean-libtool:
+
+maintainer-clean-libtool:
+
+libmpn.la: $(libmpn_la_OBJECTS) $(libmpn_la_DEPENDENCIES)
+	$(LINK)  $(libmpn_la_LDFLAGS) $(libmpn_la_OBJECTS) $(libmpn_la_LIBADD) $(LIBS)
+.c.o:
+	$(COMPILE) -c $<
+.c.obj:
+	$(COMPILE) -c `cygpath -w $<`
+.c.lo:
+	$(LTCOMPILE) -c -o $@ $<
+
+# This directory's subdirectories are mostly independent; you can cd
+# into them and run `make' without going through this Makefile.
+# To change the values of `make' variables: instead of editing Makefiles,
+# (1) if the variable is set in `config.status', edit `config.status'
+#     (which will cause the Makefiles to be regenerated when you run `make');
+# (2) otherwise, pass the desired values on the `make' command line.
+
+all-recursive install-data-recursive install-exec-recursive \
+installdirs-recursive install-recursive uninstall-recursive  \
+check-recursive installcheck-recursive info-recursive dvi-recursive:
+	@set fnord $(MAKEFLAGS); amf=$$2; \
+	dot_seen=no; \
+	target=`echo $@ | sed s/-recursive//`; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    dot_seen=yes; \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	   || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
+	done; \
+	if test "$$dot_seen" = "no"; then \
+	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
+	fi; test -z "$$fail"
+
+mostlyclean-recursive clean-recursive distclean-recursive \
+maintainer-clean-recursive:
+	@set fnord $(MAKEFLAGS); amf=$$2; \
+	dot_seen=no; \
+	rev=''; list='$(SUBDIRS)'; for subdir in $$list; do \
+	  rev="$$subdir $$rev"; \
+	  if test "$$subdir" = "."; then dot_seen=yes; else :; fi; \
+	done; \
+	test "$$dot_seen" = "no" && rev=". $$rev"; \
+	target=`echo $@ | sed s/-recursive//`; \
+	for subdir in $$rev; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	   || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
+	done && test -z "$$fail"
+tags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
+	done
+
+tags: TAGS
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	mkid -f$$here/ID $$unique $(LISP)
+
+TAGS: tags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	tags=; \
+	here=`pwd`; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+   if test "$$subdir" = .; then :; else \
+	    test -f $$subdir/TAGS && tags="$$tags -i $$here/$$subdir/TAGS"; \
+   fi; \
+	done; \
+	list='$(SOURCES) $(HEADERS) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	test -z "$(ETAGS_ARGS)$$unique$(LISP)$$tags" \
+	  || etags $(ETAGS_ARGS) $$tags  $$unique $(LISP)
+
+mostlyclean-tags:
+
+clean-tags:
+
+distclean-tags:
+	-rm -f TAGS ID
+
+maintainer-clean-tags:
+
+distdir = $(top_builddir)/$(PACKAGE)-$(VERSION)/$(subdir)
+
+distdir: $(DISTFILES)
+	@for file in $(DISTFILES); do \
+	  d=$(srcdir); \
+	  if test -d $$d/$$file; then \
+	    cp -pR $$d/$$file $(distdir); \
+	  else \
+	    test -f $(distdir)/$$file \
+	    || cp -p $$d/$$file $(distdir)/$$file || :; \
+	  fi; \
+	done
+	for subdir in $(SUBDIRS); do \
+	  if test "$$subdir" = .; then :; else \
+	    test -d $(distdir)/$$subdir \
+	    || mkdir $(distdir)/$$subdir \
+	    || exit 1; \
+	    (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir=../$(top_distdir) distdir=../$(distdir)/$$subdir distdir) \
+	      || exit 1; \
+	  fi; \
+	done
+info-am:
+info: info-recursive
+dvi-am:
+dvi: dvi-recursive
+check-am: all-am
+check: check-recursive
+installcheck-am:
+installcheck: installcheck-recursive
+install-exec-am:
+install-exec: install-exec-recursive
+
+install-data-am:
+install-data: install-data-recursive
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+install: install-recursive
+uninstall-am:
+uninstall: uninstall-recursive
+all-am: Makefile $(LTLIBRARIES)
+all-redirect: all-recursive
+install-strip:
+	$(MAKE) $(AM_MAKEFLAGS) INSTALL_STRIP_FLAG=-s install
+installdirs: installdirs-recursive
+installdirs-am:
+
+
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-rm -f Makefile $(CONFIG_CLEAN_FILES)
+	-rm -f config.cache config.log stamp-h stamp-h[0-9]*
+
+maintainer-clean-generic:
+	-rm -f Makefile.in
+mostlyclean-am:  mostlyclean-noinstLTLIBRARIES mostlyclean-compile \
+		mostlyclean-libtool mostlyclean-tags \
+		mostlyclean-generic
+
+mostlyclean: mostlyclean-recursive
+
+clean-am:  clean-noinstLTLIBRARIES clean-compile clean-libtool \
+		clean-tags clean-generic mostlyclean-am
+
+clean: clean-recursive
+
+distclean-am:  distclean-noinstLTLIBRARIES distclean-compile \
+		distclean-libtool distclean-tags distclean-generic \
+		clean-am
+	-rm -f libtool
+
+distclean: distclean-recursive
+
+maintainer-clean-am:  maintainer-clean-noinstLTLIBRARIES \
+		maintainer-clean-compile maintainer-clean-libtool \
+		maintainer-clean-tags maintainer-clean-generic \
+		distclean-am
+	@echo "This command is intended for maintainers to use;"
+	@echo "it deletes files that may require special tools to rebuild."
+
+maintainer-clean: maintainer-clean-recursive
+
+.PHONY: mostlyclean-noinstLTLIBRARIES distclean-noinstLTLIBRARIES \
+clean-noinstLTLIBRARIES maintainer-clean-noinstLTLIBRARIES \
+mostlyclean-compile distclean-compile clean-compile \
+maintainer-clean-compile mostlyclean-libtool distclean-libtool \
+clean-libtool maintainer-clean-libtool install-recursive \
+uninstall-recursive install-data-recursive uninstall-data-recursive \
+install-exec-recursive uninstall-exec-recursive installdirs-recursive \
+uninstalldirs-recursive all-recursive check-recursive \
+installcheck-recursive info-recursive dvi-recursive \
+mostlyclean-recursive distclean-recursive clean-recursive \
+maintainer-clean-recursive tags tags-recursive mostlyclean-tags \
+distclean-tags clean-tags maintainer-clean-tags distdir info-am info \
+dvi-am dvi check check-am installcheck-am installcheck install-exec-am \
+install-exec install-data-am install-data install-am install \
+uninstall-am uninstall all-redirect all-am all install-strip \
+installdirs-am installdirs mostlyclean-generic distclean-generic \
+clean-generic maintainer-clean-generic clean mostlyclean distclean \
+maintainer-clean
+
+
+# *.s are not preprocessed at all.
+.s.o:
+	$(CCAS) $(COMPILE_FLAGS) $<
+.s.obj:
+	$(CCAS) $(COMPILE_FLAGS) `cygpath -w $<`
+.s.lo:
+	$(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) $<
+
+# *.S are preprocessed with CPP.
+.S.o:
+	$(CPP) $(COMPILE_FLAGS) $< | grep -v '^#' >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	rm -f tmp-$*.s
+.S.obj:
+	$(CPP) $(COMPILE_FLAGS) `cygpath -w $<` | grep -v '^#' >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	rm -f tmp-$*.s
+
+# We have to rebuild the static object file without passing -DPIC to
+# preprocessor.  The overhead cost is one extra assemblation.  FIXME:
+# Teach libtool how to assemble with a preprocessor pass (CPP or m4).
+
+.S.lo:
+	$(CPP) $(COMPILE_FLAGS) -DPIC $< | grep -v '^#' >tmp-$*.s
+	$(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	$(CPP) $(COMPILE_FLAGS) $< | grep -v '^#' >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $*.o
+	rm -f tmp-$*.s
+
+# *.m4 are preprocessed with m4.
+.asm.o:
+	$(M4) -DOPERATION_$* $< >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	rm -f tmp-$*.s
+.asm.obj:
+	$(M4) -DOPERATION_$* `cygpath -w $<` >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	rm -f tmp-$*.s
+.asm.lo:
+	$(M4) -DPIC -DOPERATION_$* $< >tmp-$*.s
+	$(LIBTOOL) --mode=compile $(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	$(M4) -DOPERATION_$* $< >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $*.o
+	rm -f tmp-$*.s
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/rts/gmp/mpn/README b/rts/gmp/mpn/README
new file mode 100644
index 0000000000..7453c9d03e
--- /dev/null
+++ b/rts/gmp/mpn/README
@@ -0,0 +1,13 @@
+This directory contains all code for the mpn layer of GMP.
+
+Most subdirectories contain machine-dependent code, written in assembly or C.
+The `generic' subdirectory contains default code, used when there is no
+machine-dependent replacement for a particular machine.
+
+There is one subdirectory for each ISA family.  Note that e.g., 32-bit SPARC
+and 64-bit SPARC are very different ISA's, and thus cannot share any code.
+
+A particular compile will only use code from one subdirectory, and the
+`generic' subdirectory.  The ISA-specific subdirectories contain hierachies of
+directories for various architecture variants and implementations; the
+top-most level contains code that runs correctly on all variants.
diff --git a/rts/gmp/mpn/a29k/add_n.s b/rts/gmp/mpn/a29k/add_n.s
new file mode 100644
index 0000000000..e3ee6dfa60
--- /dev/null
+++ b/rts/gmp/mpn/a29k/add_n.s
@@ -0,0 +1,120 @@
+; 29000 __gmpn_add -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	lr2
+; s1_ptr	lr3
+; s2_ptr	lr4
+; size		lr5
+
+; We use the loadm/storem instructions and operate on chunks of 8
+; limbs/per iteration, until less than 8 limbs remain.
+
+; The 29k has no addition or subtraction instructions that doesn't
+; affect carry, so we need to save and restore that as soon as we
+; adjust the pointers.  gr116 is used for this purpose.  Note that
+; gr116==0 means that carry should be set.
+
+	.sect .lit,lit
+	.text
+	.align	4
+	.global	___gmpn_add_n
+	.word	0x60000
+___gmpn_add_n:
+	srl	gr117,lr5,3
+	sub	gr118,gr117,1
+	jmpt	gr118,Ltail
+	 constn	gr116,-1		; init cy reg
+	sub	gr117,gr117,2		; count for jmpfdec
+
+; Main loop working 8 limbs/iteration.
+Loop:	mtsrim	cr,(8-1)
+	loadm	0,0,gr96,lr3
+	add	lr3,lr3,32
+	mtsrim	cr,(8-1)
+	loadm	0,0,gr104,lr4
+	add	lr4,lr4,32
+
+	subr	gr116,gr116,0		; restore carry
+	addc	gr96,gr96,gr104
+	addc	gr97,gr97,gr105
+	addc	gr98,gr98,gr106
+	addc	gr99,gr99,gr107
+	addc	gr100,gr100,gr108
+	addc	gr101,gr101,gr109
+	addc	gr102,gr102,gr110
+	addc	gr103,gr103,gr111
+	subc	gr116,gr116,gr116	; gr116 = not(cy)
+
+	mtsrim	cr,(8-1)
+	storem	0,0,gr96,lr2
+	jmpfdec	gr117,Loop
+	 add	lr2,lr2,32
+
+; Code for the last up-to-7 limbs.
+; This code might look very strange, but it's hard to write it
+; differently without major slowdown.
+
+	and	lr5,lr5,(8-1)
+Ltail:	sub	gr118,lr5,1		; count for CR
+	jmpt	gr118,Lend
+	 sub	gr117,lr5,2		; count for jmpfdec
+
+	mtsr	cr,gr118
+	loadm	0,0,gr96,lr3
+	mtsr	cr,gr118
+	loadm	0,0,gr104,lr4
+
+	subr	gr116,gr116,0		; restore carry
+
+	jmpfdec	gr117,L1
+	 addc	gr96,gr96,gr104
+	jmp	Lstore
+	 mtsr	cr,gr118
+L1:	jmpfdec	gr117,L2
+	 addc	gr97,gr97,gr105
+	jmp	Lstore
+	 mtsr	cr,gr118
+L2:	jmpfdec	gr117,L3
+	 addc	gr98,gr98,gr106
+	jmp	Lstore
+	 mtsr	cr,gr118
+L3:	jmpfdec	gr117,L4
+	 addc	gr99,gr99,gr107
+	jmp	Lstore
+	 mtsr	cr,gr118
+L4:	jmpfdec	gr117,L5
+	 addc	gr100,gr100,gr108
+	jmp	Lstore
+	 mtsr	cr,gr118
+L5:	jmpfdec	gr117,L6
+	 addc	gr101,gr101,gr109
+	jmp	Lstore
+	 mtsr	cr,gr118
+L6:	addc	gr102,gr102,gr110
+
+Lstore:	storem	0,0,gr96,lr2
+	subc	gr116,gr116,gr116	; gr116 = not(cy)
+
+Lend:	jmpi	lr0
+	 add	gr96,gr116,1
diff --git a/rts/gmp/mpn/a29k/addmul_1.s b/rts/gmp/mpn/a29k/addmul_1.s
new file mode 100644
index 0000000000..f51b6d7af6
--- /dev/null
+++ b/rts/gmp/mpn/a29k/addmul_1.s
@@ -0,0 +1,113 @@
+; 29000 __gmpn_addmul_1 -- Multiply a limb vector with a single limb and
+; add the product to a second limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	lr2
+; s1_ptr	lr3
+; size		lr4
+; s2_limb	lr5
+
+	.cputype 29050
+	.sect .lit,lit
+	.text
+	.align	4
+	.global	___gmpn_addmul_1
+	.word	0x60000
+___gmpn_addmul_1:
+	sub	lr4,lr4,8
+	jmpt	lr4,Ltail
+	 const	gr120,0			; init cylimb reg
+
+	srl	gr117,lr4,3		; divide by 8
+	sub	gr117,gr117,1		; count for jmpfdec
+
+Loop:	mtsrim	cr,(8-1)
+	loadm	0,0,gr96,lr3
+	add	lr3,lr3,32
+
+	multiplu gr104,gr96,lr5
+	multmu	 gr96,gr96,lr5
+	multiplu gr105,gr97,lr5
+	multmu	 gr97,gr97,lr5
+	multiplu gr106,gr98,lr5
+	multmu	 gr98,gr98,lr5
+	multiplu gr107,gr99,lr5
+	multmu	 gr99,gr99,lr5
+	multiplu gr108,gr100,lr5
+	multmu	 gr100,gr100,lr5
+	multiplu gr109,gr101,lr5
+	multmu	 gr101,gr101,lr5
+	multiplu gr110,gr102,lr5
+	multmu	 gr102,gr102,lr5
+	multiplu gr111,gr103,lr5
+	multmu	 gr103,gr103,lr5
+
+	add	gr104,gr104,gr120
+	addc	gr105,gr105,gr96
+	addc	gr106,gr106,gr97
+	addc	gr107,gr107,gr98
+	addc	gr108,gr108,gr99
+	addc	gr109,gr109,gr100
+	addc	gr110,gr110,gr101
+	addc	gr111,gr111,gr102
+	addc	gr120,gr103,0
+
+	mtsrim	cr,(8-1)
+	loadm	0,0,gr96,lr2
+
+	add	gr104,gr96,gr104
+	addc	gr105,gr97,gr105
+	addc	gr106,gr98,gr106
+	addc	gr107,gr99,gr107
+	addc	gr108,gr100,gr108
+	addc	gr109,gr101,gr109
+	addc	gr110,gr102,gr110
+	addc	gr111,gr103,gr111
+	addc	gr120,gr120,0
+
+	mtsrim	cr,(8-1)
+	storem	0,0,gr104,lr2
+	jmpfdec	gr117,Loop
+	 add	lr2,lr2,32
+
+Ltail:	and	lr4,lr4,(8-1)
+	sub	gr118,lr4,1		; count for CR
+	jmpt	gr118,Lend
+	 sub	lr4,lr4,2
+	sub	lr2,lr2,4		; offset res_ptr by one limb
+
+Loop2:	load	0,0,gr116,lr3
+	add	lr3,lr3,4
+	multiplu gr117,gr116,lr5
+	multmu	gr118,gr116,lr5
+	add	lr2,lr2,4
+	load	0,0,gr119,lr2
+	add	gr117,gr117,gr120
+	addc	gr118,gr118,0
+	add	gr117,gr117,gr119
+	store	0,0,gr117,lr2
+	jmpfdec	lr4,Loop2
+	 addc	gr120,gr118,0
+
+Lend:	jmpi	lr0
+	 or	gr96,gr120,0		; copy
diff --git a/rts/gmp/mpn/a29k/lshift.s b/rts/gmp/mpn/a29k/lshift.s
new file mode 100644
index 0000000000..93e1917127
--- /dev/null
+++ b/rts/gmp/mpn/a29k/lshift.s
@@ -0,0 +1,93 @@
+; 29000 __gmpn_lshift --
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	lr2
+; s1_ptr	lr3
+; s2_ptr	lr4
+; size		lr5
+
+; We use the loadm/storem instructions and operate on chunks of 8
+; limbs/per iteration, until less than 8 limbs remain.
+
+	.sect .lit,lit
+	.text
+	.align	4
+	.global	___gmpn_lshift
+	.word	0x60000
+___gmpn_lshift:
+	sll	gr116,lr4,2
+	add	lr3,gr116,lr3
+	add	lr2,gr116,lr2
+	sub	lr3,lr3,4
+	load	0,0,gr119,lr3
+
+	subr	gr116,lr5,32
+	srl	gr96,gr119,gr116	; return value
+	sub	lr4,lr4,1		; actual loop count is SIZE - 1
+
+	srl	gr117,lr4,3		; chuck count = (actual count) / 8
+	cpeq	gr118,gr117,0
+	jmpt	gr118,Ltail
+	 mtsr	fc,lr5
+
+	sub	gr117,gr117,2		; count for jmpfdec
+
+; Main loop working 8 limbs/iteration.
+Loop:	sub	lr3,lr3,32
+	mtsrim	cr,(8-1)
+	loadm	0,0,gr100,lr3
+
+	extract	gr109,gr119,gr107
+	extract	gr108,gr107,gr106
+	extract	gr107,gr106,gr105
+	extract	gr106,gr105,gr104
+	extract	gr105,gr104,gr103
+	extract	gr104,gr103,gr102
+	extract	gr103,gr102,gr101
+	extract	gr102,gr101,gr100
+
+	sub	lr2,lr2,32
+	mtsrim	cr,(8-1)
+	storem	0,0,gr102,lr2
+	jmpfdec	gr117,Loop
+	 or	gr119,gr100,0
+
+; Code for the last up-to-7 limbs.
+
+	and	lr4,lr4,(8-1)
+Ltail:	cpeq	gr118,lr4,0
+	jmpt	gr118,Lend
+	 sub	lr4,lr4,2		; count for jmpfdec
+
+Loop2:	sub	lr3,lr3,4
+	load	0,0,gr116,lr3
+	extract	gr117,gr119,gr116
+	sub	lr2,lr2,4
+	store	0,0,gr117,lr2
+	jmpfdec	lr4,Loop2
+	 or	gr119,gr116,0
+
+Lend:	extract	gr117,gr119,0
+	sub	lr2,lr2,4
+	jmpi	lr0
+	 store	0,0,gr117,lr2
diff --git a/rts/gmp/mpn/a29k/mul_1.s b/rts/gmp/mpn/a29k/mul_1.s
new file mode 100644
index 0000000000..6bcf7ce0cf
--- /dev/null
+++ b/rts/gmp/mpn/a29k/mul_1.s
@@ -0,0 +1,97 @@
+; 29000 __gmpn_mul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	lr2
+; s1_ptr	lr3
+; size		lr4
+; s2_limb	lr5
+
+	.cputype 29050
+	.sect .lit,lit
+	.text
+	.align	4
+	.global	___gmpn_mul_1
+	.word	0x60000
+___gmpn_mul_1:
+	sub	lr4,lr4,8
+	jmpt	lr4,Ltail
+	 const	gr120,0			; init cylimb reg
+
+	srl	gr117,lr4,3		; divide by 8
+	sub	gr117,gr117,1		; count for jmpfdec
+
+Loop:	mtsrim	cr,(8-1)
+	loadm	0,0,gr96,lr3
+	add	lr3,lr3,32
+
+	multiplu gr104,gr96,lr5
+	multmu	 gr96,gr96,lr5
+	multiplu gr105,gr97,lr5
+	multmu	 gr97,gr97,lr5
+	multiplu gr106,gr98,lr5
+	multmu	 gr98,gr98,lr5
+	multiplu gr107,gr99,lr5
+	multmu	 gr99,gr99,lr5
+	multiplu gr108,gr100,lr5
+	multmu	 gr100,gr100,lr5
+	multiplu gr109,gr101,lr5
+	multmu	 gr101,gr101,lr5
+	multiplu gr110,gr102,lr5
+	multmu	 gr102,gr102,lr5
+	multiplu gr111,gr103,lr5
+	multmu	 gr103,gr103,lr5
+
+	add	gr104,gr104,gr120
+	addc	gr105,gr105,gr96
+	addc	gr106,gr106,gr97
+	addc	gr107,gr107,gr98
+	addc	gr108,gr108,gr99
+	addc	gr109,gr109,gr100
+	addc	gr110,gr110,gr101
+	addc	gr111,gr111,gr102
+	addc	gr120,gr103,0
+
+	mtsrim	cr,(8-1)
+	storem	0,0,gr104,lr2
+	jmpfdec	gr117,Loop
+	 add	lr2,lr2,32
+
+Ltail:	and	lr4,lr4,(8-1)
+	sub	gr118,lr4,1		; count for CR
+	jmpt	gr118,Lend
+	 sub	lr4,lr4,2
+	sub	lr2,lr2,4		; offset res_ptr by one limb
+
+Loop2:	load	0,0,gr116,lr3
+	add	lr3,lr3,4
+	multiplu gr117,gr116,lr5
+	multmu	gr118,gr116,lr5
+	add	lr2,lr2,4
+	add	gr117,gr117,gr120
+	store	0,0,gr117,lr2
+	jmpfdec	lr4,Loop2
+	 addc	gr120,gr118,0
+
+Lend:	jmpi	lr0
+	 or	gr96,gr120,0		; copy
diff --git a/rts/gmp/mpn/a29k/rshift.s b/rts/gmp/mpn/a29k/rshift.s
new file mode 100644
index 0000000000..ea163bff2b
--- /dev/null
+++ b/rts/gmp/mpn/a29k/rshift.s
@@ -0,0 +1,89 @@
+; 29000 __gmpn_rshift --
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	lr2
+; s1_ptr	lr3
+; s2_ptr	lr4
+; size		lr5
+
+; We use the loadm/storem instructions and operate on chunks of 8
+; limbs/per iteration, until less than 8 limbs remain.
+
+	.sect .lit,lit
+	.text
+	.align	4
+	.global	___gmpn_rshift
+	.word	0x60000
+___gmpn_rshift:
+	load	0,0,gr119,lr3
+	add	lr3,lr3,4
+
+	subr	gr116,lr5,32
+	sll	gr96,gr119,gr116	; return value
+	sub	lr4,lr4,1		; actual loop count is SIZE - 1
+
+	srl	gr117,lr4,3		; chuck count = (actual count) / 8
+	cpeq	gr118,gr117,0
+	jmpt	gr118,Ltail
+	 mtsr	fc,gr116
+
+	sub	gr117,gr117,2		; count for jmpfdec
+
+; Main loop working 8 limbs/iteration.
+Loop:	mtsrim	cr,(8-1)
+	loadm	0,0,gr100,lr3
+	add	lr3,lr3,32
+
+	extract	gr98,gr100,gr119
+	extract	gr99,gr101,gr100
+	extract	gr100,gr102,gr101
+	extract	gr101,gr103,gr102
+	extract	gr102,gr104,gr103
+	extract	gr103,gr105,gr104
+	extract	gr104,gr106,gr105
+	extract	gr105,gr107,gr106
+
+	mtsrim	cr,(8-1)
+	storem	0,0,gr98,lr2
+	add	lr2,lr2,32
+	jmpfdec	gr117,Loop
+	 or	gr119,gr107,0
+
+; Code for the last up-to-7 limbs.
+
+	and	lr4,lr4,(8-1)
+Ltail:	cpeq	gr118,lr4,0
+	jmpt	gr118,Lend
+	 sub	lr4,lr4,2		; count for jmpfdec
+
+Loop2:	load	0,0,gr100,lr3
+	add	lr3,lr3,4
+	extract	gr117,gr100,gr119
+	store	0,0,gr117,lr2
+	add	lr2,lr2,4
+	jmpfdec	lr4,Loop2
+	 or	gr119,gr100,0
+
+Lend:	srl	gr117,gr119,lr5
+	jmpi	lr0
+	 store	0,0,gr117,lr2
diff --git a/rts/gmp/mpn/a29k/sub_n.s b/rts/gmp/mpn/a29k/sub_n.s
new file mode 100644
index 0000000000..c6b64c5bee
--- /dev/null
+++ b/rts/gmp/mpn/a29k/sub_n.s
@@ -0,0 +1,120 @@
+; 29000 __gmpn_sub -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	lr2
+; s1_ptr	lr3
+; s2_ptr	lr4
+; size		lr5
+
+; We use the loadm/storem instructions and operate on chunks of 8
+; limbs/per iteration, until less than 8 limbs remain.
+
+; The 29k has no addition or subtraction instructions that doesn't
+; affect carry, so we need to save and restore that as soon as we
+; adjust the pointers.  gr116 is used for this purpose.  Note that
+; gr116==0 means that carry should be set.
+
+	.sect .lit,lit
+	.text
+	.align	4
+	.global	___gmpn_sub_n
+	.word	0x60000
+___gmpn_sub_n:
+	srl	gr117,lr5,3
+	sub	gr118,gr117,1
+	jmpt	gr118,Ltail
+	 constn	gr116,-1		; init cy reg
+	sub	gr117,gr117,2		; count for jmpfdec
+
+; Main loop working 8 limbs/iteration.
+Loop:	mtsrim	cr,(8-1)
+	loadm	0,0,gr96,lr3
+	add	lr3,lr3,32
+	mtsrim	cr,(8-1)
+	loadm	0,0,gr104,lr4
+	add	lr4,lr4,32
+
+	subr	gr116,gr116,0		; restore carry
+	subc	gr96,gr96,gr104
+	subc	gr97,gr97,gr105
+	subc	gr98,gr98,gr106
+	subc	gr99,gr99,gr107
+	subc	gr100,gr100,gr108
+	subc	gr101,gr101,gr109
+	subc	gr102,gr102,gr110
+	subc	gr103,gr103,gr111
+	subc	gr116,gr116,gr116	; gr116 = not(cy)
+
+	mtsrim	cr,(8-1)
+	storem	0,0,gr96,lr2
+	jmpfdec	gr117,Loop
+	 add	lr2,lr2,32
+
+; Code for the last up-to-7 limbs.
+; This code might look very strange, but it's hard to write it
+; differently without major slowdown.
+
+	and	lr5,lr5,(8-1)
+Ltail:	sub	gr118,lr5,1		; count for CR
+	jmpt	gr118,Lend
+	 sub	gr117,lr5,2		; count for jmpfdec
+
+	mtsr	cr,gr118
+	loadm	0,0,gr96,lr3
+	mtsr	cr,gr118
+	loadm	0,0,gr104,lr4
+
+	subr	gr116,gr116,0		; restore carry
+
+	jmpfdec	gr117,L1
+	 subc	gr96,gr96,gr104
+	jmp	Lstore
+	 mtsr	cr,gr118
+L1:	jmpfdec	gr117,L2
+	 subc	gr97,gr97,gr105
+	jmp	Lstore
+	 mtsr	cr,gr118
+L2:	jmpfdec	gr117,L3
+	 subc	gr98,gr98,gr106
+	jmp	Lstore
+	 mtsr	cr,gr118
+L3:	jmpfdec	gr117,L4
+	 subc	gr99,gr99,gr107
+	jmp	Lstore
+	 mtsr	cr,gr118
+L4:	jmpfdec	gr117,L5
+	 subc	gr100,gr100,gr108
+	jmp	Lstore
+	 mtsr	cr,gr118
+L5:	jmpfdec	gr117,L6
+	 subc	gr101,gr101,gr109
+	jmp	Lstore
+	 mtsr	cr,gr118
+L6:	subc	gr102,gr102,gr110
+
+Lstore:	storem	0,0,gr96,lr2
+	subc	gr116,gr116,gr116	; gr116 = not(cy)
+
+Lend:	jmpi	lr0
+	 add	gr96,gr116,1
diff --git a/rts/gmp/mpn/a29k/submul_1.s b/rts/gmp/mpn/a29k/submul_1.s
new file mode 100644
index 0000000000..ef97d8d4e5
--- /dev/null
+++ b/rts/gmp/mpn/a29k/submul_1.s
@@ -0,0 +1,116 @@
+; 29000 __gmpn_submul_1 -- Multiply a limb vector with a single limb and
+; subtract the product from a second limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	lr2
+; s1_ptr	lr3
+; size		lr4
+; s2_limb	lr5
+
+	.cputype 29050
+	.sect .lit,lit
+	.text
+	.align	4
+	.global	___gmpn_submul_1
+	.word	0x60000
+___gmpn_submul_1:
+	sub	lr4,lr4,8
+	jmpt	lr4,Ltail
+	 const	gr120,0			; init cylimb reg
+
+	srl	gr117,lr4,3		; divide by 8
+	sub	gr117,gr117,1		; count for jmpfdec
+
+Loop:	mtsrim	cr,(8-1)
+	loadm	0,0,gr96,lr3
+	add	lr3,lr3,32
+
+	multiplu gr104,gr96,lr5
+	multmu	 gr96,gr96,lr5
+	multiplu gr105,gr97,lr5
+	multmu	 gr97,gr97,lr5
+	multiplu gr106,gr98,lr5
+	multmu	 gr98,gr98,lr5
+	multiplu gr107,gr99,lr5
+	multmu	 gr99,gr99,lr5
+	multiplu gr108,gr100,lr5
+	multmu	 gr100,gr100,lr5
+	multiplu gr109,gr101,lr5
+	multmu	 gr101,gr101,lr5
+	multiplu gr110,gr102,lr5
+	multmu	 gr102,gr102,lr5
+	multiplu gr111,gr103,lr5
+	multmu	 gr103,gr103,lr5
+
+	add	gr104,gr104,gr120
+	addc	gr105,gr105,gr96
+	addc	gr106,gr106,gr97
+	addc	gr107,gr107,gr98
+	addc	gr108,gr108,gr99
+	addc	gr109,gr109,gr100
+	addc	gr110,gr110,gr101
+	addc	gr111,gr111,gr102
+	addc	gr120,gr103,0
+
+	mtsrim	cr,(8-1)
+	loadm	0,0,gr96,lr2
+
+	sub	gr96,gr96,gr104
+	subc	gr97,gr97,gr105
+	subc	gr98,gr98,gr106
+	subc	gr99,gr99,gr107
+	subc	gr100,gr100,gr108
+	subc	gr101,gr101,gr109
+	subc	gr102,gr102,gr110
+	subc	gr103,gr103,gr111
+
+	add	gr104,gr103,gr111	; invert carry from previus sub
+	addc	gr120,gr120,0
+
+	mtsrim	cr,(8-1)
+	storem	0,0,gr96,lr2
+	jmpfdec	gr117,Loop
+	 add	lr2,lr2,32
+
+Ltail:	and	lr4,lr4,(8-1)
+	sub	gr118,lr4,1		; count for CR
+	jmpt	gr118,Lend
+	 sub	lr4,lr4,2
+	sub	lr2,lr2,4		; offset res_ptr by one limb
+
+Loop2:	load	0,0,gr116,lr3
+	add	lr3,lr3,4
+	multiplu gr117,gr116,lr5
+	multmu	gr118,gr116,lr5
+	add	lr2,lr2,4
+	load	0,0,gr119,lr2
+	add	gr117,gr117,gr120
+	addc	gr118,gr118,0
+	sub	gr119,gr119,gr117
+	add	gr104,gr119,gr117	; invert carry from previus sub
+	store	0,0,gr119,lr2
+	jmpfdec	lr4,Loop2
+	 addc	gr120,gr118,0
+
+Lend:	jmpi	lr0
+	 or	gr96,gr120,0		; copy
diff --git a/rts/gmp/mpn/a29k/udiv.s b/rts/gmp/mpn/a29k/udiv.s
new file mode 100644
index 0000000000..fdd53a9a88
--- /dev/null
+++ b/rts/gmp/mpn/a29k/udiv.s
@@ -0,0 +1,30 @@
+; Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+	.sect .lit,lit
+	.text
+	.align 4
+	.global ___udiv_qrnnd
+	.word 0x60000
+___udiv_qrnnd:
+	mtsr q,lr3
+	dividu gr96,lr4,lr5
+	mfsr gr116,q
+	jmpi lr0
+	store 0,0,gr116,lr2
diff --git a/rts/gmp/mpn/a29k/umul.s b/rts/gmp/mpn/a29k/umul.s
new file mode 100644
index 0000000000..7741981167
--- /dev/null
+++ b/rts/gmp/mpn/a29k/umul.s
@@ -0,0 +1,29 @@
+; Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+	.sect .lit,lit
+	.text
+	.align 4
+	.global ___umul_ppmm
+	.word 0x50000
+___umul_ppmm:
+	multiplu gr116,lr3,lr4
+	multmu gr96,lr3,lr4
+	jmpi lr0
+	store 0,0,gr116,lr2
diff --git a/rts/gmp/mpn/alpha/README b/rts/gmp/mpn/alpha/README
new file mode 100644
index 0000000000..744260c7c5
--- /dev/null
+++ b/rts/gmp/mpn/alpha/README
@@ -0,0 +1,224 @@
+This directory contains mpn functions optimized for DEC Alpha processors.
+
+ALPHA ASSEMBLY RULES AND REGULATIONS
+
+The `.prologue N' pseudo op marks the end of instruction that needs
+special handling by unwinding.  It also says whether $27 is really
+needed for computing the gp.  The `.mask M' pseudo op says which
+registers are saved on the stack, and at what offset in the frame.
+
+Cray code is very very different...
+
+
+RELEVANT OPTIMIZATION ISSUES
+
+EV4
+
+1. This chip has very limited store bandwidth.  The on-chip L1 cache is
+   write-through, and a cache line is transfered from the store buffer to
+   the off-chip L2 in as much 15 cycles on most systems.  This delay hurts
+   mpn_add_n, mpn_sub_n, mpn_lshift, and mpn_rshift.
+
+2. Pairing is possible between memory instructions and integer arithmetic
+   instructions.
+
+3. mulq and umulh are documented to have a latency of 23 cycles, but 2 of
+   these cycles are pipelined.  Thus, multiply instructions can be issued at
+   a rate of one each 21st cycle.
+
+EV5
+
+1. The memory bandwidth of this chip seems excellent, both for loads and
+   stores.  Even when the working set is larger than the on-chip L1 and L2
+   caches, the performance remain almost unaffected.
+
+2. mulq has a latency of 12 cycles and an issue rate of 1 each 8th cycle.
+   umulh has a measured latency of 14 cycles and an issue rate of 1 each
+   10th cycle.  But the exact timing is somewhat confusing.
+
+3. mpn_add_n.  With 4-fold unrolling, we need 37 instructions, whereof 12
+   are memory operations.  This will take at least
+	ceil(37/2) [dual issue] + 1 [taken branch] = 19 cycles
+   We have 12 memory cycles, plus 4 after-store conflict cycles, or 16 data
+   cache cycles, which should be completely hidden in the 19 issue cycles.
+   The computation is inherently serial, with these dependencies:
+
+	       ldq  ldq
+		 \  /\
+	  (or)   addq |
+	   |\   /   \ |
+	   | addq  cmpult
+	    \  |     |
+	     cmpult  |
+		 \  /
+		  or
+
+   I.e., 3 operations are needed between carry-in and carry-out, making 12
+   cycles the absolute minimum for the 4 limbs.  We could replace the `or'
+   with a cmoveq/cmovne, which could issue one cycle earlier that the `or',
+   but that might waste a cycle on EV4.  The total depth remain unaffected,
+   since cmov has a latency of 2 cycles.
+
+     addq
+     /   \
+   addq  cmpult
+     |      \
+   cmpult -> cmovne
+
+Montgomery has a slightly different way of computing carry that requires one
+less instruction, but has depth 4 (instead of the current 3).  Since the
+code is currently instruction issue bound, Montgomery's idea should save us
+1/2 cycle per limb, or bring us down to a total of 17 cycles or 4.25
+cycles/limb.  Unfortunately, this method will not be good for the EV6.
+
+EV6
+
+Here we have a really parallel pipeline, capable of issuing up to 4 integer
+instructions per cycle.  One integer multiply instruction can issue each
+cycle.  To get optimal speed, we need to pretend we are vectorizing the code,
+i.e., minimize the iterative dependencies.
+
+There are two dependencies to watch out for.  1) Address arithmetic
+dependencies, and 2) carry propagation dependencies.
+
+We can avoid serializing due to address arithmetic by unrolling the loop, so
+that addresses don't depend heavily on an index variable.  Avoiding
+serializing because of carry propagation is trickier; the ultimate performance
+of the code will be determined of the number of latency cycles it takes from
+accepting carry-in to a vector point until we can generate carry-out.
+
+Most integer instructions can execute in either the L0, U0, L1, or U1
+pipelines.  Shifts only execute in U0 and U1, and multiply only in U1.
+
+CMOV instructions split into two internal instructions, CMOV1 and CMOV2, but
+the execute efficiently.  But CMOV split the mapping process (see pg 2-26 in
+cmpwrgd.pdf), suggesting the CMOV should always be placed as the last
+instruction of an aligned 4 instruction block (?).
+
+Perhaps the most important issue is the latency between the L0/U0 and L1/U1
+clusters; a result obtained on either cluster has an extra cycle of latency
+for consumers in the opposite cluster.  Because of the dynamic nature of the
+implementation, it is hard to predict where an instruction will execute.
+
+The shift loops need (per limb):
+    1 load (Lx pipes)
+    1 store (Lx pipes)
+    2 shift (Ux pipes)
+    1 iaddlog (Lx pipes, Ux pipes)
+Obviously, since the pipes are very equally loaded, we should get 4 insn/cycle, or 1.25 cycles/limb.
+
+For mpn_add_n, we currently have
+    2 load (Lx pipes)
+    1 store (Lx pipes)
+    5 iaddlog (Lx pipes, Ux pipes)
+
+Again, we have a perfect balance and will be limited by carry propagation
+delays, currently three cycles.  The superoptimizer indicates that ther
+might be sequences that--using a final cmov--have a carry propagation delay
+of just two.  Montgomery's subtraction sequence could perhaps be used, by
+complementing some operands.  All in all, we should get down to 2 cycles
+without much problems.
+
+For mpn_mul_1, we could do, just like for mpn_add_n:
+    not		newlo,notnewlo
+    addq	cylimb,newlo,newlo  ||    cmpult	cylimb,notnewlo,cyout
+    addq	cyout,newhi,cylimb
+and get 2-cycle carry propagation.  The instructions needed will be
+    1 ld (Lx pipes)
+    1 st (Lx pipes)
+    2 mul (U1 pipe)
+    4 iaddlog (Lx pipes, Ux pipes)
+issue1: addq not mul ld
+issue2: cmpult addq mul st
+Conclusion: no cluster delays and 2-cycle carry delays will give us 2 cycles/limb!
+
+Last, we have mpn_addmul_1.  Almost certainly, we will get down to 3
+cycles/limb, which would be absolutely awesome.
+
+Old, perhaps obsolete addmul_1 dependency diagram (needs 175 columns wide screen):
+
+   i  
+   s
+   s  i
+   u  n
+   e  s
+   d  t
+      r
+   i  u
+l  n  c
+i  s  t
+v  t  i
+e  r  o
+   u  n
+v  c	
+a  t  t
+l  i  y
+u  o  p
+e  n  e
+s  s  s
+        issue
+         in
+        cycle
+         -1     ldq
+               /    \
+          0   |      \
+              |       \
+          1   |        |
+              |        |
+          2   |        |                   ldq
+              |        |                  /    \
+          3   |       mulq               |      \
+              |           \              |       \
+          4  umulh         \             |        |
+               |            |            |        |
+          5    |            |            |        |                   ldq
+               |            |            |        |                  /    \
+    4calm 6    |            |   ldq      |       mulq               |      \
+               |            |  /         |           \              |       \
+    4casm 7    |            | /         umulh         \             |        |
+6              |            ||            |            |            |        |
+    3aal  8    |            ||            |            |            |        |                   ldq
+7              |            ||            |            |            |        |                  /    \
+    4calm 9    |            ||            |            |   ldq      |       mulq               |      \
+9              |            ||            |            |  /         |           \              |       \
+    4casm 10   |            ||            |            | /         umulh         \             |        |
+9              |            ||            |            ||            |            |            |        |
+    3aal  11   |           addq           |            ||            |            |            |        |                   ldq
+9              |          //   \          |            ||            |            |            |        |                  /    \
+    4calm 12    \     cmpult    addq<-cy  |            ||            |            |   ldq      |       mulq               |      \
+13               \    /       //   \      |            ||            |            |  /         |           \              |       \
+    4casm 13      addq   cmpult     stq   |            ||            |            | /         umulh         \             |        |
+11                    \  /                |            ||            |            ||            |            |            |        |
+    3aal  14          addq                |           addq           |            ||            |            |            |        |                   ldq
+10                        \               |          //   \          |            ||            |            |            |        |                  /    \
+    4calm 15                cy ---->       \     cmpult    addq<-cy  |            ||            |            |   ldq      |       mulq               |      \
+13                                          \    /       //   \      |            ||            |            |  /         |           \              |       \
+    4casm 16                                 addq   cmpult     stq   |            ||            |            | /         umulh         \             |        |
+11                                               \  /                |            ||            |            ||            |            |            |        |
+    3aal  17                                     addq                |           addq           |            ||            |            |            |        |
+10                                                   \               |          //   \          |            ||            |            |            |        |
+    4calm 18                                           cy ---->       \     cmpult    addq<-cy  |            ||            |            |   ldq      |       mulq
+13                                                                     \    /       //   \      |            ||            |            |  /         |           \
+    4casm 19                                                            addq   cmpult     stq   |            ||            |            | /         umulh         \
+11                                                                          \  /                |            ||            |            ||            |            |
+    3aal  20                                                                addq                |           addq           |            ||            |            |
+10                                                                              \               |          //   \          |            ||            |            |
+    4calm 21                                                                      cy ---->       \     cmpult    addq<-cy  |            ||            |            |   ldq
+                                                                                                  \    /       //   \      |            ||            |            |  /
+          22                                                                                       addq   cmpult     stq   |            ||            |            | /
+                                                                                                       \  /                |            ||            |            ||
+          23                                                                                           addq                |           addq           |            ||
+                                                                                                           \               |          //   \          |            ||
+          24                                                                                                 cy ---->       \     cmpult    addq<-cy  |            ||
+                                                                                                                             \    /       //   \      |            ||
+          25                                                                                                                  addq   cmpult     stq   |            ||
+                                                                                                                                  \  /                |            ||
+          26                                                                                                                      addq                |           addq
+                                                                                                                                      \               |          //   \
+          27                                                                                                                            cy ---->       \     cmpult    addq<-cy
+                                                                                                                                                        \    /       //   \
+          28                                                                                                                                             addq   cmpult     stq
+                                                                                                                                                             \  /
+As many as 6 consecutive points will be under execution simultaneously, or if we                                                                             addq
+schedule loads even further away, maybe 7 or 8.  But the number of live quantities                                                                               \
+is reasonable, and can easily be satisfied.                                                                                                                        cy ---->    
diff --git a/rts/gmp/mpn/alpha/add_n.asm b/rts/gmp/mpn/alpha/add_n.asm
new file mode 100644
index 0000000000..08d6a9f7b8
--- /dev/null
+++ b/rts/gmp/mpn/alpha/add_n.asm
@@ -0,0 +1,114 @@
+dnl  Alpha mpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl  store sum in a third limb vector.
+
+dnl  Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  s2_ptr	r18
+dnl  size	r19
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+	ldq	r3,0(r17)
+	ldq	r4,0(r18)
+
+	subq	r19,1,r19
+	and	r19,4-1,r2	C number of limbs in first loop
+	bis	r31,r31,r0
+	beq	r2,$L0		C if multiple of 4 limbs, skip first loop
+
+	subq	r19,r2,r19
+
+$Loop0:	subq	r2,1,r2
+	ldq	r5,8(r17)
+	addq	r4,r0,r4
+	ldq	r6,8(r18)
+	cmpult	r4,r0,r1
+	addq	r3,r4,r4
+	cmpult	r4,r3,r0
+	stq	r4,0(r16)
+	bis	r0,r1,r0
+
+	addq	r17,8,r17
+	addq	r18,8,r18
+	bis	r5,r5,r3
+	bis	r6,r6,r4
+	addq	r16,8,r16
+	bne	r2,$Loop0
+
+$L0:	beq	r19,$Lend
+
+	ALIGN(8)
+$Loop:	subq	r19,4,r19
+
+	ldq	r5,8(r17)
+	addq	r4,r0,r4
+	ldq	r6,8(r18)
+	cmpult	r4,r0,r1
+	addq	r3,r4,r4
+	cmpult	r4,r3,r0
+	stq	r4,0(r16)
+	bis	r0,r1,r0
+
+	ldq	r3,16(r17)
+	addq	r6,r0,r6
+	ldq	r4,16(r18)
+	cmpult	r6,r0,r1
+	addq	r5,r6,r6
+	cmpult	r6,r5,r0
+	stq	r6,8(r16)
+	bis	r0,r1,r0
+
+	ldq	r5,24(r17)
+	addq	r4,r0,r4
+	ldq	r6,24(r18)
+	cmpult	r4,r0,r1
+	addq	r3,r4,r4
+	cmpult	r4,r3,r0
+	stq	r4,16(r16)
+	bis	r0,r1,r0
+
+	ldq	r3,32(r17)
+	addq	r6,r0,r6
+	ldq	r4,32(r18)
+	cmpult	r6,r0,r1
+	addq	r5,r6,r6
+	cmpult	r6,r5,r0
+	stq	r6,24(r16)
+	bis	r0,r1,r0
+
+	addq	r17,32,r17
+	addq	r18,32,r18
+	addq	r16,32,r16
+	bne	r19,$Loop
+
+$Lend:	addq	r4,r0,r4
+	cmpult	r4,r0,r1
+	addq	r3,r4,r4
+	cmpult	r4,r3,r0
+	stq	r4,0(r16)
+	bis	r0,r1,r0
+	ret	r31,(r26),1
+EPILOGUE(mpn_add_n)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/addmul_1.asm b/rts/gmp/mpn/alpha/addmul_1.asm
new file mode 100644
index 0000000000..4ea900be6b
--- /dev/null
+++ b/rts/gmp/mpn/alpha/addmul_1.asm
@@ -0,0 +1,87 @@
+dnl Alpha __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+dnl the result to a second limb vector.
+
+dnl  Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  size	r18
+dnl  s2_limb	r19
+
+dnl  This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7
+dnl  cycles/limb on EV6.
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	ldq	r2,0(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	subq	r18,1,r18	C size--
+	mulq	r2,r19,r3	C r3 = prod_low
+	ldq	r5,0(r16)	C r5 = *res_ptr
+	umulh	r2,r19,r0	C r0 = prod_high
+	beq	r18,$Lend1	C jump if size was == 1
+	ldq	r2,0(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	subq	r18,1,r18	C size--
+	addq	r5,r3,r3
+	cmpult	r3,r5,r4
+	stq	r3,0(r16)
+	addq	r16,8,r16	C res_ptr++
+	beq	r18,$Lend2	C jump if size was == 2
+
+	ALIGN(8)
+$Loop:	mulq	r2,r19,r3	C r3 = prod_low
+	ldq	r5,0(r16)	C r5 = *res_ptr
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	subq	r18,1,r18	C size--
+	umulh	r2,r19,r4	C r4 = cy_limb
+	ldq	r2,0(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	addq	r5,r3,r3
+	cmpult	r3,r5,r5
+	stq	r3,0(r16)
+	addq	r16,8,r16	C res_ptr++
+	addq	r5,r0,r0	C combine carries
+	bne	r18,$Loop
+
+$Lend2:	mulq	r2,r19,r3	C r3 = prod_low
+	ldq	r5,0(r16)	C r5 = *res_ptr
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	umulh	r2,r19,r4	C r4 = cy_limb
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	addq	r5,r3,r3
+	cmpult	r3,r5,r5
+	stq	r3,0(r16)
+	addq	r5,r0,r0	C combine carries
+	addq	r4,r0,r0	C cy_limb = prod_high + cy
+	ret	r31,(r26),1
+$Lend1:	addq	r5,r3,r3
+	cmpult	r3,r5,r5
+	stq	r3,0(r16)
+	addq	r0,r5,r0
+	ret	r31,(r26),1
+EPILOGUE(mpn_addmul_1)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/cntlz.asm b/rts/gmp/mpn/alpha/cntlz.asm
new file mode 100644
index 0000000000..febb3b70d9
--- /dev/null
+++ b/rts/gmp/mpn/alpha/cntlz.asm
@@ -0,0 +1,68 @@
+dnl  Alpha auxiliary for longlong.h's count_leading_zeros
+
+dnl  Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  DISCUSSION:
+
+dnl  Other methods have been tried, and using a 128-entry table actually trims
+dnl  about 10% of the execution time (on a 21164) when the table is in the L1
+dnl  cache.  But under non-benchmarking conditions, the table will hardly be in
+dnl  the L1 cache.  Tricky bit-fiddling methods with multiplies and magic tables
+dnl  are also possible, but they require many more instructions than the current
+dnl  code.  (But for count_trailing_zeros, such tricks are beneficial.)
+dnl  Finally, converting to floating-point and extracting the exponent is much
+dnl  slower.
+
+ASM_START()
+PROLOGUE(MPN(count_leading_zeros))
+	bis	r31,63,r0		C initialize partial result count
+
+	srl	r16,32,r1		C shift down 32 steps -> r1
+	cmovne	r1,r1,r16		C select r1 if non-zero
+	cmovne	r1,31,r0		C if r1 is nonzero choose smaller count
+
+	srl	r16,16,r1		C shift down 16 steps -> r1
+	subq	r0,16,r2		C generate new partial result count
+	cmovne	r1,r1,r16		C choose new r1 if non-zero
+	cmovne	r1,r2,r0		C choose new count if r1 was non-zero
+
+	srl	r16,8,r1
+	subq	r0,8,r2
+	cmovne	r1,r1,r16
+	cmovne	r1,r2,r0
+
+	srl	r16,4,r1
+	subq	r0,4,r2
+	cmovne	r1,r1,r16
+	cmovne	r1,r2,r0
+
+	srl	r16,2,r1
+	subq	r0,2,r2
+	cmovne	r1,r1,r16
+	cmovne	r1,r2,r0
+
+	srl	r16,1,r1		C extract bit 1
+	subq	r0,r1,r0		C subtract it from partial result
+
+	ret	r31,(r26),1
+EPILOGUE(MPN(count_leading_zeros))
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/default.m4 b/rts/gmp/mpn/alpha/default.m4
new file mode 100644
index 0000000000..5f4c48dc73
--- /dev/null
+++ b/rts/gmp/mpn/alpha/default.m4
@@ -0,0 +1,77 @@
+divert(-1)
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+define(`ASM_START',
+	`
+	.set noreorder
+	.set noat')
+
+define(`X',`0x$1')
+define(`FLOAT64',
+	`
+	.align	3
+$1:	.t_floating $2')
+
+define(`PROLOGUE',
+	`
+	.text
+	.align	3
+	.globl	$1
+	.ent	$1
+$1:
+	.frame r30,0,r26
+	.prologue 0')
+
+define(`PROLOGUE_GP',
+	`
+	.text
+	.align	3
+	.globl	$1
+	.ent	$1
+$1:
+	ldgp	r29,0(r27)
+	.frame	r30,0,r26
+	.prologue 1')
+
+define(`EPILOGUE',
+	`
+	.end	$1')
+
+dnl Map register names r0, r1, etc, to `$0', `$1', etc.
+dnl This is needed on all systems but Unicos
+forloop(i,0,31,
+`define(`r'i,``$''i)'
+)
+forloop(i,0,31,
+`define(`f'i,``$f''i)'
+)
+
+define(`DATASTART',
+	`dnl
+	DATA
+$1:')
+define(`DATAEND',`dnl')
+
+define(`ASM_END',`dnl')
+
+divert
diff --git a/rts/gmp/mpn/alpha/ev5/add_n.asm b/rts/gmp/mpn/alpha/ev5/add_n.asm
new file mode 100644
index 0000000000..716d6404ae
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev5/add_n.asm
@@ -0,0 +1,143 @@
+dnl  Alpha EV5 __gmpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl  store sum in a third limb vector.
+
+dnl  Copyright (C) 1995, 1999, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  s2_ptr	r18
+dnl  size	r19
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+	bis	r31,r31,r25		C clear cy
+	subq	r19,4,r19		C decr loop cnt
+	blt	r19,$Lend2		C if less than 4 limbs, goto 2nd loop
+C Start software pipeline for 1st loop
+	ldq	r0,0(r18)
+	ldq	r4,0(r17)
+	ldq	r1,8(r18)
+	ldq	r5,8(r17)
+	addq	r17,32,r17		C update s1_ptr
+	ldq	r2,16(r18)
+	addq	r0,r4,r20		C 1st main add
+	ldq	r3,24(r18)
+	subq	r19,4,r19		C decr loop cnt
+	ldq	r6,-16(r17)
+	cmpult	r20,r0,r25		C compute cy from last add
+	ldq	r7,-8(r17)
+	addq	r1,r5,r28		C 2nd main add
+	addq	r18,32,r18		C update s2_ptr
+	addq	r28,r25,r21		C 2nd carry add
+	cmpult	r28,r5,r8		C compute cy from last add
+	blt	r19,$Lend1		C if less than 4 limbs remain, jump
+C 1st loop handles groups of 4 limbs in a software pipeline
+	ALIGN(16)
+$Loop:	cmpult	r21,r28,r25		C compute cy from last add
+	ldq	r0,0(r18)
+	bis	r8,r25,r25		C combine cy from the two adds
+	ldq	r1,8(r18)
+	addq	r2,r6,r28		C 3rd main add
+	ldq	r4,0(r17)
+	addq	r28,r25,r22		C 3rd carry add
+	ldq	r5,8(r17)
+	cmpult	r28,r6,r8		C compute cy from last add
+	cmpult	r22,r28,r25		C compute cy from last add
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two adds
+	stq	r21,8(r16)
+	addq	r3,r7,r28		C 4th main add
+	addq	r28,r25,r23		C 4th carry add
+	cmpult	r28,r7,r8		C compute cy from last add
+	cmpult	r23,r28,r25		C compute cy from last add
+		addq	r17,32,r17		C update s1_ptr
+	bis	r8,r25,r25		C combine cy from the two adds
+		addq	r16,32,r16		C update res_ptr
+	addq	r0,r4,r28		C 1st main add
+	ldq	r2,16(r18)
+	addq	r25,r28,r20		C 1st carry add
+	ldq	r3,24(r18)
+	cmpult	r28,r4,r8		C compute cy from last add
+	ldq	r6,-16(r17)
+	cmpult	r20,r28,r25		C compute cy from last add
+	ldq	r7,-8(r17)
+	bis	r8,r25,r25		C combine cy from the two adds
+	subq	r19,4,r19		C decr loop cnt
+	stq	r22,-16(r16)
+	addq	r1,r5,r28		C 2nd main add
+	stq	r23,-8(r16)
+	addq	r25,r28,r21		C 2nd carry add
+		addq	r18,32,r18		C update s2_ptr
+	cmpult	r28,r5,r8		C compute cy from last add
+	bge	r19,$Loop
+C Finish software pipeline for 1st loop
+$Lend1:	cmpult	r21,r28,r25		C compute cy from last add
+	bis	r8,r25,r25		C combine cy from the two adds
+	addq	r2,r6,r28		C 3rd main add
+	addq	r28,r25,r22		C 3rd carry add
+	cmpult	r28,r6,r8		C compute cy from last add
+	cmpult	r22,r28,r25		C compute cy from last add
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two adds
+	stq	r21,8(r16)
+	addq	r3,r7,r28		C 4th main add
+	addq	r28,r25,r23		C 4th carry add
+	cmpult	r28,r7,r8		C compute cy from last add
+	cmpult	r23,r28,r25		C compute cy from last add
+	bis	r8,r25,r25		C combine cy from the two adds
+	addq	r16,32,r16		C update res_ptr
+	stq	r22,-16(r16)
+	stq	r23,-8(r16)
+$Lend2:	addq	r19,4,r19		C restore loop cnt
+	beq	r19,$Lret
+C Start software pipeline for 2nd loop
+	ldq	r0,0(r18)
+	ldq	r4,0(r17)
+	subq	r19,1,r19
+	beq	r19,$Lend0
+C 2nd loop handles remaining 1-3 limbs
+	ALIGN(16)
+$Loop0:	addq	r0,r4,r28		C main add
+	ldq	r0,8(r18)
+	cmpult	r28,r4,r8		C compute cy from last add
+	ldq	r4,8(r17)
+	addq	r28,r25,r20		C carry add
+	addq	r18,8,r18
+	addq	r17,8,r17
+	stq	r20,0(r16)
+	cmpult	r20,r28,r25		C compute cy from last add
+	subq	r19,1,r19		C decr loop cnt
+	bis	r8,r25,r25		C combine cy from the two adds
+	addq	r16,8,r16
+	bne	r19,$Loop0
+$Lend0:	addq	r0,r4,r28		C main add
+	addq	r28,r25,r20		C carry add
+	cmpult	r28,r4,r8		C compute cy from last add
+	cmpult	r20,r28,r25		C compute cy from last add
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two adds
+
+$Lret:	bis	r25,r31,r0		C return cy
+	ret	r31,(r26),1
+EPILOGUE(mpn_add_n)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/ev5/lshift.asm b/rts/gmp/mpn/alpha/ev5/lshift.asm
new file mode 100644
index 0000000000..cb181dda66
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev5/lshift.asm
@@ -0,0 +1,169 @@
+dnl  Alpha EV5 __gmpn_lshift -- Shift a number left.
+
+dnl  Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  size	r18
+dnl  cnt	r19
+
+dnl  This code runs at 3.25 cycles/limb on the EV5.
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	s8addq	r18,r17,r17	C make r17 point at end of s1
+	ldq	r4,-8(r17)	C load first limb
+	subq	r31,r19,r20
+	s8addq	r18,r16,r16	C make r16 point at end of RES
+	subq	r18,1,r18
+	and	r18,4-1,r28	C number of limbs in first loop
+	srl	r4,r20,r0	C compute function result
+
+	beq	r28,$L0
+	subq	r18,r28,r18
+
+	ALIGN(8)
+$Loop0:	ldq	r3,-16(r17)
+	subq	r16,8,r16
+	sll	r4,r19,r5
+	subq	r17,8,r17
+	subq	r28,1,r28
+	srl	r3,r20,r6
+	bis	r3,r3,r4
+	bis	r5,r6,r8
+	stq	r8,0(r16)
+	bne	r28,$Loop0
+
+$L0:	sll	r4,r19,r24
+	beq	r18,$Lend
+C warm up phase 1
+	ldq	r1,-16(r17)
+	subq	r18,4,r18
+	ldq	r2,-24(r17)
+	ldq	r3,-32(r17)
+	ldq	r4,-40(r17)
+	beq	r18,$Lend1
+C warm up phase 2
+	srl	r1,r20,r7
+	sll	r1,r19,r21
+	srl	r2,r20,r8
+	ldq	r1,-48(r17)
+	sll	r2,r19,r22
+	ldq	r2,-56(r17)
+	srl	r3,r20,r5
+	bis	r7,r24,r7
+	sll	r3,r19,r23
+	bis	r8,r21,r8
+	srl	r4,r20,r6
+	ldq	r3,-64(r17)
+	sll	r4,r19,r24
+	ldq	r4,-72(r17)
+	subq	r18,4,r18
+	beq	r18,$Lend2
+	ALIGN(16)
+C main loop
+$Loop:	stq	r7,-8(r16)
+	bis	r5,r22,r5
+	stq	r8,-16(r16)
+	bis	r6,r23,r6
+
+	srl	r1,r20,r7
+	subq	r18,4,r18
+	sll	r1,r19,r21
+	unop	C ldq	r31,-96(r17)
+
+	srl	r2,r20,r8
+	ldq	r1,-80(r17)
+	sll	r2,r19,r22
+	ldq	r2,-88(r17)
+
+	stq	r5,-24(r16)
+	bis	r7,r24,r7
+	stq	r6,-32(r16)
+	bis	r8,r21,r8
+
+	srl	r3,r20,r5
+	unop	C ldq	r31,-96(r17)
+	sll	r3,r19,r23
+	subq	r16,32,r16
+
+	srl	r4,r20,r6
+	ldq	r3,-96(r17)
+	sll	r4,r19,r24
+	ldq	r4,-104(r17)
+
+	subq	r17,32,r17
+	bne	r18,$Loop
+C cool down phase 2/1
+$Lend2:	stq	r7,-8(r16)
+	bis	r5,r22,r5
+	stq	r8,-16(r16)
+	bis	r6,r23,r6
+	srl	r1,r20,r7
+	sll	r1,r19,r21
+	srl	r2,r20,r8
+	sll	r2,r19,r22
+	stq	r5,-24(r16)
+	bis	r7,r24,r7
+	stq	r6,-32(r16)
+	bis	r8,r21,r8
+	srl	r3,r20,r5
+	sll	r3,r19,r23
+	srl	r4,r20,r6
+	sll	r4,r19,r24
+C cool down phase 2/2
+	stq	r7,-40(r16)
+	bis	r5,r22,r5
+	stq	r8,-48(r16)
+	bis	r6,r23,r6
+	stq	r5,-56(r16)
+	stq	r6,-64(r16)
+C cool down phase 2/3
+	stq	r24,-72(r16)
+	ret	r31,(r26),1
+
+C cool down phase 1/1
+$Lend1:	srl	r1,r20,r7
+	sll	r1,r19,r21
+	srl	r2,r20,r8
+	sll	r2,r19,r22
+	srl	r3,r20,r5
+	bis	r7,r24,r7
+	sll	r3,r19,r23
+	bis	r8,r21,r8
+	srl	r4,r20,r6
+	sll	r4,r19,r24
+C cool down phase 1/2
+	stq	r7,-8(r16)
+	bis	r5,r22,r5
+	stq	r8,-16(r16)
+	bis	r6,r23,r6
+	stq	r5,-24(r16)
+	stq	r6,-32(r16)
+	stq	r24,-40(r16)
+	ret	r31,(r26),1
+
+$Lend:	stq	r24,-8(r16)
+	ret	r31,(r26),1
+EPILOGUE(mpn_lshift)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/ev5/rshift.asm b/rts/gmp/mpn/alpha/ev5/rshift.asm
new file mode 100644
index 0000000000..9940d83fad
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev5/rshift.asm
@@ -0,0 +1,167 @@
+dnl  Alpha EV5 __gmpn_rshift -- Shift a number right.
+
+dnl  Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  size	r18
+dnl  cnt	r19
+
+dnl  This code runs at 3.25 cycles/limb on the EV5.
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	ldq	r4,0(r17)	C load first limb
+	subq	r31,r19,r20
+	subq	r18,1,r18
+	and	r18,4-1,r28	C number of limbs in first loop
+	sll	r4,r20,r0	C compute function result
+
+	beq	r28,$L0
+	subq	r18,r28,r18
+
+	ALIGN(8)
+$Loop0:	ldq	r3,8(r17)
+	addq	r16,8,r16
+	srl	r4,r19,r5
+	addq	r17,8,r17
+	subq	r28,1,r28
+	sll	r3,r20,r6
+	bis	r3,r3,r4
+	bis	r5,r6,r8
+	stq	r8,-8(r16)
+	bne	r28,$Loop0
+
+$L0:	srl	r4,r19,r24
+	beq	r18,$Lend
+C warm up phase 1
+	ldq	r1,8(r17)
+	subq	r18,4,r18
+	ldq	r2,16(r17)
+	ldq	r3,24(r17)
+	ldq	r4,32(r17)
+	beq	r18,$Lend1
+C warm up phase 2
+	sll	r1,r20,r7
+	srl	r1,r19,r21
+	sll	r2,r20,r8
+	ldq	r1,40(r17)
+	srl	r2,r19,r22
+	ldq	r2,48(r17)
+	sll	r3,r20,r5
+	bis	r7,r24,r7
+	srl	r3,r19,r23
+	bis	r8,r21,r8
+	sll	r4,r20,r6
+	ldq	r3,56(r17)
+	srl	r4,r19,r24
+	ldq	r4,64(r17)
+	subq	r18,4,r18
+	beq	r18,$Lend2
+	ALIGN(16)
+C main loop
+$Loop:	stq	r7,0(r16)
+	bis	r5,r22,r5
+	stq	r8,8(r16)
+	bis	r6,r23,r6
+
+	sll	r1,r20,r7
+	subq	r18,4,r18
+	srl	r1,r19,r21
+	unop	C ldq	r31,-96(r17)
+
+	sll	r2,r20,r8
+	ldq	r1,72(r17)
+	srl	r2,r19,r22
+	ldq	r2,80(r17)
+
+	stq	r5,16(r16)
+	bis	r7,r24,r7
+	stq	r6,24(r16)
+	bis	r8,r21,r8
+
+	sll	r3,r20,r5
+	unop	C ldq	r31,-96(r17)
+	srl	r3,r19,r23
+	addq	r16,32,r16
+
+	sll	r4,r20,r6
+	ldq	r3,88(r17)
+	srl	r4,r19,r24
+	ldq	r4,96(r17)
+
+	addq	r17,32,r17
+	bne	r18,$Loop
+C cool down phase 2/1
+$Lend2:	stq	r7,0(r16)
+	bis	r5,r22,r5
+	stq	r8,8(r16)
+	bis	r6,r23,r6
+	sll	r1,r20,r7
+	srl	r1,r19,r21
+	sll	r2,r20,r8
+	srl	r2,r19,r22
+	stq	r5,16(r16)
+	bis	r7,r24,r7
+	stq	r6,24(r16)
+	bis	r8,r21,r8
+	sll	r3,r20,r5
+	srl	r3,r19,r23
+	sll	r4,r20,r6
+	srl	r4,r19,r24
+C cool down phase 2/2
+	stq	r7,32(r16)
+	bis	r5,r22,r5
+	stq	r8,40(r16)
+	bis	r6,r23,r6
+	stq	r5,48(r16)
+	stq	r6,56(r16)
+C cool down phase 2/3
+	stq	r24,64(r16)
+	ret	r31,(r26),1
+
+C cool down phase 1/1
+$Lend1:	sll	r1,r20,r7
+	srl	r1,r19,r21
+	sll	r2,r20,r8
+	srl	r2,r19,r22
+	sll	r3,r20,r5
+	bis	r7,r24,r7
+	srl	r3,r19,r23
+	bis	r8,r21,r8
+	sll	r4,r20,r6
+	srl	r4,r19,r24
+C cool down phase 1/2
+	stq	r7,0(r16)
+	bis	r5,r22,r5
+	stq	r8,8(r16)
+	bis	r6,r23,r6
+	stq	r5,16(r16)
+	stq	r6,24(r16)
+	stq	r24,32(r16)
+	ret	r31,(r26),1
+
+$Lend:	stq	r24,0(r16)
+	ret	r31,(r26),1
+EPILOGUE(mpn_rshift)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/ev5/sub_n.asm b/rts/gmp/mpn/alpha/ev5/sub_n.asm
new file mode 100644
index 0000000000..5248a2aa38
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev5/sub_n.asm
@@ -0,0 +1,143 @@
+dnl  Alpha EV5 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0
+dnl  and store difference in a third limb vector.
+
+dnl  Copyright (C) 1995, 1999, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  s2_ptr	r18
+dnl  size	r19
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	bis	r31,r31,r25		C clear cy
+	subq	r19,4,r19		C decr loop cnt
+	blt	r19,$Lend2		C if less than 4 limbs, goto 2nd loop
+C Start software pipeline for 1st loop
+	ldq	r0,0(r18)
+	ldq	r4,0(r17)
+	ldq	r1,8(r18)
+	ldq	r5,8(r17)
+	addq	r17,32,r17		C update s1_ptr
+	ldq	r2,16(r18)
+	subq	r4,r0,r20		C 1st main subtract
+	ldq	r3,24(r18)
+	subq	r19,4,r19		C decr loop cnt
+	ldq	r6,-16(r17)
+	cmpult	r4,r0,r25		C compute cy from last subtract
+	ldq	r7,-8(r17)
+	subq	r5,r1,r28		C 2nd main subtract
+	addq	r18,32,r18		C update s2_ptr
+	subq	r28,r25,r21		C 2nd carry subtract
+	cmpult	r5,r1,r8		C compute cy from last subtract
+	blt	r19,$Lend1		C if less than 4 limbs remain, jump
+C 1st loop handles groups of 4 limbs in a software pipeline
+	ALIGN(16)
+$Loop:	cmpult	r28,r25,r25		C compute cy from last subtract
+	ldq	r0,0(r18)
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	ldq	r1,8(r18)
+	subq	r6,r2,r28		C 3rd main subtract
+	ldq	r4,0(r17)
+	subq	r28,r25,r22		C 3rd carry subtract
+	ldq	r5,8(r17)
+	cmpult	r6,r2,r8		C compute cy from last subtract
+	cmpult	r28,r25,r25		C compute cy from last subtract
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	stq	r21,8(r16)
+	subq	r7,r3,r28		C 4th main subtract
+	subq	r28,r25,r23		C 4th carry subtract
+	cmpult	r7,r3,r8		C compute cy from last subtract
+	cmpult	r28,r25,r25		C compute cy from last subtract
+		addq	r17,32,r17		C update s1_ptr
+	bis	r8,r25,r25		C combine cy from the two subtracts
+		addq	r16,32,r16		C update res_ptr
+	subq	r4,r0,r28		C 1st main subtract
+	ldq	r2,16(r18)
+	subq	r28,r25,r20		C 1st carry subtract
+	ldq	r3,24(r18)
+	cmpult	r4,r0,r8		C compute cy from last subtract
+	ldq	r6,-16(r17)
+	cmpult	r28,r25,r25		C compute cy from last subtract
+	ldq	r7,-8(r17)
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	subq	r19,4,r19		C decr loop cnt
+	stq	r22,-16(r16)
+	subq	r5,r1,r28		C 2nd main subtract
+	stq	r23,-8(r16)
+	subq	r28,r25,r21		C 2nd carry subtract
+		addq	r18,32,r18		C update s2_ptr
+	cmpult	r5,r1,r8		C compute cy from last subtract
+	bge	r19,$Loop
+C Finish software pipeline for 1st loop
+$Lend1:	cmpult	r28,r25,r25		C compute cy from last subtract
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	subq	r6,r2,r28		C cy add
+	subq	r28,r25,r22		C 3rd main subtract
+	cmpult	r6,r2,r8		C compute cy from last subtract
+	cmpult	r28,r25,r25		C compute cy from last subtract
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	stq	r21,8(r16)
+	subq	r7,r3,r28		C cy add
+	subq	r28,r25,r23		C 4th main subtract
+	cmpult	r7,r3,r8		C compute cy from last subtract
+	cmpult	r28,r25,r25		C compute cy from last subtract
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	addq	r16,32,r16		C update res_ptr
+	stq	r22,-16(r16)
+	stq	r23,-8(r16)
+$Lend2:	addq	r19,4,r19		C restore loop cnt
+	beq	r19,$Lret
+C Start software pipeline for 2nd loop
+	ldq	r0,0(r18)
+	ldq	r4,0(r17)
+	subq	r19,1,r19
+	beq	r19,$Lend0
+C 2nd loop handles remaining 1-3 limbs
+	ALIGN(16)
+$Loop0:	subq	r4,r0,r28		C main subtract
+	cmpult	r4,r0,r8		C compute cy from last subtract
+	ldq	r0,8(r18)
+	ldq	r4,8(r17)
+	subq	r28,r25,r20		C carry subtract
+	addq	r18,8,r18
+	addq	r17,8,r17
+	stq	r20,0(r16)
+	cmpult	r28,r25,r25		C compute cy from last subtract
+	subq	r19,1,r19		C decr loop cnt
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	addq	r16,8,r16
+	bne	r19,$Loop0
+$Lend0:	subq	r4,r0,r28		C main subtract
+	subq	r28,r25,r20		C carry subtract
+	cmpult	r4,r0,r8		C compute cy from last subtract
+	cmpult	r28,r25,r25		C compute cy from last subtract
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two subtracts
+
+$Lret:	bis	r25,r31,r0		C return cy
+	ret	r31,(r26),1
+EPILOGUE(mpn_sub_n)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/ev6/addmul_1.asm b/rts/gmp/mpn/alpha/ev6/addmul_1.asm
new file mode 100644
index 0000000000..2f588626a5
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev6/addmul_1.asm
@@ -0,0 +1,474 @@
+dnl Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+dnl the result to a second limb vector.
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  size	r18
+dnl  s2_limb	r19
+
+dnl  This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and
+dnl  exactly 3.625 cycles/limb on EV6...
+
+dnl This code was written in close cooperation with ev6 pipeline expert
+dnl Steve Root (root@toober.hlo.dec.com).  Any errors are tege's fault, though.
+dnl
+dnl   Register usages for unrolled loop:
+dnl	  0-3     mul's
+dnl	  4-7     acc's
+dnl	  8-15    mul results
+dnl	  20,21   carry's
+dnl	  22,23   save for stores
+
+dnl   Sustains 8 mul-adds in 29 cycles in the unrolled inner loop.
+
+dnl   The stores can issue a cycle late so we have paired no-op's to 'catch'
+dnl   them, so that further disturbance to the schedule is damped.
+
+dnl   We couldn't pair the loads, because the entangled schedule of the
+dnl   carry's has to happen on one side {0} of the machine. Note, the total
+dnl   use of U0, and the total use of L0 (after attending to the stores).
+dnl   which is part of the reason why....
+
+dnl   This is a great schedule for the d_cache, a poor schedule for the
+dnl   b_cache. The lockup on U0 means that any stall can't be recovered
+dnl   from. Consider a ldq in L1.  say that load gets stalled because it
+dnl   collides with a fill from the b_Cache. On the next cycle, this load
+dnl   gets priority. If first looks at L0, and goes there. The instruction
+dnl   we intended for L0 gets to look at L1, which is NOT where we want
+dnl   it. It either stalls 1, because it can't go in L0, or goes there, and
+dnl   causes a further instruction to stall.
+
+dnl   So for b_cache, we're likely going to want to put one or more cycles
+dnl   back into the code! And, of course, put in prefetches. For the
+dnl   accumulator, lds, intent to modify.  For the multiplier, you might
+dnl   want ldq, evict next, if you're not wanting to use it again soon. Use
+dnl   256 ahead of present pointer value. At a place where we have an mt
+dnl   followed by a bookkeeping, put the bookkeeping in upper, and the
+dnl   prefetch into lower.
+
+dnl   Note, the usage of physical registers per cycle is smoothed off, as
+dnl   much as possible.
+
+dnl   Note, the ldq's and stq's are at the end of the quadpacks.  note, we'd
+dnl   like not to have a ldq or stq to preceded a conditional branch in a
+dnl   quadpack. The conditional branch moves the retire pointer one cycle
+dnl   later.
+
+dnl   Optimization notes:
+dnl   Callee-saves regs: r9 r10 r11 r12 r13 r14 r15 r26 ?r27?
+dnl   Reserved regs:	 r29 r30 r31
+dnl   Free caller-saves regs in unrolled code: r24 r25 r28
+dnl   We should swap some of the callee-saves regs for some of the free
+dnl   caller-saves regs, saving some overhead cycles.
+dnl   Most importantly, we should write fast code for the 0-7 case.
+dnl   The code we use there are for the 21164, and runs at 7 cycles/limb
+dnl   on the 21264.  Should not be hard, if we write specialized code for
+dnl   1-7 limbs (the one for 0 limbs should be straightforward).  We then just
+dnl   need a jump table indexed by the low 3 bits of the count argument.
+
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	cmpult	r18,	8,	r1
+	beq	r1,	$Large
+
+	ldq	r2,	0(r17)		C r2 = s1_limb
+	addq	r17,	8,	r17	C s1_ptr++
+	subq	r18,	1,	r18	C size--
+	mulq	r2,	r19,	r3	C r3 = prod_low
+	ldq	r5,	0(r16)		C r5 = *res_ptr
+	umulh	r2,	r19,	r0	C r0 = prod_high
+	beq	r18,	$Lend0b		C jump if size was == 1
+	ldq	r2,	0(r17)		C r2 = s1_limb
+	addq	r17,	8,	r17	C s1_ptr++
+	subq	r18,	1,	r18	C size--
+	addq	r5,	r3,	r3
+	cmpult	r3,	r5,	r4
+	stq	r3,	0(r16)
+	addq	r16,	8,	r16	C res_ptr++
+	beq	r18,	$Lend0a		C jump if size was == 2
+
+	ALIGN(8)
+$Loop0:	mulq	r2,	r19,	r3	C r3 = prod_low
+	ldq	r5,	0(r16)		C r5 = *res_ptr
+	addq	r4,	r0,	r0	C cy_limb = cy_limb + 'cy'
+	subq	r18,	1,	r18	C size--
+	umulh	r2,	r19,	r4	C r4 = cy_limb
+	ldq	r2,	0(r17)		C r2 = s1_limb
+	addq	r17,	8,	r17	C s1_ptr++
+	addq	r3,	r0,	r3	C r3 = cy_limb + prod_low
+	cmpult	r3,	r0,	r0	C r0 = carry from (cy_limb + prod_low)
+	addq	r5,	r3,	r3
+	cmpult	r3,	r5,	r5
+	stq	r3,	0(r16)
+	addq	r16,	8,	r16	C res_ptr++
+	addq	r5,	r0,	r0	C combine carries
+	bne	r18,	$Loop0
+$Lend0a:
+	mulq	r2,	r19,	r3	C r3 = prod_low
+	ldq	r5,	0(r16)		C r5 = *res_ptr
+	addq	r4,	r0,	r0	C cy_limb = cy_limb + 'cy'
+	umulh	r2,	r19,	r4	C r4 = cy_limb
+	addq	r3,	r0,	r3	C r3 = cy_limb + prod_low
+	cmpult	r3,	r0,	r0	C r0 = carry from (cy_limb + prod_low)
+	addq	r5,	r3,	r3
+	cmpult	r3,	r5,	r5
+	stq	r3,	0(r16)
+	addq	r5,	r0,	r0	C combine carries
+	addq	r4,	r0,	r0	C cy_limb = prod_high + cy
+	ret	r31,	(r26),	1
+$Lend0b:
+	addq	r5,	r3,	r3
+	cmpult	r3,	r5,	r5
+	stq	r3,	0(r16)
+	addq	r0,	r5,	r0
+	ret	r31,	(r26),	1
+
+$Large:
+	lda	$30,	-240($30)
+	stq	$9,	8($30)
+	stq	$10,	16($30)
+	stq	$11,	24($30)
+	stq	$12,	32($30)
+	stq	$13,	40($30)
+	stq	$14,	48($30)
+	stq	$15,	56($30)
+
+	and	r18,	7,	r20	C count for the first loop, 0-7
+	srl	r18,	3,	r18	C count for unrolled loop
+	bis	r31,	r31,	r0
+	beq	r20,	$Lunroll
+	ldq	r2,	0(r17)		C r2 = s1_limb
+	addq	r17,	8,	r17	C s1_ptr++
+	subq	r20,	1,	r20	C size--
+	mulq	r2,	r19,	r3	C r3 = prod_low
+	ldq	r5,	0(r16)		C r5 = *res_ptr
+	umulh	r2,	r19,	r0	C r0 = prod_high
+	beq	r20,	$Lend1b		C jump if size was == 1
+	ldq	r2,	0(r17)		C r2 = s1_limb
+	addq	r17,	8,	r17	C s1_ptr++
+	subq	r20,	1,	r20	C size--
+	addq	r5,	r3,	r3
+	cmpult	r3,	r5,	r4
+	stq	r3,	0(r16)
+	addq	r16,	8,	r16	C res_ptr++
+	beq	r20,	$Lend1a		C jump if size was == 2
+
+	ALIGN(8)
+$Loop1:	mulq	r2,	r19,	r3	C r3 = prod_low
+	ldq	r5,	0(r16)		C r5 = *res_ptr
+	addq	r4,	r0,	r0	C cy_limb = cy_limb + 'cy'
+	subq	r20,	1,	r20	C size--
+	umulh	r2,	r19,	r4	C r4 = cy_limb
+	ldq	r2,	0(r17)		C r2 = s1_limb
+	addq	r17,	8,	r17	C s1_ptr++
+	addq	r3,	r0,	r3	C r3 = cy_limb + prod_low
+	cmpult	r3,	r0,	r0	C r0 = carry from (cy_limb + prod_low)
+	addq	r5,	r3,	r3
+	cmpult	r3,	r5,	r5
+	stq	r3,	0(r16)
+	addq	r16,	8,	r16	C res_ptr++
+	addq	r5,	r0,	r0	C combine carries
+	bne	r20,	$Loop1
+
+$Lend1a:
+	mulq	r2,	r19,	r3	C r3 = prod_low
+	ldq	r5,	0(r16)		C r5 = *res_ptr
+	addq	r4,	r0,	r0	C cy_limb = cy_limb + 'cy'
+	umulh	r2,	r19,	r4	C r4 = cy_limb
+	addq	r3,	r0,	r3	C r3 = cy_limb + prod_low
+	cmpult	r3,	r0,	r0	C r0 = carry from (cy_limb + prod_low)
+	addq	r5,	r3,	r3
+	cmpult	r3,	r5,	r5
+	stq	r3,	0(r16)
+	addq	r16,	8,	r16	C res_ptr++
+	addq	r5,	r0,	r0	C combine carries
+	addq	r4,	r0,	r0	C cy_limb = prod_high + cy
+	br	r31,	$Lunroll
+$Lend1b:
+	addq	r5,	r3,	r3
+	cmpult	r3,	r5,	r5
+	stq	r3,	0(r16)
+	addq	r16,	8,	r16	C res_ptr++
+	addq	r0,	r5,	r0
+
+$Lunroll:
+	lda	r17,	-16(r17)	C L1 bookkeeping
+	lda	r16,	-16(r16)	C L1 bookkeeping
+	bis	r0,	r31,	r12
+
+C ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
+
+	ldq	r2,	16(r17)		C L1
+	ldq	r3,	24(r17)		C L1
+	lda	r18,	-1(r18)		C L1 bookkeeping
+	ldq	r6,	16(r16)		C L1
+	ldq	r7,	24(r16)		C L1
+	ldq	r0,	32(r17)		C L1
+	mulq	r19,	r2,	r13	C U1
+	ldq	r1,	40(r17)		C L1
+	umulh	r19,	r2,	r14	C U1
+	mulq	r19,	r3,	r15	C U1
+	lda	r17,	64(r17)		C L1 bookkeeping
+	ldq	r4,	32(r16)		C L1
+	ldq	r5,	40(r16)		C L1
+	umulh	r19,	r3,	r8	C U1
+	ldq	r2,	-16(r17)	C L1
+	mulq	r19,	r0,	r9	C U1
+	ldq	r3,	-8(r17)		C L1
+	umulh	r19,	r0,	r10	C U1
+	addq	r6,	r13,	r6	C L0 lo + acc
+	mulq	r19,	r1,	r11	C U1
+	cmpult	r6,	r13,	r20	C L0 lo add => carry
+	lda	r16,	64(r16)		C L1 bookkeeping
+	addq	r6,	r12,	r22	C U0 hi add => answer
+	cmpult	r22,	r12,	r21	C L0 hi add => carry
+	addq	r14,	r20,	r14	C U0 hi mul + carry
+	ldq	r6,	-16(r16)	C L1
+	addq	r7,	r15,	r23	C L0 lo + acc
+	addq	r14,	r21,	r14	C U0 hi mul + carry
+	ldq	r7,	-8(r16)		C L1
+	umulh	r19,	r1,	r12	C U1
+	cmpult	r23,	r15,	r20	C L0 lo add => carry
+	addq	r23,	r14,	r23	C U0 hi add => answer
+	ldq	r0,	0(r17)		C L1
+	mulq	r19,	r2,	r13	C U1
+	cmpult	r23,	r14,	r21	C L0 hi add => carry
+	addq	r8,	r20,	r8	C U0 hi mul + carry
+	ldq	r1,	8(r17)		C L1
+	umulh	r19,	r2,	r14	C U1
+	addq	r4,	r9,	r4	C L0 lo + acc
+	stq	r22,	-48(r16)	C L0
+	stq	r23,	-40(r16)	C L1
+	mulq	r19,	r3,	r15	C U1
+	addq	r8,	r21,	r8	C U0 hi mul + carry
+	cmpult	r4,	r9,	r20	C L0 lo add => carry
+	addq	r4,	r8,	r22	C U0 hi add => answer
+	ble	r18,	$Lend		C U1 bookkeeping
+
+C ____ MAIN UNROLLED LOOP ____
+	ALIGN(16)
+$Loop:
+	bis	r31,	r31,	r31	C U1 mt
+	cmpult	r22,	r8,	r21	C L0 hi add => carry
+	addq	r10,	r20,	r10	C U0 hi mul + carry
+	ldq	r4,	0(r16)		C L1
+
+	bis	r31,	r31,	r31	C U1 mt
+	addq	r5,	r11,	r23	C L0 lo + acc
+	addq	r10,	r21,	r10	C L0 hi mul + carry
+	ldq	r5,	8(r16)		C L1
+
+	umulh	r19,	r3,	r8	C U1
+	cmpult	r23,	r11,	r20	C L0 lo add => carry
+	addq	r23,	r10,	r23	C U0 hi add => answer
+	ldq	r2,	16(r17)		C L1
+
+	mulq	r19,	r0,	r9	C U1
+	cmpult	r23,	r10,	r21	C L0 hi add => carry
+	addq	r12,	r20,	r12	C U0 hi mul + carry
+	ldq	r3,	24(r17)		C L1
+
+	umulh	r19,	r0,	r10	C U1
+	addq	r6,	r13,	r6	C L0 lo + acc
+	stq	r22,	-32(r16)	C L0
+	stq	r23,	-24(r16)	C L1
+
+	bis	r31,	r31,	r31	C L0 st slosh
+	mulq	r19,	r1,	r11	C U1
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r12,	r21,	r12	C U0 hi mul + carry
+
+	cmpult	r6,	r13,	r20	C L0 lo add => carry
+	bis	r31,	r31,	r31	C U1 mt
+	lda	r18,	-1(r18)		C L1 bookkeeping
+	addq	r6,	r12,	r22	C U0 hi add => answer
+
+	bis	r31,	r31,	r31	C U1 mt
+	cmpult	r22,	r12,	r21	C L0 hi add => carry
+	addq	r14,	r20,	r14	C U0 hi mul + carry
+	ldq	r6,	16(r16)		C L1
+
+	bis	r31,	r31,	r31	C U1 mt
+	addq	r7,	r15,	r23	C L0 lo + acc
+	addq	r14,	r21,	r14	C U0 hi mul + carry
+	ldq	r7,	24(r16)		C L1
+
+	umulh	r19,	r1,	r12	C U1
+	cmpult	r23,	r15,	r20	C L0 lo add => carry
+	addq	r23,	r14,	r23	C U0 hi add => answer
+	ldq	r0,	32(r17)		C L1
+
+	mulq	r19,	r2,	r13	C U1
+	cmpult	r23,	r14,	r21	C L0 hi add => carry
+	addq	r8,	r20,	r8	C U0 hi mul + carry
+	ldq	r1,	40(r17)		C L1
+
+	umulh	r19,	r2,	r14	C U1
+	addq	r4,	r9,	r4	C U0 lo + acc
+	stq	r22,	-16(r16)	C L0
+	stq	r23,	-8(r16)		C L1
+
+	bis	r31,	r31,	r31	C L0 st slosh
+	mulq	r19,	r3,	r15	C U1
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r8,	r21,	r8	C L0 hi mul + carry
+
+	cmpult	r4,	r9,	r20	C L0 lo add => carry
+	bis	r31,	r31,	r31	C U1 mt
+	lda	r17,	64(r17)		C L1 bookkeeping
+	addq	r4,	r8,	r22	C U0 hi add => answer
+
+	bis	r31,	r31,	r31	C U1 mt
+	cmpult	r22,	r8,	r21	C L0 hi add => carry
+	addq	r10,	r20,	r10	C U0 hi mul + carry
+	ldq	r4,	32(r16)		C L1
+
+	bis	r31,	r31,	r31	C U1 mt
+	addq	r5,	r11,	r23	C L0 lo + acc
+	addq	r10,	r21,	r10	C L0 hi mul + carry
+	ldq	r5,	40(r16)		C L1
+
+	umulh	r19,	r3,	r8	C U1
+	cmpult	r23,	r11,	r20	C L0 lo add => carry
+	addq	r23,	r10,	r23	C U0 hi add => answer
+	ldq	r2,	-16(r17)	C L1
+
+	mulq	r19,	r0,	r9	C U1
+	cmpult	r23,	r10,	r21	C L0 hi add => carry
+	addq	r12,	r20,	r12	C U0 hi mul + carry
+	ldq	r3,	-8(r17)		C L1
+
+	umulh	r19,	r0,	r10	C U1
+	addq	r6,	r13,	r6	C L0 lo + acc
+	stq	r22,	0(r16)		C L0
+	stq	r23,	8(r16)		C L1
+
+	bis	r31,	r31,	r31	C L0 st slosh
+	mulq	r19,	r1,	r11	C U1
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r12,	r21,	r12	C U0 hi mul + carry
+
+	cmpult	r6,	r13,	r20	C L0 lo add => carry
+	bis	r31,	r31,	r31	C U1 mt
+	lda	r16,	64(r16)		C L1 bookkeeping
+	addq	r6,	r12,	r22	C U0 hi add => answer
+
+	bis	r31,	r31,	r31	C U1 mt
+	cmpult	r22,	r12,	r21	C L0 hi add => carry
+	addq	r14,	r20,	r14	C U0 hi mul + carry
+	ldq	r6,	-16(r16)	C L1
+
+	bis	r31,	r31,	r31	C U1 mt
+	addq	r7,	r15,	r23	C L0 lo + acc
+	addq	r14,	r21,	r14	C U0 hi mul + carry
+	ldq	r7,	-8(r16)		C L1
+
+	umulh	r19,	r1,	r12	C U1
+	cmpult	r23,	r15,	r20	C L0 lo add => carry
+	addq	r23,	r14,	r23	C U0 hi add => answer
+	ldq	r0,	0(r17)		C L1
+
+	mulq	r19,	r2,	r13	C U1
+	cmpult	r23,	r14,	r21	C L0 hi add => carry
+	addq	r8,	r20,	r8	C U0 hi mul + carry
+	ldq	r1,	8(r17)		C L1
+
+	umulh	r19,	r2,	r14	C U1
+	addq	r4,	r9,	r4	C L0 lo + acc
+	stq	r22,	-48(r16)	C L0
+	stq	r23,	-40(r16)	C L1
+
+	bis	r31,	r31,	r31	C L0 st slosh
+	mulq	r19,	r3,	r15	C U1
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r8,	r21,	r8	C U0 hi mul + carry
+
+	cmpult	r4,	r9,	r20	C L0 lo add => carry
+	addq	r4,	r8,	r22	C U0 hi add => answer
+	bis	r31,	r31,	r31	C L1 mt
+	bgt	r18,	$Loop		C U1 bookkeeping
+
+C ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
+$Lend:
+	cmpult	r22,	r8,	r21	C L0 hi add => carry
+	addq	r10,	r20,	r10	C U0 hi mul + carry
+	ldq	r4,	0(r16)		C L1
+	addq	r5,	r11,	r23	C L0 lo + acc
+	addq	r10,	r21,	r10	C L0 hi mul + carry
+	ldq	r5,	8(r16)		C L1
+	umulh	r19,	r3,	r8	C U1
+	cmpult	r23,	r11,	r20	C L0 lo add => carry
+	addq	r23,	r10,	r23	C U0 hi add => answer
+	mulq	r19,	r0,	r9	C U1
+	cmpult	r23,	r10,	r21	C L0 hi add => carry
+	addq	r12,	r20,	r12	C U0 hi mul + carry
+	umulh	r19,	r0,	r10	C U1
+	addq	r6,	r13,	r6	C L0 lo + acc
+	stq	r22,	-32(r16)	C L0
+	stq	r23,	-24(r16)	C L1
+	mulq	r19,	r1,	r11	C U1
+	addq	r12,	r21,	r12	C U0 hi mul + carry
+	cmpult	r6,	r13,	r20	C L0 lo add => carry
+	addq	r6,	r12,	r22	C U0 hi add => answer
+	cmpult	r22,	r12,	r21	C L0 hi add => carry
+	addq	r14,	r20,	r14	C U0 hi mul + carry
+	addq	r7,	r15,	r23	C L0 lo + acc
+	addq	r14,	r21,	r14	C U0 hi mul + carry
+	umulh	r19,	r1,	r12	C U1
+	cmpult	r23,	r15,	r20	C L0 lo add => carry
+	addq	r23,	r14,	r23	C U0 hi add => answer
+	cmpult	r23,	r14,	r21	C L0 hi add => carry
+	addq	r8,	r20,	r8	C U0 hi mul + carry
+	addq	r4,	r9,	r4	C U0 lo + acc
+	stq	r22,	-16(r16)	C L0
+	stq	r23,	-8(r16)		C L1
+	bis	r31,	r31,	r31	C L0 st slosh
+	addq	r8,	r21,	r8	C L0 hi mul + carry
+	cmpult	r4,	r9,	r20	C L0 lo add => carry
+	addq	r4,	r8,	r22	C U0 hi add => answer
+	cmpult	r22,	r8,	r21	C L0 hi add => carry
+	addq	r10,	r20,	r10	C U0 hi mul + carry
+	addq	r5,	r11,	r23	C L0 lo + acc
+	addq	r10,	r21,	r10	C L0 hi mul + carry
+	cmpult	r23,	r11,	r20	C L0 lo add => carry
+	addq	r23,	r10,	r23	C U0 hi add => answer
+	cmpult	r23,	r10,	r21	C L0 hi add => carry
+	addq	r12,	r20,	r12	C U0 hi mul + carry
+	stq	r22,	0(r16)		C L0
+	stq	r23,	8(r16)		C L1
+	addq	r12,	r21,	r0	C U0 hi mul + carry
+
+	ldq	$9,	8($30)
+	ldq	$10,	16($30)
+	ldq	$11,	24($30)
+	ldq	$12,	32($30)
+	ldq	$13,	40($30)
+	ldq	$14,	48($30)
+	ldq	$15,	56($30)
+	lda	$30,	240($30)
+	ret	r31,	(r26),	1
+EPILOGUE(mpn_addmul_1)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/ev6/gmp-mparam.h b/rts/gmp/mpn/alpha/ev6/gmp-mparam.h
new file mode 100644
index 0000000000..7ea20577f8
--- /dev/null
+++ b/rts/gmp/mpn/alpha/ev6/gmp-mparam.h
@@ -0,0 +1,62 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* Generated by tuneup.c, 2000-08-02. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD     47
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD         70
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD     94
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD        101
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD                33
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD               70
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD              29
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD         46
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD            33
+#endif
diff --git a/rts/gmp/mpn/alpha/gmp-mparam.h b/rts/gmp/mpn/alpha/gmp-mparam.h
new file mode 100644
index 0000000000..054ff2fe5f
--- /dev/null
+++ b/rts/gmp/mpn/alpha/gmp-mparam.h
@@ -0,0 +1,64 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values are for the 21164 family.  The 21264 will require
+   different values, since it has such quick multiplication.  */
+/* Generated by tuneup.c, 2000-07-19. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   22
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD       53
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   31
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD       47
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              64
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD             98
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            17
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD           4
+#endif
diff --git a/rts/gmp/mpn/alpha/invert_limb.asm b/rts/gmp/mpn/alpha/invert_limb.asm
new file mode 100644
index 0000000000..a921b32b3f
--- /dev/null
+++ b/rts/gmp/mpn/alpha/invert_limb.asm
@@ -0,0 +1,345 @@
+dnl  Alpha mpn_invert_limb -- Invert a normalized limb.
+
+dnl  Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+dnl 
+dnl  This is based on sophie:/gmp-stuff/dbg-inv-limb.c.
+dnl  The ideas are due to Peter L. Montgomery
+dnl 
+dnl  The table below uses 4096 bytes.  The file mentioned above has an
+dnl  alternative function that doesn't require the table, but it runs 50%
+dnl  slower than this.
+
+include(`../config.m4')
+
+ASM_START()
+
+FLOAT64($C36,9223372036854775808.0)		C 2^63
+
+PROLOGUE_GP(mpn_invert_limb)
+	lda	r30,-16(r30)
+	addq	r16,r16,r1
+	bne	r1,$73
+	lda	r0,-1
+	br	r31,$Lend
+$73:
+	srl	r16,1,r1
+	stq	r1,0(r30)
+	ldt	f11,0(r30)
+	cvtqt	f11,f1
+	lda	r1,$C36
+	ldt	f10,0(r1)
+	divt	f10,f1,f10
+	lda	r2,$invtab-4096
+	srl	r16,52,r1
+	addq	r1,r1,r1
+	addq	r1,r2,r1
+	bic	r1,6,r2
+	ldq	r2,0(r2)
+	bic	r1,1,r1
+	extwl	r2,r1,r2
+	sll	r2,48,r0
+	umulh	r16,r0,r1
+	addq	r16,r1,r3
+	stq	r3,0(r30)
+	ldt	f11,0(r30)
+	cvtqt	f11,f1
+	mult	f1,f10,f1
+	cvttqc	f1,f1
+	stt	f1,0(r30)
+	ldq	r4,0(r30)
+	subq	r0,r4,r0
+	umulh	r16,r0,r1
+	mulq	r16,r0,r2
+	addq	r16,r1,r3
+	bge	r3,$Loop2
+$Loop1:	addq	r2,r16,r2
+	cmpult	r2,r16,r1
+	addq	r3,r1,r3
+	addq	r0,1,r0
+	blt	r3,$Loop1
+$Loop2:	cmpult	r2,r16,r1
+	subq	r0,1,r0
+	subq	r3,r1,r3
+	subq	r2,r16,r2
+	bge	r3,$Loop2
+$Lend:
+	lda	r30,16(r30)
+	ret	r31,(r26),1
+EPILOGUE(mpn_invert_limb)
+DATASTART(`$invtab',4)
+	.word 0xffff,0xffc0,0xff80,0xff40,0xff00,0xfec0,0xfe81,0xfe41
+	.word 0xfe01,0xfdc2,0xfd83,0xfd43,0xfd04,0xfcc5,0xfc86,0xfc46
+	.word 0xfc07,0xfbc8,0xfb8a,0xfb4b,0xfb0c,0xfacd,0xfa8e,0xfa50
+	.word 0xfa11,0xf9d3,0xf994,0xf956,0xf918,0xf8d9,0xf89b,0xf85d
+	.word 0xf81f,0xf7e1,0xf7a3,0xf765,0xf727,0xf6ea,0xf6ac,0xf66e
+	.word 0xf631,0xf5f3,0xf5b6,0xf578,0xf53b,0xf4fd,0xf4c0,0xf483
+	.word 0xf446,0xf409,0xf3cc,0xf38f,0xf352,0xf315,0xf2d8,0xf29c
+	.word 0xf25f,0xf222,0xf1e6,0xf1a9,0xf16d,0xf130,0xf0f4,0xf0b8
+	.word 0xf07c,0xf03f,0xf003,0xefc7,0xef8b,0xef4f,0xef14,0xeed8
+	.word 0xee9c,0xee60,0xee25,0xede9,0xedae,0xed72,0xed37,0xecfb
+	.word 0xecc0,0xec85,0xec4a,0xec0e,0xebd3,0xeb98,0xeb5d,0xeb22
+	.word 0xeae8,0xeaad,0xea72,0xea37,0xe9fd,0xe9c2,0xe988,0xe94d
+	.word 0xe913,0xe8d8,0xe89e,0xe864,0xe829,0xe7ef,0xe7b5,0xe77b
+	.word 0xe741,0xe707,0xe6cd,0xe694,0xe65a,0xe620,0xe5e6,0xe5ad
+	.word 0xe573,0xe53a,0xe500,0xe4c7,0xe48d,0xe454,0xe41b,0xe3e2
+	.word 0xe3a9,0xe370,0xe336,0xe2fd,0xe2c5,0xe28c,0xe253,0xe21a
+	.word 0xe1e1,0xe1a9,0xe170,0xe138,0xe0ff,0xe0c7,0xe08e,0xe056
+	.word 0xe01e,0xdfe5,0xdfad,0xdf75,0xdf3d,0xdf05,0xdecd,0xde95
+	.word 0xde5d,0xde25,0xdded,0xddb6,0xdd7e,0xdd46,0xdd0f,0xdcd7
+	.word 0xdca0,0xdc68,0xdc31,0xdbf9,0xdbc2,0xdb8b,0xdb54,0xdb1d
+	.word 0xdae6,0xdaae,0xda78,0xda41,0xda0a,0xd9d3,0xd99c,0xd965
+	.word 0xd92f,0xd8f8,0xd8c1,0xd88b,0xd854,0xd81e,0xd7e8,0xd7b1
+	.word 0xd77b,0xd745,0xd70e,0xd6d8,0xd6a2,0xd66c,0xd636,0xd600
+	.word 0xd5ca,0xd594,0xd55f,0xd529,0xd4f3,0xd4bd,0xd488,0xd452
+	.word 0xd41d,0xd3e7,0xd3b2,0xd37c,0xd347,0xd312,0xd2dd,0xd2a7
+	.word 0xd272,0xd23d,0xd208,0xd1d3,0xd19e,0xd169,0xd134,0xd100
+	.word 0xd0cb,0xd096,0xd061,0xd02d,0xcff8,0xcfc4,0xcf8f,0xcf5b
+	.word 0xcf26,0xcef2,0xcebe,0xce89,0xce55,0xce21,0xcded,0xcdb9
+	.word 0xcd85,0xcd51,0xcd1d,0xcce9,0xccb5,0xcc81,0xcc4e,0xcc1a
+	.word 0xcbe6,0xcbb3,0xcb7f,0xcb4c,0xcb18,0xcae5,0xcab1,0xca7e
+	.word 0xca4b,0xca17,0xc9e4,0xc9b1,0xc97e,0xc94b,0xc918,0xc8e5
+	.word 0xc8b2,0xc87f,0xc84c,0xc819,0xc7e7,0xc7b4,0xc781,0xc74f
+	.word 0xc71c,0xc6e9,0xc6b7,0xc684,0xc652,0xc620,0xc5ed,0xc5bb
+	.word 0xc589,0xc557,0xc524,0xc4f2,0xc4c0,0xc48e,0xc45c,0xc42a
+	.word 0xc3f8,0xc3c7,0xc395,0xc363,0xc331,0xc300,0xc2ce,0xc29c
+	.word 0xc26b,0xc239,0xc208,0xc1d6,0xc1a5,0xc174,0xc142,0xc111
+	.word 0xc0e0,0xc0af,0xc07e,0xc04d,0xc01c,0xbfeb,0xbfba,0xbf89
+	.word 0xbf58,0xbf27,0xbef6,0xbec5,0xbe95,0xbe64,0xbe33,0xbe03
+	.word 0xbdd2,0xbda2,0xbd71,0xbd41,0xbd10,0xbce0,0xbcb0,0xbc80
+	.word 0xbc4f,0xbc1f,0xbbef,0xbbbf,0xbb8f,0xbb5f,0xbb2f,0xbaff
+	.word 0xbacf,0xba9f,0xba6f,0xba40,0xba10,0xb9e0,0xb9b1,0xb981
+	.word 0xb951,0xb922,0xb8f2,0xb8c3,0xb894,0xb864,0xb835,0xb806
+	.word 0xb7d6,0xb7a7,0xb778,0xb749,0xb71a,0xb6eb,0xb6bc,0xb68d
+	.word 0xb65e,0xb62f,0xb600,0xb5d1,0xb5a2,0xb574,0xb545,0xb516
+	.word 0xb4e8,0xb4b9,0xb48a,0xb45c,0xb42e,0xb3ff,0xb3d1,0xb3a2
+	.word 0xb374,0xb346,0xb318,0xb2e9,0xb2bb,0xb28d,0xb25f,0xb231
+	.word 0xb203,0xb1d5,0xb1a7,0xb179,0xb14b,0xb11d,0xb0f0,0xb0c2
+	.word 0xb094,0xb067,0xb039,0xb00b,0xafde,0xafb0,0xaf83,0xaf55
+	.word 0xaf28,0xaefb,0xaecd,0xaea0,0xae73,0xae45,0xae18,0xadeb
+	.word 0xadbe,0xad91,0xad64,0xad37,0xad0a,0xacdd,0xacb0,0xac83
+	.word 0xac57,0xac2a,0xabfd,0xabd0,0xaba4,0xab77,0xab4a,0xab1e
+	.word 0xaaf1,0xaac5,0xaa98,0xaa6c,0xaa40,0xaa13,0xa9e7,0xa9bb
+	.word 0xa98e,0xa962,0xa936,0xa90a,0xa8de,0xa8b2,0xa886,0xa85a
+	.word 0xa82e,0xa802,0xa7d6,0xa7aa,0xa77e,0xa753,0xa727,0xa6fb
+	.word 0xa6d0,0xa6a4,0xa678,0xa64d,0xa621,0xa5f6,0xa5ca,0xa59f
+	.word 0xa574,0xa548,0xa51d,0xa4f2,0xa4c6,0xa49b,0xa470,0xa445
+	.word 0xa41a,0xa3ef,0xa3c4,0xa399,0xa36e,0xa343,0xa318,0xa2ed
+	.word 0xa2c2,0xa297,0xa26d,0xa242,0xa217,0xa1ed,0xa1c2,0xa197
+	.word 0xa16d,0xa142,0xa118,0xa0ed,0xa0c3,0xa098,0xa06e,0xa044
+	.word 0xa01a,0x9fef,0x9fc5,0x9f9b,0x9f71,0x9f47,0x9f1c,0x9ef2
+	.word 0x9ec8,0x9e9e,0x9e74,0x9e4b,0x9e21,0x9df7,0x9dcd,0x9da3
+	.word 0x9d79,0x9d50,0x9d26,0x9cfc,0x9cd3,0x9ca9,0x9c80,0x9c56
+	.word 0x9c2d,0x9c03,0x9bda,0x9bb0,0x9b87,0x9b5e,0x9b34,0x9b0b
+	.word 0x9ae2,0x9ab9,0x9a8f,0x9a66,0x9a3d,0x9a14,0x99eb,0x99c2
+	.word 0x9999,0x9970,0x9947,0x991e,0x98f6,0x98cd,0x98a4,0x987b
+	.word 0x9852,0x982a,0x9801,0x97d8,0x97b0,0x9787,0x975f,0x9736
+	.word 0x970e,0x96e5,0x96bd,0x9695,0x966c,0x9644,0x961c,0x95f3
+	.word 0x95cb,0x95a3,0x957b,0x9553,0x952b,0x9503,0x94db,0x94b3
+	.word 0x948b,0x9463,0x943b,0x9413,0x93eb,0x93c3,0x939b,0x9374
+	.word 0x934c,0x9324,0x92fd,0x92d5,0x92ad,0x9286,0x925e,0x9237
+	.word 0x920f,0x91e8,0x91c0,0x9199,0x9172,0x914a,0x9123,0x90fc
+	.word 0x90d4,0x90ad,0x9086,0x905f,0x9038,0x9011,0x8fea,0x8fc3
+	.word 0x8f9c,0x8f75,0x8f4e,0x8f27,0x8f00,0x8ed9,0x8eb2,0x8e8b
+	.word 0x8e65,0x8e3e,0x8e17,0x8df1,0x8dca,0x8da3,0x8d7d,0x8d56
+	.word 0x8d30,0x8d09,0x8ce3,0x8cbc,0x8c96,0x8c6f,0x8c49,0x8c23
+	.word 0x8bfc,0x8bd6,0x8bb0,0x8b8a,0x8b64,0x8b3d,0x8b17,0x8af1
+	.word 0x8acb,0x8aa5,0x8a7f,0x8a59,0x8a33,0x8a0d,0x89e7,0x89c1
+	.word 0x899c,0x8976,0x8950,0x892a,0x8904,0x88df,0x88b9,0x8893
+	.word 0x886e,0x8848,0x8823,0x87fd,0x87d8,0x87b2,0x878d,0x8767
+	.word 0x8742,0x871d,0x86f7,0x86d2,0x86ad,0x8687,0x8662,0x863d
+	.word 0x8618,0x85f3,0x85ce,0x85a9,0x8583,0x855e,0x8539,0x8514
+	.word 0x84f0,0x84cb,0x84a6,0x8481,0x845c,0x8437,0x8412,0x83ee
+	.word 0x83c9,0x83a4,0x8380,0x835b,0x8336,0x8312,0x82ed,0x82c9
+	.word 0x82a4,0x8280,0x825b,0x8237,0x8212,0x81ee,0x81ca,0x81a5
+	.word 0x8181,0x815d,0x8138,0x8114,0x80f0,0x80cc,0x80a8,0x8084
+	.word 0x8060,0x803c,0x8018,0x7ff4,0x7fd0,0x7fac,0x7f88,0x7f64
+	.word 0x7f40,0x7f1c,0x7ef8,0x7ed4,0x7eb1,0x7e8d,0x7e69,0x7e45
+	.word 0x7e22,0x7dfe,0x7ddb,0x7db7,0x7d93,0x7d70,0x7d4c,0x7d29
+	.word 0x7d05,0x7ce2,0x7cbf,0x7c9b,0x7c78,0x7c55,0x7c31,0x7c0e
+	.word 0x7beb,0x7bc7,0x7ba4,0x7b81,0x7b5e,0x7b3b,0x7b18,0x7af5
+	.word 0x7ad2,0x7aaf,0x7a8c,0x7a69,0x7a46,0x7a23,0x7a00,0x79dd
+	.word 0x79ba,0x7997,0x7975,0x7952,0x792f,0x790c,0x78ea,0x78c7
+	.word 0x78a4,0x7882,0x785f,0x783c,0x781a,0x77f7,0x77d5,0x77b2
+	.word 0x7790,0x776e,0x774b,0x7729,0x7706,0x76e4,0x76c2,0x76a0
+	.word 0x767d,0x765b,0x7639,0x7617,0x75f5,0x75d2,0x75b0,0x758e
+	.word 0x756c,0x754a,0x7528,0x7506,0x74e4,0x74c2,0x74a0,0x747e
+	.word 0x745d,0x743b,0x7419,0x73f7,0x73d5,0x73b4,0x7392,0x7370
+	.word 0x734f,0x732d,0x730b,0x72ea,0x72c8,0x72a7,0x7285,0x7264
+	.word 0x7242,0x7221,0x71ff,0x71de,0x71bc,0x719b,0x717a,0x7158
+	.word 0x7137,0x7116,0x70f5,0x70d3,0x70b2,0x7091,0x7070,0x704f
+	.word 0x702e,0x700c,0x6feb,0x6fca,0x6fa9,0x6f88,0x6f67,0x6f46
+	.word 0x6f26,0x6f05,0x6ee4,0x6ec3,0x6ea2,0x6e81,0x6e60,0x6e40
+	.word 0x6e1f,0x6dfe,0x6dde,0x6dbd,0x6d9c,0x6d7c,0x6d5b,0x6d3a
+	.word 0x6d1a,0x6cf9,0x6cd9,0x6cb8,0x6c98,0x6c77,0x6c57,0x6c37
+	.word 0x6c16,0x6bf6,0x6bd6,0x6bb5,0x6b95,0x6b75,0x6b54,0x6b34
+	.word 0x6b14,0x6af4,0x6ad4,0x6ab4,0x6a94,0x6a73,0x6a53,0x6a33
+	.word 0x6a13,0x69f3,0x69d3,0x69b3,0x6993,0x6974,0x6954,0x6934
+	.word 0x6914,0x68f4,0x68d4,0x68b5,0x6895,0x6875,0x6855,0x6836
+	.word 0x6816,0x67f6,0x67d7,0x67b7,0x6798,0x6778,0x6758,0x6739
+	.word 0x6719,0x66fa,0x66db,0x66bb,0x669c,0x667c,0x665d,0x663e
+	.word 0x661e,0x65ff,0x65e0,0x65c0,0x65a1,0x6582,0x6563,0x6544
+	.word 0x6524,0x6505,0x64e6,0x64c7,0x64a8,0x6489,0x646a,0x644b
+	.word 0x642c,0x640d,0x63ee,0x63cf,0x63b0,0x6391,0x6373,0x6354
+	.word 0x6335,0x6316,0x62f7,0x62d9,0x62ba,0x629b,0x627c,0x625e
+	.word 0x623f,0x6221,0x6202,0x61e3,0x61c5,0x61a6,0x6188,0x6169
+	.word 0x614b,0x612c,0x610e,0x60ef,0x60d1,0x60b3,0x6094,0x6076
+	.word 0x6058,0x6039,0x601b,0x5ffd,0x5fdf,0x5fc0,0x5fa2,0x5f84
+	.word 0x5f66,0x5f48,0x5f2a,0x5f0b,0x5eed,0x5ecf,0x5eb1,0x5e93
+	.word 0x5e75,0x5e57,0x5e39,0x5e1b,0x5dfd,0x5de0,0x5dc2,0x5da4
+	.word 0x5d86,0x5d68,0x5d4a,0x5d2d,0x5d0f,0x5cf1,0x5cd3,0x5cb6
+	.word 0x5c98,0x5c7a,0x5c5d,0x5c3f,0x5c21,0x5c04,0x5be6,0x5bc9
+	.word 0x5bab,0x5b8e,0x5b70,0x5b53,0x5b35,0x5b18,0x5afb,0x5add
+	.word 0x5ac0,0x5aa2,0x5a85,0x5a68,0x5a4b,0x5a2d,0x5a10,0x59f3
+	.word 0x59d6,0x59b8,0x599b,0x597e,0x5961,0x5944,0x5927,0x590a
+	.word 0x58ed,0x58d0,0x58b3,0x5896,0x5879,0x585c,0x583f,0x5822
+	.word 0x5805,0x57e8,0x57cb,0x57ae,0x5791,0x5775,0x5758,0x573b
+	.word 0x571e,0x5702,0x56e5,0x56c8,0x56ac,0x568f,0x5672,0x5656
+	.word 0x5639,0x561c,0x5600,0x55e3,0x55c7,0x55aa,0x558e,0x5571
+	.word 0x5555,0x5538,0x551c,0x5500,0x54e3,0x54c7,0x54aa,0x548e
+	.word 0x5472,0x5456,0x5439,0x541d,0x5401,0x53e5,0x53c8,0x53ac
+	.word 0x5390,0x5374,0x5358,0x533c,0x5320,0x5304,0x52e8,0x52cb
+	.word 0x52af,0x5293,0x5277,0x525c,0x5240,0x5224,0x5208,0x51ec
+	.word 0x51d0,0x51b4,0x5198,0x517c,0x5161,0x5145,0x5129,0x510d
+	.word 0x50f2,0x50d6,0x50ba,0x509f,0x5083,0x5067,0x504c,0x5030
+	.word 0x5015,0x4ff9,0x4fdd,0x4fc2,0x4fa6,0x4f8b,0x4f6f,0x4f54
+	.word 0x4f38,0x4f1d,0x4f02,0x4ee6,0x4ecb,0x4eb0,0x4e94,0x4e79
+	.word 0x4e5e,0x4e42,0x4e27,0x4e0c,0x4df0,0x4dd5,0x4dba,0x4d9f
+	.word 0x4d84,0x4d69,0x4d4d,0x4d32,0x4d17,0x4cfc,0x4ce1,0x4cc6
+	.word 0x4cab,0x4c90,0x4c75,0x4c5a,0x4c3f,0x4c24,0x4c09,0x4bee
+	.word 0x4bd3,0x4bb9,0x4b9e,0x4b83,0x4b68,0x4b4d,0x4b32,0x4b18
+	.word 0x4afd,0x4ae2,0x4ac7,0x4aad,0x4a92,0x4a77,0x4a5d,0x4a42
+	.word 0x4a27,0x4a0d,0x49f2,0x49d8,0x49bd,0x49a3,0x4988,0x496e
+	.word 0x4953,0x4939,0x491e,0x4904,0x48e9,0x48cf,0x48b5,0x489a
+	.word 0x4880,0x4865,0x484b,0x4831,0x4817,0x47fc,0x47e2,0x47c8
+	.word 0x47ae,0x4793,0x4779,0x475f,0x4745,0x472b,0x4711,0x46f6
+	.word 0x46dc,0x46c2,0x46a8,0x468e,0x4674,0x465a,0x4640,0x4626
+	.word 0x460c,0x45f2,0x45d8,0x45be,0x45a5,0x458b,0x4571,0x4557
+	.word 0x453d,0x4523,0x4509,0x44f0,0x44d6,0x44bc,0x44a2,0x4489
+	.word 0x446f,0x4455,0x443c,0x4422,0x4408,0x43ef,0x43d5,0x43bc
+	.word 0x43a2,0x4388,0x436f,0x4355,0x433c,0x4322,0x4309,0x42ef
+	.word 0x42d6,0x42bc,0x42a3,0x428a,0x4270,0x4257,0x423d,0x4224
+	.word 0x420b,0x41f2,0x41d8,0x41bf,0x41a6,0x418c,0x4173,0x415a
+	.word 0x4141,0x4128,0x410e,0x40f5,0x40dc,0x40c3,0x40aa,0x4091
+	.word 0x4078,0x405f,0x4046,0x402d,0x4014,0x3ffb,0x3fe2,0x3fc9
+	.word 0x3fb0,0x3f97,0x3f7e,0x3f65,0x3f4c,0x3f33,0x3f1a,0x3f01
+	.word 0x3ee8,0x3ed0,0x3eb7,0x3e9e,0x3e85,0x3e6c,0x3e54,0x3e3b
+	.word 0x3e22,0x3e0a,0x3df1,0x3dd8,0x3dc0,0x3da7,0x3d8e,0x3d76
+	.word 0x3d5d,0x3d45,0x3d2c,0x3d13,0x3cfb,0x3ce2,0x3cca,0x3cb1
+	.word 0x3c99,0x3c80,0x3c68,0x3c50,0x3c37,0x3c1f,0x3c06,0x3bee
+	.word 0x3bd6,0x3bbd,0x3ba5,0x3b8d,0x3b74,0x3b5c,0x3b44,0x3b2b
+	.word 0x3b13,0x3afb,0x3ae3,0x3acb,0x3ab2,0x3a9a,0x3a82,0x3a6a
+	.word 0x3a52,0x3a3a,0x3a22,0x3a09,0x39f1,0x39d9,0x39c1,0x39a9
+	.word 0x3991,0x3979,0x3961,0x3949,0x3931,0x3919,0x3901,0x38ea
+	.word 0x38d2,0x38ba,0x38a2,0x388a,0x3872,0x385a,0x3843,0x382b
+	.word 0x3813,0x37fb,0x37e3,0x37cc,0x37b4,0x379c,0x3785,0x376d
+	.word 0x3755,0x373e,0x3726,0x370e,0x36f7,0x36df,0x36c8,0x36b0
+	.word 0x3698,0x3681,0x3669,0x3652,0x363a,0x3623,0x360b,0x35f4
+	.word 0x35dc,0x35c5,0x35ae,0x3596,0x357f,0x3567,0x3550,0x3539
+	.word 0x3521,0x350a,0x34f3,0x34db,0x34c4,0x34ad,0x3496,0x347e
+	.word 0x3467,0x3450,0x3439,0x3422,0x340a,0x33f3,0x33dc,0x33c5
+	.word 0x33ae,0x3397,0x3380,0x3368,0x3351,0x333a,0x3323,0x330c
+	.word 0x32f5,0x32de,0x32c7,0x32b0,0x3299,0x3282,0x326c,0x3255
+	.word 0x323e,0x3227,0x3210,0x31f9,0x31e2,0x31cb,0x31b5,0x319e
+	.word 0x3187,0x3170,0x3159,0x3143,0x312c,0x3115,0x30fe,0x30e8
+	.word 0x30d1,0x30ba,0x30a4,0x308d,0x3076,0x3060,0x3049,0x3033
+	.word 0x301c,0x3005,0x2fef,0x2fd8,0x2fc2,0x2fab,0x2f95,0x2f7e
+	.word 0x2f68,0x2f51,0x2f3b,0x2f24,0x2f0e,0x2ef8,0x2ee1,0x2ecb
+	.word 0x2eb4,0x2e9e,0x2e88,0x2e71,0x2e5b,0x2e45,0x2e2e,0x2e18
+	.word 0x2e02,0x2dec,0x2dd5,0x2dbf,0x2da9,0x2d93,0x2d7c,0x2d66
+	.word 0x2d50,0x2d3a,0x2d24,0x2d0e,0x2cf8,0x2ce1,0x2ccb,0x2cb5
+	.word 0x2c9f,0x2c89,0x2c73,0x2c5d,0x2c47,0x2c31,0x2c1b,0x2c05
+	.word 0x2bef,0x2bd9,0x2bc3,0x2bad,0x2b97,0x2b81,0x2b6c,0x2b56
+	.word 0x2b40,0x2b2a,0x2b14,0x2afe,0x2ae8,0x2ad3,0x2abd,0x2aa7
+	.word 0x2a91,0x2a7c,0x2a66,0x2a50,0x2a3a,0x2a25,0x2a0f,0x29f9
+	.word 0x29e4,0x29ce,0x29b8,0x29a3,0x298d,0x2977,0x2962,0x294c
+	.word 0x2937,0x2921,0x290c,0x28f6,0x28e0,0x28cb,0x28b5,0x28a0
+	.word 0x288b,0x2875,0x2860,0x284a,0x2835,0x281f,0x280a,0x27f5
+	.word 0x27df,0x27ca,0x27b4,0x279f,0x278a,0x2774,0x275f,0x274a
+	.word 0x2735,0x271f,0x270a,0x26f5,0x26e0,0x26ca,0x26b5,0x26a0
+	.word 0x268b,0x2676,0x2660,0x264b,0x2636,0x2621,0x260c,0x25f7
+	.word 0x25e2,0x25cd,0x25b8,0x25a2,0x258d,0x2578,0x2563,0x254e
+	.word 0x2539,0x2524,0x250f,0x24fa,0x24e5,0x24d1,0x24bc,0x24a7
+	.word 0x2492,0x247d,0x2468,0x2453,0x243e,0x2429,0x2415,0x2400
+	.word 0x23eb,0x23d6,0x23c1,0x23ad,0x2398,0x2383,0x236e,0x235a
+	.word 0x2345,0x2330,0x231c,0x2307,0x22f2,0x22dd,0x22c9,0x22b4
+	.word 0x22a0,0x228b,0x2276,0x2262,0x224d,0x2239,0x2224,0x2210
+	.word 0x21fb,0x21e6,0x21d2,0x21bd,0x21a9,0x2194,0x2180,0x216c
+	.word 0x2157,0x2143,0x212e,0x211a,0x2105,0x20f1,0x20dd,0x20c8
+	.word 0x20b4,0x20a0,0x208b,0x2077,0x2063,0x204e,0x203a,0x2026
+	.word 0x2012,0x1ffd,0x1fe9,0x1fd5,0x1fc1,0x1fac,0x1f98,0x1f84
+	.word 0x1f70,0x1f5c,0x1f47,0x1f33,0x1f1f,0x1f0b,0x1ef7,0x1ee3
+	.word 0x1ecf,0x1ebb,0x1ea7,0x1e93,0x1e7f,0x1e6a,0x1e56,0x1e42
+	.word 0x1e2e,0x1e1a,0x1e06,0x1df3,0x1ddf,0x1dcb,0x1db7,0x1da3
+	.word 0x1d8f,0x1d7b,0x1d67,0x1d53,0x1d3f,0x1d2b,0x1d18,0x1d04
+	.word 0x1cf0,0x1cdc,0x1cc8,0x1cb5,0x1ca1,0x1c8d,0x1c79,0x1c65
+	.word 0x1c52,0x1c3e,0x1c2a,0x1c17,0x1c03,0x1bef,0x1bdb,0x1bc8
+	.word 0x1bb4,0x1ba0,0x1b8d,0x1b79,0x1b66,0x1b52,0x1b3e,0x1b2b
+	.word 0x1b17,0x1b04,0x1af0,0x1add,0x1ac9,0x1ab6,0x1aa2,0x1a8f
+	.word 0x1a7b,0x1a68,0x1a54,0x1a41,0x1a2d,0x1a1a,0x1a06,0x19f3
+	.word 0x19e0,0x19cc,0x19b9,0x19a5,0x1992,0x197f,0x196b,0x1958
+	.word 0x1945,0x1931,0x191e,0x190b,0x18f8,0x18e4,0x18d1,0x18be
+	.word 0x18ab,0x1897,0x1884,0x1871,0x185e,0x184b,0x1837,0x1824
+	.word 0x1811,0x17fe,0x17eb,0x17d8,0x17c4,0x17b1,0x179e,0x178b
+	.word 0x1778,0x1765,0x1752,0x173f,0x172c,0x1719,0x1706,0x16f3
+	.word 0x16e0,0x16cd,0x16ba,0x16a7,0x1694,0x1681,0x166e,0x165b
+	.word 0x1648,0x1635,0x1623,0x1610,0x15fd,0x15ea,0x15d7,0x15c4
+	.word 0x15b1,0x159f,0x158c,0x1579,0x1566,0x1553,0x1541,0x152e
+	.word 0x151b,0x1508,0x14f6,0x14e3,0x14d0,0x14bd,0x14ab,0x1498
+	.word 0x1485,0x1473,0x1460,0x144d,0x143b,0x1428,0x1416,0x1403
+	.word 0x13f0,0x13de,0x13cb,0x13b9,0x13a6,0x1394,0x1381,0x136f
+	.word 0x135c,0x1349,0x1337,0x1325,0x1312,0x1300,0x12ed,0x12db
+	.word 0x12c8,0x12b6,0x12a3,0x1291,0x127f,0x126c,0x125a,0x1247
+	.word 0x1235,0x1223,0x1210,0x11fe,0x11ec,0x11d9,0x11c7,0x11b5
+	.word 0x11a3,0x1190,0x117e,0x116c,0x1159,0x1147,0x1135,0x1123
+	.word 0x1111,0x10fe,0x10ec,0x10da,0x10c8,0x10b6,0x10a4,0x1091
+	.word 0x107f,0x106d,0x105b,0x1049,0x1037,0x1025,0x1013,0x1001
+	.word 0x0fef,0x0fdc,0x0fca,0x0fb8,0x0fa6,0x0f94,0x0f82,0x0f70
+	.word 0x0f5e,0x0f4c,0x0f3a,0x0f28,0x0f17,0x0f05,0x0ef3,0x0ee1
+	.word 0x0ecf,0x0ebd,0x0eab,0x0e99,0x0e87,0x0e75,0x0e64,0x0e52
+	.word 0x0e40,0x0e2e,0x0e1c,0x0e0a,0x0df9,0x0de7,0x0dd5,0x0dc3
+	.word 0x0db2,0x0da0,0x0d8e,0x0d7c,0x0d6b,0x0d59,0x0d47,0x0d35
+	.word 0x0d24,0x0d12,0x0d00,0x0cef,0x0cdd,0x0ccb,0x0cba,0x0ca8
+	.word 0x0c97,0x0c85,0x0c73,0x0c62,0x0c50,0x0c3f,0x0c2d,0x0c1c
+	.word 0x0c0a,0x0bf8,0x0be7,0x0bd5,0x0bc4,0x0bb2,0x0ba1,0x0b8f
+	.word 0x0b7e,0x0b6c,0x0b5b,0x0b4a,0x0b38,0x0b27,0x0b15,0x0b04
+	.word 0x0af2,0x0ae1,0x0ad0,0x0abe,0x0aad,0x0a9c,0x0a8a,0x0a79
+	.word 0x0a68,0x0a56,0x0a45,0x0a34,0x0a22,0x0a11,0x0a00,0x09ee
+	.word 0x09dd,0x09cc,0x09bb,0x09a9,0x0998,0x0987,0x0976,0x0965
+	.word 0x0953,0x0942,0x0931,0x0920,0x090f,0x08fe,0x08ec,0x08db
+	.word 0x08ca,0x08b9,0x08a8,0x0897,0x0886,0x0875,0x0864,0x0853
+	.word 0x0842,0x0831,0x081f,0x080e,0x07fd,0x07ec,0x07db,0x07ca
+	.word 0x07b9,0x07a8,0x0798,0x0787,0x0776,0x0765,0x0754,0x0743
+	.word 0x0732,0x0721,0x0710,0x06ff,0x06ee,0x06dd,0x06cd,0x06bc
+	.word 0x06ab,0x069a,0x0689,0x0678,0x0668,0x0657,0x0646,0x0635
+	.word 0x0624,0x0614,0x0603,0x05f2,0x05e1,0x05d1,0x05c0,0x05af
+	.word 0x059e,0x058e,0x057d,0x056c,0x055c,0x054b,0x053a,0x052a
+	.word 0x0519,0x0508,0x04f8,0x04e7,0x04d6,0x04c6,0x04b5,0x04a5
+	.word 0x0494,0x0484,0x0473,0x0462,0x0452,0x0441,0x0431,0x0420
+	.word 0x0410,0x03ff,0x03ef,0x03de,0x03ce,0x03bd,0x03ad,0x039c
+	.word 0x038c,0x037b,0x036b,0x035b,0x034a,0x033a,0x0329,0x0319
+	.word 0x0309,0x02f8,0x02e8,0x02d7,0x02c7,0x02b7,0x02a6,0x0296
+	.word 0x0286,0x0275,0x0265,0x0255,0x0245,0x0234,0x0224,0x0214
+	.word 0x0204,0x01f3,0x01e3,0x01d3,0x01c3,0x01b2,0x01a2,0x0192
+	.word 0x0182,0x0172,0x0161,0x0151,0x0141,0x0131,0x0121,0x0111
+	.word 0x0101,0x00f0,0x00e0,0x00d0,0x00c0,0x00b0,0x00a0,0x0090
+	.word 0x0080,0x0070,0x0060,0x0050,0x0040,0x0030,0x0020,0x0010
+DATAEND()
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/lshift.asm b/rts/gmp/mpn/alpha/lshift.asm
new file mode 100644
index 0000000000..87c46f6fe7
--- /dev/null
+++ b/rts/gmp/mpn/alpha/lshift.asm
@@ -0,0 +1,104 @@
+dnl  Alpha mpn_lshift -- Shift a number left.
+
+dnl  Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  size	r18
+dnl  cnt	r19
+
+dnl  This code runs at 4.8 cycles/limb on the 21064.  With infinite unrolling,
+dnl  it would take 4 cycles/limb.  It should be possible to get down to 3
+dnl  cycles/limb since both ldq and stq can be paired with the other used
+dnl  instructions.  But there are many restrictions in the 21064 pipeline that
+dnl  makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+dnl  1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+dnl  2. Only aligned instruction pairs can be paired.
+dnl  3. The store buffer or silo might not be able to deal with the bandwidth.
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	s8addq	r18,r17,r17	C make r17 point at end of s1
+	ldq	r4,-8(r17)	C load first limb
+	subq	r17,8,r17
+	subq	r31,r19,r7
+	s8addq	r18,r16,r16	C make r16 point at end of RES
+	subq	r18,1,r18
+	and	r18,4-1,r20	C number of limbs in first loop
+	srl	r4,r7,r0	C compute function result
+
+	beq	r20,$L0
+	subq	r18,r20,r18
+
+	ALIGN(8)
+$Loop0:
+	ldq	r3,-8(r17)
+	subq	r16,8,r16
+	subq	r17,8,r17
+	subq	r20,1,r20
+	sll	r4,r19,r5
+	srl	r3,r7,r6
+	bis	r3,r3,r4
+	bis	r5,r6,r8
+	stq	r8,0(r16)
+	bne	r20,$Loop0
+
+$L0:	beq	r18,$Lend
+
+	ALIGN(8)
+$Loop:	ldq	r3,-8(r17)
+	subq	r16,32,r16
+	subq	r18,4,r18
+	sll	r4,r19,r5
+	srl	r3,r7,r6
+
+	ldq	r4,-16(r17)
+	sll	r3,r19,r1
+	bis	r5,r6,r8
+	stq	r8,24(r16)
+	srl	r4,r7,r2
+
+	ldq	r3,-24(r17)
+	sll	r4,r19,r5
+	bis	r1,r2,r8
+	stq	r8,16(r16)
+	srl	r3,r7,r6
+
+	ldq	r4,-32(r17)
+	sll	r3,r19,r1
+	bis	r5,r6,r8
+	stq	r8,8(r16)
+	srl	r4,r7,r2
+
+	subq	r17,32,r17
+	bis	r1,r2,r8
+	stq	r8,0(r16)
+
+	bgt	r18,$Loop
+
+$Lend:	sll	r4,r19,r8
+	stq	r8,-8(r16)
+	ret	r31,(r26),1
+EPILOGUE(mpn_lshift)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/mul_1.asm b/rts/gmp/mpn/alpha/mul_1.asm
new file mode 100644
index 0000000000..46b8df34f5
--- /dev/null
+++ b/rts/gmp/mpn/alpha/mul_1.asm
@@ -0,0 +1,71 @@
+dnl  Alpha __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl  the result in a second limb vector.
+
+dnl  Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  size	r18
+dnl  s2_limb	r19
+
+dnl  This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7
+dnl  cycles/limb on EV6.
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	ldq	r2,0(r17)	C r2 = s1_limb
+	subq	r18,1,r18	C size--
+	mulq	r2,r19,r3	C r3 = prod_low
+	bic	r31,r31,r4	C clear cy_limb
+	umulh	r2,r19,r0	C r0 = prod_high
+	beq	r18,$Lend1	C jump if size was == 1
+	ldq	r2,8(r17)	C r2 = s1_limb
+	subq	r18,1,r18	C size--
+	stq	r3,0(r16)
+	beq	r18,$Lend2	C jump if size was == 2
+
+	ALIGN(8)
+$Loop:	mulq	r2,r19,r3	C r3 = prod_low
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	subq	r18,1,r18	C size--
+	umulh	r2,r19,r4	C r4 = cy_limb
+	ldq	r2,16(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	stq	r3,8(r16)
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	addq	r16,8,r16	C res_ptr++
+	bne	r18,$Loop
+
+$Lend2:	mulq	r2,r19,r3	C r3 = prod_low
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	umulh	r2,r19,r4	C r4 = cy_limb
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	stq	r3,8(r16)
+	addq	r4,r0,r0	C cy_limb = prod_high + cy
+	ret	r31,(r26),1
+$Lend1:	stq	r3,0(r16)
+	ret	r31,(r26),1
+EPILOGUE(mpn_mul_1)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/rshift.asm b/rts/gmp/mpn/alpha/rshift.asm
new file mode 100644
index 0000000000..aa25eda54e
--- /dev/null
+++ b/rts/gmp/mpn/alpha/rshift.asm
@@ -0,0 +1,102 @@
+dnl  Alpha mpn_rshift -- Shift a number right.
+
+dnl  Copyright (C) 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  size	r18
+dnl  cnt	r19
+
+dnl  This code runs at 4.8 cycles/limb on the 21064.  With infinite unrolling,
+dnl  it would take 4 cycles/limb.  It should be possible to get down to 3
+dnl  cycles/limb since both ldq and stq can be paired with the other used
+dnl  instructions.  But there are many restrictions in the 21064 pipeline that
+dnl  makes it hard, if not impossible, to get down to 3 cycles/limb:
+
+dnl  1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
+dnl  2. Only aligned instruction pairs can be paired.
+dnl  3. The store buffer or silo might not be able to deal with the bandwidth.
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	ldq	r4,0(r17)	C load first limb
+	addq	r17,8,r17
+	subq	r31,r19,r7
+	subq	r18,1,r18
+	and	r18,4-1,r20	C number of limbs in first loop
+	sll	r4,r7,r0	C compute function result
+
+	beq	r20,$L0
+	subq	r18,r20,r18
+
+	ALIGN(8)
+$Loop0:
+	ldq	r3,0(r17)
+	addq	r16,8,r16
+	addq	r17,8,r17
+	subq	r20,1,r20
+	srl	r4,r19,r5
+	sll	r3,r7,r6
+	bis	r3,r3,r4
+	bis	r5,r6,r8
+	stq	r8,-8(r16)
+	bne	r20,$Loop0
+
+$L0:	beq	r18,$Lend
+
+	ALIGN(8)
+$Loop:	ldq	r3,0(r17)
+	addq	r16,32,r16
+	subq	r18,4,r18
+	srl	r4,r19,r5
+	sll	r3,r7,r6
+
+	ldq	r4,8(r17)
+	srl	r3,r19,r1
+	bis	r5,r6,r8
+	stq	r8,-32(r16)
+	sll	r4,r7,r2
+
+	ldq	r3,16(r17)
+	srl	r4,r19,r5
+	bis	r1,r2,r8
+	stq	r8,-24(r16)
+	sll	r3,r7,r6
+
+	ldq	r4,24(r17)
+	srl	r3,r19,r1
+	bis	r5,r6,r8
+	stq	r8,-16(r16)
+	sll	r4,r7,r2
+
+	addq	r17,32,r17
+	bis	r1,r2,r8
+	stq	r8,-8(r16)
+
+	bgt	r18,$Loop
+
+$Lend:	srl	r4,r19,r8
+	stq	r8,0(r16)
+	ret	r31,(r26),1
+EPILOGUE(mpn_rshift)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/sub_n.asm b/rts/gmp/mpn/alpha/sub_n.asm
new file mode 100644
index 0000000000..718f657141
--- /dev/null
+++ b/rts/gmp/mpn/alpha/sub_n.asm
@@ -0,0 +1,114 @@
+dnl  Alpha mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl  store difference in a third limb vector.
+
+dnl  Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  s2_ptr	r18
+dnl  size	r19
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	ldq	r3,0(r17)
+	ldq	r4,0(r18)
+
+	subq	r19,1,r19
+	and	r19,4-1,r2	C number of limbs in first loop
+	bis	r31,r31,r0
+	beq	r2,$L0		C if multiple of 4 limbs, skip first loop
+
+	subq	r19,r2,r19
+
+$Loop0:	subq	r2,1,r2
+	ldq	r5,8(r17)
+	addq	r4,r0,r4
+	ldq	r6,8(r18)
+	cmpult	r4,r0,r1
+	subq	r3,r4,r4
+	cmpult	r3,r4,r0
+	stq	r4,0(r16)
+	bis	r0,r1,r0
+
+	addq	r17,8,r17
+	addq	r18,8,r18
+	bis	r5,r5,r3
+	bis	r6,r6,r4
+	addq	r16,8,r16
+	bne	r2,$Loop0
+
+$L0:	beq	r19,$Lend
+
+	ALIGN(8)
+$Loop:	subq	r19,4,r19
+
+	ldq	r5,8(r17)
+	addq	r4,r0,r4
+	ldq	r6,8(r18)
+	cmpult	r4,r0,r1
+	subq	r3,r4,r4
+	cmpult	r3,r4,r0
+	stq	r4,0(r16)
+	bis	r0,r1,r0
+
+	ldq	r3,16(r17)
+	addq	r6,r0,r6
+	ldq	r4,16(r18)
+	cmpult	r6,r0,r1
+	subq	r5,r6,r6
+	cmpult	r5,r6,r0
+	stq	r6,8(r16)
+	bis	r0,r1,r0
+
+	ldq	r5,24(r17)
+	addq	r4,r0,r4
+	ldq	r6,24(r18)
+	cmpult	r4,r0,r1
+	subq	r3,r4,r4
+	cmpult	r3,r4,r0
+	stq	r4,16(r16)
+	bis	r0,r1,r0
+
+	ldq	r3,32(r17)
+	addq	r6,r0,r6
+	ldq	r4,32(r18)
+	cmpult	r6,r0,r1
+	subq	r5,r6,r6
+	cmpult	r5,r6,r0
+	stq	r6,24(r16)
+	bis	r0,r1,r0
+
+	addq	r17,32,r17
+	addq	r18,32,r18
+	addq	r16,32,r16
+	bne	r19,$Loop
+
+$Lend:	addq	r4,r0,r4
+	cmpult	r4,r0,r1
+	subq	r3,r4,r4
+	cmpult	r3,r4,r0
+	stq	r4,0(r16)
+	bis	r0,r1,r0
+	ret	r31,(r26),1
+EPILOGUE(mpn_sub_n)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/submul_1.asm b/rts/gmp/mpn/alpha/submul_1.asm
new file mode 100644
index 0000000000..caec1a720b
--- /dev/null
+++ b/rts/gmp/mpn/alpha/submul_1.asm
@@ -0,0 +1,87 @@
+dnl  Alpha __gmpn_submul_1 -- Multiply a limb vector with a limb and
+dnl  subtract the result from a second limb vector.
+
+dnl  Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  size	r18
+dnl  s2_limb	r19
+
+dnl  This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and 7
+dnl  cycles/limb on EV6.
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	ldq	r2,0(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	subq	r18,1,r18	C size--
+	mulq	r2,r19,r3	C r3 = prod_low
+	ldq	r5,0(r16)	C r5 = *res_ptr
+	umulh	r2,r19,r0	C r0 = prod_high
+	beq	r18,$Lend1	C jump if size was == 1
+	ldq	r2,0(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	subq	r18,1,r18	C size--
+	subq	r5,r3,r3
+	cmpult	r5,r3,r4
+	stq	r3,0(r16)
+	addq	r16,8,r16	C res_ptr++
+	beq	r18,$Lend2	C jump if size was == 2
+
+	ALIGN(8)
+$Loop:	mulq	r2,r19,r3	C r3 = prod_low
+	ldq	r5,0(r16)	C r5 = *res_ptr
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	subq	r18,1,r18	C size--
+	umulh	r2,r19,r4	C r4 = cy_limb
+	ldq	r2,0(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	subq	r5,r3,r3
+	cmpult	r5,r3,r5
+	stq	r3,0(r16)
+	addq	r16,8,r16	C res_ptr++
+	addq	r5,r0,r0	C combine carries
+	bne	r18,$Loop
+
+$Lend2:	mulq	r2,r19,r3	C r3 = prod_low
+	ldq	r5,0(r16)	C r5 = *res_ptr
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	umulh	r2,r19,r4	C r4 = cy_limb
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	subq	r5,r3,r3
+	cmpult	r5,r3,r5
+	stq	r3,0(r16)
+	addq	r5,r0,r0	C combine carries
+	addq	r4,r0,r0	C cy_limb = prod_high + cy
+	ret	r31,(r26),1
+$Lend1:	subq	r5,r3,r3
+	cmpult	r5,r3,r5
+	stq	r3,0(r16)
+	addq	r0,r5,r0
+	ret	r31,(r26),1
+EPILOGUE(mpn_submul_1)
+ASM_END()
diff --git a/rts/gmp/mpn/alpha/udiv_qrnnd.S b/rts/gmp/mpn/alpha/udiv_qrnnd.S
new file mode 100644
index 0000000000..53814bbcb0
--- /dev/null
+++ b/rts/gmp/mpn/alpha/udiv_qrnnd.S
@@ -0,0 +1,151 @@
+ # Alpha 21064 __udiv_qrnnd
+
+ # Copyright (C) 1992, 1994, 1995, 1997, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+	.set noreorder
+	.set noat
+.text
+	.align	3
+	.globl	__gmpn_udiv_qrnnd
+	.ent	__gmpn_udiv_qrnnd
+__gmpn_udiv_qrnnd:
+	.frame $30,0,$26,0
+	.prologue 0
+#define cnt	$2
+#define tmp	$3
+#define rem_ptr	$16
+#define n1	$17
+#define n0	$18
+#define d	$19
+#define qb	$20
+
+	ldiq	cnt,16
+	blt	d,.Largedivisor
+
+.Loop1:	cmplt	n0,0,tmp
+	addq	n1,n1,n1
+	bis	n1,tmp,n1
+	addq	n0,n0,n0
+	cmpule	d,n1,qb
+	subq	n1,d,tmp
+	cmovne	qb,tmp,n1
+	bis	n0,qb,n0
+	cmplt	n0,0,tmp
+	addq	n1,n1,n1
+	bis	n1,tmp,n1
+	addq	n0,n0,n0
+	cmpule	d,n1,qb
+	subq	n1,d,tmp
+	cmovne	qb,tmp,n1
+	bis	n0,qb,n0
+	cmplt	n0,0,tmp
+	addq	n1,n1,n1
+	bis	n1,tmp,n1
+	addq	n0,n0,n0
+	cmpule	d,n1,qb
+	subq	n1,d,tmp
+	cmovne	qb,tmp,n1
+	bis	n0,qb,n0
+	cmplt	n0,0,tmp
+	addq	n1,n1,n1
+	bis	n1,tmp,n1
+	addq	n0,n0,n0
+	cmpule	d,n1,qb
+	subq	n1,d,tmp
+	cmovne	qb,tmp,n1
+	bis	n0,qb,n0
+	subq	cnt,1,cnt
+	bgt	cnt,.Loop1
+	stq	n1,0(rem_ptr)
+	bis	$31,n0,$0
+	ret	$31,($26),1
+
+.Largedivisor:
+	and	n0,1,$4
+
+	srl	n0,1,n0
+	sll	n1,63,tmp
+	or	tmp,n0,n0
+	srl	n1,1,n1
+
+	and	d,1,$6
+	srl	d,1,$5
+	addq	$5,$6,$5
+
+.Loop2:	cmplt	n0,0,tmp
+	addq	n1,n1,n1
+	bis	n1,tmp,n1
+	addq	n0,n0,n0
+	cmpule	$5,n1,qb
+	subq	n1,$5,tmp
+	cmovne	qb,tmp,n1
+	bis	n0,qb,n0
+	cmplt	n0,0,tmp
+	addq	n1,n1,n1
+	bis	n1,tmp,n1
+	addq	n0,n0,n0
+	cmpule	$5,n1,qb
+	subq	n1,$5,tmp
+	cmovne	qb,tmp,n1
+	bis	n0,qb,n0
+	cmplt	n0,0,tmp
+	addq	n1,n1,n1
+	bis	n1,tmp,n1
+	addq	n0,n0,n0
+	cmpule	$5,n1,qb
+	subq	n1,$5,tmp
+	cmovne	qb,tmp,n1
+	bis	n0,qb,n0
+	cmplt	n0,0,tmp
+	addq	n1,n1,n1
+	bis	n1,tmp,n1
+	addq	n0,n0,n0
+	cmpule	$5,n1,qb
+	subq	n1,$5,tmp
+	cmovne	qb,tmp,n1
+	bis	n0,qb,n0
+	subq	cnt,1,cnt
+	bgt	cnt,.Loop2
+
+	addq	n1,n1,n1
+	addq	$4,n1,n1
+	bne	$6,.LOdd
+	stq	n1,0(rem_ptr)
+	bis	$31,n0,$0
+	ret	$31,($26),1
+
+.LOdd:
+	/* q' in n0. r' in n1 */
+	addq	n1,n0,n1
+	cmpult	n1,n0,tmp	# tmp := carry from addq
+	beq	tmp,.LLp6
+	addq	n0,1,n0
+	subq	n1,d,n1
+.LLp6:	cmpult	n1,d,tmp
+	bne	tmp,.LLp7
+	addq	n0,1,n0
+	subq	n1,d,n1
+.LLp7:
+	stq	n1,0(rem_ptr)
+	bis	$31,n0,$0
+	ret	$31,($26),1
+
+	.end	__gmpn_udiv_qrnnd
diff --git a/rts/gmp/mpn/alpha/umul.asm b/rts/gmp/mpn/alpha/umul.asm
new file mode 100644
index 0000000000..44428ed5f5
--- /dev/null
+++ b/rts/gmp/mpn/alpha/umul.asm
@@ -0,0 +1,39 @@
+dnl  Currently unused.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published by
+dnl  the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl  option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+	.set noreorder
+	.set volatile
+	.set noat
+
+.text
+	.align 3
+	.globl __umul_ppmm
+	.ent __umul_ppmm
+__umul_ppmm:
+__umul_ppmm..ng:
+	.frame $30,0,$26,0
+	.prologue 0
+	mulq $17,$18,$1
+	umulh $17,$18,$0
+	stq $1,0($16)
+	ret $31,($26),1
+	.end __umul_ppmm
diff --git a/rts/gmp/mpn/alpha/unicos.m4 b/rts/gmp/mpn/alpha/unicos.m4
new file mode 100644
index 0000000000..7ff26c090c
--- /dev/null
+++ b/rts/gmp/mpn/alpha/unicos.m4
@@ -0,0 +1,63 @@
+divert(-1)
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+define(`ASM_START',
+	`.ident	dummy')
+
+define(`X',`^X$1')
+define(`FLOAT64',
+	`dnl
+	.psect	$1@crud,data
+$1:	.t_floating $2
+	.endp')
+
+define(`PROLOGUE',
+	`dnl
+	.stack	192		; What does this mean?  Only Cray knows.
+	.psect	$1@code,code,cache
+$1::')
+define(`PROLOGUE_GP', `PROLOGUE($1)')
+
+define(`EPILOGUE',
+	`dnl
+	.endp')
+
+define(`DATASTART',
+	`dnl
+	.psect	$1@crud,data
+$1:')
+define(`DATAEND',
+	`dnl
+	.endp')
+
+define(`ASM_END',
+	`dnl
+	.end')
+
+define(`unop',`bis r31,r31,r31') ; Unicos assembler lacks unop
+define(`cvttqc',`cvttq/c')
+
+define(`ALIGN',`')		; Unicos assembler seems to align using garbage
+
+divert
+
diff --git a/rts/gmp/mpn/arm/add_n.S b/rts/gmp/mpn/arm/add_n.S
new file mode 100644
index 0000000000..fb3f8f703b
--- /dev/null
+++ b/rts/gmp/mpn/arm/add_n.S
@@ -0,0 +1,77 @@
+@ ARM mpn_add -- Add two limb vectors of the same length > 0 and store sum in
+@ a third limb vector.
+@ Contributed by Robert Harley.
+
+@ Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 2.1 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+@ MA 02111-1307, USA.
+
+#define s r0
+#define a r1
+#define b r2
+#define n r3
+
+#define sl r10
+#define fp r11
+#define ip r12
+#define sp r13
+#define lr r14
+#define pc r15
+
+.text
+	.align	0
+	.global __gmpn_add_n
+	.type	__gmpn_add_n,%function
+__gmpn_add_n:
+	stmfd	sp!, { r8, r9, lr }
+	movs	n, n, lsr #1
+	bcc	skip1
+	ldr	ip, [a], #4
+	ldr	lr, [b], #4
+	adds	ip, ip, lr
+	str	ip, [s], #4
+skip1:
+	tst	n, #1
+	beq	skip2
+	ldmia	a!, { r8, r9 }
+	ldmia	b!, { ip, lr }
+	adcs	r8, r8, ip
+	adcs	r9, r9, lr
+	stmia	s!, { r8, r9 }
+skip2:
+	bics	n, n, #1
+	beq	return
+	stmfd	sp!, { r4, r5, r6, r7 }
+add_n_loop:
+	ldmia	a!, { r4, r5, r6, r7 }
+	ldmia	b!, { r8, r9, ip, lr }
+	adcs	r4, r4, r8
+	ldr	r8, [s] /* Bring stuff into cache. */
+	adcs	r5, r5, r9
+	adcs	r6, r6, ip
+	adcs	r7, r7, lr
+	stmia	s!, { r4, r5, r6, r7 }
+	sub	n, n, #2
+	teq	n, #0
+	bne	add_n_loop
+	ldmfd	sp!, { r4, r5, r6, r7 }
+return:
+	adc	r0, n, #0
+	ldmfd	sp!, { r8, r9, pc }
+end:
+	.size	__gmpn_add_n, end - __gmpn_add_n
diff --git a/rts/gmp/mpn/arm/addmul_1.S b/rts/gmp/mpn/arm/addmul_1.S
new file mode 100644
index 0000000000..396fff77a3
--- /dev/null
+++ b/rts/gmp/mpn/arm/addmul_1.S
@@ -0,0 +1,89 @@
+@ ARM mpn_mul_1 -- Multiply a limb vector with a limb and add the result to a
+@ second limb vector.
+@ Contributed by Robert Harley.
+
+@ Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 2.1 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+@ MA 02111-1307, USA.
+
+#define p r0
+#define a r1
+#define n r2
+#define w r3
+
+#define z r11
+
+#define ip r12
+#define sp r13
+#define lr r14
+#define pc r15
+
+.text
+	.align	0
+	.global	__gmpn_addmul_1
+	.type	__gmpn_addmul_1,%function
+__gmpn_addmul_1:
+	stmfd	sp!, { r8-r11, lr }
+	mov	z, #0
+	mov	ip, #0
+	movs	n, n, lsr #1
+	bcc	skip1
+	ldr	lr, [a], #4
+	ldr	r9, [p]
+	umlal	r9, ip, w, lr
+	str	r9, [p], #4
+skip1:
+	movs	n, n, lsr #1
+	bcc	skip2
+	ldmia	p, { r9, r10 }
+	adds	r8, ip, r9
+	adc	r9, z, #0
+	ldmia	a!, { ip, lr }
+	umlal	r8, r9, w, ip
+	adds	r9, r9, r10
+	adc	ip, z, #0
+	umlal	r9, ip, w, lr
+	stmia	p!, { r8, r9 }
+skip2:
+	teq	n, #0
+	beq	return
+	stmfd	sp!, { r4-r7 }
+addmul_loop:
+	ldmia	p, { r5, r6, r7, r8 }
+	adds	r4, ip, r5
+	adc	r5, z, #0
+	ldmia	a!, { r9, r10, ip, lr }
+	umlal	r4, r5, w, r9
+	adds	r5, r5, r6
+	adc	r6, z, #0
+	umlal	r5, r6, w, r10
+	adds	r6, r6, r7
+	adc	r7, z, #0
+	umlal	r6, r7, w, ip
+	adds	r7, r7, r8
+	adc	ip, z, #0
+	umlal	r7, ip, w, lr
+	subs	n, n, #1
+	stmia	p!, { r4, r5, r6, r7 }
+	bne	addmul_loop
+	ldmfd	sp!, { r4-r7 }
+return:
+	mov	r0, ip
+	ldmfd	sp!, { r8-r11, pc }
+end:
+	.size	__gmpn_addmul_1, end - __gmpn_addmul_1
diff --git a/rts/gmp/mpn/arm/gmp-mparam.h b/rts/gmp/mpn/arm/gmp-mparam.h
new file mode 100644
index 0000000000..a35b0c7b66
--- /dev/null
+++ b/rts/gmp/mpn/arm/gmp-mparam.h
@@ -0,0 +1,34 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 21
+#endif
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 48
+#endif
diff --git a/rts/gmp/mpn/arm/mul_1.S b/rts/gmp/mpn/arm/mul_1.S
new file mode 100644
index 0000000000..bae526a0f0
--- /dev/null
+++ b/rts/gmp/mpn/arm/mul_1.S
@@ -0,0 +1,81 @@
+@ ARM mpn_addmul_1 -- Multiply a limb vector with a limb and store the result
+@ in a second limb vector.
+@ Contributed by Robert Harley.
+
+@ Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 2.1 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+@ MA 02111-1307, USA.
+
+#define p r0
+#define a r1
+#define n r2
+#define w r3
+
+#define sl r10
+#define fp r11
+#define ip r12
+#define sp r13
+#define lr r14
+#define pc r15
+
+.text
+	.align	0
+	.global	__gmpn_mul_1
+	.type	__gmpn_mul_1,%function
+__gmpn_mul_1:
+	stmfd	sp!, { r8, r9, lr }
+	ands	ip, n, #1
+	beq	skip1
+	ldr	lr, [a], #4
+	umull	r9, ip, w, lr
+	str	r9, [p], #4
+skip1:
+	tst	n, #2
+	beq	skip2
+	mov	r8, ip
+	ldmia	a!, { ip, lr }
+	mov	r9, #0
+	umlal	r8, r9, w, ip
+	mov	ip, #0
+	umlal	r9, ip, w, lr
+	stmia	p!, { r8, r9 }
+skip2:
+	bics	n, n, #3
+	beq	return
+	stmfd	sp!, { r6, r7 }
+mul_1_loop:
+	mov	r6, ip
+	ldmia	a!, { r8, r9, ip, lr }
+	ldr	r7, [p] /* Bring stuff into cache. */
+	mov	r7, #0
+	umlal	r6, r7, w, r8
+	mov	r8, #0
+	umlal	r7, r8, w, r9
+	mov	r9, #0
+	umlal	r8, r9, w, ip
+	mov	ip, #0
+	umlal	r9, ip, w, lr
+	subs	n, n, #4
+	stmia	p!, { r6, r7, r8, r9 }
+	bne	mul_1_loop
+	ldmfd	sp!, { r6, r7 }
+return:
+	mov	r0, ip
+	ldmfd	sp!, { r8, r9, pc }
+end:
+	.size	__gmpn_mul_1, end - __gmpn_mul_1
diff --git a/rts/gmp/mpn/arm/sub_n.S b/rts/gmp/mpn/arm/sub_n.S
new file mode 100644
index 0000000000..856505fe21
--- /dev/null
+++ b/rts/gmp/mpn/arm/sub_n.S
@@ -0,0 +1,79 @@
+@ ARM mpn_sub -- Subtract two limb vectors of the same length > 0 and store
+@ difference in a third limb vector.
+@ Contributed by Robert Harley.
+
+@ Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 2.1 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+@ MA 02111-1307, USA.
+
+#define d r0
+#define a r1
+#define b r2
+#define n r3
+
+#define sl r10
+#define fp r11
+#define ip r12
+#define sp r13
+#define lr r14
+#define pc r15
+
+.text
+	.align	0
+	.global	__gmpn_sub_n
+	.type	__gmpn_sub_n,%function
+__gmpn_sub_n:
+	stmfd	sp!, { r8, r9, lr }
+	subs	ip, ip, ip
+	tst	n, #1
+	beq	skip1
+	ldr	ip, [a], #4
+	ldr	lr, [b], #4
+	subs	ip, ip, lr
+	str	ip, [d], #4
+skip1:
+	tst	n, #2
+	beq	skip2
+	ldmia	a!, { r8, r9 }
+	ldmia	b!, { ip, lr }
+	sbcs	r8, r8, ip
+	sbcs	r9, r9, lr
+	stmia	d!, { r8, r9 }
+skip2:
+	bics	n, n, #3
+	beq	return
+	stmfd	sp!, { r4, r5, r6, r7 }
+sub_n_loop:
+	ldmia	a!, { r4, r5, r6, r7 }
+	ldmia	b!, { r8, r9, ip, lr }
+	sbcs	r4, r4, r8
+	ldr	r8, [d] /* Bring stuff into cache. */
+	sbcs	r5, r5, r9
+	sbcs	r6, r6, ip
+	sbcs	r7, r7, lr
+	stmia	d!, { r4, r5, r6, r7 }
+	sub	n, n, #4
+	teq	n, #0
+	bne	sub_n_loop
+	ldmfd	sp!, { r4, r5, r6, r7 }
+return:
+	sbc	r0, r0, r0
+	and	r0, r0, #1
+	ldmfd	sp!, { r8, r9, pc }
+end:
+	.size	__gmpn_sub_n, end - __gmpn_sub_n
diff --git a/rts/gmp/mpn/asm-defs.m4 b/rts/gmp/mpn/asm-defs.m4
new file mode 100644
index 0000000000..aa2024138b
--- /dev/null
+++ b/rts/gmp/mpn/asm-defs.m4
@@ -0,0 +1,1182 @@
+divert(-1)
+dnl
+dnl  m4 macros for gmp assembly code, shared by all CPUs.
+dnl
+dnl  These macros are designed for use with any m4 and have been used on
+dnl  GNU, FreeBSD, OpenBSD and SysV.
+dnl
+dnl  GNU m4 and OpenBSD 2.7 m4 will give filenames and line numbers in error
+dnl  messages.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl  Macros:
+dnl
+dnl  Most new m4 specific macros have an "m4_" prefix to emphasise they're
+dnl  m4 expansions.  But new defining things like deflit() and defreg() are
+dnl  named like the builtin define(), and forloop() is named following the
+dnl  GNU m4 example on which it's based.
+dnl
+dnl  GNU m4 with the -P option uses "m4_" as a prefix for builtins, but that
+dnl  option isn't going to be used, so there's no conflict or confusion.
+dnl
+dnl
+dnl  Comments in output:
+dnl
+dnl  The m4 comment delimiters are left at # and \n, the normal assembler
+dnl  commenting for most CPUs.  m4 passes comment text through without
+dnl  expanding macros in it, which is generally a good thing since it stops
+dnl  unexpected expansions and possible resultant errors.
+dnl
+dnl  But note that when a quoted string is being read, a # isn't special, so
+dnl  apostrophes in comments in quoted strings must be avoided or they'll be
+dnl  interpreted as a closing quote mark.  But when the quoted text is
+dnl  re-read # will still act like a normal comment, supressing macro
+dnl  expansion.
+dnl
+dnl  For example,
+dnl
+dnl          # apostrophes in comments that're outside quotes are ok
+dnl          # and using macro names like PROLOGUE is ok too
+dnl          ...
+dnl          ifdef(`PIC',`
+dnl                  # but apostrophes aren't ok inside quotes
+dnl                  #                     ^--wrong
+dnl                  ...
+dnl                  # though macro names like PROLOGUE are still ok
+dnl                  ...
+dnl          ')
+dnl
+dnl  If macro expansion in a comment is wanted, use `#' in the .asm (ie. a
+dnl  quoted hash symbol), which will turn into # in the .s but get
+dnl  expansions done on that line.  This can make the .s more readable to
+dnl  humans, but it won't make a blind bit of difference to the assembler.
+dnl
+dnl  All the above applies, mutatis mutandis, when changecom() is used to
+dnl  select @ ! ; or whatever other commenting.
+dnl
+dnl
+dnl  Variations in m4 affecting gmp:
+dnl
+dnl  $# - When a macro is called as "foo" with no brackets, BSD m4 sets $#
+dnl       to 1, whereas GNU or SysV m4 set it to 0.  In all cases though
+dnl       "foo()" sets $# to 1.  This is worked around in various places.
+dnl
+dnl  len() - When "len()" is given an empty argument, BSD m4 evaluates to
+dnl       nothing, whereas GNU, SysV, and the new OpenBSD, evaluate to 0.
+dnl       See m4_length() below which works around this.
+dnl
+dnl  translit() - GNU m4 accepts character ranges like A-Z, and the new
+dnl       OpenBSD m4 does under option -g, but basic BSD and SysV don't.
+dnl
+dnl  popdef() - in BSD and SysV m4 popdef() takes multiple arguments and
+dnl       pops each, but GNU m4 only takes one argument.
+dnl
+dnl  push back - BSD m4 has some limits on the amount of text that can be
+dnl       pushed back.  The limit is reasonably big and so long as macros
+dnl       don't gratuitously duplicate big arguments it isn't a problem.
+dnl       Normally an error message is given, but sometimes it just hangs.
+dnl
+dnl  eval() &,|,^ - GNU and SysV m4 have bitwise operators &,|,^ available,
+dnl       but BSD m4 doesn't (contrary to what the man page suggests) and
+dnl       instead ^ is exponentiation.
+dnl
+dnl  eval() ?: - The C ternary operator "?:" is available in BSD m4, but not
+dnl       in SysV or GNU m4 (as of GNU m4 1.4 and betas of 1.5).
+dnl
+dnl  eval() -2^31 - BSD m4 has a bug where an eval() resulting in -2^31
+dnl       (ie. -2147483648) gives "-(".  Using -2147483648 within an
+dnl       expression is ok, it just can't be a final result.  "-(" will of
+dnl       course upset parsing, with all sorts of strange effects.
+dnl
+dnl  eval() <<,>> - SysV m4 doesn't support shift operators in eval() (on
+dnl       SunOS 5.7 /usr/xpg4/m4 has them but /usr/ccs/m4 doesn't).  See
+dnl       m4_lshift() and m4_rshift() below for workarounds.
+dnl
+dnl  m4wrap() - in BSD m4, m4wrap() replaces any previous m4wrap() string,
+dnl       in SysV m4 it appends to it, and in GNU m4 it prepends.  See
+dnl       m4wrap_prepend() below which brings uniformity to this.
+dnl
+dnl  __file__,__line__ - GNU m4 and OpenBSD 2.7 m4 provide these, and
+dnl       they're used here to make error messages more informative.  GNU m4
+dnl       gives an unhelpful "NONE 0" in an m4wrap(), but that's worked
+dnl       around.
+dnl
+dnl  __file__ quoting - OpenBSD m4, unlike GNU m4, doesn't quote the
+dnl       filename in __file__, so care should be taken that no macro has
+dnl       the same name as a file, or an unwanted expansion will occur when
+dnl       printing an error or warning.
+dnl
+dnl  OpenBSD 2.6 m4 - this m4 rejects decimal constants containing an 8 or 9
+dnl       in eval(), making it pretty much unusable.  This bug is confined
+dnl       to version 2.6 (it's not in 2.5, and has been fixed in 2.7).
+dnl
+dnl  SunOS /usr/bin/m4 - this m4 lacks a number of desired features,
+dnl       including $# and $@, defn(), m4exit(), m4wrap(), pushdef(),
+dnl       popdef().  /usr/5bin/m4 is a SysV style m4 which should always be
+dnl       available, and "configure" will reject /usr/bin/m4 in favour of
+dnl       /usr/5bin/m4 (if necessary).
+dnl
+dnl       The sparc code actually has modest m4 requirements currently and
+dnl       could manage with /usr/bin/m4, but there's no reason to put our
+dnl       macros through contortions when /usr/5bin/m4 is available or GNU
+dnl       m4 can be installed.
+
+
+ifdef(`__ASM_DEFS_M4_INCLUDED__',
+`m4_error(`asm-defs.m4 already included, dont include it twice
+')m4exit(1)')
+define(`__ASM_DEFS_M4_INCLUDED__')
+
+
+dnl  Detect and give a message about the unsuitable OpenBSD 2.6 m4.
+
+ifelse(eval(89),89,,
+`errprint(
+`This m4 doesnt accept 8 and/or 9 in constants in eval(), making it unusable.
+This is probably OpenBSD 2.6 m4 (September 1999).  Upgrade to OpenBSD 2.7,
+or get a bug fix from the CVS (expr.c rev 1.9), or get GNU m4.  Dont forget
+to configure with M4=/wherever/m4 if you install one of these in a directory
+not in $PATH.
+')m4exit(1)')
+
+
+dnl  Detect and give a message about the unsuitable SunOS /usr/bin/m4.
+dnl
+dnl  Unfortunately this test doesn't work when m4 is run in the normal way
+dnl  from mpn/Makefile with "m4 -DOPERATION_foo foo.asm", since the bad m4
+dnl  takes "-" in "-D..." to mean read stdin, so it will look like it just
+dnl  hangs.  But running "m4 asm-defs.m4" to try it out will work.
+dnl
+dnl  We'd like to abort immediately on finding a problem, but unfortunately
+dnl  the bad m4 doesn't have an m4exit(), nor does an invalid eval() kill
+dnl  it.  Unexpanded $#'s in some m4_assert_numargs() later on will comment
+dnl  out some closing parentheses and kill it with "m4: arg stack overflow".
+
+define(m4_dollarhash_works_test,``$#'')
+ifelse(m4_dollarhash_works_test(x),1,,
+`errprint(
+`This m4 doesnt support $# and cant be used for GMP asm processing.
+If this is on SunOS, ./configure should choose /usr/5bin/m4 if you have that
+or can get it, otherwise install GNU m4.  Dont forget to configure with
+M4=/wherever/m4 if you install in a directory not in $PATH.
+')')
+undefine(`m4_dollarhash_works_test')
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Basic error handling things.
+
+
+dnl  Usage: m4_dollarhash_1_if_noparen_p
+dnl
+dnl  Expand to 1 if a call "foo" gives $# set to 1 (as opposed to 0 like GNU
+dnl  and SysV m4 give).
+
+define(m4_dollarhash_1_if_noparen_test,`$#')
+define(m4_dollarhash_1_if_noparen_p,
+eval(m4_dollarhash_1_if_noparen_test==1))
+undefine(`m4_dollarhash_1_if_noparen_test')
+
+
+dnl  Usage: m4wrap_prepend(string)
+dnl
+dnl  Prepend the given string to what will be exapanded under m4wrap at the
+dnl  end of input.
+dnl
+dnl  This macro exists to work around variations in m4wrap() behaviour in
+dnl  the various m4s (notes at the start of this file).  Don't use m4wrap()
+dnl  directly since it will interfere with this scheme.
+
+define(m4wrap_prepend,
+m4_assert_numargs(1)
+`define(`m4wrap_string',`$1'defn(`m4wrap_string'))')
+
+m4wrap(`m4wrap_string')
+define(m4wrap_string,`')
+
+
+dnl  Usage: m4_file_and_line
+dnl
+dnl  Expand to the current file and line number, if the GNU m4 extensions
+dnl  __file__ and __line__ are available.
+dnl
+dnl  In GNU m4 1.4 at the end of input when m4wrap text is expanded,
+dnl  __file__ is NONE and __line__ is 0, which is not a helpful thing to
+dnl  print.  If m4_file_seen() has been called to note the last file seen,
+dnl  then that file at a big line number is used, otherwise "end of input"
+dnl  is used (although "end of input" won't parse as an error message).
+
+define(m4_file_and_line,
+`ifdef(`__file__',
+`ifelse(__file__`'__line__,`NONE0',
+`ifdef(`m4_file_seen_last',`m4_file_seen_last: 999999: ',`end of input: ')',
+`__file__: __line__: ')')')
+
+
+dnl  Usage: m4_errprint_commas(arg,...)
+dnl
+dnl  The same as errprint(), but commas are printed between arguments
+dnl  instead of spaces.
+
+define(m4_errprint_commas,
+`errprint(`$1')dnl
+ifelse(eval($#>1),1,`errprint(`,')m4_errprint_commas(shift($@))')')
+
+
+dnl  Usage: m4_error(args...)
+dnl         m4_warning(args...)
+dnl
+dnl  Print an error message, using m4_errprint_commas, prefixed with the
+dnl  current filename and line number (if available).  m4_error sets up to
+dnl  give an error exit at the end of processing, m4_warning just prints.
+dnl  These macros are the recommended way to print errors.
+dnl
+dnl  The arguments here should be quoted in the usual way to prevent them
+dnl  being expanded when the macro call is read.  (m4_error takes care not
+dnl  to do any further expansion.)
+dnl
+dnl  For example,
+dnl
+dnl         m4_error(`some error message
+dnl         ')
+dnl
+dnl  which prints
+dnl
+dnl         foo.asm:123: some error message
+dnl
+dnl  or if __file__ and __line__ aren't available
+dnl
+dnl         some error message
+dnl
+dnl  The "file:line:" format is a basic style, used by gcc and GNU m4, so
+dnl  emacs and other editors will recognise it in their normal error message
+dnl  parsing.
+
+define(m4_warning,
+`m4_errprint_commas(m4_file_and_line`'$@)')
+
+define(m4_error,
+`define(`m4_error_occurred',1)m4_warning($@)')
+
+define(`m4_error_occurred',0)
+
+dnl  This m4wrap_prepend() is first, so it'll be executed last.
+m4wrap_prepend(
+`ifelse(m4_error_occurred,1,
+`m4_error(`Errors occurred during m4 processing
+')m4exit(1)')')
+
+
+dnl  Usage: m4_assert_numargs(num)
+dnl
+dnl  Put this unquoted on a line on its own at the start of a macro
+dnl  definition to add some code to check that num many arguments get passed
+dnl  to the macro.  For example,
+dnl
+dnl         define(foo,
+dnl         m4_assert_numargs(2)
+dnl         `something `$1' and `$2' blah blah')
+dnl
+dnl  Then a call like foo(one,two,three) will provoke an error like
+dnl
+dnl         file:10: foo expected 2 arguments, got 3 arguments
+dnl
+dnl  Here are some calls and how many arguments they're interpreted as passing.
+dnl
+dnl         foo(abc,def)  2
+dnl         foo(xyz)      1
+dnl         foo()         0
+dnl         foo          -1
+dnl
+dnl  The -1 for no parentheses at all means a macro that's meant to be used
+dnl  that way can be checked with m4_assert_numargs(-1).  For example,
+dnl
+dnl         define(SPECIAL_SUFFIX,
+dnl         m4_assert_numargs(-1)
+dnl         `ifdef(`FOO',`_foo',`_bar')')
+dnl
+dnl  But as an alternative see also deflit() below where parenthesized
+dnl  expressions following a macro are passed through to the output.
+dnl
+dnl  Note that in BSD m4 there's no way to differentiate calls "foo" and
+dnl  "foo()", so in BSD m4 the distinction between the two isn't enforced.
+dnl  (In GNU and SysV m4 it can be checked, and is.)
+
+
+dnl  m4_assert_numargs is able to check its own arguments by calling
+dnl  assert_numargs_internal directly.
+dnl
+dnl  m4_doublequote($`'0) expands to ``$0'', whereas ``$`'0'' would expand
+dnl  to `$`'0' and do the wrong thing, and likewise for $1.  The same is
+dnl  done in other assert macros.
+dnl
+dnl  $`#' leaves $# in the new macro being defined, and stops # being
+dnl  interpreted as a comment character.
+dnl
+dnl  `dnl ' means an explicit dnl isn't necessary when m4_assert_numargs is
+dnl  used.  The space means that if there is a dnl it'll still work.
+
+dnl  Usage: m4_doublequote(x) expands to ``x''
+define(m4_doublequote,
+`m4_assert_numargs_internal(`$0',1,$#,len(`$1'))``$1''')
+
+define(m4_assert_numargs,
+`m4_assert_numargs_internal(`$0',1,$#,len(`$1'))dnl
+`m4_assert_numargs_internal'(m4_doublequote($`'0),$1,$`#',`len'(m4_doublequote($`'1)))`dnl '')
+
+dnl  Called: m4_assert_numargs_internal(`macroname',wantargs,$#,len(`$1'))
+define(m4_assert_numargs_internal,
+`m4_assert_numargs_internal_check(`$1',`$2',m4_numargs_count(`$3',`$4'))')
+
+dnl  Called: m4_assert_numargs_internal_check(`macroname',wantargs,gotargs)
+dnl
+dnl  If m4_dollarhash_1_if_noparen_p (BSD m4) then gotargs can be 0 when it
+dnl  should be -1.  If wantargs is -1 but gotargs is 0 and the two can't be
+dnl  distinguished then it's allowed to pass.
+dnl
+define(m4_assert_numargs_internal_check,
+`ifelse(eval($2 == $3
+             || ($2==-1 && $3==0 && m4_dollarhash_1_if_noparen_p)),0,
+`m4_error(`$1 expected 'm4_Narguments(`$2')`, got 'm4_Narguments(`$3')
+)')')
+
+dnl  Called: m4_numargs_count($#,len(`$1'))
+dnl  If $#==0 then -1 args, if $#==1 but len(`$1')==0 then 0 args, otherwise
+dnl  $# args.
+define(m4_numargs_count,
+`ifelse($1,0, -1,
+`ifelse(eval($1==1 && $2-0==0),1, 0, $1)')')
+
+dnl  Usage: m4_Narguments(N)
+dnl  "$1 argument" or "$1 arguments" with the plural according to $1.
+define(m4_Narguments,
+`$1 argument`'ifelse(`$1',1,,s)')
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Additional error checking things.
+
+
+dnl  Usage: m4_file_seen()
+dnl
+dnl  Record __file__ for the benefit of m4_file_and_line in m4wrap text.
+dnl  The basic __file__ macro comes out quoted, like `foo.asm', and
+dnl  m4_file_seen_last is defined like that too.
+dnl
+dnl  This only needs to be used with something that could generate an error
+dnl  message in m4wrap text.  The x86 PROLOGUE is the only such at the
+dnl  moment (at end of input its m4wrap checks for missing EPILOGUE).  A few
+dnl  include()s can easily trick this scheme, but you'd expect an EPILOGUE
+dnl  in the same file as the PROLOGUE.
+
+define(m4_file_seen,
+m4_assert_numargs(0)
+`ifelse(__file__,`NONE',,
+`define(`m4_file_seen_last',m4_doublequote(__file__))')')
+
+
+dnl  Usage: m4_assert_onearg()
+dnl
+dnl  Put this, unquoted, at the start of a macro definition to add some code
+dnl  to check that one argument is passed to the macro, but with that
+dnl  argument allowed to be empty.  For example,
+dnl
+dnl          define(foo,
+dnl          m4_assert_onearg()
+dnl          `blah blah $1 blah blah')
+dnl
+dnl  Calls "foo(xyz)" or "foo()" are accepted.  A call "foo(xyz,abc)" fails.
+dnl  A call "foo" fails too, but BSD m4 can't detect this case (GNU and SysV
+dnl  m4 can).
+
+define(m4_assert_onearg,
+m4_assert_numargs(0)
+`m4_assert_onearg_internal'(m4_doublequote($`'0),$`#')`dnl ')
+
+dnl  Called: m4_assert_onearg(`macroname',$#)
+define(m4_assert_onearg_internal,
+`ifelse($2,1,,
+`m4_error(`$1 expected 1 argument, got 'm4_Narguments(`$2')
+)')')
+
+
+dnl  Usage: m4_assert_numargs_range(low,high)
+dnl
+dnl  Put this, unquoted, at the start of a macro definition to add some code
+dnl  to check that between low and high many arguments get passed to the
+dnl  macro.  For example,
+dnl
+dnl         define(foo,
+dnl         m4_assert_numargs_range(3,5)
+dnl         `mandatory $1 $2 $3 optional $4 $5 end')
+dnl
+dnl  See m4_assert_numargs() for more info.
+
+define(m4_assert_numargs_range,
+m4_assert_numargs(2)
+``m4_assert_numargs_range_internal'(m4_doublequote($`'0),$1,$2,$`#',`len'(m4_doublequote($`'1)))`dnl '')
+
+dnl  Called: m4_assert_numargs_range_internal(`name',low,high,$#,len(`$1'))
+define(m4_assert_numargs_range_internal,
+m4_assert_numargs(5)
+`m4_assert_numargs_range_check(`$1',`$2',`$3',m4_numargs_count(`$4',`$5'))')
+
+dnl  Called: m4_assert_numargs_range_check(`name',low,high,gotargs)
+dnl
+dnl  If m4_dollarhash_1_if_noparen_p (BSD m4) then gotargs can be 0 when it
+dnl  should be -1.  To ensure a `high' of -1 works, a fudge is applied to
+dnl  gotargs if it's 0 and the 0 and -1 cases can't be distinguished.
+dnl
+define(m4_assert_numargs_range_check,
+m4_assert_numargs(4)
+`ifelse(eval($2 <= $4 &&
+             ($4 - ($4==0 && m4_dollarhash_1_if_noparen_p) <= $3)),0,
+`m4_error(`$1 expected $2 to $3 arguments, got 'm4_Narguments(`$4')
+)')')
+
+
+dnl  Usage: m4_assert_defined(symbol)
+dnl
+dnl  Put this unquoted on a line of its own at the start of a macro
+dnl  definition to add some code to check that the given symbol is defined
+dnl  when the macro is used.  For example,
+dnl
+dnl          define(foo,
+dnl          m4_assert_defined(`FOO_PREFIX')
+dnl          `FOO_PREFIX whatever')
+dnl
+dnl  This is a convenient way to check that the user or ./configure or
+dnl  whatever has defined the things needed by a macro, as opposed to
+dnl  silently generating garbage.
+
+define(m4_assert_defined,
+m4_assert_numargs(1)
+``m4_assert_defined_internal'(m4_doublequote($`'0),``$1'')`dnl '')
+
+dnl  Called: m4_assert_defined_internal(`macroname',`define_required')
+define(m4_assert_defined_internal,
+m4_assert_numargs(2)
+`ifdef(`$2',,
+`m4_error(`$1 needs $2 defined
+')')')
+
+
+dnl  Usage: m4_not_for_expansion(`SYMBOL')
+dnl         define_not_for_expansion(`SYMBOL')
+dnl
+dnl  m4_not_for_expansion turns SYMBOL, if defined, into something which
+dnl  will give an error if expanded.  For example,
+dnl
+dnl         m4_not_for_expansion(`PIC')
+dnl
+dnl  define_not_for_expansion is the same, but always makes a definition.
+dnl
+dnl  These are for symbols that should be tested with ifdef(`FOO',...)
+dnl  rather than be expanded as such.  They guard against accidentally
+dnl  omitting the quotes, as in ifdef(FOO,...).  Note though that they only
+dnl  catches this when FOO is defined, so be sure to test code both with and
+dnl  without each definition.
+
+define(m4_not_for_expansion,
+m4_assert_numargs(1)
+`ifdef(`$1',`define_not_for_expansion(`$1')')')
+
+define(define_not_for_expansion,
+m4_assert_numargs(1)
+`ifelse(defn(`$1'),,,
+`m4_error(``$1' has a non-empty value, maybe it shouldnt be munged with m4_not_for_expansion()
+')')dnl
+define(`$1',`m4_not_for_expansion_internal(`$1')')')
+
+define(m4_not_for_expansion_internal,
+`m4_error(``$1' is not meant to be expanded, perhaps you mean `ifdef(`$1',...)'
+')')
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Various generic m4 things.
+
+
+dnl  Usage: m4_ifdef_anyof_p(`symbol',...)
+dnl
+dnl  Expand to 1 if any of the symbols in the argument list are defined, or
+dnl  to 0 if not.
+
+define(m4_ifdef_anyof_p,
+`ifelse(eval($#<=1 && m4_length(`$1')==0),1, 0,
+`ifdef(`$1', 1,
+`m4_ifdef_anyof_p(shift($@))')')')
+
+
+dnl  Usage: m4_length(string)
+dnl
+dnl  Determine the length of a string.  This is the same as len(), but
+dnl  always expands to a number, working around the BSD len() which
+dnl  evaluates to nothing given an empty argument.
+
+define(m4_length,
+m4_assert_onearg()
+`eval(len(`$1')-0)')
+
+
+dnl  Usage: m4_stringequal_p(x,y)
+dnl
+dnl  Expand to 1 or 0 according as strings x and y are equal or not.
+
+define(m4_stringequal_p,
+`ifelse(`$1',`$2',1,0)')
+
+
+dnl  Usage: m4_incr_or_decr(n,last)
+dnl
+dnl  Do an incr(n) or decr(n), whichever is in the direction of "last".
+dnl  Both n and last must be numbers of course.
+
+define(m4_incr_or_decr,
+m4_assert_numargs(2)
+`ifelse(eval($1<$2),1,incr($1),decr($1))')
+
+
+dnl  Usage: forloop(i, first, last, statement)
+dnl
+dnl  Based on GNU m4 examples/forloop.m4, but extended.
+dnl
+dnl  statement is expanded repeatedly, with i successively defined as
+dnl
+dnl         first, first+1, ..., last-1, last
+dnl
+dnl  Or if first > last, then it's
+dnl
+dnl         first, first-1, ..., last+1, last
+dnl
+dnl  If first == last, then one expansion is done.
+dnl
+dnl  A pushdef/popdef of i is done to preserve any previous definition (or
+dnl  lack of definition).  first and last are eval()ed and so can be
+dnl  expressions.
+dnl
+dnl  forloop_first is defined to 1 on the first iteration, 0 on the rest.
+dnl  forloop_last is defined to 1 on the last iteration, 0 on the others.
+dnl  Nested forloops are allowed, in which case forloop_first and
+dnl  forloop_last apply to the innermost loop that's open.
+dnl
+dnl  A simple example,
+dnl
+dnl         forloop(i, 1, 2*2+1, `dnl
+dnl         iteration number i ... ifelse(forloop_first,1,FIRST)
+dnl         ')
+
+
+dnl  "i" and "statement" are carefully quoted, but "first" and "last" are
+dnl  just plain numbers once eval()ed.
+
+define(`forloop',
+m4_assert_numargs(4)
+`pushdef(`$1',eval(`$2'))dnl
+pushdef(`forloop_first',1)dnl
+pushdef(`forloop_last',0)dnl
+forloop_internal(`$1',eval(`$3'),`$4')`'dnl
+popdef(`forloop_first')dnl
+popdef(`forloop_last')dnl
+popdef(`$1')')
+
+dnl  Called: forloop_internal(`var',last,statement)
+define(`forloop_internal',
+m4_assert_numargs(3)
+`ifelse($1,$2,
+`define(`forloop_last',1)$3',
+`$3`'dnl
+define(`forloop_first',0)dnl
+define(`$1',m4_incr_or_decr($1,$2))dnl
+forloop_internal(`$1',$2,`$3')')')
+
+
+dnl  Usage: m4_toupper(x)
+dnl         m4_tolower(x)
+dnl
+dnl  Convert the argument string to upper or lower case, respectively.
+dnl  Only one argument accepted.
+dnl
+dnl  BSD m4 doesn't take ranges like a-z in translit(), so the full alphabet
+dnl  is written out.
+
+define(m4_alphabet_lower, `abcdefghijklmnopqrstuvwxyz')
+define(m4_alphabet_upper, `ABCDEFGHIJKLMNOPQRSTUVWXYZ')
+
+define(m4_toupper,
+m4_assert_onearg()
+`translit(`$1', m4_alphabet_lower, m4_alphabet_upper)')
+
+define(m4_tolower,
+m4_assert_onearg()
+`translit(`$1', m4_alphabet_upper, m4_alphabet_lower)')
+
+
+dnl  Usage: m4_empty_if_zero(x)
+dnl
+dnl  Evaluate to x, or to nothing if x is 0.  x is eval()ed and so can be an
+dnl  expression.
+dnl
+dnl  This is useful for x86 addressing mode displacements since forms like
+dnl  (%ebx) are one byte shorter than 0(%ebx).  A macro `foo' for use as
+dnl  foo(%ebx) could be defined with the following so it'll be empty if the
+dnl  expression comes out zero.
+dnl
+dnl	   deflit(`foo', `m4_empty_if_zero(a+b*4-c)')
+dnl
+dnl  Naturally this shouldn't be done if, say, a computed jump depends on
+dnl  the code being a particular size.
+
+define(m4_empty_if_zero,
+m4_assert_onearg()
+`ifelse(eval($1),0,,eval($1))')
+
+
+dnl  Usage: m4_log2(x)
+dnl
+dnl  Calculate a logarithm to base 2.
+dnl  x must be an integral power of 2, between 2**0 and 2**30.
+dnl  x is eval()ed, so it can be an expression.
+dnl  An error results if x is invalid.
+dnl
+dnl  2**31 isn't supported, because an unsigned 2147483648 is out of range
+dnl  of a 32-bit signed int.  Also, the bug in BSD m4 where an eval()
+dnl  resulting in 2147483648 (or -2147483648 as the case may be) gives `-('
+dnl  means tests like eval(1<<31==(x)) would be necessary, but that then
+dnl  gives an unattractive explosion of eval() error messages if x isn't
+dnl  numeric.
+
+define(m4_log2,
+m4_assert_numargs(1)
+`m4_log2_internal(0,1,eval(`$1'))')
+
+dnl  Called: m4_log2_internal(n,2**n,target)
+define(m4_log2_internal,
+m4_assert_numargs(3)
+`ifelse($2,$3,$1,
+`ifelse($1,30,
+`m4_error(`m4_log2() argument too big or not a power of two: $3
+')',
+`m4_log2_internal(incr($1),eval(2*$2),$3)')')')
+
+
+dnl  Usage:  m4_div2_towards_zero
+dnl
+dnl  m4 division is probably whatever a C signed division is, and C doesn't
+dnl  specify what rounding gets used on negatives, so this expression forces
+dnl  a rounding towards zero.
+
+define(m4_div2_towards_zero,
+m4_assert_numargs(1)
+`eval((($1) + ((($1)<0) & ($1))) / 2)')
+
+
+dnl  Usage: m4_lshift(n,count)
+dnl         m4_rshift(n,count)
+dnl
+dnl  Calculate n shifted left or right by count many bits.  Both n and count
+dnl  are eval()ed and so can be expressions.
+dnl
+dnl  Negative counts are allowed and mean a shift in the opposite direction.
+dnl  Negative n is allowed and right shifts will be arithmetic (meaning
+dnl  divide by 2**count, rounding towards zero, also meaning the sign bit is
+dnl  duplicated).
+dnl
+dnl  Use these macros instead of << and >> in eval() since the basic ccs
+dnl  SysV m4 doesn't have those operators.
+
+define(m4_rshift,
+m4_assert_numargs(2)
+`m4_lshift(`$1',-(`$2'))')
+
+define(m4_lshift,
+m4_assert_numargs(2)
+`m4_lshift_internal(eval(`$1'),eval(`$2'))')
+
+define(m4_lshift_internal,
+m4_assert_numargs(2)
+`ifelse(eval($2-0==0),1,$1,
+`ifelse(eval($2>0),1,
+`m4_lshift_internal(eval($1*2),decr($2))',
+`m4_lshift_internal(m4_div2_towards_zero($1),incr($2))')')')
+
+
+dnl  Usage: deflit(name,value)
+dnl
+dnl  Like define(), but "name" expands like a literal, rather than taking
+dnl  arguments.  For example "name(%eax)" expands to "value(%eax)".
+dnl
+dnl  Limitations:
+dnl
+dnl  $ characters in the value part must have quotes to stop them looking
+dnl  like macro parameters.  For example, deflit(reg,`123+$`'4+567').  See
+dnl  defreg() below for handling simple register definitions like $7 etc.
+dnl
+dnl  "name()" is turned into "name", unfortunately.  In GNU and SysV m4 an
+dnl  error is generated when this happens, but in BSD m4 it will happen
+dnl  silently.  The problem is that in BSD m4 $# is 1 in both "name" or
+dnl  "name()", so there's no way to differentiate them.  Because we want
+dnl  plain "name" to turn into plain "value", we end up with "name()"
+dnl  turning into plain "value" too.
+dnl
+dnl  "name(foo)" will lose any whitespace after commas in "foo", for example
+dnl  "disp(%eax, %ecx)" would become "128(%eax,%ecx)".
+dnl
+dnl  These parentheses oddities shouldn't matter in assembler text, but if
+dnl  they do the suggested workaround is to write "name ()" or "name (foo)"
+dnl  to stop the parentheses looking like a macro argument list.  If a space
+dnl  isn't acceptable in the output, then write "name`'()" or "name`'(foo)".
+dnl  The `' is stripped when read, but again stops the parentheses looking
+dnl  like parameters.
+
+dnl  Quoting for deflit_emptyargcheck is similar to m4_assert_numargs.  The
+dnl  stuff in the ifelse gives a $#, $1 and $@ evaluated in the new macro
+dnl  created, not in deflit.
+define(deflit,
+m4_assert_numargs(2)
+`define(`$1',
+`deflit_emptyargcheck'(``$1'',$`#',m4_doublequote($`'1))`dnl
+$2`'dnl
+ifelse(eval($'`#>1 || m4_length('m4_doublequote($`'1)`)!=0),1,($'`@))')')
+
+dnl  Called: deflit_emptyargcheck(macroname,$#,`$1')
+define(deflit_emptyargcheck,
+`ifelse(eval($2==1 && !m4_dollarhash_1_if_noparen_p && m4_length(`$3')==0),1,
+`m4_error(`dont use a deflit as $1() because it loses the brackets (see deflit in asm-incl.m4 for more information)
+')')')
+
+
+dnl  Usage: m4_assert(`expr')
+dnl
+dnl  Test a compile-time requirement with an m4 expression.  The expression
+dnl  should be quoted, and will be eval()ed and expected to be non-zero.
+dnl  For example,
+dnl
+dnl         m4_assert(`FOO*2+6 < 14')
+
+define(m4_assert,
+m4_assert_numargs(1)
+`ifelse(eval($1),1,,
+`m4_error(`assertion failed: $1
+')')')
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Various assembler things, not specific to any particular CPU.
+dnl
+
+
+dnl  Usage: include_mpn(`filename')
+dnl
+dnl  Like include(), but adds a path to the mpn source directory.  For
+dnl  example,
+dnl
+dnl         include_mpn(`sparc64/addmul_1h.asm')
+
+define(include_mpn,
+m4_assert_numargs(1)
+m4_assert_defined(`CONFIG_TOP_SRCDIR')
+`include(CONFIG_TOP_SRCDIR`/mpn/$1')')
+
+
+dnl  Usage: C comment ...
+dnl
+dnl  "C" works like a FORTRAN-style comment character.  This can be used for
+dnl  comments to the right of assembly instructions, where just dnl would
+dnl  remove the linefeed, and concatenate adjacent lines.
+dnl
+dnl  "C" and/or "dnl" are useful when an assembler doesn't support comments,
+dnl  or where different assemblers for a particular CPU have different
+dnl  comment styles.  The intermediate ".s" files will end up with no
+dnl  comments, just code.
+dnl
+dnl  Using "C" is not intended to cause offence to anyone who doesn't like
+dnl  FORTRAN; but if that happens it's an unexpected bonus.
+
+define(C, `
+dnl')
+
+
+dnl  Various possible defines passed from the Makefile that are to be tested
+dnl  with ifdef() rather than be expanded.
+
+m4_not_for_expansion(`PIC')
+
+dnl  aors_n
+m4_not_for_expansion(`OPERATION_add_n')
+m4_not_for_expansion(`OPERATION_sub_n')
+
+dnl  aorsmul_n
+m4_not_for_expansion(`OPERATION_addmul_1')
+m4_not_for_expansion(`OPERATION_submul_1')
+
+dnl  logops_n
+m4_not_for_expansion(`OPERATION_and_n')
+m4_not_for_expansion(`OPERATION_andn_n')
+m4_not_for_expansion(`OPERATION_nand_n')
+m4_not_for_expansion(`OPERATION_ior_n')
+m4_not_for_expansion(`OPERATION_iorn_n')
+m4_not_for_expansion(`OPERATION_nior_n')
+m4_not_for_expansion(`OPERATION_xor_n')
+m4_not_for_expansion(`OPERATION_xnor_n')
+
+dnl  popham
+m4_not_for_expansion(`OPERATION_popcount')
+m4_not_for_expansion(`OPERATION_hamdist')
+
+
+dnl  Usage: m4_config_gmp_mparam(`symbol')
+dnl
+dnl  Check that `symbol' is defined.  If it isn't, issue an error and
+dnl  terminate immediately.  The error message explains that the symbol
+dnl  should be in config.m4, copied from gmp-mparam.h.
+dnl
+dnl  Processing is terminated immediately since missing something like
+dnl  KARATSUBA_SQR_THRESHOLD can lead to infinite loops with endless error
+dnl  messages.
+
+define(m4_config_gmp_mparam,
+m4_assert_numargs(1)
+`ifdef(`$1',,
+`m4_error(`$1 is not defined.
+	"configure" should have extracted this from gmp-mparam.h and put it
+	in config.m4, but somehow this has failed.
+')m4exit(1)')')
+
+
+dnl  Usage: defreg(name,reg)
+dnl
+dnl  Give a name to a $ style register.  For example,
+dnl
+dnl         defreg(foo,$12)
+dnl
+dnl  defreg() inserts an extra pair of quotes after the $ so that it's not
+dnl  interpreted as an m4 macro parameter, ie. foo is actually $`'12.  m4
+dnl  strips those quotes when foo is expanded.
+dnl
+dnl  deflit() is used to make the new definition, so it will expand
+dnl  literally even if followed by parentheses ie. foo(99) will become
+dnl  $12(99).  (But there's nowhere that would be used is there?)
+dnl
+dnl  When making further definitions from existing defreg() macros, remember
+dnl  to use defreg() again to protect the $ in the new definitions too.  For
+dnl  example,
+dnl
+dnl         defreg(a0,$4)
+dnl         defreg(a1,$5)
+dnl         ...
+dnl
+dnl         defreg(PARAM_DST,a0)
+dnl
+dnl  This is only because a0 is expanding at the time the PARAM_DST
+dnl  definition is made, leaving a literal $4 that must be re-quoted.  On
+dnl  the other hand in something like the following ra is only expanded when
+dnl  ret is used and its $`'31 protection will have its desired effect at
+dnl  that time.
+dnl
+dnl         defreg(ra,$31)
+dnl         ...
+dnl         define(ret,`j ra')
+dnl
+dnl  Note that only $n forms are meant to be used here, and something like
+dnl  128($30) doesn't get protected and will come out wrong.
+
+define(defreg,
+m4_assert_numargs(2)
+`deflit(`$1',
+substr(`$2',0,1)``''substr(`$2',1))')
+
+
+dnl  Usage: m4_instruction_wrapper(num)
+dnl
+dnl  Put this, unquoted, on a line on its own, at the start of a macro
+dnl  that's a wrapper around an assembler instruction.  It adds code to give
+dnl  a descriptive error message if the macro is invoked without arguments.
+dnl
+dnl  For example, suppose jmp needs to be wrapped,
+dnl
+dnl         define(jmp,
+dnl         m4_instruction_wrapper()
+dnl         m4_assert_numargs(1)
+dnl                 `.byte 0x42
+dnl                 .long  $1
+dnl                 nop')
+dnl
+dnl  The point of m4_instruction_wrapper is to get a better error message
+dnl  than m4_assert_numargs would give if jmp is accidentally used as plain
+dnl  "jmp foo" instead of the intended "jmp( foo)".  "jmp()" with no
+dnl  argument also provokes the error message.
+dnl
+dnl  m4_instruction_wrapper should only be used with wrapped instructions
+dnl  that take arguments, since obviously something meant to be used as
+dnl  plain "ret", say, doesn't want to give an error when used that way.
+
+define(m4_instruction_wrapper,
+m4_assert_numargs(0)
+``m4_instruction_wrapper_internal'(m4_doublequote($`'0),dnl
+m4_doublequote(ifdef(`__file__',__file__,`the m4 sources')),dnl
+$`#',m4_doublequote($`'1))`dnl'')
+
+dnl  Called: m4_instruction_wrapper_internal($0,`filename',$#,$1)
+define(m4_instruction_wrapper_internal,
+`ifelse(eval($3<=1 && m4_length(`$4')==0),1,
+`m4_error(`$1 is a macro replacing that instruction and needs arguments, see $2 for details
+')')')
+
+
+dnl  Usage: UNROLL_LOG2, UNROLL_MASK, UNROLL_BYTES
+dnl         CHUNK_LOG2, CHUNK_MASK, CHUNK_BYTES
+dnl
+dnl  When code supports a variable amount of loop unrolling, the convention
+dnl  is to define UNROLL_COUNT to the number of limbs processed per loop.
+dnl  When testing code this can be varied to see how much the loop overhead
+dnl  is costing.  For example,
+dnl
+dnl         deflit(UNROLL_COUNT, 32)
+dnl
+dnl  If the forloop() generating the unrolled loop has a pattern processing
+dnl  more than one limb, the convention is to express this with CHUNK_COUNT.
+dnl  For example,
+dnl
+dnl         deflit(CHUNK_COUNT, 2)
+dnl
+dnl  The LOG2, MASK and BYTES definitions below are derived from these COUNT
+dnl  definitions.  If COUNT is redefined, the LOG2, MASK and BYTES follow
+dnl  the new definition automatically.
+dnl
+dnl  LOG2 is the log base 2 of COUNT.  MASK is COUNT-1, which can be used as
+dnl  a bit mask.  BYTES is BYTES_PER_MP_LIMB*COUNT, the number of bytes
+dnl  processed in each unrolled loop.
+dnl
+dnl  BYTES_PER_MP_LIMB is defined in a CPU specific m4 include file.  It
+dnl  exists only so the BYTES definitions here can be common to all CPUs.
+dnl  In the actual code for a given CPU, an explicit 4 or 8 may as well be
+dnl  used because the code is only for a particular CPU, it doesn't need to
+dnl  be general.
+dnl
+dnl  Note that none of these macros do anything except give conventional
+dnl  names to commonly used things.  You still have to write your own
+dnl  expressions for a forloop() and the resulting address displacements.
+dnl  Something like the following would be typical for 4 bytes per limb.
+dnl
+dnl         forloop(`i',0,UNROLL_COUNT-1,`
+dnl                 deflit(`disp',eval(i*4))
+dnl                 ...
+dnl         ')
+dnl
+dnl  Or when using CHUNK_COUNT,
+dnl
+dnl         forloop(`i',0,UNROLL_COUNT/CHUNK_COUNT-1,`
+dnl                 deflit(`disp0',eval(i*CHUNK_COUNT*4))
+dnl                 deflit(`disp1',eval(disp0+4))
+dnl                 ...
+dnl         ')
+dnl
+dnl  Clearly `i' can be run starting from 1, or from high to low or whatever
+dnl  best suits.
+
+deflit(UNROLL_LOG2,
+m4_assert_defined(`UNROLL_COUNT')
+`m4_log2(UNROLL_COUNT)')
+
+deflit(UNROLL_MASK,
+m4_assert_defined(`UNROLL_COUNT')
+`eval(UNROLL_COUNT-1)')
+
+deflit(UNROLL_BYTES,
+m4_assert_defined(`UNROLL_COUNT')
+m4_assert_defined(`BYTES_PER_MP_LIMB')
+`eval(UNROLL_COUNT * BYTES_PER_MP_LIMB)')
+ 
+deflit(CHUNK_LOG2,
+m4_assert_defined(`CHUNK_COUNT')
+`m4_log2(CHUNK_COUNT)')
+
+deflit(CHUNK_MASK,
+m4_assert_defined(`CHUNK_COUNT')
+`eval(CHUNK_COUNT-1)')
+
+deflit(CHUNK_BYTES,
+m4_assert_defined(`CHUNK_COUNT')
+m4_assert_defined(`BYTES_PER_MP_LIMB')
+`eval(CHUNK_COUNT * BYTES_PER_MP_LIMB)')
+
+
+dnl  Usage: MPN(name)
+dnl
+dnl  Add MPN_PREFIX to a name.
+dnl  MPN_PREFIX defaults to "__gmpn_" if not defined.
+
+ifdef(`MPN_PREFIX',,
+`define(`MPN_PREFIX',`__gmpn_')')
+
+define(MPN,
+m4_assert_numargs(1)
+`MPN_PREFIX`'$1')
+
+
+dnl  Usage: mpn_add_n, etc
+dnl
+dnl  Convenience definitions using MPN(), like the #defines in gmp.h.  Each
+dnl  function that might be implemented in assembler is here.
+
+define(define_mpn,
+m4_assert_numargs(1)
+`define(`mpn_$1',`MPN(`$1')')')
+
+define_mpn(add)
+define_mpn(add_1)
+define_mpn(add_n)
+define_mpn(add_nc)
+define_mpn(addmul_1)
+define_mpn(addmul_1c)
+define_mpn(addsub_n)
+define_mpn(addsub_nc)
+define_mpn(and_n)
+define_mpn(andn_n)
+define_mpn(bdivmod)
+define_mpn(cmp)
+define_mpn(com_n)
+define_mpn(copyd)
+define_mpn(copyi)
+define_mpn(divexact_by3c)
+define_mpn(divrem)
+define_mpn(divrem_1)
+define_mpn(divrem_1c)
+define_mpn(divrem_2)
+define_mpn(divrem_classic)
+define_mpn(divrem_newton)
+define_mpn(dump)
+define_mpn(gcd)
+define_mpn(gcd_1)
+define_mpn(gcdext)
+define_mpn(get_str)
+define_mpn(hamdist)
+define_mpn(invert_limb)
+define_mpn(ior_n)
+define_mpn(iorn_n)
+define_mpn(kara_mul_n)
+define_mpn(kara_sqr_n)
+define_mpn(lshift)
+define_mpn(lshiftc)
+define_mpn(mod_1)
+define_mpn(mod_1c)
+define_mpn(mul)
+define_mpn(mul_1)
+define_mpn(mul_1c)
+define_mpn(mul_basecase)
+define_mpn(mul_n)
+define_mpn(perfect_square_p)
+define_mpn(popcount)
+define_mpn(preinv_mod_1)
+define_mpn(nand_n)
+define_mpn(nior_n)
+define_mpn(random)
+define_mpn(random2)
+define_mpn(rshift)
+define_mpn(rshiftc)
+define_mpn(scan0)
+define_mpn(scan1)
+define_mpn(set_str)
+define_mpn(sqr_basecase)
+define_mpn(sub_n)
+define_mpn(sqrtrem)
+define_mpn(sub)
+define_mpn(sub_1)
+define_mpn(sub_n)
+define_mpn(sub_nc)
+define_mpn(submul_1)
+define_mpn(submul_1c)
+define_mpn(toom3_mul_n)
+define_mpn(toom3_sqr_n)
+define_mpn(umul_ppmm)
+define_mpn(udiv_qrnnd)
+define_mpn(xnor_n)
+define_mpn(xor_n)
+
+define(`ASM_START',
+	`')
+
+define(`PROLOGUE',
+	`
+	TEXT
+	ALIGN(4)
+	GLOBL	GSYM_PREFIX`$1'
+	TYPE(GSYM_PREFIX`$1',`function')
+GSYM_PREFIX`$1':')
+
+define(`EPILOGUE',
+	`
+	SIZE(GSYM_PREFIX`$1',.-GSYM_PREFIX`$1')')
+
+dnl  LSYM_PREFIX might be L$, so defn() must be used to quote it or the L
+dnl  will expand as the L macro, an infinite recursion.
+define(`L',`defn(`LSYM_PREFIX')$1')
+
+define(`INT32',
+	`
+	ALIGN(4)
+$1:
+	W32	$2
+	')
+
+define(`INT64',
+	`
+	ALIGN(8)
+$1:
+	W32	$2
+	W32	$3
+	')
+
+
+dnl  Usage: ALIGN(bytes)
+dnl
+dnl  Emit a ".align" directive.  The alignment is specified in bytes, and
+dnl  will normally need to be a power of 2.  The actual ".align" generated
+dnl  is either bytes or logarithmic according to what ./configure detects.
+dnl
+dnl  ALIGN_FILL_0x90, if defined and equal to "yes", means a ", 0x90" should
+dnl  be appended (this is for x86).
+
+define(ALIGN,
+m4_assert_numargs(1)
+m4_assert_defined(`ALIGN_LOGARITHMIC')
+`.align	ifelse(ALIGN_LOGARITHMIC,yes,`m4_log2($1)',`eval($1)')dnl
+ifelse(ALIGN_FILL_0x90,yes,`, 0x90')')
+
+
+dnl  Usage: MULFUNC_PROLOGUE(function function...)
+dnl
+dnl  A dummy macro which is grepped for by ./configure to know what
+dnl  functions a multi-function file is providing.  Use this if there aren't
+dnl  explicit PROLOGUE()s for each possible function.
+dnl
+dnl  Multiple MULFUNC_PROLOGUEs can be used, or just one with the function
+dnl  names separated by spaces.
+
+define(`MULFUNC_PROLOGUE',
+m4_assert_numargs(1)
+`')
+
+
+divert`'dnl
diff --git a/rts/gmp/mpn/clipper/add_n.s b/rts/gmp/mpn/clipper/add_n.s
new file mode 100644
index 0000000000..538a1caed0
--- /dev/null
+++ b/rts/gmp/mpn/clipper/add_n.s
@@ -0,0 +1,48 @@
+; Clipper __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+.text
+	.align 16
+.globl ___gmpn_add_n
+___gmpn_add_n:
+	subq	$8,sp
+	storw	r6,(sp)
+	loadw	12(sp),r2
+	loadw	16(sp),r3
+	loadq	$0,r6		; clear carry-save register
+
+.Loop:	loadw	(r1),r4
+	loadw	(r2),r5
+	addwc	r6,r6		; restore carry from r6
+	addwc	r5,r4
+	storw	r4,(r0)
+	subwc	r6,r6		; save carry in r6
+	addq	$4,r0
+	addq	$4,r1
+	addq	$4,r2
+	subq	$1,r3
+	brne	.Loop
+
+	negw	r6,r0
+	loadw	(sp),r6
+	addq	$8,sp
+	ret	sp
diff --git a/rts/gmp/mpn/clipper/mul_1.s b/rts/gmp/mpn/clipper/mul_1.s
new file mode 100644
index 0000000000..c0c756488c
--- /dev/null
+++ b/rts/gmp/mpn/clipper/mul_1.s
@@ -0,0 +1,47 @@
+; Clipper __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+; the result in a second limb vector.
+
+; Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+.text
+	.align	16
+.globl	___gmpn_mul_1
+___gmpn_mul_1:
+	subq	$8,sp
+	storw	r6,(sp)
+	loadw	12(sp),r2
+	loadw	16(sp),r3
+	loadq	$0,r6		; clear carry limb
+
+.Loop:	loadw	(r1),r4
+	mulwux	r3,r4
+	addw	r6,r4		; add old carry limb into low product limb
+	loadq	$0,r6
+	addwc	r5,r6		; propagate cy into high product limb
+	storw	r4,(r0)
+	addq	$4,r0
+	addq	$4,r1
+	subq	$1,r2
+	brne	.Loop
+
+	movw	r6,r0
+	loadw	0(sp),r6
+	addq	$8,sp
+	ret	sp
diff --git a/rts/gmp/mpn/clipper/sub_n.s b/rts/gmp/mpn/clipper/sub_n.s
new file mode 100644
index 0000000000..44d8797289
--- /dev/null
+++ b/rts/gmp/mpn/clipper/sub_n.s
@@ -0,0 +1,48 @@
+; Clipper __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+.text
+	.align 16
+.globl ___gmpn_sub_n
+___gmpn_sub_n:
+	subq	$8,sp
+	storw	r6,(sp)
+	loadw	12(sp),r2
+	loadw	16(sp),r3
+	loadq	$0,r6		; clear carry-save register
+
+.Loop:	loadw	(r1),r4
+	loadw	(r2),r5
+	addwc	r6,r6		; restore carry from r6
+	subwc	r5,r4
+	storw	r4,(r0)
+	subwc	r6,r6		; save carry in r6
+	addq	$4,r0
+	addq	$4,r1
+	addq	$4,r2
+	subq	$1,r3
+	brne	.Loop
+
+	negw	r6,r0
+	loadw	(sp),r6
+	addq	$8,sp
+	ret	sp
diff --git a/rts/gmp/mpn/cray/README b/rts/gmp/mpn/cray/README
new file mode 100644
index 0000000000..8195c67e21
--- /dev/null
+++ b/rts/gmp/mpn/cray/README
@@ -0,0 +1,14 @@
+The (poorly optimized) code in this directory was originally written for a
+j90 system, but finished on a c90.  It should work on all Cray vector
+computers.  For the T3E and T3D systems, the `alpha' subdirectory at the
+same level as the directory containing this file, is much better.
+
+* `+' seems to be faster than `|' when combining carries.
+
+* It is possible that the best multiply performance would be achived by
+  storing only 24 bits per element, and using lazy carry propagation.  Before
+  calling i24mult, full carry propagation would be needed.
+
+* Supply tasking versions of the C loops.
+
+
diff --git a/rts/gmp/mpn/cray/add_n.c b/rts/gmp/mpn/cray/add_n.c
new file mode 100644
index 0000000000..1fdb394993
--- /dev/null
+++ b/rts/gmp/mpn/cray/add_n.c
@@ -0,0 +1,96 @@
+/* mpn_add_n -- Add two limb vectors of equal, non-zero length.
+   For Cray vector processors.
+
+   Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+   the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+   MA 02111-1307, USA.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_add_n (c, a, b, n)
+     mp_ptr c;
+     mp_srcptr a, b;
+     mp_size_t n;
+{
+  mp_size_t i;
+  mp_size_t nm1 = n - 1;
+  int more_carries = 0;
+  int carry_out;
+
+  /* For small operands the non-vector code is faster.  */
+  if (n < 16)
+    goto sequential;
+
+  if (a == c || b == c)
+    {
+      TMP_DECL (marker);
+      TMP_MARK (marker);
+      if (c == a)
+	{
+	  /* allocate temp space for a */
+	  mp_ptr ax = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+	  MPN_COPY (ax, a, n);
+	  a = (mp_srcptr) ax;
+	}
+      if (c == b)
+	{
+	  /* allocate temp space for b */
+	  mp_ptr bx = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+	  MPN_COPY (bx, b, n);
+	  b = (mp_srcptr) bx;
+	}
+      carry_out = mpn_add_n (c, a, b, n);
+      TMP_FREE (marker);
+      return carry_out;
+    }
+
+  carry_out = a[nm1] + b[nm1] < a[nm1];
+
+#pragma _CRI ivdep			/* Cray PVP systems */
+  for (i = nm1; i > 0; i--)
+    {
+      int cy_in;
+      cy_in = a[i - 1] + b[i - 1] < a[i - 1];
+      c[i] = a[i] + b[i] + cy_in;
+      more_carries += c[i] < cy_in;
+    }
+  c[0] = a[0] + b[0];
+
+  if (more_carries)
+    {
+      /* This won't vectorize, but we should come here rarely.  */
+      int cy;
+    sequential:
+      cy = 0;
+      for (i = 0; i < n; i++)
+	{
+	  mp_limb_t ai, ci, t;
+	  ai = a[i];
+	  t = b[i] + cy;
+	  cy = t < cy;
+	  ci = ai + t;
+	  cy += ci < ai;
+	  c[i] = ci;
+	}
+      carry_out = cy;
+    }
+
+  return carry_out;
+}
diff --git a/rts/gmp/mpn/cray/addmul_1.c b/rts/gmp/mpn/cray/addmul_1.c
new file mode 100644
index 0000000000..031b4e8e8d
--- /dev/null
+++ b/rts/gmp/mpn/cray/addmul_1.c
@@ -0,0 +1,46 @@
+/* mpn_addmul_1 for Cray PVP.
+
+Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.  */
+
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_addmul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t limb)
+{
+  mp_ptr p0, p1, tp;
+  mp_limb_t cy_limb;
+  TMP_DECL (marker);
+  TMP_MARK (marker);
+
+  p1 = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+  p0 = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+  tp = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+
+  GMPN_MULWW (p1, p0, up, &n, &limb);
+  cy_limb = mpn_add_n (tp, rp, p0, n);
+  rp[0] = tp[0];
+  cy_limb += mpn_add_n (rp + 1, tp + 1, p1, n - 1);
+  cy_limb += p1[n - 1];
+
+  TMP_FREE (marker);
+  return cy_limb;
+}
diff --git a/rts/gmp/mpn/cray/gmp-mparam.h b/rts/gmp/mpn/cray/gmp-mparam.h
new file mode 100644
index 0000000000..14f7b8e05b
--- /dev/null
+++ b/rts/gmp/mpn/cray/gmp-mparam.h
@@ -0,0 +1,27 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 64
+#define BITS_PER_SHORTINT 32
+#define BITS_PER_CHAR 8
diff --git a/rts/gmp/mpn/cray/mul_1.c b/rts/gmp/mpn/cray/mul_1.c
new file mode 100644
index 0000000000..0c8750b4ac
--- /dev/null
+++ b/rts/gmp/mpn/cray/mul_1.c
@@ -0,0 +1,44 @@
+/* mpn_mul_1 for Cray PVP.
+
+Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.  */
+
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_mul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t limb)
+{
+  mp_ptr p0, p1;
+  mp_limb_t cy_limb;
+  TMP_DECL (marker);
+  TMP_MARK (marker);
+
+  p1 = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+  p0 = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+
+  GMPN_MULWW (p1, p0, up, &n, &limb);
+  rp[0] = p0[0];
+  cy_limb = mpn_add_n (rp + 1, p0 + 1, p1, n - 1);
+  cy_limb += p1[n - 1];
+
+  TMP_FREE (marker);
+  return cy_limb;
+}
diff --git a/rts/gmp/mpn/cray/mulww.f b/rts/gmp/mpn/cray/mulww.f
new file mode 100644
index 0000000000..99507c1e44
--- /dev/null
+++ b/rts/gmp/mpn/cray/mulww.f
@@ -0,0 +1,54 @@
+c     Helper for mpn_mul_1, mpn_addmul_1, and mpn_submul_1 for Cray PVP.
+
+c     Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+c     This file is part of the GNU MP Library.
+
+c     The GNU MP Library is free software; you can redistribute it and/or
+c     modify it under the terms of the GNU Lesser General Public License as
+c     published by the Free Software Foundation; either version 2.1 of the
+c     License, or (at your option) any later version.
+
+c     The GNU MP Library is distributed in the hope that it will be useful,
+c     but WITHOUT ANY WARRANTY; without even the implied warranty of
+c     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+c     Lesser General Public License for more details.
+
+c     You should have received a copy of the GNU Lesser General Public
+c     License along with the GNU MP Library; see the file COPYING.LIB.  If
+c     not, write to the Free Software Foundation, Inc., 59 Temple Place -
+c     Suite 330, Boston, MA 02111-1307, USA.
+
+c     p1[] = hi(a[]*s); the upper limbs of each product
+c     p0[] = low(a[]*s); the corresponding lower limbs
+c     n is number of limbs in the vectors
+
+      subroutine gmpn_mulww(p1,p0,a,n,s)
+      integer*8 p1(0:*),p0(0:*),a(0:*),s
+      integer n
+
+      integer*8 a0,a1,a2,s0,s1,s2,c
+      integer*8 ai,t0,t1,t2,t3,t4
+
+      s0 = shiftl(and(s,4194303),24)
+      s1 = shiftl(and(shiftr(s,22),4194303),24)
+      s2 = shiftl(and(shiftr(s,44),4194303),24)
+
+      do i = 0,n-1
+         ai = a(i)
+         a0 = shiftl(and(ai,4194303),24)
+         a1 = shiftl(and(shiftr(ai,22),4194303),24)
+         a2 = shiftl(and(shiftr(ai,44),4194303),24)
+
+         t0 = i24mult(a0,s0)
+         t1 = i24mult(a0,s1)+i24mult(a1,s0)
+         t2 = i24mult(a0,s2)+i24mult(a1,s1)+i24mult(a2,s0)
+         t3 = i24mult(a1,s2)+i24mult(a2,s1)
+         t4 = i24mult(a2,s2)
+
+         p0(i)=shiftl(t2,44)+shiftl(t1,22)+t0
+         c=shiftr(shiftr(t0,22)+and(t1,4398046511103)+
+     $        shiftl(and(t2,1048575),22),42)
+         p1(i)=shiftl(t4,24)+shiftl(t3,2)+shiftr(t2,20)+shiftr(t1,42)+c
+      end do
+      end
diff --git a/rts/gmp/mpn/cray/mulww.s b/rts/gmp/mpn/cray/mulww.s
new file mode 100644
index 0000000000..890cdcf94d
--- /dev/null
+++ b/rts/gmp/mpn/cray/mulww.s
@@ -0,0 +1,245 @@
+*     Helper for mpn_mul_1, mpn_addmul_1, and mpn_submul_1 for Cray PVP.
+
+*     Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+*     This file is generated from mulww.f in this same directory.
+
+*     This file is part of the GNU MP Library.
+
+*     The GNU MP Library is free software; you can redistribute it and/or
+*     modify it under the terms of the GNU Lesser General Public License as
+*     published by the Free Software Foundation; either version 2.1 of the
+*     License, or (at your option) any later version.
+
+*     The GNU MP Library is distributed in the hope that it will be useful,
+*     but WITHOUT ANY WARRANTY; without even the implied warranty of
+*     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*     Lesser General Public License for more details.
+
+*     You should have received a copy of the GNU Lesser General Public
+*     License along with the GNU MP Library; see the file COPYING.LIB.  If
+*     not, write to the Free Software Foundation, Inc., 59 Temple Place -
+*     Suite 330, Boston, MA 02111-1307, USA.
+
+            IDENT           GMPN_MULWW
+**********************************************
+*      Assemble with Cal Version 2.0         *
+*                                            *
+* Generated by CFT77   6.0.4.19              *
+*           on 06/27/00 at 04:34:13          *
+*                                            *
+**********************************************
+* ALLOW UNDERSCORES IN IDENTIFIERS
+            EDIT            OFF
+            FORMAT          NEW
+@DATA       SECTION         DATA,CM
+@DATA       =               W.*
+            CON             O'0000000000040000000000
+            CON             O'0435152404713723252514        ;GMPN_MUL       1
+            CON             O'0535270000000000000000        ;WW             1
+            CON             O'0000000000000001200012        ;trbk tbl       1
+            VWD             32/0,32/P.GMPN_MULWW            ;trbk tbl       1
+            CON             O'0014003000000000001416        ;trbk tbl       1
+            CON             O'0000000000000000000011        ;trbk tbl       1
+            CON             O'0000000000000000000215        ;trbk tbl       1
+            BSSZ            1                               ;trbk tbl       1
+@CODE       SECTION         CODE
+@CODE       =               P.*
+L3          =               P.*                             ;               1
+            A0              A6                              ;arg base       1
+            A5              6                               ;num Darg       1
+            B03,A5          0,A0                            ;load DAs       1
+            A0              A1+A2                           ;               1
+            A5              1                               ;num Ts         1
+            0,A0            T00,A5                          ;               1
+            B02             A2                              ;new base       1
+            B66             A3                              ;stk top        1
+            B01             A6                              ;arg base       1
+            A7              P.L4                            ;ofrn rtn       1
+            B00             A7                              ;return         1
+            A6              @DATA                           ;               1
+            J               $STKOFEN                        ;$STKOFEN       1
+GMPN_MULWW  =               P.*                             ;               1
+            A0              @DATA+3                         ;(trbk)         1
+            B77             A0                              ;(trbk)         1
+            A1              13                              ;num Bs         1
+            A0              B66                             ;stk top        1
+            A2              B66                             ;stk tmp        1
+            A4              B67                             ;stk limt       1
+            0,A0            B77,A1                          ;               1
+            A7              782                             ;stk size       1
+            A3              A2+A7                           ;               1
+            A0              A4-A3                           ;               1
+            JAM             L3                              ;overflow       1
+            A0              A6                              ;arg base       1
+            A5              6                               ;num Darg       1
+            B03,A5          0,A0                            ;load DAs       1
+            A0              A1+A2                           ;               1
+            A5              1                               ;num Ts         1
+            0,A0            T00,A5                          ;               1
+            B02             A2                              ;new base       1
+            B66             A3                              ;new top        1
+            B01             A6                              ;arg base       1
+L4          =               P.*                             ;ofrn rtn       1
+            A7              B07                             ;regs          14
+            S7              0,A7                            ;              14
+            A6              B10                             ;regs           9
+            S6              0,A6                            ;               9
+            S5              1                               ;              14
+            S4              <22                             ;               9
+            S7              S7-S5                           ;              14
+            S5              #S7                             ;              14
+            T00             S6                              ;regs          10
+            S6              S6>22                           ;              10
+            S7              T00                             ;regs          11
+            S7              S7>44                           ;              11
+            S3              T00                             ;regs           9
+            S3              S3&S4                           ;               9
+            S6              S6&S4                           ;              10
+            S7              S7&S4                           ;              11
+            S3              S3<24                           ;               9
+            S6              S6<24                           ;              10
+            S7              S7<24                           ;              11
+            S0              S5                              ;regs          14
+            S4              S5                              ;regs          14
+            S1              S6                              ;regs          14
+            S2              S3                              ;regs          14
+            S3              S7                              ;regs          14
+            JSP             L5                              ;              14
+L6          =               P.*                             ;              14
+            S7              -S4                             ;              14
+            A2              S7                              ;regs          14
+            VL              A2                              ;regs          14
+            A3              B06                             ;s_bt_sp       14
+            A5              B05                             ;s_bt_sp       14
+            A4              B04                             ;s_bt_sp       14
+            A1              VL                              ;              14
+            A2              S4                              ;regs          14
+L7          =               P.*                             ;              14
+            A0              A3                              ;regs          15
+            VL              A1                              ;regs          15
+            V7              ,A0,1                           ;              15
+            B11             A5                              ;s_bt_sp       15
+            A7              22                              ;              17
+            B12             A4                              ;s_bt_sp       17
+            V6              V7>A7                           ;              17
+            B13             A3                              ;s_bt_sp       17
+            S7              <22                             ;              17
+            A3              B02                             ;s_bt_sp       17
+            V5              S7&V6                           ;              17
+            A6              24                              ;              17
+            V4              V5<A6                           ;              17
+            V3              S1*FV4                          ;              22
+            V2              S7&V7                           ;              16
+            V1              V2<A6                           ;              16
+            V0              S3*FV1                          ;              22
+            V6              V0+V3                           ;              22
+            A5              44                              ;              18
+            V5              V7>A5                           ;              18
+            V2              S1*FV1                          ;              21
+            V3              S7&V5                           ;              18
+            A0              14                              ;              34
+            B77             A0                              ;regs          34
+            A4              B77                             ;regs          34
+            A0              A4+A3                           ;              34
+            ,A0,1           V2                              ;v_ld_str      34
+            V0              V3<A6                           ;              18
+            V7              S2*FV1                          ;              20
+            A4              142                             ;              34
+            A0              A4+A3                           ;              34
+            ,A0,1           V7                              ;v_ld_str      34
+            V5              V7>A7                           ;              28
+            V2              S2*FV0                          ;              22
+            V3              V6+V2                           ;              22
+            S7              <20                             ;              28
+            V1              S7&V3                           ;              28
+            A4              270                             ;              34
+            A0              A4+A3                           ;              34
+            ,A0,1           V0                              ;v_ld_str      34
+            A4              14                              ;              34
+            A0              A4+A3                           ;              34
+            V7              ,A0,1                           ;v_ld_str      34
+            V6              V1<A7                           ;              28
+            V2              S2*FV4                          ;              21
+            V0              V7+V2                           ;              21
+            S7              <42                             ;              28
+            V1              S7&V0                           ;              28
+            A4              398                             ;              34
+            A0              A4+A3                           ;              34
+            ,A0,1           V0                              ;v_ld_str      34
+            V7              S3*FV4                          ;              23
+            V2              V5+V1                           ;              28
+            V0              V3<A5                           ;              26
+            A5              526                             ;              34
+            A0              A5+A3                           ;              34
+            ,A0,1           V0                              ;v_ld_str      34
+            A5              270                             ;              34
+            A0              A5+A3                           ;              34
+            V4              ,A0,1                           ;v_ld_str      34
+            V5              V2+V6                           ;              28
+            A5              20                              ;              32
+            V1              V3>A5                           ;              32
+            V0              S1*FV4                          ;              23
+            A5              654                             ;              34
+            A0              A5+A3                           ;              34
+            ,A0,1           V1                              ;v_ld_str      34
+            V6              V7+V0                           ;              23
+            A5              2                               ;              32
+            V2              V6<A5                           ;              32
+            V3              S3*FV4                          ;              24
+            A5              142                             ;              34
+            A0              A5+A3                           ;              34
+            V1              ,A0,1                           ;v_ld_str      34
+            A5              526                             ;              34
+            A0              A5+A3                           ;              34
+            V7              ,A0,1                           ;v_ld_str      34
+            V0              V1+V7                           ;              26
+            V6              V3<A6                           ;              32
+            V4              V6+V2                           ;              32
+            A6              42                              ;              28
+            V7              V5>A6                           ;              28
+            A5              654                             ;              34
+            CPW                                             ;cmr_vrsp      34
+            A0              A5+A3                           ;              34
+            V1              ,A0,1                           ;v_ld_str      34
+            A5              398                             ;              34
+            A0              A5+A3                           ;              34
+            V3              ,A0,1                           ;v_ld_str      34
+            V6              V4+V1                           ;              32
+            V2              V3>A6                           ;              32
+            V5              V6+V2                           ;              32
+            A6              B12                             ;s_bt_sp       32
+            V4              V3<A7                           ;              26
+            A7              B13                             ;regs          34
+            A3              A7+A1                           ;              34
+            A7              B11                             ;regs          34
+            A5              A7+A1                           ;              34
+            A4              A6+A1                           ;              34
+            A7              A2+A1                           ;              34
+            A0              A2+A1                           ;              34
+            A2              128                             ;              34
+            B13             A0                              ;s_bt_sp       34
+            V1              V0+V4                           ;              26
+            A0              B11                             ;regs          31
+            ,A0,1           V1                              ;              31
+            V6              V5+V7                           ;              33
+            A0              A6                              ;regs          33
+            ,A0,1           V6                              ;              33
+            A0              B13                             ;regs          34
+            A1              A2                              ;regs          34
+            A2              A7                              ;regs          34
+            JAN             L7                              ;              34
+L8          =               P.*                             ;              34
+L5          =               P.*                             ;              34
+            S1              0                               ;              35
+            A0              B02                             ;              35
+            A2              B02                             ;              35
+            A1              13                              ;num Bs        35
+            B66             A0                              ;              35
+            B77,A1          0,A0                            ;              35
+            A0              A2+A1                           ;              35
+            A1              1                               ;num Ts        35
+            T00,A1          0,A0                            ;              35
+            J               B00                             ;              35
+            EXT             $STKOFEN:p
+            ENTRY           GMPN_MULWW
+            END
diff --git a/rts/gmp/mpn/cray/sub_n.c b/rts/gmp/mpn/cray/sub_n.c
new file mode 100644
index 0000000000..902e07a727
--- /dev/null
+++ b/rts/gmp/mpn/cray/sub_n.c
@@ -0,0 +1,97 @@
+/* mpn_sub_n -- Subtract two limb vectors of equal, non-zero length.
+   For Cray vector processors.
+
+   Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+   This file is part of the GNU MP Library.
+
+   The GNU MP Library is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or (at your
+   option) any later version.
+
+   The GNU MP Library is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+   License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+   the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+   MA 02111-1307, USA.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_sub_n (c, a, b, n)
+     mp_ptr c;
+     mp_srcptr a, b;
+     mp_size_t n;
+{
+  mp_size_t i;
+  mp_size_t nm1 = n - 1;
+  int more_carries = 0;
+  int carry_out;
+
+  /* For small operands the non-vector code is faster.  */
+  if (n < 16)
+    goto sequential;
+
+  if (a == c || b == c)
+    {
+      TMP_DECL (marker);
+      TMP_MARK (marker);
+      if (c == a)
+	{
+	  /* allocate temp space for a */
+	  mp_ptr ax = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+	  MPN_COPY (ax, a, n);
+	  a = (mp_srcptr) ax;
+	}
+      if (c == b)
+	{
+	  /* allocate temp space for b */
+	  mp_ptr bx = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+	  MPN_COPY (bx, b, n);
+	  b = (mp_srcptr) bx;
+	}
+      carry_out = mpn_sub_n (c, a, b, n);
+      TMP_FREE (marker);
+      return carry_out;
+    }
+
+  carry_out = a[nm1] < b[nm1];
+
+#pragma _CRI ivdep			/* Cray PVP systems */
+  for (i = nm1; i > 0; i--)
+    {
+      int cy_in; mp_limb_t t;
+      cy_in = a[i - 1] < b[i - 1];
+      t = a[i] - b[i];
+      more_carries += t < cy_in;
+      c[i] = t - cy_in;
+    }
+  c[0] = a[0] - b[0];
+
+  if (more_carries)
+    {
+      /* This won't vectorize, but we should come here rarely.  */
+      int cy;
+    sequential:
+      cy = 0;
+      for (i = 0; i < n; i++)
+	{
+	  mp_limb_t ai, ci, t;
+	  ai = a[i];
+	  t = b[i] + cy;
+	  cy = t < cy;
+	  ci = ai - t;
+	  cy += ci > ai;
+	  c[i] = ci;
+	}
+      carry_out = cy;
+    }
+
+  return carry_out;
+}
diff --git a/rts/gmp/mpn/cray/submul_1.c b/rts/gmp/mpn/cray/submul_1.c
new file mode 100644
index 0000000000..4d2fb13c62
--- /dev/null
+++ b/rts/gmp/mpn/cray/submul_1.c
@@ -0,0 +1,46 @@
+/* mpn_submul_1 for Cray PVP.
+
+Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.  */
+
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_submul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t limb)
+{
+  mp_ptr p0, p1, tp;
+  mp_limb_t cy_limb;
+  TMP_DECL (marker);
+  TMP_MARK (marker);
+
+  p1 = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+  p0 = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+  tp = TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+
+  GMPN_MULWW (p1, p0, up, &n, &limb);
+  cy_limb = mpn_sub_n (tp, rp, p0, n);
+  rp[0] = tp[0];
+  cy_limb += mpn_sub_n (rp + 1, tp + 1, p1, n - 1);
+  cy_limb += p1[n - 1];
+
+  TMP_FREE (marker);
+  return cy_limb;
+}
diff --git a/rts/gmp/mpn/generic/add_n.c b/rts/gmp/mpn/generic/add_n.c
new file mode 100644
index 0000000000..5fcb7e4835
--- /dev/null
+++ b/rts/gmp/mpn/generic/add_n.c
@@ -0,0 +1,62 @@
+/* mpn_add_n -- Add two limb vectors of equal, non-zero length.
+
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+#if __STDC__
+mpn_add_n (mp_ptr res_ptr, mp_srcptr s1_ptr, mp_srcptr s2_ptr, mp_size_t size)
+#else
+mpn_add_n (res_ptr, s1_ptr, s2_ptr, size)
+     register mp_ptr res_ptr;
+     register mp_srcptr s1_ptr;
+     register mp_srcptr s2_ptr;
+     mp_size_t size;
+#endif
+{
+  register mp_limb_t x, y, cy;
+  register mp_size_t j;
+
+  /* The loop counter and index J goes from -SIZE to -1.  This way
+     the loop becomes faster.  */
+  j = -size;
+
+  /* Offset the base pointers to compensate for the negative indices.  */
+  s1_ptr -= j;
+  s2_ptr -= j;
+  res_ptr -= j;
+
+  cy = 0;
+  do
+    {
+      y = s2_ptr[j];
+      x = s1_ptr[j];
+      y += cy;			/* add previous carry to one addend */
+      cy = (y < cy);		/* get out carry from that addition */
+      y = x + y;		/* add other addend */
+      cy = (y < x) + cy;	/* get out carry from that add, combine */
+      res_ptr[j] = y;
+    }
+  while (++j != 0);
+
+  return cy;
+}
diff --git a/rts/gmp/mpn/generic/addmul_1.c b/rts/gmp/mpn/generic/addmul_1.c
new file mode 100644
index 0000000000..746ae31307
--- /dev/null
+++ b/rts/gmp/mpn/generic/addmul_1.c
@@ -0,0 +1,65 @@
+/* mpn_addmul_1 -- multiply the S1_SIZE long limb vector pointed to by S1_PTR
+   by S2_LIMB, add the S1_SIZE least significant limbs of the product to the
+   limb vector pointed to by RES_PTR.  Return the most significant limb of
+   the product, adjusted for carry-out from the addition.
+
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_addmul_1 (res_ptr, s1_ptr, s1_size, s2_limb)
+     register mp_ptr res_ptr;
+     register mp_srcptr s1_ptr;
+     mp_size_t s1_size;
+     register mp_limb_t s2_limb;
+{
+  register mp_limb_t cy_limb;
+  register mp_size_t j;
+  register mp_limb_t prod_high, prod_low;
+  register mp_limb_t x;
+
+  /* The loop counter and index J goes from -SIZE to -1.  This way
+     the loop becomes faster.  */
+  j = -s1_size;
+
+  /* Offset the base pointers to compensate for the negative indices.  */
+  res_ptr -= j;
+  s1_ptr -= j;
+
+  cy_limb = 0;
+  do
+    {
+      umul_ppmm (prod_high, prod_low, s1_ptr[j], s2_limb);
+
+      prod_low += cy_limb;
+      cy_limb = (prod_low < cy_limb) + prod_high;
+
+      x = res_ptr[j];
+      prod_low = x + prod_low;
+      cy_limb += (prod_low < x);
+      res_ptr[j] = prod_low;
+    }
+  while (++j != 0);
+
+  return cy_limb;
+}
diff --git a/rts/gmp/mpn/generic/addsub_n.c b/rts/gmp/mpn/generic/addsub_n.c
new file mode 100644
index 0000000000..c9bab3ef60
--- /dev/null
+++ b/rts/gmp/mpn/generic/addsub_n.c
@@ -0,0 +1,167 @@
+/* mpn_addsub_n -- Add and Subtract two limb vectors of equal, non-zero length.
+
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#ifndef L1_CACHE_SIZE
+#define L1_CACHE_SIZE 8192	/* only 68040 has less than this */
+#endif
+
+#define PART_SIZE (L1_CACHE_SIZE / BYTES_PER_MP_LIMB / 6)
+
+
+/* mpn_addsub_n.
+   r1[] = s1[] + s2[]
+   r2[] = s1[] - s2[]
+   All operands have n limbs.
+   In-place operations allowed.  */
+mp_limb_t
+#if __STDC__
+mpn_addsub_n (mp_ptr r1p, mp_ptr r2p, mp_srcptr s1p, mp_srcptr s2p, mp_size_t n)
+#else
+mpn_addsub_n (r1p, r2p, s1p, s2p, n)
+     mp_ptr r1p, r2p;
+     mp_srcptr s1p, s2p;
+     mp_size_t n;
+#endif
+{
+  mp_limb_t acyn, acyo;		/* carry for add */
+  mp_limb_t scyn, scyo;		/* carry for subtract */
+  mp_size_t off;		/* offset in operands */
+  mp_size_t this_n;		/* size of current chunk */
+
+  /* We alternatingly add and subtract in chunks that fit into the (L1)
+     cache.  Since the chunks are several hundred limbs, the function call
+     overhead is insignificant, but we get much better locality.  */
+
+  /* We have three variant of the inner loop, the proper loop is chosen
+     depending on whether r1 or r2 are the same operand as s1 or s2.  */
+
+  if (r1p != s1p && r1p != s2p)
+    {
+      /* r1 is not identical to either input operand.  We can therefore write
+	 to r1 directly, without using temporary storage.  */
+      acyo = 0;
+      scyo = 0;
+      for (off = 0; off < n; off += PART_SIZE)
+	{
+	  this_n = MIN (n - off, PART_SIZE);
+#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
+	  acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
+#else
+	  acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
+	  acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
+#endif
+#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
+	  scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
+#else
+	  scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
+	  scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
+#endif
+	}
+    }
+  else if (r2p != s1p && r2p != s2p)
+    {
+      /* r2 is not identical to either input operand.  We can therefore write
+	 to r2 directly, without using temporary storage.  */
+      acyo = 0;
+      scyo = 0;
+      for (off = 0; off < n; off += PART_SIZE)
+	{
+	  this_n = MIN (n - off, PART_SIZE);
+#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
+	  scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
+#else
+	  scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
+	  scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
+#endif
+#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
+	  acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
+#else
+	  acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
+	  acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
+#endif
+	}
+    }
+  else
+    {
+      /* r1 and r2 are identical to s1 and s2 (r1==s1 and r2=s2 or vice versa)
+	 Need temporary storage.  */
+      mp_limb_t tp[PART_SIZE];
+      acyo = 0;
+      scyo = 0;
+      for (off = 0; off < n; off += PART_SIZE)
+	{
+	  this_n = MIN (n - off, PART_SIZE);
+#if HAVE_NATIVE_mpn_add_nc || !HAVE_NATIVE_mpn_add_n
+	  acyo = mpn_add_nc (tp, s1p + off, s2p + off, this_n, acyo);
+#else
+	  acyn = mpn_add_n (tp, s1p + off, s2p + off, this_n);
+	  acyo = acyn + mpn_add_1 (tp, tp, this_n, acyo);
+#endif
+#if HAVE_NATIVE_mpn_sub_nc || !HAVE_NATIVE_mpn_sub_n
+	  scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
+#else
+	  scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
+	  scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
+#endif
+	  MPN_COPY (r1p + off, tp, this_n);
+	}
+    }
+
+  return 2 * acyo + scyo;
+}
+
+#ifdef MAIN
+#include <stdlib.h>
+#include <stdio.h>
+#include "timing.h"
+
+long cputime ();
+
+int
+main (int argc, char **argv)
+{
+  mp_ptr r1p, r2p, s1p, s2p;
+  double t;
+  mp_size_t n;
+
+  n = strtol (argv[1], 0, 0);
+
+  r1p = malloc (n * BYTES_PER_MP_LIMB);
+  r2p = malloc (n * BYTES_PER_MP_LIMB);
+  s1p = malloc (n * BYTES_PER_MP_LIMB);
+  s2p = malloc (n * BYTES_PER_MP_LIMB);
+  TIME (t,(mpn_add_n(r1p,s1p,s2p,n),mpn_sub_n(r1p,s1p,s2p,n)));
+  printf ("              separate add and sub: %.3f\n", t);
+  TIME (t,mpn_addsub_n(r1p,r2p,s1p,s2p,n));
+  printf ("combined addsub separate variables: %.3f\n", t);
+  TIME (t,mpn_addsub_n(r1p,r2p,r1p,s2p,n));
+  printf ("        combined addsub r1 overlap: %.3f\n", t);
+  TIME (t,mpn_addsub_n(r1p,r2p,r1p,s2p,n));
+  printf ("        combined addsub r2 overlap: %.3f\n", t);
+  TIME (t,mpn_addsub_n(r1p,r2p,r1p,r2p,n));
+  printf ("          combined addsub in-place: %.3f\n", t);
+
+  return 0;
+}
+#endif
diff --git a/rts/gmp/mpn/generic/bdivmod.c b/rts/gmp/mpn/generic/bdivmod.c
new file mode 100644
index 0000000000..c4bcb414e6
--- /dev/null
+++ b/rts/gmp/mpn/generic/bdivmod.c
@@ -0,0 +1,120 @@
+/* mpn/bdivmod.c: mpn_bdivmod for computing U/V mod 2^d.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/* q_high = mpn_bdivmod (qp, up, usize, vp, vsize, d).
+
+   Puts the low d/BITS_PER_MP_LIMB limbs of Q = U / V mod 2^d at qp, and
+   returns the high d%BITS_PER_MP_LIMB bits of Q as the result.
+
+   Also, U - Q * V mod 2^(usize*BITS_PER_MP_LIMB) is placed at up.  Since the
+   low d/BITS_PER_MP_LIMB limbs of this difference are zero, the code allows
+   the limb vectors at qp to overwrite the low limbs at up, provided qp <= up.
+
+   Preconditions:
+   1.  V is odd.
+   2.  usize * BITS_PER_MP_LIMB >= d.
+   3.  If Q and U overlap, qp <= up.
+
+   Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu)
+
+   Funding for this work has been partially provided by Conselho Nacional
+   de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant
+   301314194-2, and was done while I was a visiting reseacher in the Instituto
+   de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS).
+
+   References:
+       T. Jebelean, An algorithm for exact division, Journal of Symbolic
+       Computation, v. 15, 1993, pp. 169-180.
+
+       K. Weber, The accelerated integer GCD algorithm, ACM Transactions on
+       Mathematical Software, v. 21 (March), 1995, pp. 111-122.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+#if __STDC__
+mpn_bdivmod (mp_ptr qp, mp_ptr up, mp_size_t usize,
+	     mp_srcptr vp, mp_size_t vsize, unsigned long int d)
+#else
+mpn_bdivmod (qp, up, usize, vp, vsize, d)
+     mp_ptr qp;
+     mp_ptr up;
+     mp_size_t usize;
+     mp_srcptr vp;
+     mp_size_t vsize;
+     unsigned long int d;
+#endif
+{
+  mp_limb_t v_inv;
+
+  /* 1/V mod 2^BITS_PER_MP_LIMB. */
+  modlimb_invert (v_inv, vp[0]);
+
+  /* Fast code for two cases previously used by the accel part of mpn_gcd.
+     (Could probably remove this now it's inlined there.) */
+  if (usize == 2 && vsize == 2 &&
+      (d == BITS_PER_MP_LIMB || d == 2*BITS_PER_MP_LIMB))
+    {
+      mp_limb_t hi, lo;
+      mp_limb_t q = up[0] * v_inv;
+      umul_ppmm (hi, lo, q, vp[0]);
+      up[0] = 0, up[1] -= hi + q*vp[1], qp[0] = q;
+      if (d == 2*BITS_PER_MP_LIMB)
+	q = up[1] * v_inv, up[1] = 0, qp[1] = q;
+      return 0;
+    }
+
+  /* Main loop.  */
+  while (d >= BITS_PER_MP_LIMB)
+    {
+      mp_limb_t q = up[0] * v_inv;
+      mp_limb_t b = mpn_submul_1 (up, vp, MIN (usize, vsize), q);
+      if (usize > vsize)
+	mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
+      d -= BITS_PER_MP_LIMB;
+      up += 1, usize -= 1;
+      *qp++ = q;
+    }
+
+  if (d)
+    {
+      mp_limb_t b;
+      mp_limb_t q = (up[0] * v_inv) & (((mp_limb_t)1<<d) - 1);
+      if (q <= 1)
+	{
+	  if (q == 0)
+	    return 0;
+	  else
+	    b = mpn_sub_n (up, up, vp, MIN (usize, vsize));
+	}
+      else
+	b = mpn_submul_1 (up, vp, MIN (usize, vsize), q);
+
+      if (usize > vsize)
+	mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
+      return q;
+    }
+
+  return 0;
+}
diff --git a/rts/gmp/mpn/generic/bz_divrem_n.c b/rts/gmp/mpn/generic/bz_divrem_n.c
new file mode 100644
index 0000000000..d234b22af5
--- /dev/null
+++ b/rts/gmp/mpn/generic/bz_divrem_n.c
@@ -0,0 +1,153 @@
+/* mpn_bz_divrem_n and auxilliary routines.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE
+   INTERFACES.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
+   IN FACT, IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A
+   FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 2000 Free Software Foundation, Inc.
+Contributed by Paul Zimmermann.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/*
+[1] Fast Recursive Division, by Christoph Burnikel and Joachim Ziegler,
+    Technical report MPI-I-98-1-022, october 1998.
+    http://www.mpi-sb.mpg.de/~ziegler/TechRep.ps.gz
+*/
+
+static mp_limb_t mpn_bz_div_3_halves_by_2
+  _PROTO ((mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n));
+
+
+/* mpn_bz_divrem_n(n) calls 2*mul(n/2)+2*div(n/2), thus to be faster than
+   div(n) = 4*div(n/2), we need mul(n/2) to be faster than the classic way,
+   i.e. n/2 >= KARATSUBA_MUL_THRESHOLD */
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD (7 * KARATSUBA_MUL_THRESHOLD)
+#endif
+
+#if 0
+static
+unused_mpn_divrem (qp, qxn, np, nn, dp, dn)
+     mp_ptr qp;
+     mp_size_t qxn;
+     mp_ptr np;
+     mp_size_t nn;
+     mp_srcptr dp;
+     mp_size_t dn;
+{
+  /* This might be useful: */
+  if (qxn != 0)
+    {
+      mp_limb_t c;
+      mp_ptr tp = alloca ((nn + qxn) * BYTES_PER_MP_LIMB);
+      MPN_COPY (tp + qxn - nn, np, nn);
+      MPN_ZERO (tp, qxn);
+      c = mpn_divrem (qp, 0L, tp, nn + qxn, dp, dn);
+      /* Maybe copy proper part of tp to np?  Documentation is unclear about
+	 the returned np value when qxn != 0 */
+      return c;
+    }
+}
+#endif
+
+
+/* mpn_bz_divrem_n - Implements algorithm of page 8 in [1]: divides (np,2n)
+   by (dp,n) and puts the quotient in (qp,n), the remainder in (np,n).
+   Returns most significant limb of the quotient, which is 0 or 1.
+   Requires that the most significant bit of the divisor is set.  */
+
+mp_limb_t
+#if __STDC__
+mpn_bz_divrem_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n)
+#else
+mpn_bz_divrem_n (qp, np, dp, n)
+     mp_ptr qp;
+     mp_ptr np;
+     mp_srcptr dp;
+     mp_size_t n;
+#endif
+{
+  mp_limb_t qhl, cc;
+
+  if (n % 2 != 0)
+    {
+      qhl = mpn_bz_divrem_n (qp + 1, np + 2, dp + 1, n - 1);
+      cc = mpn_submul_1 (np + 1, qp + 1, n - 1, dp[0]);
+      cc = mpn_sub_1 (np + n, np + n, 1, cc);
+      if (qhl) cc += mpn_sub_1 (np + n, np + n, 1, dp[0]);
+      while (cc)
+        {
+          qhl -= mpn_sub_1 (qp + 1, qp + 1, n - 1, (mp_limb_t) 1);
+          cc -= mpn_add_n (np + 1, np + 1, dp, n);
+        }
+      qhl += mpn_add_1 (qp + 1, qp + 1, n - 1,
+                        mpn_sb_divrem_mn (qp, np, n + 1, dp, n));
+    }
+  else
+    {
+      mp_size_t n2 = n/2;
+      qhl = mpn_bz_div_3_halves_by_2 (qp + n2, np + n2, dp, n2);
+      qhl += mpn_add_1 (qp + n2, qp + n2, n2,
+                        mpn_bz_div_3_halves_by_2 (qp, np, dp, n2));
+    }
+  return qhl;
+}
+
+
+/* divides (np, 3n) by (dp, 2n) and puts the quotient in (qp, n),
+   the remainder in (np, 2n) */
+
+static mp_limb_t
+#if __STDC__
+mpn_bz_div_3_halves_by_2 (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n)
+#else
+mpn_bz_div_3_halves_by_2 (qp, np, dp, n)
+     mp_ptr qp;
+     mp_ptr np;
+     mp_srcptr dp;
+     mp_size_t n;
+#endif
+{
+  mp_size_t twon = n + n; 
+  mp_limb_t qhl, cc;
+  mp_ptr tmp;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+  if (n < BZ_THRESHOLD)
+    qhl = mpn_sb_divrem_mn (qp, np + n, twon, dp + n, n);
+  else
+    qhl = mpn_bz_divrem_n (qp, np + n, dp + n, n);
+  tmp = (mp_ptr) TMP_ALLOC (twon * BYTES_PER_MP_LIMB);
+  mpn_mul_n (tmp, qp, dp, n);
+  cc = mpn_sub_n (np, np, tmp, twon);
+  TMP_FREE (marker);
+  if (qhl) cc += mpn_sub_n (np + n, np + n, dp, n);
+  while (cc)
+    {
+      qhl -= mpn_sub_1 (qp, qp, n, (mp_limb_t) 1);
+      cc -= mpn_add_n (np, np, dp, twon);
+    }
+  return qhl;
+}
diff --git a/rts/gmp/mpn/generic/cmp.c b/rts/gmp/mpn/generic/cmp.c
new file mode 100644
index 0000000000..8e9792f54e
--- /dev/null
+++ b/rts/gmp/mpn/generic/cmp.c
@@ -0,0 +1,56 @@
+/* mpn_cmp -- Compare two low-level natural-number integers.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Compare OP1_PTR/OP1_SIZE with OP2_PTR/OP2_SIZE.
+   There are no restrictions on the relative sizes of
+   the two arguments.
+   Return 1 if OP1 > OP2, 0 if they are equal, and -1 if OP1 < OP2.  */
+
+int
+#if __STDC__
+mpn_cmp (mp_srcptr op1_ptr, mp_srcptr op2_ptr, mp_size_t size)
+#else
+mpn_cmp (op1_ptr, op2_ptr, size)
+     mp_srcptr op1_ptr;
+     mp_srcptr op2_ptr;
+     mp_size_t size;
+#endif
+{
+  mp_size_t i;
+  mp_limb_t op1_word, op2_word;
+
+  for (i = size - 1; i >= 0; i--)
+    {
+      op1_word = op1_ptr[i];
+      op2_word = op2_ptr[i];
+      if (op1_word != op2_word)
+	goto diff;
+    }
+  return 0;
+ diff:
+  /* This can *not* be simplified to
+	op2_word - op2_word
+     since that expression might give signed overflow.  */
+  return (op1_word > op2_word) ? 1 : -1;
+}
diff --git a/rts/gmp/mpn/generic/diveby3.c b/rts/gmp/mpn/generic/diveby3.c
new file mode 100644
index 0000000000..a2fb552bfa
--- /dev/null
+++ b/rts/gmp/mpn/generic/diveby3.c
@@ -0,0 +1,77 @@
+/* mpn_divexact_by3 -- mpn division by 3, expecting no remainder. */
+
+/*
+Copyright (C) 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+/* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB.
+   0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */
+#define INVERSE_3      ((MP_LIMB_T_MAX / 3) * 2 + 1)
+
+
+/* The "c += ..."s are adding the high limb of 3*l to c.  That high limb
+   will be 0, 1 or 2.  Doing two separate "+="s seems to turn out better
+   code on gcc (as of 2.95.2 at least).
+
+   When a subtraction of a 0,1,2 carry value causes a borrow, that leaves a
+   limb value of either 0xFF...FF or 0xFF...FE and the multiply by INVERSE_3
+   gives 0x55...55 or 0xAA...AA respectively, producing a further borrow of
+   only 0 or 1 respectively.  Hence the carry out of each stage and for the
+   return value is always only 0, 1 or 2.  */
+
+mp_limb_t
+#if __STDC__
+mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t c)
+#else
+mpn_divexact_by3c (dst, src, size, c)
+     mp_ptr    dst;
+     mp_srcptr src;
+     mp_size_t size;
+     mp_limb_t c;
+#endif
+{
+  mp_size_t  i;
+
+  ASSERT (size >= 1);
+
+  i = 0;
+  do
+    {
+      mp_limb_t  l, s;
+
+      s = src[i];
+      l = s - c;
+      c = (l > s);
+
+      l *= INVERSE_3;
+      dst[i] = l;
+
+      c += (l > MP_LIMB_T_MAX/3);
+      c += (l > (MP_LIMB_T_MAX/3)*2);
+    }
+  while (++i < size);
+
+  return c;
+}
diff --git a/rts/gmp/mpn/generic/divrem.c b/rts/gmp/mpn/generic/divrem.c
new file mode 100644
index 0000000000..30673e76d9
--- /dev/null
+++ b/rts/gmp/mpn/generic/divrem.c
@@ -0,0 +1,101 @@
+/* mpn_divrem -- Divide natural numbers, producing both remainder and
+   quotient.  This is now just a middle layer for calling the new
+   internal mpn_tdiv_qr.
+
+Copyright (C) 1993, 1994, 1995, 1996, 1997, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+#if __STDC__
+mpn_divrem (mp_ptr qp, mp_size_t qxn,
+	    mp_ptr np, mp_size_t nn,
+	    mp_srcptr dp, mp_size_t dn)
+#else
+mpn_divrem (qp, qxn, np, nn, dp, dn)
+     mp_ptr qp;
+     mp_size_t qxn;
+     mp_ptr np;
+     mp_size_t nn;
+     mp_srcptr dp;
+     mp_size_t dn;
+#endif
+{
+  if (dn == 1)
+    {
+      mp_limb_t ret;
+      mp_ptr q2p;
+      mp_size_t qn;
+      TMP_DECL (marker);
+
+      TMP_MARK (marker);
+      q2p = (mp_ptr) TMP_ALLOC ((nn + qxn) * BYTES_PER_MP_LIMB);
+
+      np[0] = mpn_divrem_1 (q2p, qxn, np, nn, dp[0]);
+      qn = nn + qxn - 1;
+      MPN_COPY (qp, q2p, qn);
+      ret = q2p[qn];
+
+      TMP_FREE (marker);
+      return ret;
+    }
+  else if (dn == 2)
+    {
+      return mpn_divrem_2 (qp, qxn, np, nn, dp);
+    }
+  else
+    {
+      mp_ptr rp, q2p;
+      mp_limb_t qhl;
+      mp_size_t qn;
+      TMP_DECL (marker);
+
+      TMP_MARK (marker);
+      if (qxn != 0)
+	{
+	  mp_ptr n2p;
+	  n2p = (mp_ptr) TMP_ALLOC ((nn + qxn) * BYTES_PER_MP_LIMB);
+	  MPN_ZERO (n2p, qxn);
+	  MPN_COPY (n2p + qxn, np, nn);
+	  q2p = (mp_ptr) TMP_ALLOC ((nn - dn + qxn + 1) * BYTES_PER_MP_LIMB);
+	  rp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
+	  mpn_tdiv_qr (q2p, rp, 0L, n2p, nn + qxn, dp, dn);
+	  MPN_COPY (np, rp, dn);
+	  qn = nn - dn + qxn;
+	  MPN_COPY (qp, q2p, qn);
+	  qhl = q2p[qn];
+	}
+      else
+	{
+	  q2p = (mp_ptr) TMP_ALLOC ((nn - dn + 1) * BYTES_PER_MP_LIMB);
+	  rp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
+	  mpn_tdiv_qr (q2p, rp, 0L, np, nn, dp, dn);
+	  MPN_COPY (np, rp, dn);	/* overwrite np area with remainder */
+	  qn = nn - dn;
+	  MPN_COPY (qp, q2p, qn);
+	  qhl = q2p[qn];
+	}
+      TMP_FREE (marker);
+      return qhl;
+    }
+}
diff --git a/rts/gmp/mpn/generic/divrem_1.c b/rts/gmp/mpn/generic/divrem_1.c
new file mode 100644
index 0000000000..e93f241c9d
--- /dev/null
+++ b/rts/gmp/mpn/generic/divrem_1.c
@@ -0,0 +1,248 @@
+/* mpn_divrem_1(quot_ptr, qsize, dividend_ptr, dividend_size, divisor_limb) --
+   Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB.
+   Write DIVIDEND_SIZE limbs of quotient at QUOT_PTR.
+   Return the single-limb remainder.
+   There are no constraints on the value of the divisor.
+
+   QUOT_PTR and DIVIDEND_PTR might point to the same limb.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1998, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+
+/* __gmpn_divmod_1_internal(quot_ptr,dividend_ptr,dividend_size,divisor_limb)
+   Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB.
+   Write DIVIDEND_SIZE limbs of quotient at QUOT_PTR.
+   Return the single-limb remainder.
+   There are no constraints on the value of the divisor.
+
+   QUOT_PTR and DIVIDEND_PTR might point to the same limb. */
+
+#ifndef UMUL_TIME
+#define UMUL_TIME 1
+#endif
+
+#ifndef UDIV_TIME
+#define UDIV_TIME UMUL_TIME
+#endif
+
+static mp_limb_t
+#if __STDC__
+__gmpn_divmod_1_internal (mp_ptr quot_ptr,
+	      mp_srcptr dividend_ptr, mp_size_t dividend_size,
+	      mp_limb_t divisor_limb)
+#else
+__gmpn_divmod_1_internal (quot_ptr, dividend_ptr, dividend_size, divisor_limb)
+     mp_ptr quot_ptr;
+     mp_srcptr dividend_ptr;
+     mp_size_t dividend_size;
+     mp_limb_t divisor_limb;
+#endif
+{
+  mp_size_t i;
+  mp_limb_t n1, n0, r;
+  int dummy;
+
+  /* ??? Should this be handled at all?  Rely on callers?  */
+  if (dividend_size == 0)
+    return 0;
+
+  /* If multiplication is much faster than division, and the
+     dividend is large, pre-invert the divisor, and use
+     only multiplications in the inner loop.  */
+
+  /* This test should be read:
+       Does it ever help to use udiv_qrnnd_preinv?
+	 && Does what we save compensate for the inversion overhead?  */
+  if (UDIV_TIME > (2 * UMUL_TIME + 6)
+      && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME)
+    {
+      int normalization_steps;
+
+      count_leading_zeros (normalization_steps, divisor_limb);
+      if (normalization_steps != 0)
+	{
+	  mp_limb_t divisor_limb_inverted;
+
+	  divisor_limb <<= normalization_steps;
+	  invert_limb (divisor_limb_inverted, divisor_limb);
+
+	  n1 = dividend_ptr[dividend_size - 1];
+	  r = n1 >> (BITS_PER_MP_LIMB - normalization_steps);
+
+	  /* Possible optimization:
+	     if (r == 0
+	     && divisor_limb > ((n1 << normalization_steps)
+			     | (dividend_ptr[dividend_size - 2] >> ...)))
+	     ...one division less... */
+
+	  for (i = dividend_size - 2; i >= 0; i--)
+	    {
+	      n0 = dividend_ptr[i];
+	      udiv_qrnnd_preinv (quot_ptr[i + 1], r, r,
+				 ((n1 << normalization_steps)
+				  | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))),
+				 divisor_limb, divisor_limb_inverted);
+	      n1 = n0;
+	    }
+	  udiv_qrnnd_preinv (quot_ptr[0], r, r,
+			     n1 << normalization_steps,
+			     divisor_limb, divisor_limb_inverted);
+	  return r >> normalization_steps;
+	}
+      else
+	{
+	  mp_limb_t divisor_limb_inverted;
+
+	  invert_limb (divisor_limb_inverted, divisor_limb);
+
+	  i = dividend_size - 1;
+	  r = dividend_ptr[i];
+
+	  if (r >= divisor_limb)
+	    r = 0;
+	  else
+	    {
+	      quot_ptr[i] = 0;
+	      i--;
+	    }
+
+	  for (; i >= 0; i--)
+	    {
+	      n0 = dividend_ptr[i];
+	      udiv_qrnnd_preinv (quot_ptr[i], r, r,
+				 n0, divisor_limb, divisor_limb_inverted);
+	    }
+	  return r;
+	}
+    }
+  else
+    {
+      if (UDIV_NEEDS_NORMALIZATION)
+	{
+	  int normalization_steps;
+
+	  count_leading_zeros (normalization_steps, divisor_limb);
+	  if (normalization_steps != 0)
+	    {
+	      divisor_limb <<= normalization_steps;
+
+	      n1 = dividend_ptr[dividend_size - 1];
+	      r = n1 >> (BITS_PER_MP_LIMB - normalization_steps);
+
+	      /* Possible optimization:
+		 if (r == 0
+		 && divisor_limb > ((n1 << normalization_steps)
+				 | (dividend_ptr[dividend_size - 2] >> ...)))
+		 ...one division less... */
+
+	      for (i = dividend_size - 2; i >= 0; i--)
+		{
+		  n0 = dividend_ptr[i];
+		  udiv_qrnnd (quot_ptr[i + 1], r, r,
+			      ((n1 << normalization_steps)
+			       | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))),
+			      divisor_limb);
+		  n1 = n0;
+		}
+	      udiv_qrnnd (quot_ptr[0], r, r,
+			  n1 << normalization_steps,
+			  divisor_limb);
+	      return r >> normalization_steps;
+	    }
+	}
+      /* No normalization needed, either because udiv_qrnnd doesn't require
+	 it, or because DIVISOR_LIMB is already normalized.  */
+
+      i = dividend_size - 1;
+      r = dividend_ptr[i];
+
+      if (r >= divisor_limb)
+	r = 0;
+      else
+	{
+	  quot_ptr[i] = 0;
+	  i--;
+	}
+
+      for (; i >= 0; i--)
+	{
+	  n0 = dividend_ptr[i];
+	  udiv_qrnnd (quot_ptr[i], r, r, n0, divisor_limb);
+	}
+      return r;
+    }
+}
+
+
+
+mp_limb_t
+#if __STDC__
+mpn_divrem_1 (mp_ptr qp, mp_size_t qxn,
+	      mp_srcptr np, mp_size_t nn,
+	      mp_limb_t d)
+#else
+mpn_divrem_1 (qp, qxn, np, nn, d)
+     mp_ptr qp;
+     mp_size_t qxn;
+     mp_srcptr np;
+     mp_size_t nn;
+     mp_limb_t d;
+#endif
+{
+  mp_limb_t rlimb;
+  mp_size_t i;
+
+  /* Develop integer part of quotient.  */
+  rlimb = __gmpn_divmod_1_internal (qp + qxn, np, nn, d);
+
+  /* Develop fraction part of quotient.  This is not as fast as it should;
+     the preinvert stuff from __gmpn_divmod_1_internal ought to be used here
+     too.  */
+  if (UDIV_NEEDS_NORMALIZATION)
+    {
+      int normalization_steps;
+
+      count_leading_zeros (normalization_steps, d);
+      if (normalization_steps != 0)
+	{
+	  d <<= normalization_steps;
+	  rlimb <<= normalization_steps;
+
+	  for (i = qxn - 1; i >= 0; i--)
+	    udiv_qrnnd (qp[i], rlimb, rlimb, 0, d);
+
+	  return rlimb >> normalization_steps;
+	}
+      else
+	/* fall out */
+	;
+    }
+
+  for (i = qxn - 1; i >= 0; i--)
+    udiv_qrnnd (qp[i], rlimb, rlimb, 0, d);
+
+  return rlimb;
+}
diff --git a/rts/gmp/mpn/generic/divrem_2.c b/rts/gmp/mpn/generic/divrem_2.c
new file mode 100644
index 0000000000..0bc31ae2e7
--- /dev/null
+++ b/rts/gmp/mpn/generic/divrem_2.c
@@ -0,0 +1,151 @@
+/* mpn_divrem_2 -- Divide natural numbers, producing both remainder and
+   quotient.  The divisor is two limbs.
+
+   THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS
+   ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS
+   ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP
+   RELEASE.
+
+
+Copyright (C) 1993, 1994, 1995, 1996, 1999, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Divide num (NP/NSIZE) by den (DP/2) and write
+   the NSIZE-2 least significant quotient limbs at QP
+   and the 2 long remainder at NP.  If QEXTRA_LIMBS is
+   non-zero, generate that many fraction bits and append them after the
+   other quotient limbs.
+   Return the most significant limb of the quotient, this is always 0 or 1.
+
+   Preconditions:
+   0. NSIZE >= 2.
+   1. The most significant bit of the divisor must be set.
+   2. QP must either not overlap with the input operands at all, or
+      QP + 2 >= NP must hold true.  (This means that it's
+      possible to put the quotient in the high part of NUM, right after the
+      remainder in NUM.
+   3. NSIZE >= 2, even if QEXTRA_LIMBS is non-zero.  */
+
+mp_limb_t
+#if __STDC__
+mpn_divrem_2 (mp_ptr qp, mp_size_t qxn,
+	      mp_ptr np, mp_size_t nsize,
+	      mp_srcptr dp)
+#else
+mpn_divrem_2 (qp, qxn, np, nsize, dp)
+     mp_ptr qp;
+     mp_size_t qxn;
+     mp_ptr np;
+     mp_size_t nsize;
+     mp_srcptr dp;
+#endif
+{
+  mp_limb_t most_significant_q_limb = 0;
+  mp_size_t i;
+  mp_limb_t n1, n0, n2;
+  mp_limb_t d1, d0;
+  mp_limb_t d1inv;
+  int have_preinv;
+
+  np += nsize - 2;
+  d1 = dp[1];
+  d0 = dp[0];
+  n1 = np[1];
+  n0 = np[0];
+
+  if (n1 >= d1 && (n1 > d1 || n0 >= d0))
+    {
+      sub_ddmmss (n1, n0, n1, n0, d1, d0);
+      most_significant_q_limb = 1;
+    }
+
+  /* If multiplication is much faster than division, preinvert the most 
+     significant divisor limb before entering the loop.  */
+  if (UDIV_TIME > 2 * UMUL_TIME + 6)
+    {
+      have_preinv = 0;
+      if ((UDIV_TIME - (2 * UMUL_TIME + 6)) * (nsize - 2) > UDIV_TIME)
+	{
+	  invert_limb (d1inv, d1);
+	  have_preinv = 1;
+	}
+    }
+
+  for (i = qxn + nsize - 2 - 1; i >= 0; i--)
+    {
+      mp_limb_t q;
+      mp_limb_t r;
+
+      if (i >= qxn)
+	np--;
+      else
+	np[0] = 0;
+
+      if (n1 == d1)
+	{
+	  /* Q should be either 111..111 or 111..110.  Need special treatment
+	     of this rare case as normal division would give overflow.  */
+	  q = ~(mp_limb_t) 0;
+
+	  r = n0 + d1;
+	  if (r < d1)	/* Carry in the addition? */
+	    {
+	      add_ssaaaa (n1, n0, r - d0, np[0], 0, d0);
+	      qp[i] = q;
+	      continue;
+	    }
+	  n1 = d0 - (d0 != 0);
+	  n0 = -d0;
+	}
+      else
+	{
+	  if (UDIV_TIME > 2 * UMUL_TIME + 6 && have_preinv)
+	    udiv_qrnnd_preinv (q, r, n1, n0, d1, d1inv);
+	  else
+	    udiv_qrnnd (q, r, n1, n0, d1);
+	  umul_ppmm (n1, n0, d0, q);
+	}
+
+      n2 = np[0];
+
+    q_test:
+      if (n1 > r || (n1 == r && n0 > n2))
+	{
+	  /* The estimated Q was too large.  */
+	  q--;
+
+	  sub_ddmmss (n1, n0, n1, n0, 0, d0);
+	  r += d1;
+	  if (r >= d1)	/* If not carry, test Q again.  */
+	    goto q_test;
+	}
+
+      qp[i] = q;
+      sub_ddmmss (n1, n0, r, n2, n1, n0);
+    }
+  np[1] = n1;
+  np[0] = n0;
+
+  return most_significant_q_limb;
+}
diff --git a/rts/gmp/mpn/generic/dump.c b/rts/gmp/mpn/generic/dump.c
new file mode 100644
index 0000000000..66f375c74b
--- /dev/null
+++ b/rts/gmp/mpn/generic/dump.c
@@ -0,0 +1,76 @@
+/* THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS NOT SAFE TO
+   CALL THIS FUNCTION DIRECTLY.  IN FACT, IT IS ALMOST GUARANTEED THAT THIS
+   FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include <stdio.h>
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpn_dump (mp_srcptr ptr, mp_size_t size)
+#else
+mpn_dump (ptr, size)
+     mp_srcptr ptr;
+     mp_size_t size;
+#endif
+{
+  MPN_NORMALIZE (ptr, size);
+
+  if (size == 0)
+    printf ("0\n");
+  else
+    {
+      size--;
+      if (BYTES_PER_MP_LIMB > sizeof (long))
+	{
+	  if ((ptr[size] >> BITS_PER_MP_LIMB/2) != 0)
+	    {
+	      printf ("%lX",
+		      (unsigned long) (ptr[size] >> BITS_PER_MP_LIMB/2));
+	      printf ("%0*lX", (int) (BYTES_PER_MP_LIMB),
+		      (unsigned long) ptr[size]);
+	    }
+	  else
+	    printf ("%lX", (unsigned long) ptr[size]);
+	}
+      else
+	printf ("%lX", ptr[size]);
+
+      while (size)
+	{
+	  size--;
+	  if (BYTES_PER_MP_LIMB > sizeof (long))
+	    {
+	      printf ("%0*lX", (int) (BYTES_PER_MP_LIMB),
+		(unsigned long) (ptr[size] >> BITS_PER_MP_LIMB/2));
+	      printf ("%0*lX", (int) (BYTES_PER_MP_LIMB),
+		(unsigned long) ptr[size]);
+	    }
+	  else
+	    printf ("%0*lX", (int) (2 * BYTES_PER_MP_LIMB), ptr[size]);
+	}
+      printf ("\n");
+    }
+}
diff --git a/rts/gmp/mpn/generic/gcd.c b/rts/gmp/mpn/generic/gcd.c
new file mode 100644
index 0000000000..059e219a06
--- /dev/null
+++ b/rts/gmp/mpn/generic/gcd.c
@@ -0,0 +1,414 @@
+/* mpn/gcd.c: mpn_gcd for gcd of two odd integers.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1998, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/* Integer greatest common divisor of two unsigned integers, using
+   the accelerated algorithm (see reference below).
+
+   mp_size_t mpn_gcd (up, usize, vp, vsize).
+
+   Preconditions [U = (up, usize) and V = (vp, vsize)]:
+
+   1.  V is odd.
+   2.  numbits(U) >= numbits(V).
+
+   Both U and V are destroyed by the operation.  The result is left at vp,
+   and its size is returned.
+
+   Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu)
+
+   Funding for this work has been partially provided by Conselho Nacional
+   de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant
+   301314194-2, and was done while I was a visiting reseacher in the Instituto
+   de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS).
+
+   Refer to
+	K. Weber, The accelerated integer GCD algorithm, ACM Transactions on
+	Mathematical Software, v. 21 (March), 1995, pp. 111-122.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* If MIN (usize, vsize) >= GCD_ACCEL_THRESHOLD, then the accelerated
+   algorithm is used, otherwise the binary algorithm is used.  This may be
+   adjusted for different architectures.  */
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD 5
+#endif
+
+/* When U and V differ in size by more than BMOD_THRESHOLD, the accelerated
+   algorithm reduces using the bmod operation.  Otherwise, the k-ary reduction
+   is used.  0 <= BMOD_THRESHOLD < BITS_PER_MP_LIMB.  */
+enum
+  {
+    BMOD_THRESHOLD = BITS_PER_MP_LIMB/2
+  };
+
+
+/* Use binary algorithm to compute V <-- GCD (V, U) for usize, vsize == 2.
+   Both U and V must be odd.  */
+static __gmp_inline mp_size_t
+#if __STDC__
+gcd_2 (mp_ptr vp, mp_srcptr up)
+#else
+gcd_2 (vp, up)
+     mp_ptr vp;
+     mp_srcptr up;
+#endif
+{
+  mp_limb_t u0, u1, v0, v1;
+  mp_size_t vsize;
+
+  u0 = up[0], u1 = up[1], v0 = vp[0], v1 = vp[1];
+
+  while (u1 != v1 && u0 != v0)
+    {
+      unsigned long int r;
+      if (u1 > v1)
+	{
+	  u1 -= v1 + (u0 < v0), u0 -= v0;
+	  count_trailing_zeros (r, u0);
+	  u0 = u1 << (BITS_PER_MP_LIMB - r) | u0 >> r;
+	  u1 >>= r;
+	}
+      else  /* u1 < v1.  */
+	{
+	  v1 -= u1 + (v0 < u0), v0 -= u0;
+	  count_trailing_zeros (r, v0);
+	  v0 = v1 << (BITS_PER_MP_LIMB - r) | v0 >> r;
+	  v1 >>= r;
+	}
+    }
+
+  vp[0] = v0, vp[1] = v1, vsize = 1 + (v1 != 0);
+
+  /* If U == V == GCD, done.  Otherwise, compute GCD (V, |U - V|).  */
+  if (u1 == v1 && u0 == v0)
+    return vsize;
+
+  v0 = (u0 == v0) ? (u1 > v1) ? u1-v1 : v1-u1 : (u0 > v0) ? u0-v0 : v0-u0;
+  vp[0] = mpn_gcd_1 (vp, vsize, v0);
+
+  return 1;
+}
+
+/* The function find_a finds 0 < N < 2^BITS_PER_MP_LIMB such that there exists
+   0 < |D| < 2^BITS_PER_MP_LIMB, and N == D * C mod 2^(2*BITS_PER_MP_LIMB).
+   In the reference article, D was computed along with N, but it is better to
+   compute D separately as D <-- N / C mod 2^(BITS_PER_MP_LIMB + 1), treating
+   the result as a twos' complement signed integer.
+
+   Initialize N1 to C mod 2^(2*BITS_PER_MP_LIMB).  According to the reference
+   article, N2 should be initialized to 2^(2*BITS_PER_MP_LIMB), but we use
+   2^(2*BITS_PER_MP_LIMB) - N1 to start the calculations within double
+   precision.  If N2 > N1 initially, the first iteration of the while loop
+   will swap them.  In all other situations, N1 >= N2 is maintained.  */
+
+static
+#if ! defined (__i386__)
+__gmp_inline			/* don't inline this for the x86 */
+#endif
+mp_limb_t
+#if __STDC__
+find_a (mp_srcptr cp)
+#else
+find_a (cp)
+     mp_srcptr cp;
+#endif
+{
+  unsigned long int leading_zero_bits = 0;
+
+  mp_limb_t n1_l = cp[0];	/* N1 == n1_h * 2^BITS_PER_MP_LIMB + n1_l.  */
+  mp_limb_t n1_h = cp[1];
+
+  mp_limb_t n2_l = -n1_l;	/* N2 == n2_h * 2^BITS_PER_MP_LIMB + n2_l.  */
+  mp_limb_t n2_h = ~n1_h;
+
+  /* Main loop.  */
+  while (n2_h)			/* While N2 >= 2^BITS_PER_MP_LIMB.  */
+    {
+      /* N1 <-- N1 % N2.  */
+      if ((MP_LIMB_T_HIGHBIT >> leading_zero_bits & n2_h) == 0)
+	{
+	  unsigned long int i;
+	  count_leading_zeros (i, n2_h);
+	  i -= leading_zero_bits, leading_zero_bits += i;
+	  n2_h = n2_h<<i | n2_l>>(BITS_PER_MP_LIMB - i), n2_l <<= i;
+	  do
+	    {
+	      if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
+		n1_h -= n2_h + (n1_l < n2_l), n1_l -= n2_l;
+	      n2_l = n2_l>>1 | n2_h<<(BITS_PER_MP_LIMB - 1), n2_h >>= 1;
+	      i -= 1;
+	    }
+	  while (i);
+	}
+      if (n1_h > n2_h || (n1_h == n2_h && n1_l >= n2_l))
+	n1_h -= n2_h + (n1_l < n2_l), n1_l -= n2_l;
+
+      MP_LIMB_T_SWAP (n1_h, n2_h);
+      MP_LIMB_T_SWAP (n1_l, n2_l);
+    }
+
+  return n2_l;
+}
+
+mp_size_t
+#if __STDC__
+mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t vsize)
+#else
+mpn_gcd (gp, up, usize, vp, vsize)
+     mp_ptr gp;
+     mp_ptr up;
+     mp_size_t usize;
+     mp_ptr vp;
+     mp_size_t vsize;
+#endif
+{
+  mp_ptr orig_vp = vp;
+  mp_size_t orig_vsize = vsize;
+  int binary_gcd_ctr;		/* Number of times binary gcd will execute.  */
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  /* Use accelerated algorithm if vsize is over GCD_ACCEL_THRESHOLD.
+     Two EXTRA limbs for U and V are required for kary reduction.  */
+  if (vsize >= GCD_ACCEL_THRESHOLD)
+    {
+      unsigned long int vbitsize, d;
+      mp_ptr orig_up = up;
+      mp_size_t orig_usize = usize;
+      mp_ptr anchor_up = (mp_ptr) TMP_ALLOC ((usize + 2) * BYTES_PER_MP_LIMB);
+
+      MPN_COPY (anchor_up, orig_up, usize);
+      up = anchor_up;
+
+      count_leading_zeros (d, up[usize-1]);
+      d = usize * BITS_PER_MP_LIMB - d;
+      count_leading_zeros (vbitsize, vp[vsize-1]);
+      vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
+      d = d - vbitsize + 1;
+
+      /* Use bmod reduction to quickly discover whether V divides U.  */
+      up[usize++] = 0;				/* Insert leading zero.  */
+      mpn_bdivmod (up, up, usize, vp, vsize, d);
+
+      /* Now skip U/V mod 2^d and any low zero limbs.  */
+      d /= BITS_PER_MP_LIMB, up += d, usize -= d;
+      while (usize != 0 && up[0] == 0)
+	up++, usize--;
+
+      if (usize == 0)				/* GCD == ORIG_V.  */
+	goto done;
+
+      vp = (mp_ptr) TMP_ALLOC ((vsize + 2) * BYTES_PER_MP_LIMB);
+      MPN_COPY (vp, orig_vp, vsize);
+
+      do					/* Main loop.  */
+	{
+          /* mpn_com_n can't be used here because anchor_up and up may
+             partially overlap */
+	  if (up[usize-1] & MP_LIMB_T_HIGHBIT)  /* U < 0; take twos' compl. */
+	    {
+	      mp_size_t i;
+	      anchor_up[0] = -up[0];
+	      for (i = 1; i < usize; i++)
+		anchor_up[i] = ~up[i];
+	      up = anchor_up;
+	    }
+
+	  MPN_NORMALIZE_NOT_ZERO (up, usize);
+
+	  if ((up[0] & 1) == 0)			/* Result even; remove twos. */
+	    {
+	      unsigned int r;
+	      count_trailing_zeros (r, up[0]);
+	      mpn_rshift (anchor_up, up, usize, r);
+	      usize -= (anchor_up[usize-1] == 0);
+	    }
+	  else if (anchor_up != up)
+	    MPN_COPY_INCR (anchor_up, up, usize);
+
+	  MPN_PTR_SWAP (anchor_up,usize, vp,vsize);
+	  up = anchor_up;
+
+	  if (vsize <= 2)		/* Kary can't handle < 2 limbs and  */
+	    break;			/* isn't efficient for == 2 limbs.  */
+
+	  d = vbitsize;
+	  count_leading_zeros (vbitsize, vp[vsize-1]);
+	  vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
+	  d = d - vbitsize + 1;
+
+	  if (d > BMOD_THRESHOLD)	/* Bmod reduction.  */
+	    {
+	      up[usize++] = 0;
+	      mpn_bdivmod (up, up, usize, vp, vsize, d);
+	      d /= BITS_PER_MP_LIMB, up += d, usize -= d;
+	    }
+	  else				/* Kary reduction.  */
+	    {
+	      mp_limb_t bp[2], cp[2];
+
+	      /* C <-- V/U mod 2^(2*BITS_PER_MP_LIMB).  */
+              {
+                mp_limb_t u_inv, hi, lo;
+                modlimb_invert (u_inv, up[0]);
+                cp[0] = vp[0] * u_inv;
+                umul_ppmm (hi, lo, cp[0], up[0]);
+                cp[1] = (vp[1] - hi - cp[0] * up[1]) * u_inv;
+              }
+
+	      /* U <-- find_a (C)  *  U.  */
+	      up[usize] = mpn_mul_1 (up, up, usize, find_a (cp));
+	      usize++;
+
+	      /* B <-- A/C == U/V mod 2^(BITS_PER_MP_LIMB + 1).
+		  bp[0] <-- U/V mod 2^BITS_PER_MP_LIMB and
+		  bp[1] <-- ( (U - bp[0] * V)/2^BITS_PER_MP_LIMB ) / V mod 2
+
+                 Like V/U above, but simplified because only the low bit of
+                 bp[1] is wanted. */
+              {
+                mp_limb_t  v_inv, hi, lo;
+                modlimb_invert (v_inv, vp[0]);
+                bp[0] = up[0] * v_inv;
+                umul_ppmm (hi, lo, bp[0], vp[0]);
+                bp[1] = (up[1] + hi + (bp[0]&vp[1])) & 1;
+              }
+
+	      up[usize++] = 0;
+	      if (bp[1])	/* B < 0: U <-- U + (-B)  * V.  */
+		{
+		   mp_limb_t c = mpn_addmul_1 (up, vp, vsize, -bp[0]);
+		   mpn_add_1 (up + vsize, up + vsize, usize - vsize, c);
+		}
+	      else		/* B >= 0:  U <-- U - B * V.  */
+		{
+		  mp_limb_t b = mpn_submul_1 (up, vp, vsize, bp[0]);
+		  mpn_sub_1 (up + vsize, up + vsize, usize - vsize, b);
+		}
+
+	      up += 2, usize -= 2;  /* At least two low limbs are zero.  */
+	    }
+
+	  /* Must remove low zero limbs before complementing.  */
+	  while (usize != 0 && up[0] == 0)
+	    up++, usize--;
+	}
+      while (usize);
+
+      /* Compute GCD (ORIG_V, GCD (ORIG_U, V)).  Binary will execute twice.  */
+      up = orig_up, usize = orig_usize;
+      binary_gcd_ctr = 2;
+    }
+  else
+    binary_gcd_ctr = 1;
+
+  /* Finish up with the binary algorithm.  Executes once or twice.  */
+  for ( ; binary_gcd_ctr--; up = orig_vp, usize = orig_vsize)
+    {
+      if (usize > 2)		/* First make U close to V in size.  */
+	{
+	  unsigned long int vbitsize, d;
+	  count_leading_zeros (d, up[usize-1]);
+	  d = usize * BITS_PER_MP_LIMB - d;
+	  count_leading_zeros (vbitsize, vp[vsize-1]);
+	  vbitsize = vsize * BITS_PER_MP_LIMB - vbitsize;
+	  d = d - vbitsize - 1;
+	  if (d != -(unsigned long int)1 && d > 2)
+	    {
+	      mpn_bdivmod (up, up, usize, vp, vsize, d);  /* Result > 0.  */
+	      d /= (unsigned long int)BITS_PER_MP_LIMB, up += d, usize -= d;
+	    }
+	}
+
+      /* Start binary GCD.  */
+      do
+	{
+	  mp_size_t zeros;
+
+	  /* Make sure U is odd.  */
+	  MPN_NORMALIZE (up, usize);
+	  while (up[0] == 0)
+	    up += 1, usize -= 1;
+	  if ((up[0] & 1) == 0)
+	    {
+	      unsigned int r;
+	      count_trailing_zeros (r, up[0]);
+	      mpn_rshift (up, up, usize, r);
+	      usize -= (up[usize-1] == 0);
+	    }
+
+	  /* Keep usize >= vsize.  */
+	  if (usize < vsize)
+	    MPN_PTR_SWAP (up, usize, vp, vsize);
+
+	  if (usize <= 2)				/* Double precision. */
+	    {
+	      if (vsize == 1)
+		vp[0] = mpn_gcd_1 (up, usize, vp[0]);
+	      else
+		vsize = gcd_2 (vp, up);
+	      break;					/* Binary GCD done.  */
+	    }
+
+	  /* Count number of low zero limbs of U - V.  */
+	  for (zeros = 0; up[zeros] == vp[zeros] && ++zeros != vsize; )
+	    continue;
+
+	  /* If U < V, swap U and V; in any case, subtract V from U.  */
+	  if (zeros == vsize)				/* Subtract done.  */
+	    up += zeros, usize -= zeros;
+	  else if (usize == vsize)
+	    {
+	      mp_size_t size = vsize;
+	      do
+		size--;
+	      while (up[size] == vp[size]);
+	      if (up[size] < vp[size])			/* usize == vsize.  */
+		MP_PTR_SWAP (up, vp);
+	      up += zeros, usize = size + 1 - zeros;
+	      mpn_sub_n (up, up, vp + zeros, usize);
+	    }
+	  else
+	    {
+	      mp_size_t size = vsize - zeros;
+	      up += zeros, usize -= zeros;
+	      if (mpn_sub_n (up, up, vp + zeros, size))
+		{
+		  while (up[size] == 0)			/* Propagate borrow. */
+		    up[size++] = -(mp_limb_t)1;
+		  up[size] -= 1;
+		}
+	    }
+	}
+      while (usize);					/* End binary GCD.  */
+    }
+
+done:
+  if (vp != gp)
+    MPN_COPY (gp, vp, vsize);
+  TMP_FREE (marker);
+  return vsize;
+}
diff --git a/rts/gmp/mpn/generic/gcd_1.c b/rts/gmp/mpn/generic/gcd_1.c
new file mode 100644
index 0000000000..1832636636
--- /dev/null
+++ b/rts/gmp/mpn/generic/gcd_1.c
@@ -0,0 +1,77 @@
+/* mpn_gcd_1 --
+
+Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Does not work for U == 0 or V == 0.  It would be tough to make it work for
+   V == 0 since gcd(x,0) = x, and U does not generally fit in an mp_limb_t.  */
+
+mp_limb_t
+#if __STDC__
+mpn_gcd_1 (mp_srcptr up, mp_size_t size, mp_limb_t vlimb)
+#else
+mpn_gcd_1 (up, size, vlimb)
+     mp_srcptr up;
+     mp_size_t size;
+     mp_limb_t vlimb;
+#endif
+{
+  mp_limb_t ulimb;
+  unsigned long int u_low_zero_bits, v_low_zero_bits;
+
+  if (size > 1)
+    {
+      ulimb = mpn_mod_1 (up, size, vlimb);
+      if (ulimb == 0)
+	return vlimb;
+    }
+  else
+    ulimb = up[0];
+
+  /*  Need to eliminate low zero bits.  */
+  count_trailing_zeros (u_low_zero_bits, ulimb);
+  ulimb >>= u_low_zero_bits;
+
+  count_trailing_zeros (v_low_zero_bits, vlimb);
+  vlimb >>= v_low_zero_bits;
+
+  while (ulimb != vlimb)
+    {
+      if (ulimb > vlimb)
+	{
+	  ulimb -= vlimb;
+	  do
+	    ulimb >>= 1;
+	  while ((ulimb & 1) == 0);
+	}
+      else /*  vlimb > ulimb.  */
+	{
+	  vlimb -= ulimb;
+	  do
+	    vlimb >>= 1;
+	  while ((vlimb & 1) == 0);
+	}
+    }
+
+  return  ulimb << MIN (u_low_zero_bits, v_low_zero_bits);
+}
diff --git a/rts/gmp/mpn/generic/gcdext.c b/rts/gmp/mpn/generic/gcdext.c
new file mode 100644
index 0000000000..fe22d779a6
--- /dev/null
+++ b/rts/gmp/mpn/generic/gcdext.c
@@ -0,0 +1,700 @@
+/* mpn_gcdext -- Extended Greatest Common Divisor.
+
+Copyright (C) 1996, 1998, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD 17
+#endif
+
+#ifndef EXTEND
+#define EXTEND 1
+#endif
+
+#if STAT
+int arr[BITS_PER_MP_LIMB];
+#endif
+
+
+/* mpn_gcdext (GP, SP, SSIZE, UP, USIZE, VP, VSIZE)
+
+   Compute the extended GCD of {UP,USIZE} and {VP,VSIZE} and store the
+   greatest common divisor at GP (unless it is 0), and the first cofactor at
+   SP.  Write the size of the cofactor through the pointer SSIZE.  Return the
+   size of the value at GP.  Note that SP might be a negative number; this is
+   denoted by storing the negative of the size through SSIZE.
+
+   {UP,USIZE} and {VP,VSIZE} are both clobbered.
+
+   The space allocation for all four areas needs to be USIZE+1.
+
+   Preconditions: 1) U >= V.
+		  2) V > 0.  */
+
+/* We use Lehmer's algorithm.  The idea is to extract the most significant
+   bits of the operands, and compute the continued fraction for them.  We then
+   apply the gathered cofactors to the full operands.
+
+   Idea 1: After we have performed a full division, don't shift operands back,
+	   but instead account for the extra factors-of-2 thus introduced.
+   Idea 2: Simple generalization to use divide-and-conquer would give us an
+	   algorithm that runs faster than O(n^2).
+   Idea 3: The input numbers need less space as the computation progresses,
+	   while the s0 and s1 variables need more space.  To save memory, we
+	   could make them share space, and have the latter variables grow
+	   into the former.
+   Idea 4: We should not do double-limb arithmetic from the start.  Instead,
+	   do things in single-limb arithmetic until the quotients differ,
+	   and then switch to double-limb arithmetic.  */
+
+
+/* Division optimized for small quotients.  If the quotient is more than one limb,
+   store 1 in *qh and return 0.  */
+static mp_limb_t
+#if __STDC__
+div2 (mp_limb_t *qh, mp_limb_t n1, mp_limb_t n0, mp_limb_t d1, mp_limb_t d0)
+#else
+div2 (qh, n1, n0, d1, d0)
+     mp_limb_t *qh;
+     mp_limb_t n1;
+     mp_limb_t n0;
+     mp_limb_t d1;
+     mp_limb_t d0;
+#endif
+{
+  if (d1 == 0)
+    {
+      *qh = 1;
+      return 0;
+    }
+
+  if ((mp_limb_signed_t) n1 < 0)
+    {
+      mp_limb_t q;
+      int cnt;
+      for (cnt = 1; (mp_limb_signed_t) d1 >= 0; cnt++)
+	{
+	  d1 = (d1 << 1) | (d0 >> (BITS_PER_MP_LIMB - 1));
+	  d0 = d0 << 1;
+	}
+
+      q = 0;
+      while (cnt)
+	{
+	  q <<= 1;
+	  if (n1 > d1 || (n1 == d1 && n0 >= d0))
+	    {
+	      sub_ddmmss (n1, n0, n1, n0, d1, d0);
+	      q |= 1;
+	    }
+	  d0 = (d1 << (BITS_PER_MP_LIMB - 1)) | (d0 >> 1);
+	  d1 = d1 >> 1;
+	  cnt--;
+	}
+
+      *qh = 0;
+      return q;
+    }
+  else
+    {
+      mp_limb_t q;
+      int cnt;
+      for (cnt = 0; n1 > d1 || (n1 == d1 && n0 >= d0); cnt++)
+	{
+	  d1 = (d1 << 1) | (d0 >> (BITS_PER_MP_LIMB - 1));
+	  d0 = d0 << 1;
+	}
+
+      q = 0;
+      while (cnt)
+	{
+	  d0 = (d1 << (BITS_PER_MP_LIMB - 1)) | (d0 >> 1);
+	  d1 = d1 >> 1;
+	  q <<= 1;
+	  if (n1 > d1 || (n1 == d1 && n0 >= d0))
+	    {
+	      sub_ddmmss (n1, n0, n1, n0, d1, d0);
+	      q |= 1;
+	    }
+	  cnt--;
+	}
+
+      *qh = 0;
+      return q;
+    }
+}
+
+mp_size_t
+#if EXTEND
+#if __STDC__
+mpn_gcdext (mp_ptr gp, mp_ptr s0p, mp_size_t *s0size,
+	    mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize)
+#else
+mpn_gcdext (gp, s0p, s0size, up, size, vp, vsize)
+     mp_ptr gp;
+     mp_ptr s0p;
+     mp_size_t *s0size;
+     mp_ptr up;
+     mp_size_t size;
+     mp_ptr vp;
+     mp_size_t vsize;
+#endif
+#else
+#if __STDC__
+mpn_gcd (mp_ptr gp,
+	 mp_ptr up, mp_size_t size, mp_ptr vp, mp_size_t vsize)
+#else
+mpn_gcd (gp, up, size, vp, vsize)
+     mp_ptr gp;
+     mp_ptr up;
+     mp_size_t size;
+     mp_ptr vp;
+     mp_size_t vsize;
+#endif
+#endif
+{
+  mp_limb_t A, B, C, D;
+  int cnt;
+  mp_ptr tp, wp;
+#if RECORD
+  mp_limb_t max = 0;
+#endif
+#if EXTEND
+  mp_ptr s1p;
+  mp_ptr orig_s0p = s0p;
+  mp_size_t ssize;
+  int sign = 1;
+#endif
+  int use_double_flag;
+  TMP_DECL (mark);
+
+  TMP_MARK (mark);
+
+  use_double_flag = (size >= GCDEXT_THRESHOLD);
+
+  tp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
+  wp = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
+#if EXTEND
+  s1p = (mp_ptr) TMP_ALLOC ((size + 1) * BYTES_PER_MP_LIMB);
+
+  MPN_ZERO (s0p, size);
+  MPN_ZERO (s1p, size);
+
+  s0p[0] = 1;
+  s1p[0] = 0;
+  ssize = 1;
+#endif
+
+  if (size > vsize)
+    {
+      /* Normalize V (and shift up U the same amount).  */
+      count_leading_zeros (cnt, vp[vsize - 1]);
+      if (cnt != 0)
+	{
+	  mp_limb_t cy;
+	  mpn_lshift (vp, vp, vsize, cnt);
+	  cy = mpn_lshift (up, up, size, cnt);
+	  up[size] = cy;
+	  size += cy != 0;
+	}
+
+      mpn_divmod (up + vsize, up, size, vp, vsize);
+#if EXTEND
+      /* This is really what it boils down to in this case... */
+      s0p[0] = 0;
+      s1p[0] = 1;
+      sign = -sign;
+#endif
+      size = vsize;
+      if (cnt != 0)
+	{
+	  mpn_rshift (up, up, size, cnt);
+	  mpn_rshift (vp, vp, size, cnt);
+	}
+      MP_PTR_SWAP (up, vp);
+    }
+
+  for (;;)
+    {
+      mp_limb_t asign;
+      /* Figure out exact size of V.  */
+      vsize = size;
+      MPN_NORMALIZE (vp, vsize);
+      if (vsize <= 1)
+	break;
+
+      if (use_double_flag)
+	{
+	  mp_limb_t uh, vh, ul, vl;
+	  /* Let UH,UL be the most significant limbs of U, and let VH,VL be
+	     the corresponding bits from V.  */
+	  uh = up[size - 1];
+	  vh = vp[size - 1];
+	  ul = up[size - 2];
+	  vl = vp[size - 2];
+	  count_leading_zeros (cnt, uh);
+	  if (cnt != 0)
+	    {
+	      uh = (uh << cnt) | (ul >> (BITS_PER_MP_LIMB - cnt));
+	      vh = (vh << cnt) | (vl >> (BITS_PER_MP_LIMB - cnt));
+	      vl <<= cnt;
+	      ul <<= cnt;
+	      if (size >= 3)
+		{
+		  ul |= (up[size - 3] >> (BITS_PER_MP_LIMB - cnt));
+		  vl |= (vp[size - 3] >> (BITS_PER_MP_LIMB - cnt));
+		}
+	    }
+
+	  A = 1;
+	  B = 0;
+	  C = 0;
+	  D = 1;
+
+	  asign = 0;
+	  for (;;)
+	    {
+	      mp_limb_t T;
+	      mp_limb_t qh, q1, q2;
+	      mp_limb_t nh, nl, dh, dl;
+	      mp_limb_t t1, t0;
+	      mp_limb_t Th, Tl;
+
+	      sub_ddmmss (dh, dl, vh, vl, 0, C);
+	      if ((dl | dh) == 0)
+		break;
+	      add_ssaaaa (nh, nl, uh, ul, 0, A);
+	      q1 = div2 (&qh, nh, nl, dh, dl);
+	      if (qh != 0)
+		break;		/* could handle this */
+
+	      add_ssaaaa (dh, dl, vh, vl, 0, D);
+	      if ((dl | dh) == 0)
+		break;
+	      sub_ddmmss (nh, nl, uh, ul, 0, B);
+	      q2 = div2 (&qh, nh, nl, dh, dl);
+	      if (qh != 0)
+		break;		/* could handle this */
+
+	      if (q1 != q2)
+		break;
+
+	      asign = ~asign;
+
+	      T = A + q1 * C;
+	      A = C;
+	      C = T;
+	      T = B + q1 * D;
+	      B = D;
+	      D = T;
+	      umul_ppmm (t1, t0, q1, vl);
+	      t1 += q1 * vh;
+	      sub_ddmmss (Th, Tl, uh, ul, t1, t0);
+	      uh = vh, ul = vl;
+	      vh = Th, vl = Tl;
+
+	      add_ssaaaa (dh, dl, vh, vl, 0, C);
+	      sub_ddmmss (nh, nl, uh, ul, 0, A);
+	      q1 = div2 (&qh, nh, nl, dh, dl);
+	      if (qh != 0)
+		break;		/* could handle this */
+
+	      sub_ddmmss (dh, dl, vh, vl, 0, D);
+	      if ((dl | dh) == 0)
+		break;
+	      add_ssaaaa (nh, nl, uh, ul, 0, B);
+	      q2 = div2 (&qh, nh, nl, dh, dl);
+	      if (qh != 0)
+		break;		/* could handle this */
+
+	      if (q1 != q2)
+		break;
+
+	      asign = ~asign;
+
+	      T = A + q1 * C;
+	      A = C;
+	      C = T;
+	      T = B + q1 * D;
+	      B = D;
+	      D = T;
+	      umul_ppmm (t1, t0, q1, vl);
+	      t1 += q1 * vh;
+	      sub_ddmmss (Th, Tl, uh, ul, t1, t0);
+	      uh = vh, ul = vl;
+	      vh = Th, vl = Tl;
+	    }
+#if EXTEND
+	  if (asign)
+	    sign = -sign;
+#endif
+	}
+      else /* Same, but using single-limb calculations.  */
+	{
+	  mp_limb_t uh, vh;
+	  /* Make UH be the most significant limb of U, and make VH be
+	     corresponding bits from V.  */
+	  uh = up[size - 1];
+	  vh = vp[size - 1];
+	  count_leading_zeros (cnt, uh);
+	  if (cnt != 0)
+	    {
+	      uh = (uh << cnt) | (up[size - 2] >> (BITS_PER_MP_LIMB - cnt));
+	      vh = (vh << cnt) | (vp[size - 2] >> (BITS_PER_MP_LIMB - cnt));
+	    }
+
+	  A = 1;
+	  B = 0;
+	  C = 0;
+	  D = 1;
+
+	  asign = 0;
+	  for (;;)
+	    {
+	      mp_limb_t q, T;
+	      if (vh - C == 0 || vh + D == 0)
+		break;
+
+	      q = (uh + A) / (vh - C);
+	      if (q != (uh - B) / (vh + D))
+		break;
+
+	      asign = ~asign;
+
+	      T = A + q * C;
+	      A = C;
+	      C = T;
+	      T = B + q * D;
+	      B = D;
+	      D = T;
+	      T = uh - q * vh;
+	      uh = vh;
+	      vh = T;
+
+	      if (vh - D == 0)
+		break;
+
+	      q = (uh - A) / (vh + C);
+	      if (q != (uh + B) / (vh - D))
+		break;
+
+	      asign = ~asign;
+
+	      T = A + q * C;
+	      A = C;
+	      C = T;
+	      T = B + q * D;
+	      B = D;
+	      D = T;
+	      T = uh - q * vh;
+	      uh = vh;
+	      vh = T;
+	    }
+#if EXTEND
+	  if (asign)
+	    sign = -sign;
+#endif
+	}
+
+#if RECORD
+      max = MAX (A, max);  max = MAX (B, max);
+      max = MAX (C, max);  max = MAX (D, max);
+#endif
+
+      if (B == 0)
+	{
+	  mp_limb_t qh;
+	  mp_size_t i;
+	  /* This is quite rare.  I.e., optimize something else!  */
+
+	  /* Normalize V (and shift up U the same amount).  */
+	  count_leading_zeros (cnt, vp[vsize - 1]);
+	  if (cnt != 0)
+	    {
+	      mp_limb_t cy;
+	      mpn_lshift (vp, vp, vsize, cnt);
+	      cy = mpn_lshift (up, up, size, cnt);
+	      up[size] = cy;
+	      size += cy != 0;
+	    }
+
+	  qh = mpn_divmod (up + vsize, up, size, vp, vsize);
+#if EXTEND
+	  MPN_COPY (tp, s0p, ssize);
+	  {
+	    mp_size_t qsize;
+
+	    qsize = size - vsize; /* size of stored quotient from division */
+	    if (ssize < qsize)
+	      {
+		MPN_ZERO (tp + ssize, qsize - ssize);
+		MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
+		for (i = 0; i < ssize; i++)
+		  {
+		    mp_limb_t cy;
+		    cy = mpn_addmul_1 (tp + i, up + vsize, qsize, s1p[i]);
+		    tp[qsize + i] = cy;
+		  }
+		if (qh != 0)
+		  {
+		    mp_limb_t cy;
+		    cy = mpn_add_n (tp + qsize, tp + qsize, s1p, ssize);
+		    if (cy != 0)
+		      abort ();
+		  }
+	      }
+	    else
+	      {
+		MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
+		for (i = 0; i < qsize; i++)
+		  {
+		    mp_limb_t cy;
+		    cy = mpn_addmul_1 (tp + i, s1p, ssize, up[vsize + i]);
+		    tp[ssize + i] = cy;
+		  }
+		if (qh != 0)
+		  {
+		    mp_limb_t cy;
+		    cy = mpn_add_n (tp + qsize, tp + qsize, s1p, ssize);
+		    if (cy != 0)
+		      {
+			tp[qsize + ssize] = cy;
+			s1p[qsize + ssize] = 0;
+			ssize++;
+		      }
+		  }
+	      }
+	    ssize += qsize;
+	    ssize -= tp[ssize - 1] == 0;
+	  }
+
+	  sign = -sign;
+	  MP_PTR_SWAP (s0p, s1p);
+	  MP_PTR_SWAP (s1p, tp);
+#endif
+	  size = vsize;
+	  if (cnt != 0)
+	    {
+	      mpn_rshift (up, up, size, cnt);
+	      mpn_rshift (vp, vp, size, cnt);
+	    }
+	  MP_PTR_SWAP (up, vp);
+	}
+      else
+	{
+#if EXTEND
+	  mp_size_t tsize, wsize;
+#endif
+	  /* T = U*A + V*B
+	     W = U*C + V*D
+	     U = T
+	     V = W	   */
+
+#if STAT
+	  { mp_limb_t x; x = A | B | C | D; count_leading_zeros (cnt, x);
+	  arr[BITS_PER_MP_LIMB - cnt]++; }
+#endif
+	  if (A == 0)
+	    {
+	      /* B == 1 and C == 1 (D is arbitrary) */
+	      mp_limb_t cy;
+	      MPN_COPY (tp, vp, size);
+	      MPN_COPY (wp, up, size);
+	      mpn_submul_1 (wp, vp, size, D);
+	      MP_PTR_SWAP (tp, up);
+	      MP_PTR_SWAP (wp, vp);
+#if EXTEND
+	      MPN_COPY (tp, s1p, ssize);
+	      tsize = ssize;
+	      tp[ssize] = 0;	/* must zero since wp might spill below */
+	      MPN_COPY (wp, s0p, ssize);
+	      cy = mpn_addmul_1 (wp, s1p, ssize, D);
+	      wp[ssize] = cy;
+	      wsize = ssize + (cy != 0);
+	      MP_PTR_SWAP (tp, s0p);
+	      MP_PTR_SWAP (wp, s1p);
+	      ssize = MAX (wsize, tsize);
+#endif
+	    }
+	  else
+	    {
+	      if (asign)
+		{
+		  mp_limb_t cy;
+		  mpn_mul_1 (tp, vp, size, B);
+		  mpn_submul_1 (tp, up, size, A);
+		  mpn_mul_1 (wp, up, size, C);
+		  mpn_submul_1 (wp, vp, size, D);
+		  MP_PTR_SWAP (tp, up);
+		  MP_PTR_SWAP (wp, vp);
+#if EXTEND
+		  cy = mpn_mul_1 (tp, s1p, ssize, B);
+		  cy += mpn_addmul_1 (tp, s0p, ssize, A);
+		  tp[ssize] = cy;
+		  tsize = ssize + (cy != 0);
+		  cy = mpn_mul_1 (wp, s0p, ssize, C);
+		  cy += mpn_addmul_1 (wp, s1p, ssize, D);
+		  wp[ssize] = cy;
+		  wsize = ssize + (cy != 0);
+		  MP_PTR_SWAP (tp, s0p);
+		  MP_PTR_SWAP (wp, s1p);
+		  ssize = MAX (wsize, tsize);
+#endif
+		}
+	      else
+		{
+		  mp_limb_t cy;
+		  mpn_mul_1 (tp, up, size, A);
+		  mpn_submul_1 (tp, vp, size, B);
+		  mpn_mul_1 (wp, vp, size, D);
+		  mpn_submul_1 (wp, up, size, C);
+		  MP_PTR_SWAP (tp, up);
+		  MP_PTR_SWAP (wp, vp);
+#if EXTEND
+		  cy = mpn_mul_1 (tp, s0p, ssize, A);
+		  cy += mpn_addmul_1 (tp, s1p, ssize, B);
+		  tp[ssize] = cy;
+		  tsize = ssize + (cy != 0);
+		  cy = mpn_mul_1 (wp, s1p, ssize, D);
+		  cy += mpn_addmul_1 (wp, s0p, ssize, C);
+		  wp[ssize] = cy;
+		  wsize = ssize + (cy != 0);
+		  MP_PTR_SWAP (tp, s0p);
+		  MP_PTR_SWAP (wp, s1p);
+		  ssize = MAX (wsize, tsize);
+#endif
+		}
+	    }
+
+	  size -= up[size - 1] == 0;
+	}
+    }
+
+#if RECORD
+  printf ("max: %lx\n", max);
+#endif
+
+#if STAT
+ {int i; for (i = 0; i < BITS_PER_MP_LIMB; i++) printf ("%d:%d\n", i, arr[i]);}
+#endif
+
+  if (vsize == 0)
+    {
+      if (gp != up && gp != 0)
+	MPN_COPY (gp, up, size);
+#if EXTEND
+      MPN_NORMALIZE (s0p, ssize);
+      if (orig_s0p != s0p)
+	MPN_COPY (orig_s0p, s0p, ssize);
+      *s0size = sign >= 0 ? ssize : -ssize;
+#endif
+      TMP_FREE (mark);
+      return size;
+    }
+  else
+    {
+      mp_limb_t vl, ul, t;
+#if EXTEND
+      mp_size_t qsize, i;
+#endif
+      vl = vp[0];
+#if EXTEND
+      t = mpn_divmod_1 (wp, up, size, vl);
+
+      MPN_COPY (tp, s0p, ssize);
+
+      qsize = size - (wp[size - 1] == 0); /* size of quotient from division */
+      if (ssize < qsize)
+	{
+	  MPN_ZERO (tp + ssize, qsize - ssize);
+	  MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
+	  for (i = 0; i < ssize; i++)
+	    {
+	      mp_limb_t cy;
+	      cy = mpn_addmul_1 (tp + i, wp, qsize, s1p[i]);
+	      tp[qsize + i] = cy;
+	    }
+	}
+      else
+	{
+	  MPN_ZERO (s1p + ssize, qsize); /* zero s1 too */
+	  for (i = 0; i < qsize; i++)
+	    {
+	      mp_limb_t cy;
+	      cy = mpn_addmul_1 (tp + i, s1p, ssize, wp[i]);
+	      tp[ssize + i] = cy;
+	    }
+	}
+      ssize += qsize;
+      ssize -= tp[ssize - 1] == 0;
+
+      sign = -sign;
+      MP_PTR_SWAP (s0p, s1p);
+      MP_PTR_SWAP (s1p, tp);
+#else
+      t = mpn_mod_1 (up, size, vl);
+#endif
+      ul = vl;
+      vl = t;
+      while (vl != 0)
+	{
+	  mp_limb_t t;
+#if EXTEND
+	  mp_limb_t q;
+	  q = ul / vl;
+	  t = ul - q * vl;
+
+	  MPN_COPY (tp, s0p, ssize);
+
+	  MPN_ZERO (s1p + ssize, 1); /* zero s1 too */
+
+	  {
+	    mp_limb_t cy;
+	    cy = mpn_addmul_1 (tp, s1p, ssize, q);
+	    tp[ssize] = cy;
+	  }
+
+	  ssize += 1;
+	  ssize -= tp[ssize - 1] == 0;
+
+	  sign = -sign;
+	  MP_PTR_SWAP (s0p, s1p);
+	  MP_PTR_SWAP (s1p, tp);
+#else
+	  t = ul % vl;
+#endif
+	  ul = vl;
+	  vl = t;
+	}
+      if (gp != 0)
+	gp[0] = ul;
+#if EXTEND
+      MPN_NORMALIZE (s0p, ssize);
+      if (orig_s0p != s0p)
+	MPN_COPY (orig_s0p, s0p, ssize);
+      *s0size = sign >= 0 ? ssize : -ssize;
+#endif
+      TMP_FREE (mark);
+      return 1;
+    }
+}
diff --git a/rts/gmp/mpn/generic/get_str.c b/rts/gmp/mpn/generic/get_str.c
new file mode 100644
index 0000000000..a713b61825
--- /dev/null
+++ b/rts/gmp/mpn/generic/get_str.c
@@ -0,0 +1,216 @@
+/* mpn_get_str -- Convert a MSIZE long limb vector pointed to by MPTR
+   to a printable string in STR in base BASE.
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Convert the limb vector pointed to by MPTR and MSIZE long to a
+   char array, using base BASE for the result array.  Store the
+   result in the character array STR.  STR must point to an array with
+   space for the largest possible number represented by a MSIZE long
+   limb vector + 1 extra character.
+
+   The result is NOT in Ascii, to convert it to printable format, add
+   '0' or 'A' depending on the base and range.
+
+   Return the number of digits in the result string.
+   This may include some leading zeros.
+
+   The limb vector pointed to by MPTR is clobbered.  */
+
+size_t
+#if __STDC__
+mpn_get_str (unsigned char *str, int base, mp_ptr mptr, mp_size_t msize)
+#else
+mpn_get_str (str, base, mptr, msize)
+     unsigned char *str;
+     int base;
+     mp_ptr mptr;
+     mp_size_t msize;
+#endif
+{
+  mp_limb_t big_base;
+#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME
+  int normalization_steps;
+#endif
+#if UDIV_TIME > 2 * UMUL_TIME
+  mp_limb_t big_base_inverted;
+#endif
+  unsigned int dig_per_u;
+  mp_size_t out_len;
+  register unsigned char *s;
+
+  big_base = __mp_bases[base].big_base;
+
+  s = str;
+
+  /* Special case zero, as the code below doesn't handle it.  */
+  if (msize == 0)
+    {
+      s[0] = 0;
+      return 1;
+    }
+
+  if ((base & (base - 1)) == 0)
+    {
+      /* The base is a power of 2.  Make conversion from most
+	 significant side.  */
+      mp_limb_t n1, n0;
+      register int bits_per_digit = big_base;
+      register int x;
+      register int bit_pos;
+      register int i;
+
+      n1 = mptr[msize - 1];
+      count_leading_zeros (x, n1);
+
+	/* BIT_POS should be R when input ends in least sign. nibble,
+	   R + bits_per_digit * n when input ends in n:th least significant
+	   nibble. */
+
+      {
+	int bits;
+
+	bits = BITS_PER_MP_LIMB * msize - x;
+	x = bits % bits_per_digit;
+	if (x != 0)
+	  bits += bits_per_digit - x;
+	bit_pos = bits - (msize - 1) * BITS_PER_MP_LIMB;
+      }
+
+      /* Fast loop for bit output.  */
+      i = msize - 1;
+      for (;;)
+	{
+	  bit_pos -= bits_per_digit;
+	  while (bit_pos >= 0)
+	    {
+	      *s++ = (n1 >> bit_pos) & ((1 << bits_per_digit) - 1);
+	      bit_pos -= bits_per_digit;
+	    }
+	  i--;
+	  if (i < 0)
+	    break;
+	  n0 = (n1 << -bit_pos) & ((1 << bits_per_digit) - 1);
+	  n1 = mptr[i];
+	  bit_pos += BITS_PER_MP_LIMB;
+	  *s++ = n0 | (n1 >> bit_pos);
+	}
+
+      *s = 0;
+
+      return s - str;
+    }
+  else
+    {
+      /* General case.  The base is not a power of 2.  Make conversion
+	 from least significant end.  */
+
+      /* If udiv_qrnnd only handles divisors with the most significant bit
+	 set, prepare BIG_BASE for being a divisor by shifting it to the
+	 left exactly enough to set the most significant bit.  */
+#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME
+      count_leading_zeros (normalization_steps, big_base);
+      big_base <<= normalization_steps;
+#if UDIV_TIME > 2 * UMUL_TIME
+      /* Get the fixed-point approximation to 1/(BIG_BASE << NORMALIZATION_STEPS).  */
+      big_base_inverted = __mp_bases[base].big_base_inverted;
+#endif
+#endif
+
+      dig_per_u = __mp_bases[base].chars_per_limb;
+      out_len = ((size_t) msize * BITS_PER_MP_LIMB
+		 * __mp_bases[base].chars_per_bit_exactly) + 1;
+      s += out_len;
+
+      while (msize != 0)
+	{
+	  int i;
+	  mp_limb_t n0, n1;
+
+#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME
+	  /* If we shifted BIG_BASE above, shift the dividend too, to get
+	     the right quotient.  We need to do this every loop,
+	     since the intermediate quotients are OK, but the quotient from
+	     one turn in the loop is going to be the dividend in the
+	     next turn, and the dividend needs to be up-shifted.  */
+	  if (normalization_steps != 0)
+	    {
+	      n0 = mpn_lshift (mptr, mptr, msize, normalization_steps);
+
+	      /* If the shifting gave a carry out limb, store it and
+		 increase the length.  */
+	      if (n0 != 0)
+		{
+		  mptr[msize] = n0;
+		  msize++;
+		}
+	    }
+#endif
+
+	  /* Divide the number at TP with BIG_BASE to get a quotient and a
+	     remainder.  The remainder is our new digit in base BIG_BASE.  */
+	  i = msize - 1;
+	  n1 = mptr[i];
+
+	  if (n1 >= big_base)
+	    n1 = 0;
+	  else
+	    {
+	      msize--;
+	      i--;
+	    }
+
+	  for (; i >= 0; i--)
+	    {
+	      n0 = mptr[i];
+#if UDIV_TIME > 2 * UMUL_TIME
+	      udiv_qrnnd_preinv (mptr[i], n1, n1, n0, big_base, big_base_inverted);
+#else
+	      udiv_qrnnd (mptr[i], n1, n1, n0, big_base);
+#endif
+	    }
+
+#if UDIV_NEEDS_NORMALIZATION || UDIV_TIME > 2 * UMUL_TIME
+	  /* If we shifted above (at previous UDIV_NEEDS_NORMALIZATION tests)
+	     the remainder will be up-shifted here.  Compensate.  */
+	  n1 >>= normalization_steps;
+#endif
+
+	  /* Convert N1 from BIG_BASE to a string of digits in BASE
+	     using single precision operations.  */
+	  for (i = dig_per_u - 1; i >= 0; i--)
+	    {
+	      *--s = n1 % base;
+	      n1 /= base;
+	      if (n1 == 0 && msize == 0)
+		break;
+	    }
+	}
+
+      while (s != str)
+	*--s = 0;
+      return out_len;
+    }
+}
diff --git a/rts/gmp/mpn/generic/gmp-mparam.h b/rts/gmp/mpn/generic/gmp-mparam.h
new file mode 100644
index 0000000000..14bcaece83
--- /dev/null
+++ b/rts/gmp/mpn/generic/gmp-mparam.h
@@ -0,0 +1,27 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
diff --git a/rts/gmp/mpn/generic/hamdist.c b/rts/gmp/mpn/generic/hamdist.c
new file mode 100644
index 0000000000..35c10e8450
--- /dev/null
+++ b/rts/gmp/mpn/generic/hamdist.c
@@ -0,0 +1,94 @@
+/* mpn_hamdist --
+
+Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#if defined __GNUC__
+/* No processor claiming to be SPARC v9 compliant seem to
+   implement the POPC instruction.  Disable pattern for now.  */
+#if 0 && defined __sparc_v9__ && BITS_PER_MP_LIMB == 64
+#define popc_limb(a) \
+  ({									\
+    DItype __res;							\
+    asm ("popc %1,%0" : "=r" (__res) : "rI" (a));			\
+    __res;								\
+  })
+#endif
+#endif
+
+#ifndef popc_limb
+
+/* Cool population count of a mp_limb_t.
+   You have to figure out how this works, I won't tell you!  */
+
+static inline unsigned int
+#if __STDC__
+popc_limb (mp_limb_t x)
+#else
+popc_limb (x)
+     mp_limb_t x;
+#endif
+{
+#if BITS_PER_MP_LIMB == 64
+  /* We have to go into some trouble to define these constants.
+     (For mp_limb_t being `long long'.)  */
+  mp_limb_t cnst;
+  cnst = 0xaaaaaaaaL | ((mp_limb_t) 0xaaaaaaaaL << BITS_PER_MP_LIMB/2);
+  x -= (x & cnst) >> 1;
+  cnst = 0x33333333L | ((mp_limb_t) 0x33333333L << BITS_PER_MP_LIMB/2);
+  x = ((x & ~cnst) >> 2) + (x & cnst);
+  cnst = 0x0f0f0f0fL | ((mp_limb_t) 0x0f0f0f0fL << BITS_PER_MP_LIMB/2);
+  x = ((x >> 4) + x) & cnst;
+  x = ((x >> 8) + x);
+  x = ((x >> 16) + x);
+  x = ((x >> 32) + x) & 0xff;
+#endif
+#if BITS_PER_MP_LIMB == 32
+  x -= (x & 0xaaaaaaaa) >> 1;
+  x = ((x >> 2) & 0x33333333L) + (x & 0x33333333L);
+  x = ((x >> 4) + x) & 0x0f0f0f0fL;
+  x = ((x >> 8) + x);
+  x = ((x >> 16) + x) & 0xff;
+#endif
+  return x;
+}
+#endif
+
+unsigned long int
+#if __STDC__
+mpn_hamdist (mp_srcptr up, mp_srcptr vp, mp_size_t size)
+#else
+mpn_hamdist (up, vp, size)
+     register mp_srcptr up;
+     register mp_srcptr vp;
+     register mp_size_t size;
+#endif
+{
+  unsigned long int hamdist;
+  mp_size_t i;
+
+  hamdist = 0;
+  for (i = 0; i < size; i++)
+    hamdist += popc_limb (up[i] ^ vp[i]);
+
+  return hamdist;
+}
diff --git a/rts/gmp/mpn/generic/inlines.c b/rts/gmp/mpn/generic/inlines.c
new file mode 100644
index 0000000000..9487e58cf2
--- /dev/null
+++ b/rts/gmp/mpn/generic/inlines.c
@@ -0,0 +1,24 @@
+/*
+Copyright (C) 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#define _FORCE_INLINES
+#define _EXTERN_INLINE /* empty */
+#include "gmp.h"
diff --git a/rts/gmp/mpn/generic/jacbase.c b/rts/gmp/mpn/generic/jacbase.c
new file mode 100644
index 0000000000..dd437f1ac1
--- /dev/null
+++ b/rts/gmp/mpn/generic/jacbase.c
@@ -0,0 +1,136 @@
+/* mpn_jacobi_base -- limb/limb Jacobi symbol with restricted arguments.
+
+   THIS INTERFACE IS PRELIMINARY AND MIGHT DISAPPEAR OR BE SUBJECT TO
+   INCOMPATIBLE CHANGES IN A FUTURE RELEASE OF GMP. */
+
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+#if COUNT_TRAILING_ZEROS_TIME <= 7
+/* If count_trailing_zeros is fast, use it.
+   K7 at 7 cycles and P6 at 2 are good here.  K6 at 12-27 and P5 at 18-42
+   are not.  The default 15 in longlong.h is meant to mean not good here.  */
+
+#define PROCESS_TWOS_ANY                                \
+  {                                                     \
+    mp_limb_t  twos;                                    \
+    count_trailing_zeros (twos, a);                     \
+    result_bit1 ^= JACOBI_TWOS_U_BIT1 (twos, b);        \
+    a >>= twos;                                         \
+  }
+
+#define PROCESS_TWOS_EVEN  PROCESS_TWOS_ANY
+
+#else
+/* Use a loop instead.  With "a" uniformly distributed there will usually be
+   only a few trailing zeros.
+
+   Unfortunately the branch for the while loop here will be on a 50/50
+   chance of a 1 or 0, which is bad for branch prediction.  */
+
+#define PROCESS_TWOS_EVEN               \
+  {                                     \
+    int  two;                           \
+    two = JACOBI_TWO_U_BIT1 (b);        \
+    do                                  \
+      {                                 \
+        a >>= 1;                        \
+        result_bit1 ^= two;             \
+        ASSERT (a != 0);                \
+      }                                 \
+    while ((a & 1) == 0);               \
+  }
+
+#define PROCESS_TWOS_ANY        \
+  if ((a & 1) == 0)             \
+    PROCESS_TWOS_EVEN;
+
+#endif
+
+
+/* Calculate the value of the Jacobi symbol (a/b) of two mp_limb_t's, but
+   with a restricted range of inputs accepted, namely b>1, b odd, and a<=b.
+
+   The initial result_bit1 is taken as a parameter for the convenience of
+   mpz_kronecker_zi_ui() et al.  The sign changes both here and in those
+   routines accumulate nicely in bit 1, see the JACOBI macros.
+
+   The return value here is the normal +1, 0, or -1.  Note that +1 and -1
+   have bit 1 in the "BIT1" sense, which could be useful if the caller is
+   accumulating it into some extended calculation.
+
+   Duplicating the loop body to avoid the MP_LIMB_T_SWAP(a,b) would be
+   possible, but a couple of tests suggest it's not a significant speedup,
+   and may even be a slowdown, so what's here is good enough for now.
+
+   Future: The code doesn't demand a<=b actually, so maybe this could be
+   relaxed.  All the places this is used currently call with a<=b though.  */
+
+int
+#if __STDC__
+mpn_jacobi_base (mp_limb_t a, mp_limb_t b, int result_bit1)
+#else
+mpn_jacobi_base (a, b, result_bit1)
+     mp_limb_t a;
+     mp_limb_t b;
+     int       result_bit1;
+#endif
+{
+  ASSERT (b & 1);  /* b odd */
+  ASSERT (b != 1);
+  ASSERT (a <= b);
+
+  if (a == 0)
+    return 0;
+
+  PROCESS_TWOS_ANY;
+  if (a == 1)
+    goto done;
+
+  for (;;)
+    {
+      result_bit1 ^= JACOBI_RECIP_UU_BIT1 (a, b);
+      MP_LIMB_T_SWAP (a, b);
+
+      do
+	{
+          /* working on (a/b), a,b odd, a>=b */
+          ASSERT (a & 1);
+          ASSERT (b & 1);
+          ASSERT (a >= b);
+
+	  if ((a -= b) == 0)
+	    return 0;
+
+          PROCESS_TWOS_EVEN;
+	  if (a == 1)
+	    goto done;
+	}
+      while (a >= b);
+    }
+
+ done:
+  return JACOBI_BIT1_TO_PN (result_bit1);
+}
diff --git a/rts/gmp/mpn/generic/lshift.c b/rts/gmp/mpn/generic/lshift.c
new file mode 100644
index 0000000000..0b58389658
--- /dev/null
+++ b/rts/gmp/mpn/generic/lshift.c
@@ -0,0 +1,87 @@
+/* mpn_lshift -- Shift left low level.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Shift U (pointed to by UP and USIZE digits long) CNT bits to the left
+   and store the USIZE least significant digits of the result at WP.
+   Return the bits shifted out from the most significant digit.
+
+   Argument constraints:
+   1. 0 < CNT < BITS_PER_MP_LIMB
+   2. If the result is to be written over the input, WP must be >= UP.
+*/
+
+mp_limb_t
+#if __STDC__
+mpn_lshift (register mp_ptr wp,
+	    register mp_srcptr up, mp_size_t usize,
+	    register unsigned int cnt)
+#else
+mpn_lshift (wp, up, usize, cnt)
+     register mp_ptr wp;
+     register mp_srcptr up;
+     mp_size_t usize;
+     register unsigned int cnt;
+#endif
+{
+  register mp_limb_t high_limb, low_limb;
+  register unsigned sh_1, sh_2;
+  register mp_size_t i;
+  mp_limb_t retval;
+
+#ifdef DEBUG
+  if (usize == 0 || cnt == 0)
+    abort ();
+#endif
+
+  sh_1 = cnt;
+#if 0
+  if (sh_1 == 0)
+    {
+      if (wp != up)
+	{
+	  /* Copy from high end to low end, to allow specified input/output
+	     overlapping.  */
+	  for (i = usize - 1; i >= 0; i--)
+	    wp[i] = up[i];
+	}
+      return 0;
+    }
+#endif
+
+  wp += 1;
+  sh_2 = BITS_PER_MP_LIMB - sh_1;
+  i = usize - 1;
+  low_limb = up[i];
+  retval = low_limb >> sh_2;
+  high_limb = low_limb;
+  while (--i >= 0)
+    {
+      low_limb = up[i];
+      wp[i] = (high_limb << sh_1) | (low_limb >> sh_2);
+      high_limb = low_limb;
+    }
+  wp[i] = high_limb << sh_1;
+
+  return retval;
+}
diff --git a/rts/gmp/mpn/generic/mod_1.c b/rts/gmp/mpn/generic/mod_1.c
new file mode 100644
index 0000000000..168ec9df49
--- /dev/null
+++ b/rts/gmp/mpn/generic/mod_1.c
@@ -0,0 +1,175 @@
+/* mpn_mod_1(dividend_ptr, dividend_size, divisor_limb) --
+   Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB.
+   Return the single-limb remainder.
+   There are no constraints on the value of the divisor.
+
+Copyright (C) 1991, 1993, 1994, 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef UMUL_TIME
+#define UMUL_TIME 1
+#endif
+
+#ifndef UDIV_TIME
+#define UDIV_TIME UMUL_TIME
+#endif
+
+mp_limb_t
+#if __STDC__
+mpn_mod_1 (mp_srcptr dividend_ptr, mp_size_t dividend_size,
+	   mp_limb_t divisor_limb)
+#else
+mpn_mod_1 (dividend_ptr, dividend_size, divisor_limb)
+     mp_srcptr dividend_ptr;
+     mp_size_t dividend_size;
+     mp_limb_t divisor_limb;
+#endif
+{
+  mp_size_t i;
+  mp_limb_t n1, n0, r;
+  int dummy;
+
+  /* Botch: Should this be handled at all?  Rely on callers?  */
+  if (dividend_size == 0)
+    return 0;
+
+  /* If multiplication is much faster than division, and the
+     dividend is large, pre-invert the divisor, and use
+     only multiplications in the inner loop.  */
+
+  /* This test should be read:
+       Does it ever help to use udiv_qrnnd_preinv?
+	 && Does what we save compensate for the inversion overhead?  */
+  if (UDIV_TIME > (2 * UMUL_TIME + 6)
+      && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME)
+    {
+      int normalization_steps;
+
+      count_leading_zeros (normalization_steps, divisor_limb);
+      if (normalization_steps != 0)
+	{
+	  mp_limb_t divisor_limb_inverted;
+
+	  divisor_limb <<= normalization_steps;
+	  invert_limb (divisor_limb_inverted, divisor_limb);
+
+	  n1 = dividend_ptr[dividend_size - 1];
+	  r = n1 >> (BITS_PER_MP_LIMB - normalization_steps);
+
+	  /* Possible optimization:
+	     if (r == 0
+	     && divisor_limb > ((n1 << normalization_steps)
+			     | (dividend_ptr[dividend_size - 2] >> ...)))
+	     ...one division less... */
+
+	  for (i = dividend_size - 2; i >= 0; i--)
+	    {
+	      n0 = dividend_ptr[i];
+	      udiv_qrnnd_preinv (dummy, r, r,
+				 ((n1 << normalization_steps)
+				  | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))),
+				 divisor_limb, divisor_limb_inverted);
+	      n1 = n0;
+	    }
+	  udiv_qrnnd_preinv (dummy, r, r,
+			     n1 << normalization_steps,
+			     divisor_limb, divisor_limb_inverted);
+	  return r >> normalization_steps;
+	}
+      else
+	{
+	  mp_limb_t divisor_limb_inverted;
+
+	  invert_limb (divisor_limb_inverted, divisor_limb);
+
+	  i = dividend_size - 1;
+	  r = dividend_ptr[i];
+
+	  if (r >= divisor_limb)
+	    r = 0;
+	  else
+	    i--;
+
+	  for (; i >= 0; i--)
+	    {
+	      n0 = dividend_ptr[i];
+	      udiv_qrnnd_preinv (dummy, r, r,
+				 n0, divisor_limb, divisor_limb_inverted);
+	    }
+	  return r;
+	}
+    }
+  else
+    {
+      if (UDIV_NEEDS_NORMALIZATION)
+	{
+	  int normalization_steps;
+
+	  count_leading_zeros (normalization_steps, divisor_limb);
+	  if (normalization_steps != 0)
+	    {
+	      divisor_limb <<= normalization_steps;
+
+	      n1 = dividend_ptr[dividend_size - 1];
+	      r = n1 >> (BITS_PER_MP_LIMB - normalization_steps);
+
+	      /* Possible optimization:
+		 if (r == 0
+		 && divisor_limb > ((n1 << normalization_steps)
+				 | (dividend_ptr[dividend_size - 2] >> ...)))
+		 ...one division less... */
+
+	      for (i = dividend_size - 2; i >= 0; i--)
+		{
+		  n0 = dividend_ptr[i];
+		  udiv_qrnnd (dummy, r, r,
+			      ((n1 << normalization_steps)
+			       | (n0 >> (BITS_PER_MP_LIMB - normalization_steps))),
+			      divisor_limb);
+		  n1 = n0;
+		}
+	      udiv_qrnnd (dummy, r, r,
+			  n1 << normalization_steps,
+			  divisor_limb);
+	      return r >> normalization_steps;
+	    }
+	}
+      /* No normalization needed, either because udiv_qrnnd doesn't require
+	 it, or because DIVISOR_LIMB is already normalized.  */
+
+      i = dividend_size - 1;
+      r = dividend_ptr[i];
+
+      if (r >= divisor_limb)
+	r = 0;
+      else
+	i--;
+
+      for (; i >= 0; i--)
+	{
+	  n0 = dividend_ptr[i];
+	  udiv_qrnnd (dummy, r, r, n0, divisor_limb);
+	}
+      return r;
+    }
+}
diff --git a/rts/gmp/mpn/generic/mod_1_rs.c b/rts/gmp/mpn/generic/mod_1_rs.c
new file mode 100644
index 0000000000..62aaa94b92
--- /dev/null
+++ b/rts/gmp/mpn/generic/mod_1_rs.c
@@ -0,0 +1,111 @@
+/* mpn_mod_1_rshift -- mpn remainder under hypothetical right shift.
+
+   THE FUNCTION IN THIS FILE IS FOR INTERNAL USE AND HAS A MUTABLE
+   INTERFACE.  IT IS ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.
+   IT'S ALMOST GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP
+   RELEASE. */
+
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* When testing on a CPU with UDIV_NEEDS_NORMALIZATION equal to 0, it can be
+   changed to 1 temporarily to test the code under that case too. */
+#if 0
+#undef UDIV_NEEDS_NORMALIZATION
+#define UDIV_NEEDS_NORMALIZATION 1
+#endif
+
+
+/* Calculate the remainder "(ptr,size >> shift) % divisor".  Note ptr,size
+   is unchanged, the shift is only for its effect on the remainder.
+   The shift doesn't even need to be considered until the last limb.
+
+   This function has the normal size!=0 restriction, unlike the basic
+   mpn_mod_1. */
+
+mp_limb_t
+#if __STDC__
+mpn_mod_1_rshift (mp_srcptr ptr, mp_size_t size, unsigned shift,
+                  mp_limb_t divisor)
+#else
+mpn_mod_1_rshift (ptr, size, shift, divisor)
+     mp_srcptr ptr;
+     mp_size_t size;
+     unsigned  shift;
+     mp_limb_t divisor;
+#endif
+{
+  mp_limb_t  quot, rem;
+
+  ASSERT (shift >= 1);
+  ASSERT (shift < BITS_PER_MP_LIMB);
+  ASSERT (size >= 1);
+
+  if (size == 1)
+    return (ptr[0] >> shift) % divisor;
+
+#if UDIV_NEEDS_NORMALIZATION 
+  {
+    int  norm;
+    int  delta;
+
+    count_leading_zeros (norm, divisor);
+    divisor <<= norm;
+
+    delta = shift - norm;
+    if (delta == 0)
+      return mpn_mod_1 (ptr, size, divisor) >> norm;
+
+    if (delta > 0)
+      {
+        rem = mpn_mod_1 (ptr+1, size-1, divisor);
+        udiv_qrnnd (quot, rem,
+                    rem >> delta,
+                    (rem << (BITS_PER_MP_LIMB-delta)) | (ptr[0] >> delta),
+                    divisor);
+        return rem >> norm;
+      }
+    else
+      {
+        rem = mpn_mod_1 (ptr, size, divisor);
+        udiv_qrnnd (quot, rem,
+                    rem >> (BITS_PER_MP_LIMB+delta),
+                    rem << -delta,
+                    divisor);
+        return rem >> norm;
+      }
+  }
+
+#else /* !UDIV_NEEDS_NORMALIZATION */
+
+  rem = mpn_mod_1 (ptr+1, size-1, divisor);
+  udiv_qrnnd (quot, rem,
+              rem >> shift,
+              (rem << (BITS_PER_MP_LIMB-shift)) | (ptr[0] >> shift),
+              divisor);
+  return rem;
+
+#endif
+}
diff --git a/rts/gmp/mpn/generic/mul.c b/rts/gmp/mpn/generic/mul.c
new file mode 100644
index 0000000000..cecfa19ca1
--- /dev/null
+++ b/rts/gmp/mpn/generic/mul.c
@@ -0,0 +1,190 @@
+/* mpn_mul -- Multiply two natural numbers.
+
+   THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul)
+   ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY SAFE TO REACH
+   THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED
+   THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
+   (pointed to by VP, with VN limbs), and store the result at PRODP.  The
+   result is UN + VN limbs.  Return the most significant limb of the result.
+
+   NOTE: The space pointed to by PRODP is overwritten before finished with U
+   and V, so overlap is an error.
+
+   Argument constraints:
+   1. UN >= VN.
+   2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
+      the multiplier and the multiplicand.  */
+
+void
+#if __STDC__
+mpn_sqr_n (mp_ptr prodp,
+         mp_srcptr up, mp_size_t un)
+#else
+mpn_sqr_n (prodp, up, un)
+     mp_ptr prodp;
+     mp_srcptr up;
+     mp_size_t un;
+#endif
+{
+  if (un < KARATSUBA_SQR_THRESHOLD)
+    { /* plain schoolbook multiplication */
+      if (un == 0)
+	return;
+      mpn_sqr_basecase (prodp, up, un);
+    }
+  else if (un < TOOM3_SQR_THRESHOLD)
+    { /* karatsuba multiplication */
+      mp_ptr tspace;
+      TMP_DECL (marker);
+      TMP_MARK (marker);
+      tspace = (mp_ptr) TMP_ALLOC (2 * (un + BITS_PER_MP_LIMB) * BYTES_PER_MP_LIMB);
+      mpn_kara_sqr_n (prodp, up, un, tspace);
+      TMP_FREE (marker);
+    }
+#if WANT_FFT || TUNE_PROGRAM_BUILD
+  else if (un < FFT_SQR_THRESHOLD)
+#else
+  else
+#endif
+    { /* toom3 multiplication */
+      mp_ptr tspace;
+      TMP_DECL (marker);
+      TMP_MARK (marker);
+      tspace = (mp_ptr) TMP_ALLOC (2 * (un + BITS_PER_MP_LIMB) * BYTES_PER_MP_LIMB);
+      mpn_toom3_sqr_n (prodp, up, un, tspace);
+      TMP_FREE (marker);
+    }
+#if WANT_FFT || TUNE_PROGRAM_BUILD
+  else
+    {
+      /* schoenhage multiplication */
+      mpn_mul_fft_full (prodp, up, un, up, un);
+    }
+#endif
+}
+
+mp_limb_t
+#if __STDC__
+mpn_mul (mp_ptr prodp,
+	 mp_srcptr up, mp_size_t un,
+	 mp_srcptr vp, mp_size_t vn)
+#else
+mpn_mul (prodp, up, un, vp, vn)
+     mp_ptr prodp;
+     mp_srcptr up;
+     mp_size_t un;
+     mp_srcptr vp;
+     mp_size_t vn;
+#endif
+{
+  mp_size_t l;
+  mp_limb_t c;
+
+  if (up == vp && un == vn)
+    {
+      mpn_sqr_n (prodp, up, un);
+      return prodp[2 * un - 1];
+    }
+
+  if (vn < KARATSUBA_MUL_THRESHOLD)
+    { /* long multiplication */
+      mpn_mul_basecase (prodp, up, un, vp, vn);
+      return prodp[un + vn - 1];
+    }
+
+  mpn_mul_n (prodp, up, vp, vn);
+  if (un != vn)
+    { mp_limb_t t;
+      mp_ptr ws;
+      TMP_DECL (marker);
+      TMP_MARK (marker);
+
+      prodp += vn;
+      l = vn;
+      up += vn;
+      un -= vn;
+
+      if (un < vn) 
+	{
+	  /* Swap u's and v's. */
+          MPN_SRCPTR_SWAP (up,un, vp,vn);
+	}
+
+      ws = (mp_ptr) TMP_ALLOC (((vn >= KARATSUBA_MUL_THRESHOLD ? vn : un) + vn)
+			       * BYTES_PER_MP_LIMB);
+
+      t = 0;
+      while (vn >= KARATSUBA_MUL_THRESHOLD)
+	{
+	  mpn_mul_n (ws, up, vp, vn);
+	  if (l <= 2*vn) 
+	    {
+	      t += mpn_add_n (prodp, prodp, ws, l);
+	      if (l != 2*vn)
+		{
+		  t = mpn_add_1 (prodp + l, ws + l, 2*vn - l, t);
+		  l = 2*vn;
+		}
+	    }
+	  else
+	    {
+	      c = mpn_add_n (prodp, prodp, ws, 2*vn);
+	      t += mpn_add_1 (prodp + 2*vn, prodp + 2*vn, l - 2*vn, c);
+	    }
+	  prodp += vn;
+	  l -= vn;
+	  up += vn;
+	  un -= vn;
+	  if (un < vn) 
+	    {
+	      /* Swap u's and v's. */
+              MPN_SRCPTR_SWAP (up,un, vp,vn);
+	    }
+	}
+
+      if (vn)
+	{
+	  mpn_mul_basecase (ws, up, un, vp, vn);
+	  if (l <= un + vn) 
+	    {
+	      t += mpn_add_n (prodp, prodp, ws, l);
+	      if (l != un + vn)
+		t = mpn_add_1 (prodp + l, ws + l, un + vn - l, t);
+	    } 
+	  else
+	    {
+	      c = mpn_add_n (prodp, prodp, ws, un + vn);
+	      t += mpn_add_1 (prodp + un + vn, prodp + un + vn, l - un - vn, c);
+	    }
+	}
+
+    TMP_FREE (marker);
+  }
+  return prodp[un + vn - 1];
+}
diff --git a/rts/gmp/mpn/generic/mul_1.c b/rts/gmp/mpn/generic/mul_1.c
new file mode 100644
index 0000000000..1c36b5fb1f
--- /dev/null
+++ b/rts/gmp/mpn/generic/mul_1.c
@@ -0,0 +1,59 @@
+/* mpn_mul_1 -- Multiply a limb vector with a single limb and
+   store the product in a second limb vector.
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_mul_1 (res_ptr, s1_ptr, s1_size, s2_limb)
+     register mp_ptr res_ptr;
+     register mp_srcptr s1_ptr;
+     mp_size_t s1_size;
+     register mp_limb_t s2_limb;
+{
+  register mp_limb_t cy_limb;
+  register mp_size_t j;
+  register mp_limb_t prod_high, prod_low;
+
+  /* The loop counter and index J goes from -S1_SIZE to -1.  This way
+     the loop becomes faster.  */
+  j = -s1_size;
+
+  /* Offset the base pointers to compensate for the negative indices.  */
+  s1_ptr -= j;
+  res_ptr -= j;
+
+  cy_limb = 0;
+  do
+    {
+      umul_ppmm (prod_high, prod_low, s1_ptr[j], s2_limb);
+
+      prod_low += cy_limb;
+      cy_limb = (prod_low < cy_limb) + prod_high;
+
+      res_ptr[j] = prod_low;
+    }
+  while (++j != 0);
+
+  return cy_limb;
+}
diff --git a/rts/gmp/mpn/generic/mul_basecase.c b/rts/gmp/mpn/generic/mul_basecase.c
new file mode 100644
index 0000000000..00c06aa5c4
--- /dev/null
+++ b/rts/gmp/mpn/generic/mul_basecase.c
@@ -0,0 +1,87 @@
+/* mpn_mul_basecase -- Internal routine to multiply two natural numbers
+   of length m and n.
+
+   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
+
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996, 1997, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Handle simple cases with traditional multiplication.
+
+   This is the most critical code of multiplication.  All multiplies rely on
+   this, both small and huge.  Small ones arrive here immediately, huge ones
+   arrive here as this is the base case for Karatsuba's recursive algorithm. */
+
+void
+#if __STDC__
+mpn_mul_basecase (mp_ptr prodp,
+		     mp_srcptr up, mp_size_t usize,
+		     mp_srcptr vp, mp_size_t vsize)
+#else
+mpn_mul_basecase (prodp, up, usize, vp, vsize)
+     mp_ptr prodp;
+     mp_srcptr up;
+     mp_size_t usize;
+     mp_srcptr vp;
+     mp_size_t vsize;
+#endif
+{
+  /* We first multiply by the low order one or two limbs, as the result can
+     be stored, not added, to PROD.  We also avoid a loop for zeroing this
+     way.  */
+#if HAVE_NATIVE_mpn_mul_2
+  if (vsize >= 2)
+    {
+      prodp[usize + 1] = mpn_mul_2 (prodp, up, usize, vp[0], vp[1]);
+      prodp += 2, vp += 2, vsize -= 2;
+    }
+  else
+    {
+      prodp[usize] = mpn_mul_1 (prodp, up, usize, vp[0]);
+      return;
+    }
+#else
+  prodp[usize] = mpn_mul_1 (prodp, up, usize, vp[0]);
+  prodp += 1, vp += 1, vsize -= 1;
+#endif
+
+#if HAVE_NATIVE_mpn_addmul_2
+  while (vsize >= 2)
+    {
+      prodp[usize + 1] = mpn_addmul_2 (prodp, up, usize, vp[0], vp[1]);
+      prodp += 2, vp += 2, vsize -= 2;
+    }
+  if (vsize != 0)
+    prodp[usize] = mpn_addmul_1 (prodp, up, usize, vp[0]);
+#else
+  /* For each iteration in the loop, multiply U with one limb from V, and
+     add the result to PROD.  */
+  while (vsize != 0)
+    {
+      prodp[usize] = mpn_addmul_1 (prodp, up, usize, vp[0]);
+      prodp += 1, vp += 1, vsize -= 1;
+    }
+#endif
+}
diff --git a/rts/gmp/mpn/generic/mul_fft.c b/rts/gmp/mpn/generic/mul_fft.c
new file mode 100644
index 0000000000..00fd6d72de
--- /dev/null
+++ b/rts/gmp/mpn/generic/mul_fft.c
@@ -0,0 +1,772 @@
+/* An implementation in GMP of Scho"nhage's fast multiplication algorithm
+   modulo 2^N+1, by Paul Zimmermann, INRIA Lorraine, February 1998.
+
+   THE CONTENTS OF THIS FILE ARE FOR INTERNAL USE AND THE FUNCTIONS HAVE
+   MUTABLE INTERFACES.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED
+   INTERFACES.  IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN
+   A FUTURE GNU MP RELEASE.
+
+Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+
+/* References:
+
+   Schnelle Multiplikation grosser Zahlen, by Arnold Scho"nhage and Volker
+   Strassen, Computing 7, p. 281-292, 1971.
+
+   Asymptotically fast algorithms for the numerical multiplication
+   and division of polynomials with complex coefficients, by Arnold Scho"nhage,
+   Computer Algebra, EUROCAM'82, LNCS 144, p. 3-15, 1982.
+
+   Tapes versus Pointers, a study in implementing fast algorithms,
+   by Arnold Scho"nhage, Bulletin of the EATCS, 30, p. 23-32, 1986.
+
+   See also http://www.loria.fr/~zimmerma/bignum
+
+
+   Future:
+
+   K==2 isn't needed in the current uses of this code and the bits specific
+   for that could be dropped.
+
+   It might be possible to avoid a small number of MPN_COPYs by using a
+   rotating temporary or two.
+
+   Multiplications of unequal sized operands can be done with this code, but
+   it needs a tighter test for identifying squaring (same sizes as well as
+   same pointers).  */
+
+
+#include <stdio.h>
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+/* Change this to "#define TRACE(x) x" for some traces. */
+#define TRACE(x)
+
+
+
+FFT_TABLE_ATTRS mp_size_t mpn_fft_table[2][MPN_FFT_TABLE_SIZE] = {
+  FFT_MUL_TABLE,
+  FFT_SQR_TABLE
+};
+
+
+static void mpn_mul_fft_internal
+_PROTO ((mp_limb_t *op, mp_srcptr n, mp_srcptr m, mp_size_t pl,
+         int k, int K,
+         mp_limb_t **Ap, mp_limb_t **Bp,
+         mp_limb_t *A, mp_limb_t *B,
+         mp_size_t nprime, mp_size_t l, mp_size_t Mp, int **_fft_l,
+         mp_limb_t *T, int rec));
+
+
+/* Find the best k to use for a mod 2^(n*BITS_PER_MP_LIMB)+1 FFT.
+   sqr==0 if for a multiply, sqr==1 for a square */
+int
+#if __STDC__
+mpn_fft_best_k (mp_size_t n, int sqr)
+#else
+mpn_fft_best_k (n, sqr)
+     mp_size_t n;
+     int       sqr;
+#endif
+{
+  mp_size_t  t;
+  int        i;
+
+  for (i = 0; mpn_fft_table[sqr][i] != 0; i++)
+    if (n < mpn_fft_table[sqr][i])
+      return i + FFT_FIRST_K;
+
+  /* treat 4*last as one further entry */
+  if (i == 0 || n < 4*mpn_fft_table[sqr][i-1])
+    return i + FFT_FIRST_K;
+  else
+    return i + FFT_FIRST_K + 1;
+}
+
+
+/* Returns smallest possible number of limbs >= pl for a fft of size 2^k.
+   FIXME: Is this simply pl rounded up to the next multiple of 2^k ?  */
+
+mp_size_t
+#if __STDC__
+mpn_fft_next_size (mp_size_t pl, int k)
+#else
+mpn_fft_next_size (pl, k)
+     mp_size_t pl;
+     int       k;
+#endif
+{
+  mp_size_t N, M;
+  int       K;
+
+  /*  if (k==0) k = mpn_fft_best_k (pl, sqr); */
+  N = pl*BITS_PER_MP_LIMB;
+  K = 1<<k;
+  if (N%K) N=(N/K+1)*K;
+  M = N/K;
+  if (M%BITS_PER_MP_LIMB) N=((M/BITS_PER_MP_LIMB)+1)*BITS_PER_MP_LIMB*K;
+  return (N/BITS_PER_MP_LIMB);
+}
+
+
+static void
+#if __STDC__
+mpn_fft_initl(int **l, int k)
+#else
+mpn_fft_initl(l, k)
+     int  **l;
+     int  k;
+#endif
+{
+    int i,j,K;
+
+    l[0][0] = 0;
+    for (i=1,K=2;i<=k;i++,K*=2) {
+	for (j=0;j<K/2;j++) {
+	    l[i][j] = 2*l[i-1][j];
+	    l[i][K/2+j] = 1+l[i][j];
+	}
+    }
+}
+
+
+/* a <- -a mod 2^(n*BITS_PER_MP_LIMB)+1 */
+static void
+#if __STDC__
+mpn_fft_neg_modF(mp_limb_t *ap, mp_size_t n)
+#else
+mpn_fft_neg_modF(ap, n)
+     mp_limb_t *ap;
+     mp_size_t n;
+#endif
+{
+  mp_limb_t c;
+
+  c = ap[n]+2;
+  mpn_com_n (ap, ap, n);
+  ap[n]=0; mpn_incr_u(ap, c);
+}
+
+
+/* a <- a*2^e mod 2^(n*BITS_PER_MP_LIMB)+1 */
+static void
+#if __STDC__
+mpn_fft_mul_2exp_modF(mp_limb_t *ap, int e, mp_size_t n, mp_limb_t *tp)
+#else
+mpn_fft_mul_2exp_modF(ap, e, n, tp)
+     mp_limb_t *ap;
+     int e;
+     mp_size_t n;
+     mp_limb_t *tp;
+#endif
+{
+  int d, sh, i; mp_limb_t cc;
+
+  d = e%(n*BITS_PER_MP_LIMB); /* 2^e = (+/-) 2^d */
+  sh = d % BITS_PER_MP_LIMB;
+  if (sh) mpn_lshift(tp, ap, n+1, sh); /* no carry here */
+  else MPN_COPY(tp, ap, n+1);
+  d /= BITS_PER_MP_LIMB; /* now shift of d limbs to the left */
+ if (d) { 
+   /* ap[d..n-1] = tp[0..n-d-1], ap[0..d-1] = -tp[n-d..n-1] */
+   /* mpn_xor would be more efficient here */
+   for (i=d-1;i>=0;i--) ap[i] = ~tp[n-d+i];
+   cc = 1-mpn_add_1(ap, ap, d, 1);
+   if (cc) cc=mpn_sub_1(ap+d, tp, n-d, 1);
+   else MPN_COPY(ap+d, tp, n-d);
+   if (cc+=mpn_sub_1(ap+d, ap+d, n-d, tp[n]))
+     ap[n]=mpn_add_1(ap, ap, n, cc);
+   else ap[n]=0;
+  }
+  else if ((ap[n]=mpn_sub_1(ap, tp, n, tp[n]))) {
+    ap[n]=mpn_add_1(ap, ap, n, 1);
+  }
+  if ((e/(n*BITS_PER_MP_LIMB))%2) mpn_fft_neg_modF(ap, n);
+}
+
+
+/* a <- a+b mod 2^(n*BITS_PER_MP_LIMB)+1 */
+static void
+#if __STDC__
+mpn_fft_add_modF (mp_limb_t *ap, mp_limb_t *bp, int n)
+#else
+mpn_fft_add_modF (ap, bp, n)
+     mp_limb_t *ap,*bp;
+     int n;
+#endif
+{
+  mp_limb_t c;
+
+  c = ap[n] + bp[n] + mpn_add_n(ap, ap, bp, n);
+  if (c>1) c -= 1+mpn_sub_1(ap,ap,n,1);
+  ap[n]=c;
+}
+
+
+/* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where
+          N=n*BITS_PER_MP_LIMB 
+          2^omega is a primitive root mod 2^N+1
+   output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */
+
+static void
+#if __STDC__
+mpn_fft_fft_sqr (mp_limb_t **Ap, mp_size_t K, int **ll,
+                 mp_size_t omega, mp_size_t n, mp_size_t inc, mp_limb_t *tp)
+#else
+mpn_fft_fft_sqr(Ap,K,ll,omega,n,inc,tp)
+mp_limb_t **Ap,*tp;
+mp_size_t K,omega,n,inc;
+int       **ll;
+#endif
+{
+  if (K==2) {
+#ifdef ADDSUB
+      if (mpn_addsub_n(Ap[0], Ap[inc], Ap[0], Ap[inc], n+1) & 1)
+#else
+      MPN_COPY(tp, Ap[0], n+1);
+      mpn_add_n(Ap[0], Ap[0], Ap[inc],n+1);
+      if (mpn_sub_n(Ap[inc], tp, Ap[inc],n+1))
+#endif
+      	Ap[inc][n] = mpn_add_1(Ap[inc], Ap[inc], n, 1);
+    }
+    else {
+      int       j, inc2=2*inc;
+      int       *lk = *ll;
+      mp_limb_t *tmp;
+      TMP_DECL(marker);
+
+      TMP_MARK(marker);
+      tmp = TMP_ALLOC_LIMBS (n+1);
+	mpn_fft_fft_sqr(Ap, K/2,ll-1,2*omega,n,inc2, tp);
+	mpn_fft_fft_sqr(Ap+inc, K/2,ll-1,2*omega,n,inc2, tp);
+	/* A[2*j*inc]   <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc]
+	   A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */
+	for (j=0;j<K/2;j++,lk+=2,Ap+=2*inc) {
+	  MPN_COPY(tp, Ap[inc], n+1);
+	  mpn_fft_mul_2exp_modF(Ap[inc], lk[1]*omega, n, tmp);
+	  mpn_fft_add_modF(Ap[inc], Ap[0], n);
+	  mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp);
+	  mpn_fft_add_modF(Ap[0], tp, n);
+	}
+        TMP_FREE(marker);
+    }
+}
+
+
+/* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where
+          N=n*BITS_PER_MP_LIMB 
+         2^omega is a primitive root mod 2^N+1 
+   output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */
+
+static void
+#if __STDC__
+mpn_fft_fft (mp_limb_t **Ap, mp_limb_t **Bp, mp_size_t K, int **ll,
+             mp_size_t omega, mp_size_t n, mp_size_t inc, mp_limb_t *tp)
+#else
+mpn_fft_fft(Ap,Bp,K,ll,omega,n,inc,tp)
+     mp_limb_t **Ap,**Bp,*tp;
+     mp_size_t K,omega,n,inc;
+     int       **ll;
+#endif
+{
+  if (K==2) {
+#ifdef ADDSUB
+      if (mpn_addsub_n(Ap[0], Ap[inc], Ap[0], Ap[inc], n+1) & 1)
+#else
+      MPN_COPY(tp, Ap[0], n+1);
+      mpn_add_n(Ap[0], Ap[0], Ap[inc],n+1);
+      if (mpn_sub_n(Ap[inc], tp, Ap[inc],n+1))
+#endif
+      	Ap[inc][n] = mpn_add_1(Ap[inc], Ap[inc], n, 1);
+#ifdef ADDSUB
+      if (mpn_addsub_n(Bp[0], Bp[inc], Bp[0], Bp[inc], n+1) & 1)
+#else
+      MPN_COPY(tp, Bp[0], n+1);
+      mpn_add_n(Bp[0], Bp[0], Bp[inc],n+1);
+      if (mpn_sub_n(Bp[inc], tp, Bp[inc],n+1))
+#endif
+      	Bp[inc][n] = mpn_add_1(Bp[inc], Bp[inc], n, 1);
+    }
+    else {
+	int       j, inc2=2*inc;
+        int       *lk=*ll;
+        mp_limb_t *tmp;
+	TMP_DECL(marker);
+
+	TMP_MARK(marker);
+	tmp = TMP_ALLOC_LIMBS (n+1);
+	mpn_fft_fft(Ap, Bp, K/2,ll-1,2*omega,n,inc2, tp);
+	mpn_fft_fft(Ap+inc, Bp+inc, K/2,ll-1,2*omega,n,inc2, tp);
+	/* A[2*j*inc]   <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc]
+	   A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */
+	for (j=0;j<K/2;j++,lk+=2,Ap+=2*inc,Bp+=2*inc) {
+	  MPN_COPY(tp, Ap[inc], n+1);
+	  mpn_fft_mul_2exp_modF(Ap[inc], lk[1]*omega, n, tmp);
+	  mpn_fft_add_modF(Ap[inc], Ap[0], n);
+	  mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp);
+	  mpn_fft_add_modF(Ap[0], tp, n);
+	  MPN_COPY(tp, Bp[inc], n+1);
+	  mpn_fft_mul_2exp_modF(Bp[inc], lk[1]*omega, n, tmp);
+	  mpn_fft_add_modF(Bp[inc], Bp[0], n);
+	  mpn_fft_mul_2exp_modF(tp,lk[0]*omega, n, tmp);
+	  mpn_fft_add_modF(Bp[0], tp, n);
+	}
+	TMP_FREE(marker);
+    }
+}
+
+
+/* a[i] <- a[i]*b[i] mod 2^(n*BITS_PER_MP_LIMB)+1 for 0 <= i < K */
+static void
+#if __STDC__
+mpn_fft_mul_modF_K (mp_limb_t **ap, mp_limb_t **bp, mp_size_t n, int K) 
+#else
+mpn_fft_mul_modF_K(ap, bp, n, K) 
+     mp_limb_t **ap, **bp;
+     mp_size_t n;
+     int       K;
+#endif
+{
+  int  i;
+  int  sqr = (ap == bp);
+  TMP_DECL(marker);
+  
+  TMP_MARK(marker); 
+
+  if (n >= (sqr ? FFT_MODF_SQR_THRESHOLD : FFT_MODF_MUL_THRESHOLD)) {
+    int k, K2,nprime2,Nprime2,M2,maxLK,l,Mp2;
+    int       **_fft_l;
+    mp_limb_t **Ap,**Bp,*A,*B,*T;
+
+    k = mpn_fft_best_k (n, sqr);
+    K2 = 1<<k;
+    maxLK = (K2>BITS_PER_MP_LIMB) ? K2 : BITS_PER_MP_LIMB;
+    M2 = n*BITS_PER_MP_LIMB/K2;
+    l = n/K2;
+    Nprime2 = ((2*M2+k+2+maxLK)/maxLK)*maxLK; /* ceil((2*M2+k+3)/maxLK)*maxLK*/
+    nprime2 = Nprime2/BITS_PER_MP_LIMB;
+    Mp2 = Nprime2/K2;
+
+    Ap = TMP_ALLOC_MP_PTRS (K2);
+    Bp = TMP_ALLOC_MP_PTRS (K2);
+    A = TMP_ALLOC_LIMBS (2*K2*(nprime2+1));
+    T = TMP_ALLOC_LIMBS (nprime2+1);
+    B = A + K2*(nprime2+1);
+    _fft_l = TMP_ALLOC_TYPE (k+1, int*);
+    for (i=0;i<=k;i++)
+      _fft_l[i] = TMP_ALLOC_TYPE (1<<i, int);
+    mpn_fft_initl(_fft_l, k);
+
+    TRACE (printf("recurse: %dx%d limbs -> %d times %dx%d (%1.2f)\n", n,
+                  n, K2, nprime2, nprime2, 2.0*(double)n/nprime2/K2));
+
+    for (i=0;i<K;i++,ap++,bp++)
+      mpn_mul_fft_internal(*ap, *ap, *bp, n, k, K2, Ap, Bp, A, B, nprime2,
+	 l, Mp2, _fft_l, T, 1);
+  }
+  else {
+     mp_limb_t *a, *b, cc, *tp, *tpn; int n2=2*n;
+     tp = TMP_ALLOC_LIMBS (n2);
+     tpn = tp+n;
+     TRACE (printf ("  mpn_mul_n %d of %d limbs\n", K, n));
+     for (i=0;i<K;i++) {
+        a = *ap++; b=*bp++;
+        if (sqr)
+          mpn_sqr_n(tp, a, n);
+        else
+          mpn_mul_n(tp, b, a, n);
+	if (a[n]) cc=mpn_add_n(tpn, tpn, b, n); else cc=0;
+	if (b[n]) cc += mpn_add_n(tpn, tpn, a, n) + a[n];
+	if (cc) {
+          cc = mpn_add_1(tp, tp, n2, cc);
+          ASSERT_NOCARRY (mpn_add_1(tp, tp, n2, cc));
+        }
+	a[n] = mpn_sub_n(a, tp, tpn, n) && mpn_add_1(a, a, n, 1); 
+     }
+  }
+  TMP_FREE(marker); 
+}
+
+
+/* input: A^[l[k][0]] A^[l[k][1]] ... A^[l[k][K-1]]
+   output: K*A[0] K*A[K-1] ... K*A[1] */
+
+static void
+#if __STDC__
+mpn_fft_fftinv (mp_limb_t **Ap, int K, mp_size_t omega, mp_size_t n,
+                mp_limb_t *tp)
+#else
+mpn_fft_fftinv(Ap,K,omega,n,tp)
+     mp_limb_t **Ap, *tp;
+     int       K;
+     mp_size_t omega, n;
+#endif
+{
+    if (K==2) {
+#ifdef ADDSUB
+      if (mpn_addsub_n(Ap[0], Ap[1], Ap[0], Ap[1], n+1) & 1)
+#else
+      MPN_COPY(tp, Ap[0], n+1);
+      mpn_add_n(Ap[0], Ap[0], Ap[1], n+1);
+      if (mpn_sub_n(Ap[1], tp, Ap[1], n+1))
+#endif
+        Ap[1][n] = mpn_add_1(Ap[1], Ap[1], n, 1);
+    }
+    else {
+	int j, K2=K/2; mp_limb_t **Bp=Ap+K2, *tmp; 
+	TMP_DECL(marker);
+
+	TMP_MARK(marker);
+	tmp = TMP_ALLOC_LIMBS (n+1);
+	mpn_fft_fftinv(Ap, K2, 2*omega, n, tp);
+	mpn_fft_fftinv(Bp, K2, 2*omega, n, tp);
+	/* A[j]     <- A[j] + omega^j A[j+K/2]
+	   A[j+K/2] <- A[j] + omega^(j+K/2) A[j+K/2] */
+        for (j=0;j<K2;j++,Ap++,Bp++) {
+	  MPN_COPY(tp, Bp[0], n+1);
+	  mpn_fft_mul_2exp_modF(Bp[0], (j+K2)*omega, n, tmp);
+	  mpn_fft_add_modF(Bp[0], Ap[0], n);
+	  mpn_fft_mul_2exp_modF(tp, j*omega, n, tmp);
+	  mpn_fft_add_modF(Ap[0], tp, n);
+	}
+	TMP_FREE(marker);
+    }
+}
+
+
+/* A <- A/2^k mod 2^(n*BITS_PER_MP_LIMB)+1 */
+static void
+#if __STDC__
+mpn_fft_div_2exp_modF (mp_limb_t *ap, int k, mp_size_t n, mp_limb_t *tp)
+#else
+mpn_fft_div_2exp_modF(ap,k,n,tp)
+     mp_limb_t *ap,*tp;
+     int       k;
+     mp_size_t n;
+#endif
+{
+    int i;
+    
+    i = 2*n*BITS_PER_MP_LIMB;
+    i = (i-k) % i;
+    mpn_fft_mul_2exp_modF(ap,i,n,tp); 
+    /* 1/2^k = 2^(2nL-k) mod 2^(n*BITS_PER_MP_LIMB)+1 */
+    /* normalize so that A < 2^(n*BITS_PER_MP_LIMB)+1 */
+    if (ap[n]==1) {
+      for (i=0;i<n && ap[i]==0;i++);
+      if (i<n) {
+	ap[n]=0;
+	mpn_sub_1(ap, ap, n, 1);
+      }
+    }
+}
+
+
+/* R <- A mod 2^(n*BITS_PER_MP_LIMB)+1, n<=an<=3*n */
+static void
+#if __STDC__
+mpn_fft_norm_modF(mp_limb_t *rp, mp_limb_t *ap, mp_size_t n, mp_size_t an) 
+#else
+mpn_fft_norm_modF(rp, ap, n, an)
+     mp_limb_t *rp;
+     mp_limb_t *ap;
+     mp_size_t n;
+     mp_size_t an;
+#endif
+{
+  mp_size_t l;
+
+   if (an>2*n) {
+     l = n;
+     rp[n] = mpn_add_1(rp+an-2*n, ap+an-2*n, 3*n-an, 
+		       mpn_add_n(rp,ap,ap+2*n,an-2*n));
+   }
+   else {
+     l = an-n;
+     MPN_COPY(rp, ap, n);
+     rp[n]=0;
+   }
+   if (mpn_sub_n(rp,rp,ap+n,l)) {
+     if (mpn_sub_1(rp+l,rp+l,n+1-l,1))
+       rp[n]=mpn_add_1(rp,rp,n,1); 
+   }
+}
+
+
+static void
+#if __STDC__
+mpn_mul_fft_internal(mp_limb_t *op, mp_srcptr n, mp_srcptr m, mp_size_t pl,
+                     int k, int K,
+                     mp_limb_t **Ap, mp_limb_t **Bp,
+                     mp_limb_t *A, mp_limb_t *B,
+                     mp_size_t nprime, mp_size_t l, mp_size_t Mp,
+                     int **_fft_l,
+                     mp_limb_t *T, int rec)
+#else
+mpn_mul_fft_internal(op,n,m,pl,k,K,Ap,Bp,A,B,nprime,l,Mp,_fft_l,T,rec)
+     mp_limb_t *op;
+     mp_srcptr n, m;
+     mp_limb_t **Ap,**Bp,*A,*B,*T;
+     mp_size_t pl,nprime;
+     int       **_fft_l;
+     int       k,K,l,Mp,rec;
+#endif
+{
+  int       i, sqr, pla, lo, sh, j;
+  mp_limb_t *p;
+
+    sqr = (n==m);
+
+    TRACE (printf ("pl=%d k=%d K=%d np=%d l=%d Mp=%d rec=%d sqr=%d\n",
+                   pl,k,K,nprime,l,Mp,rec,sqr));
+
+    /* decomposition of inputs into arrays Ap[i] and Bp[i] */
+    if (rec) for (i=0;i<K;i++) {
+      Ap[i] = A+i*(nprime+1); Bp[i] = B+i*(nprime+1);
+      /* store the next M bits of n into A[i] */
+      /* supposes that M is a multiple of BITS_PER_MP_LIMB */
+      MPN_COPY(Ap[i], n, l); n+=l; MPN_ZERO(Ap[i]+l, nprime+1-l);
+      /* set most significant bits of n and m (important in recursive calls) */
+      if (i==K-1) Ap[i][l]=n[0];
+      mpn_fft_mul_2exp_modF(Ap[i], i*Mp, nprime, T);
+      if (!sqr) {
+	MPN_COPY(Bp[i], m, l); m+=l; MPN_ZERO(Bp[i]+l, nprime+1-l);
+	if (i==K-1) Bp[i][l]=m[0];
+	mpn_fft_mul_2exp_modF(Bp[i], i*Mp, nprime, T);
+      }
+    }
+
+    /* direct fft's */
+    if (sqr) mpn_fft_fft_sqr(Ap,K,_fft_l+k,2*Mp,nprime,1, T);
+    else mpn_fft_fft(Ap,Bp,K,_fft_l+k,2*Mp,nprime,1, T);
+
+    /* term to term multiplications */
+    mpn_fft_mul_modF_K(Ap, (sqr) ? Ap : Bp, nprime, K);
+
+    /* inverse fft's */
+    mpn_fft_fftinv(Ap, K, 2*Mp, nprime, T);
+
+    /* division of terms after inverse fft */
+    for (i=0;i<K;i++) mpn_fft_div_2exp_modF(Ap[i],k+((K-i)%K)*Mp,nprime, T);
+
+    /* addition of terms in result p */
+    MPN_ZERO(T,nprime+1); 
+    pla = l*(K-1)+nprime+1; /* number of required limbs for p */
+    p = B; /* B has K*(n'+1) limbs, which is >= pla, i.e. enough */
+    MPN_ZERO(p, pla);
+    sqr=0; /* will accumulate the (signed) carry at p[pla] */
+    for (i=K-1,lo=l*i+nprime,sh=l*i;i>=0;i--,lo-=l,sh-=l) {
+        mp_ptr n = p+sh;
+	j = (K-i)%K;
+	if (mpn_add_n(n,n,Ap[j],nprime+1))
+	  sqr += mpn_add_1(n+nprime+1,n+nprime+1,pla-sh-nprime-1,1);
+	T[2*l]=i+1; /* T = (i+1)*2^(2*M) */
+	if (mpn_cmp(Ap[j],T,nprime+1)>0) { /* subtract 2^N'+1 */
+	  sqr -= mpn_sub_1(n,n,pla-sh,1);
+	  sqr -= mpn_sub_1(p+lo,p+lo,pla-lo,1);
+	}
+    }
+    if (sqr==-1) {
+      if ((sqr=mpn_add_1(p+pla-pl,p+pla-pl,pl,1))) {
+	/* p[pla-pl]...p[pla-1] are all zero */
+        mpn_sub_1(p+pla-pl-1,p+pla-pl-1,pl+1,1);
+	mpn_sub_1(p+pla-1,p+pla-1,1,1);
+      }
+    }
+    else if (sqr==1) {
+	    if (pla>=2*pl)
+	      while ((sqr=mpn_add_1(p+pla-2*pl,p+pla-2*pl,2*pl,sqr)));
+	    else {
+	      sqr = mpn_sub_1(p+pla-pl,p+pla-pl,pl,sqr);
+              ASSERT (sqr == 0);
+	    }
+    }
+    else
+      ASSERT (sqr == 0);
+
+    /* here p < 2^(2M) [K 2^(M(K-1)) + (K-1) 2^(M(K-2)) + ... ]
+              < K 2^(2M) [2^(M(K-1)) + 2^(M(K-2)) + ... ]
+	      < K 2^(2M) 2^(M(K-1))*2 = 2^(M*K+M+k+1) */
+    mpn_fft_norm_modF(op,p,pl,pla);
+}
+
+
+/* op <- n*m mod 2^N+1 with fft of size 2^k where N=pl*BITS_PER_MP_LIMB
+   n and m have respectively nl and ml limbs
+   op must have space for pl+1 limbs
+   One must have pl = mpn_fft_next_size(pl, k).
+*/
+
+void
+#if __STDC__
+mpn_mul_fft (mp_ptr op, mp_size_t pl,
+             mp_srcptr n, mp_size_t nl,
+             mp_srcptr m, mp_size_t ml,
+             int k)
+#else
+mpn_mul_fft (op, pl, n, nl, m, ml, k)
+     mp_ptr    op;
+     mp_size_t pl;
+     mp_srcptr n;
+     mp_size_t nl;
+     mp_srcptr m;
+     mp_size_t ml;
+     int k;
+#endif
+{
+    int        K,maxLK,i,j;
+    mp_size_t  N,Nprime,nprime,M,Mp,l;
+    mp_limb_t  **Ap,**Bp,*A,*T,*B;
+    int        **_fft_l;
+    int        sqr = (n==m && nl==ml);
+    TMP_DECL(marker);
+
+    TRACE (printf ("\nmpn_mul_fft pl=%ld nl=%ld ml=%ld k=%d\n",
+                   pl, nl, ml, k));
+    ASSERT_ALWAYS (mpn_fft_next_size(pl, k) == pl);
+
+    TMP_MARK(marker);
+    N = pl*BITS_PER_MP_LIMB;
+    _fft_l = TMP_ALLOC_TYPE (k+1, int*);
+    for (i=0;i<=k;i++)
+      _fft_l[i] = TMP_ALLOC_TYPE (1<<i, int);
+    mpn_fft_initl(_fft_l, k);
+    K = 1<<k;
+    M = N/K;	/* N = 2^k M */
+    l = M/BITS_PER_MP_LIMB;
+    maxLK = (K>BITS_PER_MP_LIMB) ? K : BITS_PER_MP_LIMB;
+
+    Nprime = ((2*M+k+2+maxLK)/maxLK)*maxLK; /* ceil((2*M+k+3)/maxLK)*maxLK; */
+    nprime = Nprime/BITS_PER_MP_LIMB; 
+    TRACE (printf ("N=%d K=%d, M=%d, l=%d, maxLK=%d, Np=%d, np=%d\n",
+                   N, K, M, l, maxLK, Nprime, nprime));
+    if (nprime >= (sqr ? FFT_MODF_SQR_THRESHOLD : FFT_MODF_MUL_THRESHOLD)) {
+      maxLK = (1<<mpn_fft_best_k(nprime,n==m))*BITS_PER_MP_LIMB;
+      if (Nprime % maxLK) {
+	Nprime=((Nprime/maxLK)+1)*maxLK;
+	nprime = Nprime/BITS_PER_MP_LIMB;
+      }
+      TRACE (printf ("new maxLK=%d, Np=%d, np=%d\n", maxLK, Nprime, nprime));
+    }
+
+    T = TMP_ALLOC_LIMBS (nprime+1);
+    Mp = Nprime/K;
+
+    TRACE (printf("%dx%d limbs -> %d times %dx%d limbs (%1.2f)\n",
+                  pl,pl,K,nprime,nprime,2.0*(double)N/Nprime/K);
+           printf("   temp space %ld\n", 2*K*(nprime+1)));
+
+    A = _MP_ALLOCATE_FUNC_LIMBS (2*K*(nprime+1));
+    B = A+K*(nprime+1);
+    Ap = TMP_ALLOC_MP_PTRS (K); 
+    Bp = TMP_ALLOC_MP_PTRS (K); 
+    /* special decomposition for main call */
+    for (i=0;i<K;i++) {
+      Ap[i] = A+i*(nprime+1); Bp[i] = B+i*(nprime+1);
+      /* store the next M bits of n into A[i] */
+      /* supposes that M is a multiple of BITS_PER_MP_LIMB */
+      if (nl>0) {
+	j = (nl>=l) ? l : nl; /* limbs to store in Ap[i] */
+	MPN_COPY(Ap[i], n, j); n+=l; MPN_ZERO(Ap[i]+j, nprime+1-j);
+	mpn_fft_mul_2exp_modF(Ap[i], i*Mp, nprime, T);
+      }
+      else MPN_ZERO(Ap[i], nprime+1);
+      nl -= l;
+      if (n!=m) {
+	if (ml>0) {
+	  j = (ml>=l) ? l : ml; /* limbs to store in Bp[i] */
+	  MPN_COPY(Bp[i], m, j); m+=l; MPN_ZERO(Bp[i]+j, nprime+1-j);
+	  mpn_fft_mul_2exp_modF(Bp[i], i*Mp, nprime, T);
+	}
+	else MPN_ZERO(Bp[i], nprime+1);
+      }
+      ml -= l;
+    }
+    mpn_mul_fft_internal(op,n,m,pl,k,K,Ap,Bp,A,B,nprime,l,Mp,_fft_l,T,0);
+    TMP_FREE(marker);
+    _MP_FREE_FUNC_LIMBS (A, 2*K*(nprime+1));
+}
+
+
+#if WANT_ASSERT
+static int
+#if __STDC__
+mpn_zero_p (mp_ptr p, mp_size_t n)
+#else
+     mpn_zero_p (p, n)
+     mp_ptr p;
+     mp_size_t n;
+#endif
+{
+  mp_size_t i;
+
+  for (i = 0; i < n; i++)
+    {
+      if (p[i] != 0)
+        return 0;
+    }
+
+  return 1;
+}
+#endif
+
+
+/* Multiply {n,nl}*{m,ml} and write the result to {op,nl+ml}.
+
+   FIXME: Duplicating the result like this is wasteful, do something better
+   perhaps at the norm_modF stage above. */
+
+void
+#if __STDC__
+mpn_mul_fft_full (mp_ptr op,
+                  mp_srcptr n, mp_size_t nl,
+                  mp_srcptr m, mp_size_t ml)
+#else
+mpn_mul_fft_full (op, n, nl, m, ml)
+     mp_ptr    op;
+     mp_srcptr n;
+     mp_size_t nl;
+     mp_srcptr m;
+     mp_size_t ml;
+#endif
+{
+  mp_ptr     pad_op;
+  mp_size_t  pl;
+  int        k;
+  int        sqr = (n==m && nl==ml);
+
+  k = mpn_fft_best_k (nl+ml, sqr);
+  pl = mpn_fft_next_size (nl+ml, k);
+
+  TRACE (printf ("mpn_mul_fft_full nl=%ld ml=%ld -> pl=%ld k=%d\n",
+                 nl, ml, pl, k));
+
+  pad_op = _MP_ALLOCATE_FUNC_LIMBS (pl+1);
+  mpn_mul_fft (pad_op, pl, n, nl, m, ml, k);
+
+  ASSERT (mpn_zero_p (pad_op+nl+ml, pl+1-(nl+ml)));
+  MPN_COPY (op, pad_op, nl+ml);
+
+  _MP_FREE_FUNC_LIMBS (pad_op, pl+1);
+}
diff --git a/rts/gmp/mpn/generic/mul_n.c b/rts/gmp/mpn/generic/mul_n.c
new file mode 100644
index 0000000000..b7563be2d3
--- /dev/null
+++ b/rts/gmp/mpn/generic/mul_n.c
@@ -0,0 +1,1343 @@
+/* mpn_mul_n and helper function -- Multiply/square natural numbers.
+
+   THE HELPER FUNCTIONS IN THIS FILE (meaning everything except mpn_mul_n)
+   ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY SAFE TO REACH
+   THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED
+   THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* Multiplicative inverse of 3, modulo 2^BITS_PER_MP_LIMB.
+   0xAAAAAAAB for 32 bits, 0xAAAAAAAAAAAAAAAB for 64 bits. */
+#define INVERSE_3      ((MP_LIMB_T_MAX / 3) * 2 + 1)
+
+#if !defined (__alpha) && !defined (__mips)
+/* For all other machines, we want to call mpn functions for the compund
+   operations instead of open-coding them.  */
+#define USE_MORE_MPN
+#endif
+
+/*== Function declarations =================================================*/
+
+static void evaluate3 _PROTO ((mp_ptr, mp_ptr, mp_ptr,
+                               mp_ptr, mp_ptr, mp_ptr,
+                               mp_srcptr, mp_srcptr, mp_srcptr,
+                               mp_size_t, mp_size_t));
+static void interpolate3 _PROTO ((mp_srcptr,
+                                  mp_ptr, mp_ptr, mp_ptr,
+                                  mp_srcptr,
+                                  mp_ptr, mp_ptr, mp_ptr,
+                                  mp_size_t, mp_size_t));
+static mp_limb_t add2Times _PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
+
+
+/*-- mpn_kara_mul_n ---------------------------------------------------------------*/
+
+/* Multiplies using 3 half-sized mults and so on recursively.
+ * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
+ * No overlap of p[...] with a[...] or b[...].
+ * ws is workspace.
+ */
+
+void
+#if __STDC__
+mpn_kara_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
+#else
+mpn_kara_mul_n(p, a, b, n, ws)
+     mp_ptr    p;
+     mp_srcptr a;
+     mp_srcptr b;
+     mp_size_t n;
+     mp_ptr    ws;
+#endif
+{
+  mp_limb_t i, sign, w, w0, w1;
+  mp_size_t n2;
+  mp_srcptr x, y;
+
+  n2 = n >> 1;
+  ASSERT (n2 > 0);
+
+  if (n & 1)
+    {
+      /* Odd length. */
+      mp_size_t n1, n3, nm1;
+
+      n3 = n - n2;
+
+      sign = 0;
+      w = a[n2];
+      if (w != 0)
+	w -= mpn_sub_n (p, a, a + n3, n2);
+      else
+	{
+	  i = n2;
+	  do
+	    {
+	      --i;
+	      w0 = a[i];
+	      w1 = a[n3+i];
+	    }
+	  while (w0 == w1 && i != 0);
+	  if (w0 < w1)
+	    {
+	      x = a + n3;
+	      y = a;
+	      sign = 1;
+	    }
+	  else
+	    {
+	      x = a;
+	      y = a + n3;
+	    }
+	  mpn_sub_n (p, x, y, n2);
+	}
+      p[n2] = w;
+
+      w = b[n2];
+      if (w != 0)
+	w -= mpn_sub_n (p + n3, b, b + n3, n2);
+      else
+	{
+	  i = n2;
+	  do 
+	    {
+	      --i;
+	      w0 = b[i]; 
+	      w1 = b[n3+i];
+	    }
+	  while (w0 == w1 && i != 0);
+	  if (w0 < w1)
+	    {
+	      x = b + n3;
+	      y = b;
+	      sign ^= 1;
+	    }
+	  else
+	    {
+	      x = b;
+	      y = b + n3;
+	    }
+	  mpn_sub_n (p + n3, x, y, n2);
+	}
+      p[n] = w;
+
+      n1 = n + 1;
+      if (n2 < KARATSUBA_MUL_THRESHOLD)
+	{
+	  if (n3 < KARATSUBA_MUL_THRESHOLD)
+	    {
+	      mpn_mul_basecase (ws, p, n3, p + n3, n3);
+	      mpn_mul_basecase (p, a, n3, b, n3);
+	    }
+	  else
+	    {
+	      mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
+	      mpn_kara_mul_n (p, a, b, n3, ws + n1);
+	    }
+	  mpn_mul_basecase (p + n1, a + n3, n2, b + n3, n2);
+	}
+      else
+	{
+	  mpn_kara_mul_n (ws, p, p + n3, n3, ws + n1);
+	  mpn_kara_mul_n (p, a, b, n3, ws + n1);
+	  mpn_kara_mul_n (p + n1, a + n3, b + n3, n2, ws + n1);
+	}
+
+      if (sign)
+	mpn_add_n (ws, p, ws, n1);
+      else
+	mpn_sub_n (ws, p, ws, n1);
+
+      nm1 = n - 1;
+      if (mpn_add_n (ws, p + n1, ws, nm1))
+	{
+	  mp_limb_t x = ws[nm1] + 1;
+	  ws[nm1] = x;
+	  if (x == 0)
+	    ++ws[n];
+	}
+      if (mpn_add_n (p + n3, p + n3, ws, n1))
+	{
+	  mp_limb_t x;
+	  i = n1 + n3;
+	  do
+	    {
+	      x = p[i] + 1;
+	      p[i] = x;
+	      ++i;
+	    } while (x == 0);
+	}
+    }
+  else
+    {
+      /* Even length. */
+      mp_limb_t t;
+
+      i = n2;
+      do
+	{
+	  --i;
+	  w0 = a[i];
+	  w1 = a[n2+i];
+	}
+      while (w0 == w1 && i != 0);
+      sign = 0;
+      if (w0 < w1)
+	{
+	  x = a + n2;
+	  y = a;
+	  sign = 1;
+	}
+      else
+	{
+	  x = a;
+	  y = a + n2;
+	}
+      mpn_sub_n (p, x, y, n2);
+
+      i = n2;
+      do 
+	{
+	  --i;
+	  w0 = b[i];
+	  w1 = b[n2+i];
+	}
+      while (w0 == w1 && i != 0);
+      if (w0 < w1)
+	{
+	  x = b + n2;
+	  y = b;
+	  sign ^= 1;
+	}
+      else
+	{
+	  x = b;
+	  y = b + n2;
+	}
+      mpn_sub_n (p + n2, x, y, n2);
+
+      /* Pointwise products. */
+      if (n2 < KARATSUBA_MUL_THRESHOLD)
+	{
+	  mpn_mul_basecase (ws, p, n2, p + n2, n2);
+	  mpn_mul_basecase (p, a, n2, b, n2);
+	  mpn_mul_basecase (p + n, a + n2, n2, b + n2, n2);
+	}
+      else
+	{
+	  mpn_kara_mul_n (ws, p, p + n2, n2, ws + n);
+	  mpn_kara_mul_n (p, a, b, n2, ws + n);
+	  mpn_kara_mul_n (p + n, a + n2, b + n2, n2, ws + n);
+	}
+
+      /* Interpolate. */
+      if (sign)
+	w = mpn_add_n (ws, p, ws, n);
+      else
+	w = -mpn_sub_n (ws, p, ws, n);
+      w += mpn_add_n (ws, p + n, ws, n);
+      w += mpn_add_n (p + n2, p + n2, ws, n);
+      /* TO DO: could put "if (w) { ... }" here.
+       * Less work but badly predicted branch.
+       * No measurable difference in speed on Alpha.
+       */
+      i = n + n2;
+      t = p[i] + w;
+      p[i] = t;
+      if (t < w)
+	{
+	  do
+	    {
+	      ++i;
+	      w = p[i] + 1;
+	      p[i] = w;
+	    }
+	  while (w == 0);
+	}
+    }
+}
+
+void
+#if __STDC__
+mpn_kara_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
+#else
+mpn_kara_sqr_n (p, a, n, ws)
+     mp_ptr    p;
+     mp_srcptr a;
+     mp_size_t n;
+     mp_ptr    ws;
+#endif
+{
+  mp_limb_t i, sign, w, w0, w1;
+  mp_size_t n2;
+  mp_srcptr x, y;
+
+  n2 = n >> 1;
+  ASSERT (n2 > 0);
+
+  if (n & 1)
+    {
+      /* Odd length. */
+      mp_size_t n1, n3, nm1;
+
+      n3 = n - n2;
+
+      sign = 0;
+      w = a[n2];
+      if (w != 0)
+	w -= mpn_sub_n (p, a, a + n3, n2);
+      else
+	{
+	  i = n2;
+	  do
+	    {
+	      --i;
+	      w0 = a[i];
+	      w1 = a[n3+i];
+	    }
+	  while (w0 == w1 && i != 0);
+	  if (w0 < w1)
+	    {
+	      x = a + n3;
+	      y = a;
+	      sign = 1;
+	    }
+	  else
+	    {
+	      x = a;
+	      y = a + n3;
+	    }
+	  mpn_sub_n (p, x, y, n2);
+	}
+      p[n2] = w;
+
+      w = a[n2];
+      if (w != 0)
+	w -= mpn_sub_n (p + n3, a, a + n3, n2);
+      else
+	{
+	  i = n2;
+	  do 
+	    {
+	      --i;
+	      w0 = a[i]; 
+	      w1 = a[n3+i];
+	    }
+	  while (w0 == w1 && i != 0);
+	  if (w0 < w1)
+	    {
+	      x = a + n3;
+	      y = a;
+	      sign ^= 1;
+	    }
+	  else
+	    {
+	      x = a;
+	      y = a + n3;
+	    }
+	  mpn_sub_n (p + n3, x, y, n2);
+	}
+      p[n] = w;
+
+      n1 = n + 1;
+      if (n2 < KARATSUBA_SQR_THRESHOLD)
+	{
+	  if (n3 < KARATSUBA_SQR_THRESHOLD)
+	    {
+	      mpn_sqr_basecase (ws, p, n3);
+	      mpn_sqr_basecase (p, a, n3);
+	    }
+	  else
+	    {
+	      mpn_kara_sqr_n (ws, p, n3, ws + n1);
+	      mpn_kara_sqr_n (p, a, n3, ws + n1);
+	    }
+	  mpn_sqr_basecase (p + n1, a + n3, n2);
+	}
+      else
+	{
+	  mpn_kara_sqr_n (ws, p, n3, ws + n1);
+	  mpn_kara_sqr_n (p, a, n3, ws + n1);
+	  mpn_kara_sqr_n (p + n1, a + n3, n2, ws + n1);
+	}
+
+      if (sign)
+	mpn_add_n (ws, p, ws, n1);
+      else
+	mpn_sub_n (ws, p, ws, n1);
+
+      nm1 = n - 1;
+      if (mpn_add_n (ws, p + n1, ws, nm1))
+	{
+	  mp_limb_t x = ws[nm1] + 1;
+	  ws[nm1] = x;
+	  if (x == 0)
+	    ++ws[n];
+	}
+      if (mpn_add_n (p + n3, p + n3, ws, n1))
+	{
+	  mp_limb_t x;
+	  i = n1 + n3;
+	  do
+	    {
+	      x = p[i] + 1;
+	      p[i] = x;
+	      ++i;
+	    } while (x == 0);
+	}
+    }
+  else
+    {
+      /* Even length. */
+      mp_limb_t t;
+
+      i = n2;
+      do
+	{
+	  --i;
+	  w0 = a[i];
+	  w1 = a[n2+i];
+	}
+      while (w0 == w1 && i != 0);
+      sign = 0;
+      if (w0 < w1)
+	{
+	  x = a + n2;
+	  y = a;
+	  sign = 1;
+	}
+      else
+	{
+	  x = a;
+	  y = a + n2;
+	}
+      mpn_sub_n (p, x, y, n2);
+
+      i = n2;
+      do 
+	{
+	  --i;
+	  w0 = a[i];
+	  w1 = a[n2+i];
+	}
+      while (w0 == w1 && i != 0);
+      if (w0 < w1)
+	{
+	  x = a + n2;
+	  y = a;
+	  sign ^= 1;
+	}
+      else
+	{
+	  x = a;
+	  y = a + n2;
+	}
+      mpn_sub_n (p + n2, x, y, n2);
+
+      /* Pointwise products. */
+      if (n2 < KARATSUBA_SQR_THRESHOLD)
+	{
+	  mpn_sqr_basecase (ws, p, n2);
+	  mpn_sqr_basecase (p, a, n2);
+	  mpn_sqr_basecase (p + n, a + n2, n2);
+	}
+      else
+	{
+	  mpn_kara_sqr_n (ws, p, n2, ws + n);
+	  mpn_kara_sqr_n (p, a, n2, ws + n);
+	  mpn_kara_sqr_n (p + n, a + n2, n2, ws + n);
+	}
+
+      /* Interpolate. */
+      if (sign)
+	w = mpn_add_n (ws, p, ws, n);
+      else
+	w = -mpn_sub_n (ws, p, ws, n);
+      w += mpn_add_n (ws, p + n, ws, n);
+      w += mpn_add_n (p + n2, p + n2, ws, n);
+      /* TO DO: could put "if (w) { ... }" here.
+       * Less work but badly predicted branch.
+       * No measurable difference in speed on Alpha.
+       */
+      i = n + n2;
+      t = p[i] + w;
+      p[i] = t;
+      if (t < w)
+	{
+	  do
+	    {
+	      ++i;
+	      w = p[i] + 1;
+	      p[i] = w;
+	    }
+	  while (w == 0);
+	}
+    }
+}
+
+/*-- add2Times -------------------------------------------------------------*/
+
+/* z[] = x[] + 2 * y[]
+   Note that z and x might point to the same vectors. */
+#ifdef USE_MORE_MPN
+static inline mp_limb_t
+#if __STDC__
+add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
+#else
+add2Times (z, x, y, n)
+     mp_ptr    z;
+     mp_srcptr x;
+     mp_srcptr y;
+     mp_size_t n;
+#endif
+{
+  mp_ptr t;
+  mp_limb_t c;
+  TMP_DECL (marker);
+  TMP_MARK (marker);
+  t = (mp_ptr) TMP_ALLOC (n * BYTES_PER_MP_LIMB);
+  c = mpn_lshift (t, y, n, 1);
+  c += mpn_add_n (z, x, t, n);
+  TMP_FREE (marker);
+  return c;
+}
+#else
+
+static mp_limb_t
+#if __STDC__
+add2Times (mp_ptr z, mp_srcptr x, mp_srcptr y, mp_size_t n)
+#else
+add2Times (z, x, y, n)
+     mp_ptr    z;
+     mp_srcptr x;
+     mp_srcptr y;
+     mp_size_t n;
+#endif
+{
+  mp_limb_t c, v, w;
+
+  ASSERT (n > 0);
+  v = *x; w = *y;
+  c = w >> (BITS_PER_MP_LIMB - 1);
+  w <<= 1;
+  v += w;
+  c += v < w;
+  *z = v;
+  ++x; ++y; ++z;
+  while (--n)
+    {
+      v = *x;
+      w = *y;
+      v += c;
+      c = v < c;
+      c += w >> (BITS_PER_MP_LIMB - 1);
+      w <<= 1;
+      v += w;
+      c += v < w;
+      *z = v;
+      ++x; ++y; ++z;
+    }
+
+  return c;
+}
+#endif
+
+/*-- evaluate3 -------------------------------------------------------------*/
+
+/* Evaluates:
+ *   ph := 4*A+2*B+C
+ *   p1 := A+B+C
+ *   p2 := A+2*B+4*C
+ * where:
+ *   ph[], p1[], p2[], A[] and B[] all have length len,
+ *   C[] has length len2 with len-len2 = 0, 1 or 2.
+ * Returns top words (overflow) at pth, pt1 and pt2 respectively.
+ */
+#ifdef USE_MORE_MPN
+static void
+#if __STDC__
+evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
+	   mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t len, mp_size_t len2)
+#else
+evaluate3 (ph, p1, p2, pth, pt1, pt2,
+           A, B, C, len, len2)
+     mp_ptr    ph;
+     mp_ptr    p1;
+     mp_ptr    p2;
+     mp_ptr    pth;
+     mp_ptr    pt1;
+     mp_ptr    pt2;
+     mp_srcptr A;
+     mp_srcptr B;
+     mp_srcptr C;
+     mp_size_t len;
+     mp_size_t len2;
+#endif
+{
+  mp_limb_t c, d, e;
+  
+  ASSERT (len - len2 <= 2);
+
+  e = mpn_lshift (p1, B, len, 1);
+
+  c = mpn_lshift (ph, A, len, 2);
+  c += e + mpn_add_n (ph, ph, p1, len);
+  d = mpn_add_n (ph, ph, C, len2);
+  if (len2 == len) c += d; else c += mpn_add_1 (ph + len2, ph + len2, len-len2, d);
+  ASSERT (c < 7);
+  *pth = c;
+
+  c = mpn_lshift (p2, C, len2, 2);
+#if 1
+  if (len2 != len) { p2[len-1] = 0; p2[len2] = c; c = 0; }
+  c += e + mpn_add_n (p2, p2, p1, len);
+#else
+  d = mpn_add_n (p2, p2, p1, len2);
+  c += d;
+  if (len2 != len) c = mpn_add_1 (p2+len2, p1+len2, len-len2, c);
+  c += e;
+#endif
+  c += mpn_add_n (p2, p2, A, len);
+  ASSERT (c < 7);
+  *pt2 = c;
+
+  c = mpn_add_n (p1, A, B, len);
+  d = mpn_add_n (p1, p1, C, len2);
+  if (len2 == len) c += d;
+  else c += mpn_add_1 (p1+len2, p1+len2, len-len2, d);
+  ASSERT (c < 3);
+  *pt1 = c;
+
+}
+
+#else
+
+static void
+#if __STDC__
+evaluate3 (mp_ptr ph, mp_ptr p1, mp_ptr p2, mp_ptr pth, mp_ptr pt1, mp_ptr pt2,
+	   mp_srcptr A, mp_srcptr B, mp_srcptr C, mp_size_t l, mp_size_t ls)
+#else
+evaluate3 (ph, p1, p2, pth, pt1, pt2,
+           A, B, C, l, ls)
+     mp_ptr    ph;
+     mp_ptr    p1;
+     mp_ptr    p2;
+     mp_ptr    pth;
+     mp_ptr    pt1;
+     mp_ptr    pt2;
+     mp_srcptr A;
+     mp_srcptr B;
+     mp_srcptr C;
+     mp_size_t l;
+     mp_size_t ls;
+#endif
+{
+  mp_limb_t a,b,c, i, t, th,t1,t2, vh,v1,v2;
+
+  ASSERT (l - ls <= 2);
+
+  th = t1 = t2 = 0;
+  for (i = 0; i < l; ++i)
+    {
+      a = *A;
+      b = *B;
+      c = i < ls ? *C : 0;
+
+      /* TO DO: choose one of the following alternatives. */
+#if 0
+      t = a << 2;
+      vh = th + t;
+      th = vh < t;
+      th += a >> (BITS_PER_MP_LIMB - 2);
+      t = b << 1;
+      vh += t;
+      th += vh < t;
+      th += b >> (BITS_PER_MP_LIMB - 1);
+      vh += c;
+      th += vh < c;
+#else
+      vh = th + c;
+      th = vh < c;
+      t = b << 1;
+      vh += t;
+      th += vh < t;
+      th += b >> (BITS_PER_MP_LIMB - 1);
+      t = a << 2;
+      vh += t;
+      th += vh < t;
+      th += a >> (BITS_PER_MP_LIMB - 2);
+#endif
+
+      v1 = t1 + a;
+      t1 = v1 < a;
+      v1 += b;
+      t1 += v1 < b;
+      v1 += c;
+      t1 += v1 < c;
+
+      v2 = t2 + a;
+      t2 = v2 < a;
+      t = b << 1;
+      v2 += t;
+      t2 += v2 < t;
+      t2 += b >> (BITS_PER_MP_LIMB - 1);
+      t = c << 2;
+      v2 += t;
+      t2 += v2 < t;
+      t2 += c >> (BITS_PER_MP_LIMB - 2);
+
+      *ph = vh;
+      *p1 = v1;
+      *p2 = v2;
+
+      ++A; ++B; ++C;
+      ++ph; ++p1; ++p2;
+    }
+
+  ASSERT (th < 7);
+  ASSERT (t1 < 3);
+  ASSERT (t2 < 7);
+
+  *pth = th;
+  *pt1 = t1;
+  *pt2 = t2;
+}
+#endif
+
+
+/*-- interpolate3 ----------------------------------------------------------*/
+
+/* Interpolates B, C, D (in-place) from:
+ *   16*A+8*B+4*C+2*D+E
+ *   A+B+C+D+E
+ *   A+2*B+4*C+8*D+16*E
+ * where:
+ *   A[], B[], C[] and D[] all have length l,
+ *   E[] has length ls with l-ls = 0, 2 or 4.
+ *
+ * Reads top words (from earlier overflow) from ptb, ptc and ptd,
+ * and returns new top words there.
+ */
+
+#ifdef USE_MORE_MPN
+static void
+#if __STDC__
+interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
+              mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t len, mp_size_t len2)
+#else
+interpolate3 (A, B, C, D, E,
+              ptb, ptc, ptd, len, len2)
+     mp_srcptr A;
+     mp_ptr    B;
+     mp_ptr    C;
+     mp_ptr    D;
+     mp_srcptr E;
+     mp_ptr    ptb;
+     mp_ptr    ptc;
+     mp_ptr    ptd;
+     mp_size_t len;
+     mp_size_t len2;
+#endif
+{
+  mp_ptr ws;
+  mp_limb_t t, tb,tc,td;
+  TMP_DECL (marker);
+  TMP_MARK (marker);
+
+  ASSERT (len - len2 == 0 || len - len2 == 2 || len - len2 == 4);
+
+  /* Let x1, x2, x3 be the values to interpolate.  We have:
+   *         b = 16*a + 8*x1 + 4*x2 + 2*x3 +    e
+   *         c =    a +   x1 +   x2 +   x3 +    e
+   *         d =    a + 2*x1 + 4*x2 + 8*x3 + 16*e
+   */
+
+  ws = (mp_ptr) TMP_ALLOC (len * BYTES_PER_MP_LIMB);
+
+  tb = *ptb; tc = *ptc; td = *ptd;
+
+
+  /* b := b - 16*a -    e
+   * c := c -    a -    e
+   * d := d -    a - 16*e
+   */
+
+  t = mpn_lshift (ws, A, len, 4);
+  tb -= t + mpn_sub_n (B, B, ws, len);
+  t = mpn_sub_n (B, B, E, len2);
+  if (len2 == len) tb -= t;
+  else tb -= mpn_sub_1 (B+len2, B+len2, len-len2, t);
+
+  tc -= mpn_sub_n (C, C, A, len);
+  t = mpn_sub_n (C, C, E, len2);
+  if (len2 == len) tc -= t;
+  else tc -= mpn_sub_1 (C+len2, C+len2, len-len2, t);
+
+  t = mpn_lshift (ws, E, len2, 4);
+  t += mpn_add_n (ws, ws, A, len2);
+#if 1
+  if (len2 != len) t = mpn_add_1 (ws+len2, A+len2, len-len2, t);
+  td -= t + mpn_sub_n (D, D, ws, len);
+#else
+  t += mpn_sub_n (D, D, ws, len2);
+  if (len2 != len) {
+    t = mpn_sub_1 (D+len2, D+len2, len-len2, t);
+    t += mpn_sub_n (D+len2, D+len2, A+len2, len-len2);
+  } /* end if/else */
+  td -= t;
+#endif
+
+
+  /* b, d := b + d, b - d */
+
+#ifdef HAVE_MPN_ADD_SUB_N
+  /* #error TO DO ... */
+#else
+  t = tb + td + mpn_add_n (ws, B, D, len);  
+  td = tb - td - mpn_sub_n (D, B, D, len);
+  tb = t;
+  MPN_COPY (B, ws, len);
+#endif
+  
+  /* b := b-8*c */
+  t = 8 * tc + mpn_lshift (ws, C, len, 3);
+  tb -= t + mpn_sub_n (B, B, ws, len);
+
+  /* c := 2*c - b */
+  tc = 2 * tc + mpn_lshift (C, C, len, 1);
+  tc -= tb + mpn_sub_n (C, C, B, len);
+
+  /* d := d/3 */
+  td = (td - mpn_divexact_by3 (D, D, len)) * INVERSE_3;
+
+  /* b, d := b + d, b - d */
+#ifdef HAVE_MPN_ADD_SUB_N
+  /* #error TO DO ... */
+#else
+  t = tb + td + mpn_add_n (ws, B, D, len);  
+  td = tb - td - mpn_sub_n (D, B, D, len);
+  tb = t;
+  MPN_COPY (B, ws, len);
+#endif
+
+      /* Now:
+       *	 b = 4*x1
+       *	 c = 2*x2
+       *	 d = 4*x3
+       */
+
+  ASSERT(!(*B & 3));
+  mpn_rshift (B, B, len, 2);
+  B[len-1] |= tb<<(BITS_PER_MP_LIMB-2);
+  ASSERT((long)tb >= 0);
+  tb >>= 2;
+
+  ASSERT(!(*C & 1));
+  mpn_rshift (C, C, len, 1);
+  C[len-1] |= tc<<(BITS_PER_MP_LIMB-1);
+  ASSERT((long)tc >= 0);
+  tc >>= 1;
+
+  ASSERT(!(*D & 3));
+  mpn_rshift (D, D, len, 2);
+  D[len-1] |= td<<(BITS_PER_MP_LIMB-2);
+  ASSERT((long)td >= 0);
+  td >>= 2;
+
+#if WANT_ASSERT
+  ASSERT (tb < 2);
+  if (len == len2)
+    {
+      ASSERT (tc < 3);
+      ASSERT (td < 2);
+    }
+  else
+    {
+      ASSERT (tc < 2);
+      ASSERT (!td);
+    }
+#endif
+
+  *ptb = tb;
+  *ptc = tc;
+  *ptd = td;
+
+  TMP_FREE (marker);
+}
+
+#else
+
+static void
+#if __STDC__
+interpolate3 (mp_srcptr A, mp_ptr B, mp_ptr C, mp_ptr D, mp_srcptr E,
+	      mp_ptr ptb, mp_ptr ptc, mp_ptr ptd, mp_size_t l, mp_size_t ls)
+#else
+interpolate3 (A, B, C, D, E,
+              ptb, ptc, ptd, l, ls)
+     mp_srcptr A;
+     mp_ptr    B;
+     mp_ptr    C;
+     mp_ptr    D;
+     mp_srcptr E;
+     mp_ptr    ptb;
+     mp_ptr    ptc;
+     mp_ptr    ptd;
+     mp_size_t l;
+     mp_size_t ls;
+#endif
+{
+  mp_limb_t a,b,c,d,e,t, i, sb,sc,sd, ob,oc,od;
+  const mp_limb_t maskOffHalf = (~(mp_limb_t) 0) << (BITS_PER_MP_LIMB >> 1);
+
+#if WANT_ASSERT
+  t = l - ls;
+  ASSERT (t == 0 || t == 2 || t == 4);
+#endif
+
+  sb = sc = sd = 0;
+  for (i = 0; i < l; ++i)
+    {
+      mp_limb_t tb, tc, td, tt;
+
+      a = *A;
+      b = *B;
+      c = *C;
+      d = *D;
+      e = i < ls ? *E : 0;
+
+      /* Let x1, x2, x3 be the values to interpolate.  We have:
+       *	 b = 16*a + 8*x1 + 4*x2 + 2*x3 +    e
+       *	 c =	a +   x1 +   x2 +   x3 +    e
+       *	 d =	a + 2*x1 + 4*x2 + 8*x3 + 16*e
+       */
+
+      /* b := b - 16*a -    e
+       * c := c -    a -    e
+       * d := d -    a - 16*e
+       */
+      t = a << 4;
+      tb = -(a >> (BITS_PER_MP_LIMB - 4)) - (b < t);
+      b -= t;
+      tb -= b < e;
+      b -= e;
+      tc = -(c < a);
+      c -= a;
+      tc -= c < e;
+      c -= e;
+      td = -(d < a);
+      d -= a;
+      t = e << 4;
+      td = td - (e >> (BITS_PER_MP_LIMB - 4)) - (d < t);
+      d -= t;
+
+      /* b, d := b + d, b - d */
+      t = b + d;
+      tt = tb + td + (t < b);
+      td = tb - td - (b < d);
+      d = b - d;
+      b = t;
+      tb = tt;
+
+      /* b := b-8*c */
+      t = c << 3;
+      tb = tb - (tc << 3) - (c >> (BITS_PER_MP_LIMB - 3)) - (b < t);
+      b -= t;
+
+      /* c := 2*c - b */
+      t = c << 1;
+      tc = (tc << 1) + (c >> (BITS_PER_MP_LIMB - 1)) - tb - (t < b);
+      c = t - b;
+
+      /* d := d/3 */
+      d *= INVERSE_3;
+      td = td - (d >> (BITS_PER_MP_LIMB - 1)) - (d*3 < d);
+      td *= INVERSE_3;
+
+      /* b, d := b + d, b - d */
+      t = b + d;
+      tt = tb + td + (t < b);
+      td = tb - td - (b < d);
+      d = b - d;
+      b = t;
+      tb = tt;
+
+      /* Now:
+       *	 b = 4*x1
+       *	 c = 2*x2
+       *	 d = 4*x3
+       */
+
+      /* sb has period 2. */
+      b += sb;
+      tb += b < sb;
+      sb &= maskOffHalf;
+      sb |= sb >> (BITS_PER_MP_LIMB >> 1);
+      sb += tb;
+
+      /* sc has period 1. */
+      c += sc;
+      tc += c < sc;
+      /* TO DO: choose one of the following alternatives. */
+#if 1
+      sc = (mp_limb_t)((long)sc >> (BITS_PER_MP_LIMB - 1));
+      sc += tc;
+#else
+      sc = tc - ((long)sc < 0L);
+#endif
+
+      /* sd has period 2. */
+      d += sd;
+      td += d < sd;
+      sd &= maskOffHalf;
+      sd |= sd >> (BITS_PER_MP_LIMB >> 1);
+      sd += td;
+
+      if (i != 0)
+	{
+	  B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
+	  C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
+	  D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
+	}
+      ob = b >> 2;
+      oc = c >> 1;
+      od = d >> 2;
+
+      ++A; ++B; ++C; ++D; ++E;
+    }
+
+  /* Handle top words. */
+  b = *ptb;
+  c = *ptc;
+  d = *ptd;
+
+  t = b + d;
+  d = b - d;
+  b = t;
+  b -= c << 3;
+  c = (c << 1) - b;
+  d *= INVERSE_3;
+  t = b + d;
+  d = b - d;
+  b = t;
+
+  b += sb;
+  c += sc;
+  d += sd;
+
+  B[-1] = ob | b << (BITS_PER_MP_LIMB - 2);
+  C[-1] = oc | c << (BITS_PER_MP_LIMB - 1);
+  D[-1] = od | d << (BITS_PER_MP_LIMB - 2);
+
+  b >>= 2;
+  c >>= 1;
+  d >>= 2;
+
+#if WANT_ASSERT
+  ASSERT (b < 2);
+  if (l == ls)
+    {
+      ASSERT (c < 3);
+      ASSERT (d < 2);
+    }
+  else
+    {
+      ASSERT (c < 2);
+      ASSERT (!d);
+    }
+#endif
+
+  *ptb = b;
+  *ptc = c;
+  *ptd = d;
+}
+#endif
+
+
+/*-- mpn_toom3_mul_n --------------------------------------------------------------*/
+
+/* Multiplies using 5 mults of one third size and so on recursively.
+ * p[0..2*n-1] := product of a[0..n-1] and b[0..n-1].
+ * No overlap of p[...] with a[...] or b[...].
+ * ws is workspace.
+ */
+
+/* TO DO: If TOOM3_MUL_THRESHOLD is much bigger than KARATSUBA_MUL_THRESHOLD then the
+ *        recursion in mpn_toom3_mul_n() will always bottom out with mpn_kara_mul_n()
+ *        because the "n < KARATSUBA_MUL_THRESHOLD" test here will always be false.
+ */
+
+#define TOOM3_MUL_REC(p, a, b, n, ws) \
+  do {								\
+    if (n < KARATSUBA_MUL_THRESHOLD)				\
+      mpn_mul_basecase (p, a, n, b, n);				\
+    else if (n < TOOM3_MUL_THRESHOLD)				\
+      mpn_kara_mul_n (p, a, b, n, ws);				\
+    else							\
+      mpn_toom3_mul_n (p, a, b, n, ws);				\
+  } while (0)
+
+void
+#if __STDC__
+mpn_toom3_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n, mp_ptr ws)
+#else
+mpn_toom3_mul_n (p, a, b, n, ws)
+     mp_ptr    p;
+     mp_srcptr a;
+     mp_srcptr b;
+     mp_size_t n;
+     mp_ptr    ws;
+#endif
+{
+  mp_limb_t cB,cC,cD, dB,dC,dD, tB,tC,tD;
+  mp_limb_t *A,*B,*C,*D,*E, *W;
+  mp_size_t l,l2,l3,l4,l5,ls;
+
+  /* Break n words into chunks of size l, l and ls.
+   * n = 3*k   => l = k,   ls = k
+   * n = 3*k+1 => l = k+1, ls = k-1
+   * n = 3*k+2 => l = k+1, ls = k
+   */
+  {
+    mp_limb_t m;
+
+    ASSERT (n >= TOOM3_MUL_THRESHOLD);
+    l = ls = n / 3;
+    m = n - l * 3;
+    if (m != 0)
+      ++l;
+    if (m == 1)
+      --ls;
+
+    l2 = l * 2;
+    l3 = l * 3;
+    l4 = l * 4;
+    l5 = l * 5;
+    A = p;
+    B = ws;
+    C = p + l2;
+    D = ws + l2;
+    E = p + l4;
+    W = ws + l4;
+  }
+
+  /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
+  evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
+  evaluate3 (A + l, B + l, C + l, &dB, &dC, &dD, b, b + l, b + l2, l, ls);
+
+  /** Second stage: pointwise multiplies. **/
+  TOOM3_MUL_REC(D, C, C + l, l, W);
+  tD = cD*dD;
+  if (cD) tD += mpn_addmul_1 (D + l, C + l, l, cD);
+  if (dD) tD += mpn_addmul_1 (D + l, C, l, dD);
+  ASSERT (tD < 49);
+  TOOM3_MUL_REC(C, B, B + l, l, W);
+  tC = cC*dC;
+  /* TO DO: choose one of the following alternatives. */
+#if 0
+  if (cC) tC += mpn_addmul_1 (C + l, B + l, l, cC);
+  if (dC) tC += mpn_addmul_1 (C + l, B, l, dC);
+#else
+  if (cC)
+    {
+      if (cC == 1) tC += mpn_add_n (C + l, C + l, B + l, l);
+      else tC += add2Times (C + l, C + l, B + l, l);
+    }
+  if (dC)
+    {
+      if (dC == 1) tC += mpn_add_n (C + l, C + l, B, l);
+      else tC += add2Times (C + l, C + l, B, l);
+    }
+#endif
+  ASSERT (tC < 9);
+  TOOM3_MUL_REC(B, A, A + l, l, W);
+  tB = cB*dB;
+  if (cB) tB += mpn_addmul_1 (B + l, A + l, l, cB);
+  if (dB) tB += mpn_addmul_1 (B + l, A, l, dB);
+  ASSERT (tB < 49);
+  TOOM3_MUL_REC(A, a, b, l, W);
+  TOOM3_MUL_REC(E, a + l2, b + l2, ls, W);
+
+  /** Third stage: interpolation. **/
+  interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
+
+  /** Final stage: add up the coefficients. **/
+  {
+    mp_limb_t i, x, y;
+    tB += mpn_add_n (p + l, p + l, B, l2);
+    tD += mpn_add_n (p + l3, p + l3, D, l2);
+    mpn_incr_u (p + l3, tB);
+    mpn_incr_u (p + l4, tC);
+    mpn_incr_u (p + l5, tD);
+  }
+}
+
+/*-- mpn_toom3_sqr_n --------------------------------------------------------------*/
+
+/* Like previous function but for squaring */
+
+#define TOOM3_SQR_REC(p, a, n, ws) \
+  do {								\
+    if (n < KARATSUBA_SQR_THRESHOLD)				\
+      mpn_sqr_basecase (p, a, n);				\
+    else if (n < TOOM3_SQR_THRESHOLD)				\
+      mpn_kara_sqr_n (p, a, n, ws);				\
+    else							\
+      mpn_toom3_sqr_n (p, a, n, ws);				\
+  } while (0)
+
+void
+#if __STDC__
+mpn_toom3_sqr_n (mp_ptr p, mp_srcptr a, mp_size_t n, mp_ptr ws)
+#else
+mpn_toom3_sqr_n (p, a, n, ws)
+     mp_ptr    p;
+     mp_srcptr a;
+     mp_size_t n;
+     mp_ptr    ws;
+#endif
+{
+  mp_limb_t cB,cC,cD, tB,tC,tD;
+  mp_limb_t *A,*B,*C,*D,*E, *W;
+  mp_size_t l,l2,l3,l4,l5,ls;
+
+  /* Break n words into chunks of size l, l and ls.
+   * n = 3*k   => l = k,   ls = k
+   * n = 3*k+1 => l = k+1, ls = k-1
+   * n = 3*k+2 => l = k+1, ls = k
+   */
+  {
+    mp_limb_t m;
+
+    ASSERT (n >= TOOM3_MUL_THRESHOLD);
+    l = ls = n / 3;
+    m = n - l * 3;
+    if (m != 0)
+      ++l;
+    if (m == 1)
+      --ls;
+
+    l2 = l * 2;
+    l3 = l * 3;
+    l4 = l * 4;
+    l5 = l * 5;
+    A = p;
+    B = ws;
+    C = p + l2;
+    D = ws + l2;
+    E = p + l4;
+    W = ws + l4;
+  }
+
+  /** First stage: evaluation at points 0, 1/2, 1, 2, oo. **/
+  evaluate3 (A, B, C, &cB, &cC, &cD, a, a + l, a + l2, l, ls);
+
+  /** Second stage: pointwise multiplies. **/
+  TOOM3_SQR_REC(D, C, l, W);
+  tD = cD*cD;
+  if (cD) tD += mpn_addmul_1 (D + l, C, l, 2*cD);
+  ASSERT (tD < 49);
+  TOOM3_SQR_REC(C, B, l, W);
+  tC = cC*cC;
+  /* TO DO: choose one of the following alternatives. */
+#if 0
+  if (cC) tC += mpn_addmul_1 (C + l, B, l, 2*cC);
+#else
+  if (cC >= 1)
+    {
+      tC += add2Times (C + l, C + l, B, l);
+      if (cC == 2)
+        tC += add2Times (C + l, C + l, B, l);
+    }
+#endif
+  ASSERT (tC < 9);
+  TOOM3_SQR_REC(B, A, l, W);
+  tB = cB*cB;
+  if (cB) tB += mpn_addmul_1 (B + l, A, l, 2*cB);
+  ASSERT (tB < 49);
+  TOOM3_SQR_REC(A, a, l, W);
+  TOOM3_SQR_REC(E, a + l2, ls, W);
+
+  /** Third stage: interpolation. **/
+  interpolate3 (A, B, C, D, E, &tB, &tC, &tD, l2, ls << 1);
+
+  /** Final stage: add up the coefficients. **/
+  {
+    mp_limb_t i, x, y;
+    tB += mpn_add_n (p + l, p + l, B, l2);
+    tD += mpn_add_n (p + l3, p + l3, D, l2);
+    mpn_incr_u (p + l3, tB);
+    mpn_incr_u (p + l4, tC);
+    mpn_incr_u (p + l5, tD);
+  }
+}
+
+void
+#if __STDC__
+mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n)
+#else
+mpn_mul_n (p, a, b, n)
+     mp_ptr    p;
+     mp_srcptr a;
+     mp_srcptr b;
+     mp_size_t n;
+#endif
+{
+  if (n < KARATSUBA_MUL_THRESHOLD)
+    mpn_mul_basecase (p, a, n, b, n);
+  else if (n < TOOM3_MUL_THRESHOLD)
+    {
+      /* Allocate workspace of fixed size on stack: fast! */
+#if TUNE_PROGRAM_BUILD
+      mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD_LIMIT-1) + 2 * BITS_PER_MP_LIMB];
+#else
+      mp_limb_t ws[2 * (TOOM3_MUL_THRESHOLD-1) + 2 * BITS_PER_MP_LIMB];
+#endif
+      mpn_kara_mul_n (p, a, b, n, ws);
+    }
+#if WANT_FFT || TUNE_PROGRAM_BUILD
+  else if (n < FFT_MUL_THRESHOLD)
+#else
+  else
+#endif
+    {
+      /* Use workspace of unknown size in heap, as stack space may
+       * be limited.  Since n is at least TOOM3_MUL_THRESHOLD, the
+       * multiplication will take much longer than malloc()/free().  */
+      mp_limb_t wsLen, *ws;
+      wsLen = 2 * n + 3 * BITS_PER_MP_LIMB;
+      ws = (mp_ptr) (*_mp_allocate_func) ((size_t) wsLen * sizeof (mp_limb_t));
+      mpn_toom3_mul_n (p, a, b, n, ws);
+      (*_mp_free_func) (ws, (size_t) wsLen * sizeof (mp_limb_t));
+    }
+#if WANT_FFT || TUNE_PROGRAM_BUILD
+  else
+    {
+      mpn_mul_fft_full (p, a, n, b, n);      
+    }
+#endif
+}
diff --git a/rts/gmp/mpn/generic/perfsqr.c b/rts/gmp/mpn/generic/perfsqr.c
new file mode 100644
index 0000000000..42ee3405d7
--- /dev/null
+++ b/rts/gmp/mpn/generic/perfsqr.c
@@ -0,0 +1,123 @@
+/* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square,
+   zero otherwise.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* sq_res_0x100[x mod 0x100] == 1 iff x mod 0x100 is a quadratic residue
+   modulo 0x100.  */
+static unsigned char const sq_res_0x100[0x100] =
+{
+  1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+  0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+  1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+  0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+  0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+  0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+  0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+  0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+};
+
+int
+#if __STDC__
+mpn_perfect_square_p (mp_srcptr up, mp_size_t usize)
+#else
+mpn_perfect_square_p (up, usize)
+     mp_srcptr up;
+     mp_size_t usize;
+#endif
+{
+  mp_limb_t rem;
+  mp_ptr root_ptr;
+  int res;
+  TMP_DECL (marker);
+
+  /* The first test excludes 55/64 (85.9%) of the perfect square candidates
+     in O(1) time.  */
+  if ((sq_res_0x100[(unsigned int) up[0] % 0x100] & 1) == 0)
+    return 0;
+
+#if defined (PP)
+  /* The second test excludes 30652543/30808063 (99.5%) of the remaining
+     perfect square candidates in O(n) time.  */
+
+  /* Firstly, compute REM = A mod PP.  */
+  if (UDIV_TIME > (2 * UMUL_TIME + 6))
+    rem = mpn_preinv_mod_1 (up, usize, (mp_limb_t) PP, (mp_limb_t) PP_INVERTED);
+  else
+    rem = mpn_mod_1 (up, usize, (mp_limb_t) PP);
+
+  /* Now decide if REM is a quadratic residue modulo the factors in PP.  */
+
+  /* If A is just a few limbs, computing the square root does not take long
+     time, so things might run faster if we limit this loop according to the
+     size of A.  */
+
+#if BITS_PER_MP_LIMB == 64
+  if (((CNST_LIMB(0x12DD703303AED3) >> rem % 53) & 1) == 0)
+    return 0;
+  if (((CNST_LIMB(0x4351B2753DF) >> rem % 47) & 1) == 0)
+    return 0;
+  if (((CNST_LIMB(0x35883A3EE53) >> rem % 43) & 1) == 0)
+    return 0;
+  if (((CNST_LIMB(0x1B382B50737) >> rem % 41) & 1) == 0)
+    return 0;
+  if (((CNST_LIMB(0x165E211E9B) >> rem % 37) & 1) == 0)
+    return 0;
+  if (((CNST_LIMB(0x121D47B7) >> rem % 31) & 1) == 0)
+    return 0;
+#endif
+  if (((0x13D122F3L >> rem % 29) & 1) == 0)
+    return 0;
+  if (((0x5335FL >> rem % 23) & 1) == 0)
+    return 0;
+  if (((0x30AF3L >> rem % 19) & 1) == 0)
+    return 0;
+  if (((0x1A317L >> rem % 17) & 1) == 0)
+    return 0;
+  if (((0x161BL >> rem % 13) & 1) == 0)
+    return 0;
+  if (((0x23BL >> rem % 11) & 1) == 0)
+    return 0;
+  if (((0x017L >> rem % 7) & 1) == 0)
+    return 0;
+  if (((0x13L >> rem % 5) & 1) == 0)
+    return 0;
+  if (((0x3L >> rem % 3) & 1) == 0)
+    return 0;
+#endif
+
+  TMP_MARK (marker);
+
+  /* For the third and last test, we finally compute the square root,
+     to make sure we've really got a perfect square.  */
+  root_ptr = (mp_ptr) TMP_ALLOC ((usize + 1) / 2 * BYTES_PER_MP_LIMB);
+
+  /* Iff mpn_sqrtrem returns zero, the square is perfect.  */
+  res = ! mpn_sqrtrem (root_ptr, NULL, up, usize);
+  TMP_FREE (marker);
+  return res;
+}
diff --git a/rts/gmp/mpn/generic/popcount.c b/rts/gmp/mpn/generic/popcount.c
new file mode 100644
index 0000000000..387be9536d
--- /dev/null
+++ b/rts/gmp/mpn/generic/popcount.c
@@ -0,0 +1,93 @@
+/* popcount.c
+
+Copyright (C) 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#if defined __GNUC__
+/* No processor claiming to be SPARC v9 compliant seem to
+   implement the POPC instruction.  Disable pattern for now.  */
+#if 0 && defined __sparc_v9__ && BITS_PER_MP_LIMB == 64
+#define popc_limb(a) \
+  ({									\
+    DItype __res;							\
+    asm ("popc %1,%0" : "=r" (__res) : "rI" (a));			\
+    __res;								\
+  })
+#endif
+#endif
+
+#ifndef popc_limb
+
+/* Cool population count of a mp_limb_t.
+   You have to figure out how this works, I won't tell you!  */
+
+static inline unsigned int
+#if __STDC__
+popc_limb (mp_limb_t x)
+#else
+popc_limb (x)
+     mp_limb_t x;
+#endif
+{
+#if BITS_PER_MP_LIMB == 64
+  /* We have to go into some trouble to define these constants.
+     (For mp_limb_t being `long long'.)  */
+  mp_limb_t cnst;
+  cnst = 0xaaaaaaaaL | ((mp_limb_t) 0xaaaaaaaaL << BITS_PER_MP_LIMB/2);
+  x -= (x & cnst) >> 1;
+  cnst = 0x33333333L | ((mp_limb_t) 0x33333333L << BITS_PER_MP_LIMB/2);
+  x = ((x & ~cnst) >> 2) + (x & cnst);
+  cnst = 0x0f0f0f0fL | ((mp_limb_t) 0x0f0f0f0fL << BITS_PER_MP_LIMB/2);
+  x = ((x >> 4) + x) & cnst;
+  x = ((x >> 8) + x);
+  x = ((x >> 16) + x);
+  x = ((x >> 32) + x) & 0xff;
+#endif
+#if BITS_PER_MP_LIMB == 32
+  x -= (x & 0xaaaaaaaa) >> 1;
+  x = ((x >> 2) & 0x33333333L) + (x & 0x33333333L);
+  x = ((x >> 4) + x) & 0x0f0f0f0fL;
+  x = ((x >> 8) + x);
+  x = ((x >> 16) + x) & 0xff;
+#endif
+  return x;
+}
+#endif
+
+unsigned long int
+#if __STDC__
+mpn_popcount (register mp_srcptr p, register mp_size_t size)
+#else
+mpn_popcount (p, size)
+     register mp_srcptr p;
+     register mp_size_t size;
+#endif
+{
+  unsigned long int popcnt;
+  mp_size_t i;
+
+  popcnt = 0;
+  for (i = 0; i < size; i++)
+    popcnt += popc_limb (p[i]);
+
+  return popcnt;
+}
diff --git a/rts/gmp/mpn/generic/pre_mod_1.c b/rts/gmp/mpn/generic/pre_mod_1.c
new file mode 100644
index 0000000000..27179683b3
--- /dev/null
+++ b/rts/gmp/mpn/generic/pre_mod_1.c
@@ -0,0 +1,69 @@
+/* mpn_preinv_mod_1 (dividend_ptr, dividend_size, divisor_limb,
+		       divisor_limb_inverted) --
+   Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by the normalized DIVISOR_LIMB.
+   DIVISOR_LIMB_INVERTED should be 2^(2*BITS_PER_MP_LIMB) / DIVISOR_LIMB +
+   - 2^BITS_PER_MP_LIMB.
+   Return the single-limb remainder.
+
+Copyright (C) 1991, 1993, 1994, Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef UMUL_TIME
+#define UMUL_TIME 1
+#endif
+
+#ifndef UDIV_TIME
+#define UDIV_TIME UMUL_TIME
+#endif
+
+mp_limb_t
+#if __STDC__
+mpn_preinv_mod_1 (mp_srcptr dividend_ptr, mp_size_t dividend_size,
+		  mp_limb_t divisor_limb, mp_limb_t divisor_limb_inverted)
+#else
+mpn_preinv_mod_1 (dividend_ptr, dividend_size, divisor_limb, divisor_limb_inverted)
+     mp_srcptr dividend_ptr;
+     mp_size_t dividend_size;
+     mp_limb_t divisor_limb;
+     mp_limb_t divisor_limb_inverted;
+#endif
+{
+  mp_size_t i;
+  mp_limb_t n0, r;
+  int dummy;
+
+  i = dividend_size - 1;
+  r = dividend_ptr[i];
+
+  if (r >= divisor_limb)
+    r = 0;
+  else
+    i--;
+
+  for (; i >= 0; i--)
+    {
+      n0 = dividend_ptr[i];
+      udiv_qrnnd_preinv (dummy, r, r, n0, divisor_limb, divisor_limb_inverted);
+    }
+  return r;
+}
diff --git a/rts/gmp/mpn/generic/random.c b/rts/gmp/mpn/generic/random.c
new file mode 100644
index 0000000000..dea4e20e56
--- /dev/null
+++ b/rts/gmp/mpn/generic/random.c
@@ -0,0 +1,43 @@
+/* mpn_random -- Generate random numbers.
+
+Copyright (C) 1996, 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "urandom.h"
+
+void
+#if __STDC__
+mpn_random (mp_ptr res_ptr, mp_size_t size)
+#else
+mpn_random (res_ptr, size)
+     mp_ptr res_ptr;
+     mp_size_t size;
+#endif
+{
+  mp_size_t i;
+
+  for (i = 0; i < size; i++)
+    res_ptr[i] = urandom ();
+
+  /* Make sure the most significant limb is non-zero.  */
+  while (res_ptr[size - 1] == 0)
+    res_ptr[size - 1] = urandom ();
+}
diff --git a/rts/gmp/mpn/generic/random2.c b/rts/gmp/mpn/generic/random2.c
new file mode 100644
index 0000000000..86682f81fa
--- /dev/null
+++ b/rts/gmp/mpn/generic/random2.c
@@ -0,0 +1,105 @@
+/* mpn_random2 -- Generate random numbers with relatively long strings
+   of ones and zeroes.  Suitable for border testing.
+
+Copyright (C) 1992, 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#if defined (__hpux) || defined (__alpha)  || defined (__svr4__) || defined (__SVR4)
+/* HPUX lacks random().  DEC OSF/1 1.2 random() returns a double.  */
+long mrand48 ();
+static inline long
+random ()
+{
+  return mrand48 ();
+}
+#elif defined(_WIN32) && !(defined(__CYGWIN__) || defined(__CYGWIN32__))
+/* MS CRT supplies just the poxy rand(), with an upper bound of 0x7fff */
+static inline unsigned long
+random ()
+{
+  return rand () ^ (rand () << 16) ^ (rand() << 32);
+}
+
+#else
+long random ();
+#endif
+
+/* It's a bit tricky to get this right, so please test the code well
+   if you hack with it.  Some early versions of the function produced
+   random numbers with the leading limb == 0, and some versions never
+   made the most significant bit set.  */
+
+void
+#if __STDC__
+mpn_random2 (mp_ptr res_ptr, mp_size_t size)
+#else
+mpn_random2 (res_ptr, size)
+     mp_ptr res_ptr;
+     mp_size_t size;
+#endif
+{
+  int n_bits;
+  int bit_pos;
+  mp_size_t limb_pos;
+  unsigned int ran;
+  mp_limb_t limb;
+
+  limb = 0;
+
+  /* Start off in a random bit position in the most significant limb.  */
+  bit_pos = random () & (BITS_PER_MP_LIMB - 1);
+
+  /* Least significant bit of RAN chooses string of ones/string of zeroes.
+     Make most significant limb be non-zero by setting bit 0 of RAN.  */
+  ran = random () | 1;
+
+  for (limb_pos = size - 1; limb_pos >= 0; )
+    {
+      n_bits = (ran >> 1) % BITS_PER_MP_LIMB + 1;
+      if ((ran & 1) != 0)
+	{
+	  /* Generate a string of ones.  */
+	  if (n_bits >= bit_pos)
+	    {
+	      res_ptr[limb_pos--] = limb | ((((mp_limb_t) 2) << bit_pos) - 1);
+	      bit_pos += BITS_PER_MP_LIMB;
+	      limb = (~(mp_limb_t) 0) << (bit_pos - n_bits);
+	    }
+	  else
+	    {
+	      limb |= ((((mp_limb_t) 1) << n_bits) - 1) << (bit_pos - n_bits + 1);
+	    }
+	}
+      else
+	{
+	  /* Generate a string of zeroes.  */
+	  if (n_bits >= bit_pos)
+	    {
+	      res_ptr[limb_pos--] = limb;
+	      limb = 0;
+	      bit_pos += BITS_PER_MP_LIMB;
+	    }
+	}
+      bit_pos -= n_bits;
+      ran = random ();
+    }
+}
diff --git a/rts/gmp/mpn/generic/rshift.c b/rts/gmp/mpn/generic/rshift.c
new file mode 100644
index 0000000000..59caf73529
--- /dev/null
+++ b/rts/gmp/mpn/generic/rshift.c
@@ -0,0 +1,88 @@
+/* mpn_rshift -- Shift right a low-level natural-number integer.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Shift U (pointed to by UP and USIZE limbs long) CNT bits to the right
+   and store the USIZE least significant limbs of the result at WP.
+   The bits shifted out to the right are returned.
+
+   Argument constraints:
+   1. 0 < CNT < BITS_PER_MP_LIMB
+   2. If the result is to be written over the input, WP must be <= UP.
+*/
+
+mp_limb_t
+#if __STDC__
+mpn_rshift (register mp_ptr wp,
+	    register mp_srcptr up, mp_size_t usize,
+	    register unsigned int cnt)
+#else
+mpn_rshift (wp, up, usize, cnt)
+     register mp_ptr wp;
+     register mp_srcptr up;
+     mp_size_t usize;
+     register unsigned int cnt;
+#endif
+{
+  register mp_limb_t high_limb, low_limb;
+  register unsigned sh_1, sh_2;
+  register mp_size_t i;
+  mp_limb_t retval;
+
+#ifdef DEBUG
+  if (usize == 0 || cnt == 0)
+    abort ();
+#endif
+
+  sh_1 = cnt;
+
+#if 0
+  if (sh_1 == 0)
+    {
+      if (wp != up)
+	{
+	  /* Copy from low end to high end, to allow specified input/output
+	     overlapping.  */
+	  for (i = 0; i < usize; i++)
+	    wp[i] = up[i];
+	}
+      return usize;
+    }
+#endif
+
+  wp -= 1;
+  sh_2 = BITS_PER_MP_LIMB - sh_1;
+  high_limb = up[0];
+  retval = high_limb << sh_2;
+  low_limb = high_limb;
+
+  for (i = 1; i < usize; i++)
+    {
+      high_limb = up[i];
+      wp[i] = (low_limb >> sh_1) | (high_limb << sh_2);
+      low_limb = high_limb;
+    }
+  wp[i] = low_limb >> sh_1;
+
+  return retval;
+}
diff --git a/rts/gmp/mpn/generic/sb_divrem_mn.c b/rts/gmp/mpn/generic/sb_divrem_mn.c
new file mode 100644
index 0000000000..a269e34f5f
--- /dev/null
+++ b/rts/gmp/mpn/generic/sb_divrem_mn.c
@@ -0,0 +1,201 @@
+/* mpn_sb_divrem_mn -- Divide natural numbers, producing both remainder and
+   quotient.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE
+   INTERFACES.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
+   IN FACT, IT IS ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A
+   FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 1993, 1994, 1995, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Divide num (NP/NSIZE) by den (DP/DSIZE) and write
+   the NSIZE-DSIZE least significant quotient limbs at QP
+   and the DSIZE long remainder at NP.  If QEXTRA_LIMBS is
+   non-zero, generate that many fraction bits and append them after the
+   other quotient limbs.
+   Return the most significant limb of the quotient, this is always 0 or 1.
+
+   Preconditions:
+   0. NSIZE >= DSIZE.
+   1. The most significant bit of the divisor must be set.
+   2. QP must either not overlap with the input operands at all, or
+      QP + DSIZE >= NP must hold true.  (This means that it's
+      possible to put the quotient in the high part of NUM, right after the
+      remainder in NUM.
+   3. NSIZE >= DSIZE, even if QEXTRA_LIMBS is non-zero.
+   4. DSIZE >= 2.  */
+
+
+#define PREINVERT_VIABLE \
+  (UDIV_TIME > 2 * UMUL_TIME + 6 /* && ! TARGET_REGISTER_STARVED */)
+
+mp_limb_t
+#if __STDC__
+mpn_sb_divrem_mn (mp_ptr qp,
+	       mp_ptr np, mp_size_t nsize,
+	       mp_srcptr dp, mp_size_t dsize)
+#else
+mpn_sb_divrem_mn (qp, np, nsize, dp, dsize)
+     mp_ptr qp;
+     mp_ptr np;
+     mp_size_t nsize;
+     mp_srcptr dp;
+     mp_size_t dsize;
+#endif
+{
+  mp_limb_t most_significant_q_limb = 0;
+  mp_size_t i;
+  mp_limb_t dx, d1, n0;
+  mp_limb_t dxinv;
+  int have_preinv;
+
+  ASSERT_ALWAYS (dsize > 2);
+
+  np += nsize - dsize;
+  dx = dp[dsize - 1];
+  d1 = dp[dsize - 2];
+  n0 = np[dsize - 1];
+
+  if (n0 >= dx)
+    {
+      if (n0 > dx || mpn_cmp (np, dp, dsize - 1) >= 0)
+	{
+	  mpn_sub_n (np, np, dp, dsize);
+	  most_significant_q_limb = 1;
+	}
+    }
+
+  /* If multiplication is much faster than division, preinvert the
+     most significant divisor limb before entering the loop.  */
+  if (PREINVERT_VIABLE)
+    {
+      have_preinv = 0;
+      if ((UDIV_TIME - (2 * UMUL_TIME + 6)) * (nsize - dsize) > UDIV_TIME)
+	{
+	  invert_limb (dxinv, dx);
+	  have_preinv = 1;
+	}
+    }
+
+  for (i = nsize - dsize - 1; i >= 0; i--)
+    {
+      mp_limb_t q;
+      mp_limb_t nx;
+      mp_limb_t cy_limb;
+
+      nx = np[dsize - 1];
+      np--;
+
+      if (nx == dx)
+	{
+	  /* This might over-estimate q, but it's probably not worth
+	     the extra code here to find out.  */
+	  q = ~(mp_limb_t) 0;
+
+#if 1
+	  cy_limb = mpn_submul_1 (np, dp, dsize, q);
+#else
+	  /* This should be faster on many machines */
+	  cy_limb = mpn_sub_n (np + 1, np + 1, dp, dsize);
+	  cy = mpn_add_n (np, np, dp, dsize);
+	  np[dsize] += cy;
+#endif
+
+	  if (nx != cy_limb)
+	    {
+	      mpn_add_n (np, np, dp, dsize);
+	      q--;
+	    }
+
+	  qp[i] = q;
+	}
+      else
+	{
+	  mp_limb_t rx, r1, r0, p1, p0;
+
+          /* "workaround" avoids a problem with gcc 2.7.2.3 i386 register
+             usage when np[dsize-1] is used in an asm statement like
+             umul_ppmm in udiv_qrnnd_preinv.  The symptom is seg faults due
+             to registers being clobbered.  gcc 2.95 i386 doesn't have the
+             problem. */
+          {
+            mp_limb_t  workaround = np[dsize - 1];
+            if (PREINVERT_VIABLE && have_preinv)
+              udiv_qrnnd_preinv (q, r1, nx, workaround, dx, dxinv);
+            else
+              udiv_qrnnd (q, r1, nx, workaround, dx);
+          }
+	  umul_ppmm (p1, p0, d1, q);
+
+	  r0 = np[dsize - 2];
+	  rx = 0;
+	  if (r1 < p1 || (r1 == p1 && r0 < p0))
+	    {
+	      p1 -= p0 < d1;
+	      p0 -= d1;
+	      q--;
+	      r1 += dx;
+	      rx = r1 < dx;
+	    }
+
+	  p1 += r0 < p0;	/* cannot carry! */
+	  rx -= r1 < p1;	/* may become 11..1 if q is still too large */
+	  r1 -= p1;
+	  r0 -= p0;
+
+	  cy_limb = mpn_submul_1 (np, dp, dsize - 2, q);
+
+	  {
+	    mp_limb_t cy1, cy2;
+	    cy1 = r0 < cy_limb;
+	    r0 -= cy_limb;
+	    cy2 = r1 < cy1;
+	    r1 -= cy1;
+	    np[dsize - 1] = r1;
+	    np[dsize - 2] = r0;
+	    if (cy2 != rx)
+	      {
+		mpn_add_n (np, np, dp, dsize);
+		q--;
+	      }
+	  }
+	  qp[i] = q;
+	}
+    }
+
+  /* ______ ______ ______
+    |__rx__|__r1__|__r0__|		partial remainder
+	    ______ ______
+	 - |__p1__|__p0__|		partial product to subtract
+	    ______ ______
+	 - |______|cylimb|		
+
+     rx is -1, 0 or 1.  If rx=1, then q is correct (it should match
+     carry out).  If rx=-1 then q is too large.  If rx=0, then q might
+     be too large, but it is most likely correct.
+  */
+
+  return most_significant_q_limb;
+}
diff --git a/rts/gmp/mpn/generic/scan0.c b/rts/gmp/mpn/generic/scan0.c
new file mode 100644
index 0000000000..96f05ce854
--- /dev/null
+++ b/rts/gmp/mpn/generic/scan0.c
@@ -0,0 +1,62 @@
+/* mpn_scan0 -- Scan from a given bit position for the next clear bit.
+
+Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Design issues:
+   1. What if starting_bit is not within U?  Caller's problem?
+   2. Bit index should be 'unsigned'?
+
+   Argument constraints:
+   1. U must sooner ot later have a limb with a clear bit.
+ */
+
+unsigned long int
+#if __STDC__
+mpn_scan0 (register mp_srcptr up,
+	   register unsigned long int starting_bit)
+#else
+mpn_scan0 (up, starting_bit)
+     register mp_srcptr up;
+     register unsigned long int starting_bit;
+#endif
+{
+  mp_size_t starting_word;
+  mp_limb_t alimb;
+  int cnt;
+  mp_srcptr p;
+
+  /* Start at the word implied by STARTING_BIT.  */
+  starting_word = starting_bit / BITS_PER_MP_LIMB;
+  p = up + starting_word;
+  alimb = ~*p++;
+
+  /* Mask off any bits before STARTING_BIT in the first limb.  */
+  alimb &= - (mp_limb_t) 1 << (starting_bit % BITS_PER_MP_LIMB);
+
+  while (alimb == 0)
+    alimb = ~*p++;
+
+  count_leading_zeros (cnt, alimb & -alimb);
+  return (p - up) * BITS_PER_MP_LIMB - 1 - cnt;
+}
diff --git a/rts/gmp/mpn/generic/scan1.c b/rts/gmp/mpn/generic/scan1.c
new file mode 100644
index 0000000000..98e2e0dcc0
--- /dev/null
+++ b/rts/gmp/mpn/generic/scan1.c
@@ -0,0 +1,62 @@
+/* mpn_scan1 -- Scan from a given bit position for the next set bit.
+
+Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Design issues:
+   1. What if starting_bit is not within U?  Caller's problem?
+   2. Bit index should be 'unsigned'?
+
+   Argument constraints:
+   1. U must sooner ot later have a limb != 0.
+ */
+
+unsigned long int
+#if __STDC__
+mpn_scan1 (register mp_srcptr up,
+	   register unsigned long int starting_bit)
+#else
+mpn_scan1 (up, starting_bit)
+     register mp_srcptr up;
+     register unsigned long int starting_bit;
+#endif
+{
+  mp_size_t starting_word;
+  mp_limb_t alimb;
+  int cnt;
+  mp_srcptr p;
+
+  /* Start at the word implied by STARTING_BIT.  */
+  starting_word = starting_bit / BITS_PER_MP_LIMB;
+  p = up + starting_word;
+  alimb = *p++;
+
+  /* Mask off any bits before STARTING_BIT in the first limb.  */
+  alimb &= - (mp_limb_t) 1 << (starting_bit % BITS_PER_MP_LIMB);
+
+  while (alimb == 0)
+    alimb = *p++;
+
+  count_leading_zeros (cnt, alimb & -alimb);
+  return (p - up) * BITS_PER_MP_LIMB - 1 - cnt;
+}
diff --git a/rts/gmp/mpn/generic/set_str.c b/rts/gmp/mpn/generic/set_str.c
new file mode 100644
index 0000000000..e6ccc92154
--- /dev/null
+++ b/rts/gmp/mpn/generic/set_str.c
@@ -0,0 +1,159 @@
+/* mpn_set_str (mp_ptr res_ptr, const char *str, size_t str_len, int base)
+   -- Convert a STR_LEN long base BASE byte string pointed to by STR to a
+   limb vector pointed to by RES_PTR.  Return the number of limbs in
+   RES_PTR.
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_size_t
+#if __STDC__
+mpn_set_str (mp_ptr xp, const unsigned char *str, size_t str_len, int base)
+#else
+mpn_set_str (xp, str, str_len, base)
+     mp_ptr xp;
+     const unsigned char *str;
+     size_t str_len;
+     int base;
+#endif
+{
+  mp_size_t size;
+  mp_limb_t big_base;
+  int indigits_per_limb;
+  mp_limb_t res_digit;
+
+  big_base = __mp_bases[base].big_base;
+  indigits_per_limb = __mp_bases[base].chars_per_limb;
+
+/*   size = str_len / indigits_per_limb + 1;  */
+
+  size = 0;
+
+  if ((base & (base - 1)) == 0)
+    {
+      /* The base is a power of 2.  Read the input string from
+	 least to most significant character/digit.  */
+
+      const unsigned char *s;
+      int next_bitpos;
+      int bits_per_indigit = big_base;
+
+      res_digit = 0;
+      next_bitpos = 0;
+
+      for (s = str + str_len - 1; s >= str; s--)
+	{
+	  int inp_digit = *s;
+
+	  res_digit |= (mp_limb_t) inp_digit << next_bitpos;
+	  next_bitpos += bits_per_indigit;
+	  if (next_bitpos >= BITS_PER_MP_LIMB)
+	    {
+	      xp[size++] = res_digit;
+	      next_bitpos -= BITS_PER_MP_LIMB;
+	      res_digit = inp_digit >> (bits_per_indigit - next_bitpos);
+	    }
+	}
+
+      if (res_digit != 0)
+	xp[size++] = res_digit;
+    }
+  else
+    {
+      /* General case.  The base is not a power of 2.  */
+
+      size_t i;
+      int j;
+      mp_limb_t cy_limb;
+
+      for (i = indigits_per_limb; i < str_len; i += indigits_per_limb)
+	{
+	  res_digit = *str++;
+	  if (base == 10)
+	    { /* This is a common case.
+		 Help the compiler to avoid multiplication.  */
+	      for (j = 1; j < indigits_per_limb; j++)
+		res_digit = res_digit * 10 + *str++;
+	    }
+	  else
+	    {
+	      for (j = 1; j < indigits_per_limb; j++)
+		res_digit = res_digit * base + *str++;
+	    }
+
+	  if (size == 0)
+	    {
+	      if (res_digit != 0)
+		{
+		  xp[0] = res_digit;
+		  size = 1;
+		}
+	    }
+	  else
+	    {
+	      cy_limb = mpn_mul_1 (xp, xp, size, big_base);
+	      cy_limb += mpn_add_1 (xp, xp, size, res_digit);
+	      if (cy_limb != 0)
+		xp[size++] = cy_limb;
+	    }
+	}
+
+      big_base = base;
+      res_digit = *str++;
+      if (base == 10)
+	{ /* This is a common case.
+	     Help the compiler to avoid multiplication.  */
+	  for (j = 1; j < str_len - (i - indigits_per_limb); j++)
+	    {
+	      res_digit = res_digit * 10 + *str++;
+	      big_base *= 10;
+	    }
+	}
+      else
+	{
+	  for (j = 1; j < str_len - (i - indigits_per_limb); j++)
+	    {
+	      res_digit = res_digit * base + *str++;
+	      big_base *= base;
+	    }
+	}
+
+      if (size == 0)
+	{
+	  if (res_digit != 0)
+	    {
+	      xp[0] = res_digit;
+	      size = 1;
+	    }
+	}
+      else
+	{
+	  cy_limb = mpn_mul_1 (xp, xp, size, big_base);
+	  cy_limb += mpn_add_1 (xp, xp, size, res_digit);
+	  if (cy_limb != 0)
+	    xp[size++] = cy_limb;
+	}
+    }
+
+  return size;
+}
diff --git a/rts/gmp/mpn/generic/sqr_basecase.c b/rts/gmp/mpn/generic/sqr_basecase.c
new file mode 100644
index 0000000000..760258a3e0
--- /dev/null
+++ b/rts/gmp/mpn/generic/sqr_basecase.c
@@ -0,0 +1,83 @@
+/* mpn_sqr_basecase -- Internal routine to square two natural numbers
+   of length m and n.
+
+   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
+
+
+Copyright (C) 1991, 1992, 1993, 1994, 1996, 1997, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void
+#if __STDC__
+mpn_sqr_basecase (mp_ptr prodp, mp_srcptr up, mp_size_t n)
+#else
+mpn_sqr_basecase (prodp, up, n)
+     mp_ptr prodp;
+     mp_srcptr up;
+     mp_size_t n;
+#endif
+{
+  mp_size_t i;
+
+  {
+    /* N.B.!  We need the superfluous indirection through argh to work around
+       a reloader bug in GCC 2.7.*.  */
+    mp_limb_t x;
+    mp_limb_t argh;
+    x = up[0];
+    umul_ppmm (argh, prodp[0], x, x);
+    prodp[1] = argh;
+  }
+  if (n > 1)
+    {
+      mp_limb_t tarr[2 * KARATSUBA_SQR_THRESHOLD];
+      mp_ptr tp = tarr;
+      mp_limb_t cy;
+
+      /* must fit 2*n limbs in tarr */
+      ASSERT (n <= KARATSUBA_SQR_THRESHOLD);
+
+      cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]);
+      tp[n - 1] = cy;
+      for (i = 2; i < n; i++)
+	{
+	  mp_limb_t cy;
+	  cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]);
+	  tp[n + i - 2] = cy;
+	}
+      for (i = 1; i < n; i++)
+	{
+	  mp_limb_t x;
+	  x = up[i];
+	  umul_ppmm (prodp[2 * i + 1], prodp[2 * i], x, x);
+	}
+      {
+	mp_limb_t cy;
+	cy = mpn_lshift (tp, tp, 2 * n - 2, 1);
+	cy += mpn_add_n (prodp + 1, prodp + 1, tp, 2 * n - 2);
+	prodp[2 * n - 1] += cy;
+      }
+    }
+}
diff --git a/rts/gmp/mpn/generic/sqrtrem.c b/rts/gmp/mpn/generic/sqrtrem.c
new file mode 100644
index 0000000000..ee3b5144dd
--- /dev/null
+++ b/rts/gmp/mpn/generic/sqrtrem.c
@@ -0,0 +1,509 @@
+/* mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size)
+
+   Write the square root of {OP_PTR, OP_SIZE} at ROOT_PTR.
+   Write the remainder at REM_PTR, if REM_PTR != NULL.
+   Return the size of the remainder.
+   (The size of the root is always half of the size of the operand.)
+
+   OP_PTR and ROOT_PTR may not point to the same object.
+   OP_PTR and REM_PTR may point to the same object.
+
+   If REM_PTR is NULL, only the root is computed and the return value of
+   the function is 0 if OP is a perfect square, and *any* non-zero number
+   otherwise.
+
+Copyright (C) 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/* This code is just correct if "unsigned char" has at least 8 bits.  It
+   doesn't help to use CHAR_BIT from limits.h, as the real problem is
+   the static arrays.  */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Square root algorithm:
+
+   1. Shift OP (the input) to the left an even number of bits s.t. there
+      are an even number of words and either (or both) of the most
+      significant bits are set.  This way, sqrt(OP) has exactly half as
+      many words as OP, and has its most significant bit set.
+
+   2. Get a 9-bit approximation to sqrt(OP) using the pre-computed tables.
+      This approximation is used for the first single-precision
+      iterations of Newton's method, yielding a full-word approximation
+      to sqrt(OP).
+
+   3. Perform multiple-precision Newton iteration until we have the
+      exact result.  Only about half of the input operand is used in
+      this calculation, as the square root is perfectly determinable
+      from just the higher half of a number.  */
+
+/* Define this macro for IEEE P854 machines with a fast sqrt instruction.  */
+#if defined __GNUC__ && ! defined __SOFT_FLOAT
+
+#if defined (__sparc__) && BITS_PER_MP_LIMB == 32
+#define SQRT(a) \
+  ({									\
+    double __sqrt_res;							\
+    asm ("fsqrtd %1,%0" : "=f" (__sqrt_res) : "f" (a));			\
+    __sqrt_res;								\
+  })
+#endif
+
+#if defined (__HAVE_68881__)
+#define SQRT(a) \
+  ({									\
+    double __sqrt_res;							\
+    asm ("fsqrtx %1,%0" : "=f" (__sqrt_res) : "f" (a));			\
+    __sqrt_res;								\
+  })
+#endif
+
+#if defined (__hppa) && BITS_PER_MP_LIMB == 32
+#define SQRT(a) \
+  ({									\
+    double __sqrt_res;							\
+    asm ("fsqrt,dbl %1,%0" : "=fx" (__sqrt_res) : "fx" (a));		\
+    __sqrt_res;								\
+  })
+#endif
+
+#if defined (_ARCH_PWR2) && BITS_PER_MP_LIMB == 32
+#define SQRT(a) \
+  ({									\
+    double __sqrt_res;							\
+    asm ("fsqrt %0,%1" : "=f" (__sqrt_res) : "f" (a));			\
+    __sqrt_res;								\
+  })
+#endif
+
+#if 0
+#if defined (__i386__) || defined (__i486__)
+#define SQRT(a) \
+  ({									\
+    double __sqrt_res;							\
+    asm ("fsqrt" : "=t" (__sqrt_res) : "0" (a));			\
+    __sqrt_res;								\
+  })
+#endif
+#endif
+
+#endif
+
+#ifndef SQRT
+
+/* Tables for initial approximation of the square root.  These are
+   indexed with bits 1-8 of the operand for which the square root is
+   calculated, where bit 0 is the most significant non-zero bit.  I.e.
+   the most significant one-bit is not used, since that per definition
+   is one.  Likewise, the tables don't return the highest bit of the
+   result.  That bit must be inserted by or:ing the returned value with
+   0x100.  This way, we get a 9-bit approximation from 8-bit tables!  */
+
+/* Table to be used for operands with an even total number of bits.
+   (Exactly as in the decimal system there are similarities between the
+   square root of numbers with the same initial digits and an even
+   difference in the total number of digits.  Consider the square root
+   of 1, 10, 100, 1000, ...)  */
+static const unsigned char even_approx_tab[256] =
+{
+  0x6a, 0x6a, 0x6b, 0x6c, 0x6c, 0x6d, 0x6e, 0x6e,
+  0x6f, 0x70, 0x71, 0x71, 0x72, 0x73, 0x73, 0x74,
+  0x75, 0x75, 0x76, 0x77, 0x77, 0x78, 0x79, 0x79,
+  0x7a, 0x7b, 0x7b, 0x7c, 0x7d, 0x7d, 0x7e, 0x7f,
+  0x80, 0x80, 0x81, 0x81, 0x82, 0x83, 0x83, 0x84,
+  0x85, 0x85, 0x86, 0x87, 0x87, 0x88, 0x89, 0x89,
+  0x8a, 0x8b, 0x8b, 0x8c, 0x8d, 0x8d, 0x8e, 0x8f,
+  0x8f, 0x90, 0x90, 0x91, 0x92, 0x92, 0x93, 0x94,
+  0x94, 0x95, 0x96, 0x96, 0x97, 0x97, 0x98, 0x99,
+  0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9c, 0x9d, 0x9e,
+  0x9e, 0x9f, 0xa0, 0xa0, 0xa1, 0xa1, 0xa2, 0xa3,
+  0xa3, 0xa4, 0xa4, 0xa5, 0xa6, 0xa6, 0xa7, 0xa7,
+  0xa8, 0xa9, 0xa9, 0xaa, 0xaa, 0xab, 0xac, 0xac,
+  0xad, 0xad, 0xae, 0xaf, 0xaf, 0xb0, 0xb0, 0xb1,
+  0xb2, 0xb2, 0xb3, 0xb3, 0xb4, 0xb5, 0xb5, 0xb6,
+  0xb6, 0xb7, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xba,
+  0xbb, 0xbb, 0xbc, 0xbd, 0xbd, 0xbe, 0xbe, 0xbf,
+  0xc0, 0xc0, 0xc1, 0xc1, 0xc2, 0xc2, 0xc3, 0xc3,
+  0xc4, 0xc5, 0xc5, 0xc6, 0xc6, 0xc7, 0xc7, 0xc8,
+  0xc9, 0xc9, 0xca, 0xca, 0xcb, 0xcb, 0xcc, 0xcc,
+  0xcd, 0xce, 0xce, 0xcf, 0xcf, 0xd0, 0xd0, 0xd1,
+  0xd1, 0xd2, 0xd3, 0xd3, 0xd4, 0xd4, 0xd5, 0xd5,
+  0xd6, 0xd6, 0xd7, 0xd7, 0xd8, 0xd9, 0xd9, 0xda,
+  0xda, 0xdb, 0xdb, 0xdc, 0xdc, 0xdd, 0xdd, 0xde,
+  0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe1, 0xe2, 0xe2,
+  0xe3, 0xe3, 0xe4, 0xe4, 0xe5, 0xe5, 0xe6, 0xe6,
+  0xe7, 0xe7, 0xe8, 0xe8, 0xe9, 0xea, 0xea, 0xeb,
+  0xeb, 0xec, 0xec, 0xed, 0xed, 0xee, 0xee, 0xef,
+  0xef, 0xf0, 0xf0, 0xf1, 0xf1, 0xf2, 0xf2, 0xf3,
+  0xf3, 0xf4, 0xf4, 0xf5, 0xf5, 0xf6, 0xf6, 0xf7,
+  0xf7, 0xf8, 0xf8, 0xf9, 0xf9, 0xfa, 0xfa, 0xfb,
+  0xfb, 0xfc, 0xfc, 0xfd, 0xfd, 0xfe, 0xfe, 0xff,
+};
+
+/* Table to be used for operands with an odd total number of bits.
+   (Further comments before previous table.)  */
+static const unsigned char odd_approx_tab[256] =
+{
+  0x00, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
+  0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07,
+  0x07, 0x08, 0x08, 0x09, 0x09, 0x0a, 0x0a, 0x0b,
+  0x0b, 0x0c, 0x0c, 0x0d, 0x0d, 0x0e, 0x0e, 0x0f,
+  0x0f, 0x10, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12,
+  0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x16, 0x16,
+  0x16, 0x17, 0x17, 0x18, 0x18, 0x19, 0x19, 0x1a,
+  0x1a, 0x1b, 0x1b, 0x1b, 0x1c, 0x1c, 0x1d, 0x1d,
+  0x1e, 0x1e, 0x1f, 0x1f, 0x20, 0x20, 0x20, 0x21,
+  0x21, 0x22, 0x22, 0x23, 0x23, 0x23, 0x24, 0x24,
+  0x25, 0x25, 0x26, 0x26, 0x27, 0x27, 0x27, 0x28,
+  0x28, 0x29, 0x29, 0x2a, 0x2a, 0x2a, 0x2b, 0x2b,
+  0x2c, 0x2c, 0x2d, 0x2d, 0x2d, 0x2e, 0x2e, 0x2f,
+  0x2f, 0x30, 0x30, 0x30, 0x31, 0x31, 0x32, 0x32,
+  0x32, 0x33, 0x33, 0x34, 0x34, 0x35, 0x35, 0x35,
+  0x36, 0x36, 0x37, 0x37, 0x37, 0x38, 0x38, 0x39,
+  0x39, 0x39, 0x3a, 0x3a, 0x3b, 0x3b, 0x3b, 0x3c,
+  0x3c, 0x3d, 0x3d, 0x3d, 0x3e, 0x3e, 0x3f, 0x3f,
+  0x40, 0x40, 0x40, 0x41, 0x41, 0x41, 0x42, 0x42,
+  0x43, 0x43, 0x43, 0x44, 0x44, 0x45, 0x45, 0x45,
+  0x46, 0x46, 0x47, 0x47, 0x47, 0x48, 0x48, 0x49,
+  0x49, 0x49, 0x4a, 0x4a, 0x4b, 0x4b, 0x4b, 0x4c,
+  0x4c, 0x4c, 0x4d, 0x4d, 0x4e, 0x4e, 0x4e, 0x4f,
+  0x4f, 0x50, 0x50, 0x50, 0x51, 0x51, 0x51, 0x52,
+  0x52, 0x53, 0x53, 0x53, 0x54, 0x54, 0x54, 0x55,
+  0x55, 0x56, 0x56, 0x56, 0x57, 0x57, 0x57, 0x58,
+  0x58, 0x59, 0x59, 0x59, 0x5a, 0x5a, 0x5a, 0x5b,
+  0x5b, 0x5b, 0x5c, 0x5c, 0x5d, 0x5d, 0x5d, 0x5e,
+  0x5e, 0x5e, 0x5f, 0x5f, 0x60, 0x60, 0x60, 0x61,
+  0x61, 0x61, 0x62, 0x62, 0x62, 0x63, 0x63, 0x63,
+  0x64, 0x64, 0x65, 0x65, 0x65, 0x66, 0x66, 0x66,
+  0x67, 0x67, 0x67, 0x68, 0x68, 0x68, 0x69, 0x69,
+};
+#endif
+
+
+mp_size_t
+#if __STDC__
+mpn_sqrtrem (mp_ptr root_ptr, mp_ptr rem_ptr, mp_srcptr op_ptr, mp_size_t op_size)
+#else
+mpn_sqrtrem (root_ptr, rem_ptr, op_ptr, op_size)
+     mp_ptr root_ptr;
+     mp_ptr rem_ptr;
+     mp_srcptr op_ptr;
+     mp_size_t op_size;
+#endif
+{
+  /* R (root result) */
+  mp_ptr rp;			/* Pointer to least significant word */
+  mp_size_t rsize;		/* The size in words */
+
+  /* T (OP shifted to the left a.k.a. normalized) */
+  mp_ptr tp;			/* Pointer to least significant word */
+  mp_size_t tsize;		/* The size in words */
+  mp_ptr t_end_ptr;		/* Pointer right beyond most sign. word */
+  mp_limb_t t_high0, t_high1;	/* The two most significant words */
+
+  /* TT (temporary for numerator/remainder) */
+  mp_ptr ttp;			/* Pointer to least significant word */
+
+  /* X (temporary for quotient in main loop) */
+  mp_ptr xp;			/* Pointer to least significant word */
+  mp_size_t xsize;		/* The size in words */
+
+  unsigned cnt;
+  mp_limb_t initial_approx;	/* Initially made approximation */
+  mp_size_t tsizes[BITS_PER_MP_LIMB];	/* Successive calculation precisions */
+  mp_size_t tmp;
+  mp_size_t i;
+
+  mp_limb_t cy_limb;
+  TMP_DECL (marker);
+
+  /* If OP is zero, both results are zero.  */
+  if (op_size == 0)
+    return 0;
+
+  count_leading_zeros (cnt, op_ptr[op_size - 1]);
+  tsize = op_size;
+  if ((tsize & 1) != 0)
+    {
+      cnt += BITS_PER_MP_LIMB;
+      tsize++;
+    }
+
+  rsize = tsize / 2;
+  rp = root_ptr;
+
+  TMP_MARK (marker);
+
+  /* Shift OP an even number of bits into T, such that either the most or
+     the second most significant bit is set, and such that the number of
+     words in T becomes even.  This way, the number of words in R=sqrt(OP)
+     is exactly half as many as in OP, and the most significant bit of R
+     is set.
+
+     Also, the initial approximation is simplified by this up-shifted OP.
+
+     Finally, the Newtonian iteration which is the main part of this
+     program performs division by R.  The fast division routine expects
+     the divisor to be "normalized" in exactly the sense of having the
+     most significant bit set.  */
+
+  tp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
+
+  if ((cnt & ~1) % BITS_PER_MP_LIMB != 0)
+    t_high0 = mpn_lshift (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size,
+			  (cnt & ~1) % BITS_PER_MP_LIMB);
+  else
+    MPN_COPY (tp + cnt / BITS_PER_MP_LIMB, op_ptr, op_size);
+
+  if (cnt >= BITS_PER_MP_LIMB)
+    tp[0] = 0;
+
+  t_high0 = tp[tsize - 1];
+  t_high1 = tp[tsize - 2];	/* Never stray.  TSIZE is >= 2.  */
+
+/* Is there a fast sqrt instruction defined for this machine?  */
+#ifdef SQRT
+  {
+    initial_approx = SQRT (t_high0 * MP_BASE_AS_DOUBLE + t_high1);
+    /* If t_high0,,t_high1 is big, the result in INITIAL_APPROX might have
+       become incorrect due to overflow in the conversion from double to
+       mp_limb_t above.  It will typically be zero in that case, but might be
+       a small number on some machines.  The most significant bit of
+       INITIAL_APPROX should be set, so that bit is a good overflow
+       indication.  */
+    if ((mp_limb_signed_t) initial_approx >= 0)
+      initial_approx = ~(mp_limb_t)0;
+  }
+#else
+  /* Get a 9 bit approximation from the tables.  The tables expect to
+     be indexed with the 8 high bits right below the highest bit.
+     Also, the highest result bit is not returned by the tables, and
+     must be or:ed into the result.  The scheme gives 9 bits of start
+     approximation with just 256-entry 8 bit tables.  */
+
+  if ((cnt & 1) == 0)
+    {
+      /* The most significant bit of t_high0 is set.  */
+      initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 1);
+      initial_approx &= 0xff;
+      initial_approx = even_approx_tab[initial_approx];
+    }
+  else
+    {
+      /* The most significant bit of t_high0 is unset,
+	 the second most significant is set.  */
+      initial_approx = t_high0 >> (BITS_PER_MP_LIMB - 8 - 2);
+      initial_approx &= 0xff;
+      initial_approx = odd_approx_tab[initial_approx];
+    }
+  initial_approx |= 0x100;
+  initial_approx <<= BITS_PER_MP_LIMB - 8 - 1;
+
+  /* Perform small precision Newtonian iterations to get a full word
+     approximation.  For small operands, these iterations will do the
+     entire job.  */
+  if (t_high0 == ~(mp_limb_t)0)
+    initial_approx = t_high0;
+  else
+    {
+      mp_limb_t quot;
+
+      if (t_high0 >= initial_approx)
+	initial_approx = t_high0 + 1;
+
+      /* First get about 18 bits with pure C arithmetics.  */
+      quot = t_high0 / (initial_approx >> BITS_PER_MP_LIMB/2) << BITS_PER_MP_LIMB/2;
+      initial_approx = (initial_approx + quot) / 2;
+      initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
+
+      /* Now get a full word by one (or for > 36 bit machines) several
+	 iterations.  */
+      for (i = 18; i < BITS_PER_MP_LIMB; i <<= 1)
+	{
+	  mp_limb_t ignored_remainder;
+
+	  udiv_qrnnd (quot, ignored_remainder,
+		      t_high0, t_high1, initial_approx);
+	  initial_approx = (initial_approx + quot) / 2;
+	  initial_approx |= (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
+	}
+    }
+#endif
+
+  rp[0] = initial_approx;
+  rsize = 1;
+
+#ifdef SQRT_DEBUG
+	  printf ("\n\nT = ");
+	  mpn_dump (tp, tsize);
+#endif
+
+  if (tsize > 2)
+    {
+      /* Determine the successive precisions to use in the iteration.  We
+	 minimize the precisions, beginning with the highest (i.e. last
+	 iteration) to the lowest (i.e. first iteration).  */
+
+      xp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
+      ttp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
+
+      t_end_ptr = tp + tsize;
+
+      tmp = tsize / 2;
+      for (i = 0;; i++)
+	{
+	  tsize = (tmp + 1) / 2;
+	  if (tmp == tsize)
+	    break;
+	  tsizes[i] = tsize + tmp;
+	  tmp = tsize;
+	}
+
+      /* Main Newton iteration loop.  For big arguments, most of the
+	 time is spent here.  */
+
+      /* It is possible to do a great optimization here.  The successive
+	 divisors in the mpn_divmod call below have more and more leading
+	 words equal to its predecessor.  Therefore the beginning of
+	 each division will repeat the same work as did the last
+	 division.  If we could guarantee that the leading words of two
+	 consecutive divisors are the same (i.e. in this case, a later
+	 divisor has just more digits at the end) it would be a simple
+	 matter of just using the old remainder of the last division in
+	 a subsequent division, to take care of this optimization.  This
+	 idea would surely make a difference even for small arguments.  */
+
+      /* Loop invariants:
+
+	 R <= shiftdown_to_same_size(floor(sqrt(OP))) < R + 1.
+	 X - 1 < shiftdown_to_same_size(floor(sqrt(OP))) <= X.
+	 R <= shiftdown_to_same_size(X).  */
+
+      while (--i >= 0)
+	{
+	  mp_limb_t cy;
+#ifdef SQRT_DEBUG
+	  mp_limb_t old_least_sign_r = rp[0];
+	  mp_size_t old_rsize = rsize;
+
+	  printf ("R = ");
+	  mpn_dump (rp, rsize);
+#endif
+	  tsize = tsizes[i];
+
+	  /* Need to copy the numerator into temporary space, as
+	     mpn_divmod overwrites its numerator argument with the
+	     remainder (which we currently ignore).  */
+	  MPN_COPY (ttp, t_end_ptr - tsize, tsize);
+	  cy = mpn_divmod (xp, ttp, tsize, rp, rsize);
+	  xsize = tsize - rsize;
+
+#ifdef SQRT_DEBUG
+	  printf ("X =%d ", cy);
+	  mpn_dump (xp, xsize);
+#endif
+
+	  /* Add X and R with the most significant limbs aligned,
+	     temporarily ignoring at least one limb at the low end of X.  */
+	  tmp = xsize - rsize;
+	  cy += mpn_add_n (xp + tmp, rp, xp + tmp, rsize);
+
+	  /* If T begins with more than 2 x BITS_PER_MP_LIMB of ones, we get
+	     intermediate roots that'd need an extra bit.  We don't want to
+	     handle that since it would make the subsequent divisor
+	     non-normalized, so round such roots down to be only ones in the
+	     current precision.  */
+	  if (cy == 2)
+	    {
+	      mp_size_t j;
+	      for (j = xsize; j >= 0; j--)
+		xp[j] = ~(mp_limb_t)0;
+	    }
+
+	  /* Divide X by 2 and put the result in R.  This is the new
+	     approximation.  Shift in the carry from the addition.  */
+	  mpn_rshift (rp, xp, xsize, 1);
+	  rp[xsize - 1] |= ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1));
+	  rsize = xsize;
+#ifdef SQRT_DEBUG
+	  if (old_least_sign_r != rp[rsize - old_rsize])
+	    printf (">>>>>>>> %d: %0*lX, %0*lX <<<<<<<<\n",
+		    i, 2 * BYTES_PER_MP_LIMB, old_least_sign_r,
+		    2 * BYTES_PER_MP_LIMB, rp[rsize - old_rsize]);
+#endif
+	}
+    }
+
+#ifdef SQRT_DEBUG
+  printf ("(final) R = ");
+  mpn_dump (rp, rsize);
+#endif
+
+  /* We computed the square root of OP * 2**(2*floor(cnt/2)).
+     This has resulted in R being 2**floor(cnt/2) to large.
+     Shift it down here to fix that.  */
+  if (cnt / 2 != 0)
+    {
+      mpn_rshift (rp, rp, rsize, cnt/2);
+      rsize -= rp[rsize - 1] == 0;
+    }
+
+  /* Calculate the remainder.  */
+  mpn_mul_n (tp, rp, rp, rsize);
+  tsize = rsize + rsize;
+  tsize -= tp[tsize - 1] == 0;
+  if (op_size < tsize
+      || (op_size == tsize && mpn_cmp (op_ptr, tp, op_size) < 0))
+    {
+      /* R is too large.  Decrement it.  */
+
+      /* These operations can't overflow.  */
+      cy_limb  = mpn_sub_n (tp, tp, rp, rsize);
+      cy_limb += mpn_sub_n (tp, tp, rp, rsize);
+      mpn_decr_u (tp + rsize, cy_limb);
+      mpn_incr_u (tp, (mp_limb_t) 1);
+
+      mpn_decr_u (rp, (mp_limb_t) 1);
+
+#ifdef SQRT_DEBUG
+      printf ("(adjusted) R = ");
+      mpn_dump (rp, rsize);
+#endif
+    }
+
+  if (rem_ptr != NULL)
+    {
+      cy_limb = mpn_sub (rem_ptr, op_ptr, op_size, tp, tsize);
+      MPN_NORMALIZE (rem_ptr, op_size);
+      TMP_FREE (marker);
+      return op_size;
+    }
+  else
+    {
+      int res;
+      res = op_size != tsize || mpn_cmp (op_ptr, tp, op_size);
+      TMP_FREE (marker);
+      return res;
+    }
+}
diff --git a/rts/gmp/mpn/generic/sub_n.c b/rts/gmp/mpn/generic/sub_n.c
new file mode 100644
index 0000000000..4f2f06099c
--- /dev/null
+++ b/rts/gmp/mpn/generic/sub_n.c
@@ -0,0 +1,62 @@
+/* mpn_sub_n -- Subtract two limb vectors of equal, non-zero length.
+
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+#if __STDC__
+mpn_sub_n (mp_ptr res_ptr, mp_srcptr s1_ptr, mp_srcptr s2_ptr, mp_size_t size)
+#else
+mpn_sub_n (res_ptr, s1_ptr, s2_ptr, size)
+     register mp_ptr res_ptr;
+     register mp_srcptr s1_ptr;
+     register mp_srcptr s2_ptr;
+     mp_size_t size;
+#endif
+{
+  register mp_limb_t x, y, cy;
+  register mp_size_t j;
+
+  /* The loop counter and index J goes from -SIZE to -1.  This way
+     the loop becomes faster.  */
+  j = -size;
+
+  /* Offset the base pointers to compensate for the negative indices.  */
+  s1_ptr -= j;
+  s2_ptr -= j;
+  res_ptr -= j;
+
+  cy = 0;
+  do
+    {
+      y = s2_ptr[j];
+      x = s1_ptr[j];
+      y += cy;			/* add previous carry to subtrahend */
+      cy = (y < cy);		/* get out carry from that addition */
+      y = x - y;		/* main subtract */
+      cy = (y > x) + cy;	/* get out carry from the subtract, combine */
+      res_ptr[j] = y;
+    }
+  while (++j != 0);
+
+  return cy;
+}
diff --git a/rts/gmp/mpn/generic/submul_1.c b/rts/gmp/mpn/generic/submul_1.c
new file mode 100644
index 0000000000..c7c08ee4af
--- /dev/null
+++ b/rts/gmp/mpn/generic/submul_1.c
@@ -0,0 +1,65 @@
+/* mpn_submul_1 -- multiply the S1_SIZE long limb vector pointed to by S1_PTR
+   by S2_LIMB, subtract the S1_SIZE least significant limbs of the product
+   from the limb vector pointed to by RES_PTR.  Return the most significant
+   limb of the product, adjusted for carry-out from the subtraction.
+
+Copyright (C) 1992, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_submul_1 (res_ptr, s1_ptr, s1_size, s2_limb)
+     register mp_ptr res_ptr;
+     register mp_srcptr s1_ptr;
+     mp_size_t s1_size;
+     register mp_limb_t s2_limb;
+{
+  register mp_limb_t cy_limb;
+  register mp_size_t j;
+  register mp_limb_t prod_high, prod_low;
+  register mp_limb_t x;
+
+  /* The loop counter and index J goes from -SIZE to -1.  This way
+     the loop becomes faster.  */
+  j = -s1_size;
+
+  /* Offset the base pointers to compensate for the negative indices.  */
+  res_ptr -= j;
+  s1_ptr -= j;
+
+  cy_limb = 0;
+  do
+    {
+      umul_ppmm (prod_high, prod_low, s1_ptr[j], s2_limb);
+
+      prod_low += cy_limb;
+      cy_limb = (prod_low < cy_limb) + prod_high;
+
+      x = res_ptr[j];
+      prod_low = x - prod_low;
+      cy_limb += (prod_low > x);
+      res_ptr[j] = prod_low;
+    }
+  while (++j != 0);
+
+  return cy_limb;
+}
diff --git a/rts/gmp/mpn/generic/tdiv_qr.c b/rts/gmp/mpn/generic/tdiv_qr.c
new file mode 100644
index 0000000000..b748b5d810
--- /dev/null
+++ b/rts/gmp/mpn/generic/tdiv_qr.c
@@ -0,0 +1,401 @@
+/* mpn_tdiv_qr -- Divide the numerator (np,nn) by the denominator (dp,dn) and
+   write the nn-dn+1 quotient limbs at qp and the dn remainder limbs at rp.  If
+   qxn is non-zero, generate that many fraction limbs and append them after the
+   other quotient limbs, and update the remainder accordningly.  The input
+   operands are unaffected.
+
+   Preconditions:
+   1. The most significant limb of of the divisor must be non-zero.
+   2. No argument overlap is permitted.  (??? relax this ???)
+   3. nn >= dn, even if qxn is non-zero.  (??? relax this ???)
+
+   The time complexity of this is O(qn*qn+M(dn,qn)), where M(m,n) is the time
+   complexity of multiplication.
+
+Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD (7 * KARATSUBA_MUL_THRESHOLD)
+#endif
+
+/* Extract the middle limb from ((h,,l) << cnt) */
+#define SHL(h,l,cnt) \
+  ((h << cnt) | ((l >> 1) >> ((~cnt) & (BITS_PER_MP_LIMB - 1))))
+
+void
+#if __STDC__
+mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn,
+	     mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn)
+#else
+mpn_tdiv_qr (qp, rp, qxn, np, nn, dp, dn)
+     mp_ptr qp;
+     mp_ptr rp;
+     mp_size_t qxn;
+     mp_srcptr np;
+     mp_size_t nn;
+     mp_srcptr dp;
+     mp_size_t dn;
+#endif
+{
+  /* FIXME:
+     1. qxn
+     2. pass allocated storage in additional parameter?
+  */
+  if (qxn != 0)
+    abort ();
+
+  switch (dn)
+    {
+    case 0:
+      DIVIDE_BY_ZERO;
+
+    case 1:
+      {
+	rp[0] = mpn_divmod_1 (qp, np, nn, dp[0]);
+	return;
+      }
+
+    case 2:
+      {
+	int cnt;
+	mp_ptr n2p, d2p;
+	mp_limb_t qhl, cy;
+	TMP_DECL (marker);
+	TMP_MARK (marker);
+	count_leading_zeros (cnt, dp[dn - 1]);
+	if (cnt != 0)
+	  {
+	    d2p = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
+	    mpn_lshift (d2p, dp, dn, cnt);
+	    n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB);
+	    cy = mpn_lshift (n2p, np, nn, cnt);
+	    n2p[nn] = cy;
+	    qhl = mpn_divrem_2 (qp, 0L, n2p, nn + (cy != 0), d2p);
+	    if (cy == 0)
+	      qp[nn - 2] = qhl;	/* always store nn-dn+1 quotient limbs */
+	  }
+	else
+	  {
+	    d2p = (mp_ptr) dp;
+	    n2p = (mp_ptr) TMP_ALLOC (nn * BYTES_PER_MP_LIMB);
+	    MPN_COPY (n2p, np, nn);
+	    qhl = mpn_divrem_2 (qp, 0L, n2p, nn, d2p);
+	    qp[nn - 2] = qhl;	/* always store nn-dn+1 quotient limbs */
+	  }
+
+	if (cnt != 0)
+	  mpn_rshift (rp, n2p, dn, cnt);
+	else
+	  MPN_COPY (rp, n2p, dn);
+	TMP_FREE (marker);
+	return;
+      }
+
+    default:
+      {
+	int adjust;
+	TMP_DECL (marker);
+	TMP_MARK (marker);
+	adjust = np[nn - 1] >= dp[dn - 1];	/* conservative tests for quotient size */
+	if (nn + adjust >= 2 * dn)
+	  {
+	    mp_ptr n2p, d2p;
+	    mp_limb_t cy;
+	    int cnt;
+	    count_leading_zeros (cnt, dp[dn - 1]);
+
+	    qp[nn - dn] = 0;			/* zero high quotient limb */
+	    if (cnt != 0)			/* normalize divisor if needed */
+	      {
+		d2p = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
+		mpn_lshift (d2p, dp, dn, cnt);
+		n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB);
+		cy = mpn_lshift (n2p, np, nn, cnt);
+		n2p[nn] = cy;
+		nn += adjust;
+	      }
+	    else
+	      {
+		d2p = (mp_ptr) dp;
+		n2p = (mp_ptr) TMP_ALLOC ((nn + 1) * BYTES_PER_MP_LIMB);
+		MPN_COPY (n2p, np, nn);
+		n2p[nn] = 0;
+		nn += adjust;
+	      }
+
+	    if (dn == 2)
+	      mpn_divrem_2 (qp, 0L, n2p, nn, d2p);
+	    else if (dn < BZ_THRESHOLD)
+	      mpn_sb_divrem_mn (qp, n2p, nn, d2p, dn);
+	    else
+	      {
+		/* Perform 2*dn / dn limb divisions as long as the limbs
+		   in np last.  */
+		mp_ptr q2p = qp + nn - 2 * dn;
+		n2p += nn - 2 * dn;
+		mpn_bz_divrem_n (q2p, n2p, d2p, dn);
+		nn -= dn;
+		while (nn >= 2 * dn)
+		  {
+		    mp_limb_t c;
+		    q2p -= dn;  n2p -= dn;
+		    c = mpn_bz_divrem_n (q2p, n2p, d2p, dn);
+		    ASSERT_ALWAYS (c == 0);
+		    nn -= dn;
+		  }
+
+		if (nn != dn)
+		  {
+		    n2p -= nn - dn;
+		    /* In theory, we could fall out to the cute code below
+		       since we now have exactly the situation that code
+		       is designed to handle.  We botch this badly and call
+		       the basic mpn_sb_divrem_mn!  */
+		    if (dn == 2)
+		      mpn_divrem_2 (qp, 0L, n2p, nn, d2p);
+		    else
+		      mpn_sb_divrem_mn (qp, n2p, nn, d2p, dn);
+		  }
+	      }
+
+
+	    if (cnt != 0)
+	      mpn_rshift (rp, n2p, dn, cnt);
+	    else
+	      MPN_COPY (rp, n2p, dn);
+	    TMP_FREE (marker);
+	    return;
+	  }
+
+	/* When we come here, the numerator/partial remainder is less
+	   than twice the size of the denominator.  */
+
+	  {
+	    /* Problem:
+
+	       Divide a numerator N with nn limbs by a denominator D with dn
+	       limbs forming a quotient of nn-dn+1 limbs.  When qn is small
+	       compared to dn, conventional division algorithms perform poorly.
+	       We want an algorithm that has an expected running time that is
+	       dependent only on qn.  It is assumed that the most significant
+	       limb of the numerator is smaller than the most significant limb
+	       of the denominator.
+
+	       Algorithm (very informally stated):
+
+	       1) Divide the 2 x qn most significant limbs from the numerator
+		  by the qn most significant limbs from the denominator.  Call
+		  the result qest.  This is either the correct quotient, but
+		  might be 1 or 2 too large.  Compute the remainder from the
+		  division.  (This step is implemented by a mpn_divrem call.)
+
+	       2) Is the most significant limb from the remainder < p, where p
+		  is the product of the most significant limb from the quotient
+		  and the next(d).  (Next(d) denotes the next ignored limb from
+		  the denominator.)  If it is, decrement qest, and adjust the
+		  remainder accordingly.
+
+	       3) Is the remainder >= qest?  If it is, qest is the desired
+		  quotient.  The algorithm terminates.
+
+	       4) Subtract qest x next(d) from the remainder.  If there is
+		  borrow out, decrement qest, and adjust the remainder
+		  accordingly.
+
+	       5) Skip one word from the denominator (i.e., let next(d) denote
+		  the next less significant limb.  */
+
+	    mp_size_t qn;
+	    mp_ptr n2p, d2p;
+	    mp_ptr tp;
+	    mp_limb_t cy;
+	    mp_size_t in, rn;
+	    mp_limb_t quotient_too_large;
+	    int cnt;
+
+	    qn = nn - dn;
+	    qp[qn] = 0;				/* zero high quotient limb */
+	    qn += adjust;			/* qn cannot become bigger */
+
+	    if (qn == 0)
+	      {
+		MPN_COPY (rp, np, dn);
+		TMP_FREE (marker);
+		return;
+	      }
+
+	    in = dn - qn;		/* (at least partially) ignored # of limbs in ops */
+	    /* Normalize denominator by shifting it to the left such that its
+	       most significant bit is set.  Then shift the numerator the same
+	       amount, to mathematically preserve quotient.  */
+	    count_leading_zeros (cnt, dp[dn - 1]);
+	    if (cnt != 0)
+	      {
+		d2p = (mp_ptr) TMP_ALLOC (qn * BYTES_PER_MP_LIMB);
+
+		mpn_lshift (d2p, dp + in, qn, cnt);
+		d2p[0] |= dp[in - 1] >> (BITS_PER_MP_LIMB - cnt);
+
+		n2p = (mp_ptr) TMP_ALLOC ((2 * qn + 1) * BYTES_PER_MP_LIMB);
+		cy = mpn_lshift (n2p, np + nn - 2 * qn, 2 * qn, cnt);
+		if (adjust)
+		  {
+		    n2p[2 * qn] = cy;
+		    n2p++;
+		  }
+		else
+		  {
+		    n2p[0] |= np[nn - 2 * qn - 1] >> (BITS_PER_MP_LIMB - cnt);
+		  }
+	      }
+	    else
+	      {
+		d2p = (mp_ptr) dp + in;
+
+		n2p = (mp_ptr) TMP_ALLOC ((2 * qn + 1) * BYTES_PER_MP_LIMB);
+		MPN_COPY (n2p, np + nn - 2 * qn, 2 * qn);
+		if (adjust)
+		  {
+		    n2p[2 * qn] = 0;
+		    n2p++;
+		  }
+	      }
+
+	    /* Get an approximate quotient using the extracted operands.  */
+	    if (qn == 1)
+	      {
+		mp_limb_t q0, r0;
+		mp_limb_t gcc272bug_n1, gcc272bug_n0, gcc272bug_d0;
+		/* Due to a gcc 2.7.2.3 reload pass bug, we have to use some
+		   temps here.  This doesn't hurt code quality on any machines
+		   so we do it unconditionally.  */
+		gcc272bug_n1 = n2p[1];
+		gcc272bug_n0 = n2p[0];
+		gcc272bug_d0 = d2p[0];
+		udiv_qrnnd (q0, r0, gcc272bug_n1, gcc272bug_n0, gcc272bug_d0);
+		n2p[0] = r0;
+		qp[0] = q0;
+	      }
+	    else if (qn == 2)
+	      mpn_divrem_2 (qp, 0L, n2p, 4L, d2p);
+	    else if (qn < BZ_THRESHOLD)
+	      mpn_sb_divrem_mn (qp, n2p, qn * 2, d2p, qn);
+	    else
+	      mpn_bz_divrem_n (qp, n2p, d2p, qn);
+
+	    rn = qn;
+	    /* Multiply the first ignored divisor limb by the most significant
+	       quotient limb.  If that product is > the partial remainder's
+	       most significant limb, we know the quotient is too large.  This
+	       test quickly catches most cases where the quotient is too large;
+	       it catches all cases where the quotient is 2 too large.  */
+	    {
+	      mp_limb_t dl, x;
+	      mp_limb_t h, l;
+
+	      if (in - 2 < 0)
+		dl = 0;
+	      else
+		dl = dp[in - 2];
+
+	      x = SHL (dp[in - 1], dl, cnt);
+	      umul_ppmm (h, l, x, qp[qn - 1]);
+
+	      if (n2p[qn - 1] < h)
+		{
+		  mp_limb_t cy;
+
+		  mpn_decr_u (qp, (mp_limb_t) 1);
+		  cy = mpn_add_n (n2p, n2p, d2p, qn);
+		  if (cy)
+		    {
+		      /* The partial remainder is safely large.  */
+		      n2p[qn] = cy;
+		      ++rn;
+		    }
+		}
+	    }
+
+	    quotient_too_large = 0;
+	    if (cnt != 0)
+	      {
+		mp_limb_t cy1, cy2;
+
+		/* Append partially used numerator limb to partial remainder.  */
+		cy1 = mpn_lshift (n2p, n2p, rn, BITS_PER_MP_LIMB - cnt);
+		n2p[0] |= np[in - 1] & (~(mp_limb_t) 0 >> cnt);
+
+		/* Update partial remainder with partially used divisor limb.  */
+		cy2 = mpn_submul_1 (n2p, qp, qn, dp[in - 1] & (~(mp_limb_t) 0 >> cnt));
+		if (qn != rn)
+		  {
+		    if (n2p[qn] < cy2)
+		      abort ();
+		    n2p[qn] -= cy2;
+		  }
+		else
+		  {
+		    n2p[qn] = cy1 - cy2;
+
+		    quotient_too_large = (cy1 < cy2);
+		    ++rn;
+		  }
+		--in;
+	      }
+	    /* True: partial remainder now is neutral, i.e., it is not shifted up.  */
+
+	    tp = (mp_ptr) TMP_ALLOC (dn * BYTES_PER_MP_LIMB);
+
+	    if (in < qn)
+	      {
+		if (in == 0)
+		  {
+		    MPN_COPY (rp, n2p, rn);
+		    if (rn != dn)
+		      abort ();
+		    goto foo;
+		  }
+		mpn_mul (tp, qp, qn, dp, in);
+	      }
+	    else
+	      mpn_mul (tp, dp, in, qp, qn);
+
+	    cy = mpn_sub (n2p, n2p, rn, tp + in, qn);
+	    MPN_COPY (rp + in, n2p, dn - in);
+	    quotient_too_large |= cy;
+	    cy = mpn_sub_n (rp, np, tp, in);
+	    cy = mpn_sub_1 (rp + in, rp + in, rn, cy);
+	    quotient_too_large |= cy;
+	  foo:
+	    if (quotient_too_large)
+	      {
+		mpn_decr_u (qp, (mp_limb_t) 1);
+		mpn_add_n (rp, rp, dp, dn);
+	      }
+	  }
+	TMP_FREE (marker);
+	return;
+      }
+    }
+}
diff --git a/rts/gmp/mpn/generic/udiv_w_sdiv.c b/rts/gmp/mpn/generic/udiv_w_sdiv.c
new file mode 100644
index 0000000000..061cce86e1
--- /dev/null
+++ b/rts/gmp/mpn/generic/udiv_w_sdiv.c
@@ -0,0 +1,131 @@
+/* mpn_udiv_w_sdiv -- implement udiv_qrnnd on machines with only signed
+   division.
+
+   Contributed by Peter L. Montgomery.
+
+   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY SAFE
+   TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS
+   ALMOST GUARANTEED THAT THIS FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE
+   GNU MP RELEASE.
+
+
+Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_udiv_w_sdiv (rp, a1, a0, d)
+     mp_limb_t *rp, a1, a0, d;
+{
+  mp_limb_t q, r;
+  mp_limb_t c0, c1, b1;
+
+  if ((mp_limb_signed_t) d >= 0)
+    {
+      if (a1 < d - a1 - (a0 >> (BITS_PER_MP_LIMB - 1)))
+	{
+	  /* dividend, divisor, and quotient are nonnegative */
+	  sdiv_qrnnd (q, r, a1, a0, d);
+	}
+      else
+	{
+	  /* Compute c1*2^32 + c0 = a1*2^32 + a0 - 2^31*d */
+	  sub_ddmmss (c1, c0, a1, a0, d >> 1, d << (BITS_PER_MP_LIMB - 1));
+	  /* Divide (c1*2^32 + c0) by d */
+	  sdiv_qrnnd (q, r, c1, c0, d);
+	  /* Add 2^31 to quotient */
+	  q += (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1);
+	}
+    }
+  else
+    {
+      b1 = d >> 1;			/* d/2, between 2^30 and 2^31 - 1 */
+      c1 = a1 >> 1;			/* A/2 */
+      c0 = (a1 << (BITS_PER_MP_LIMB - 1)) + (a0 >> 1);
+
+      if (a1 < b1)			/* A < 2^32*b1, so A/2 < 2^31*b1 */
+	{
+	  sdiv_qrnnd (q, r, c1, c0, b1); /* (A/2) / (d/2) */
+
+	  r = 2*r + (a0 & 1);		/* Remainder from A/(2*b1) */
+	  if ((d & 1) != 0)
+	    {
+	      if (r >= q)
+		r = r - q;
+	      else if (q - r <= d)
+		{
+		  r = r - q + d;
+		  q--;
+		}
+	      else
+		{
+		  r = r - q + 2*d;
+		  q -= 2;
+		}
+	    }
+	}
+      else if (c1 < b1)			/* So 2^31 <= (A/2)/b1 < 2^32 */
+	{
+	  c1 = (b1 - 1) - c1;
+	  c0 = ~c0;			/* logical NOT */
+
+	  sdiv_qrnnd (q, r, c1, c0, b1); /* (A/2) / (d/2) */
+
+	  q = ~q;			/* (A/2)/b1 */
+	  r = (b1 - 1) - r;
+
+	  r = 2*r + (a0 & 1);		/* A/(2*b1) */
+
+	  if ((d & 1) != 0)
+	    {
+	      if (r >= q)
+		r = r - q;
+	      else if (q - r <= d)
+		{
+		  r = r - q + d;
+		  q--;
+		}
+	      else
+		{
+		  r = r - q + 2*d;
+		  q -= 2;
+		}
+	    }
+	}
+      else				/* Implies c1 = b1 */
+	{				/* Hence a1 = d - 1 = 2*b1 - 1 */
+	  if (a0 >= -d)
+	    {
+	      q = -1;
+	      r = a0 + d;
+	    }
+	  else
+	    {
+	      q = -2;
+	      r = a0 + 2*d;
+	    }
+	}
+    }
+
+  *rp = r;
+  return q;
+}
diff --git a/rts/gmp/mpn/hppa/README b/rts/gmp/mpn/hppa/README
new file mode 100644
index 0000000000..97e7abe011
--- /dev/null
+++ b/rts/gmp/mpn/hppa/README
@@ -0,0 +1,91 @@
+This directory contains mpn functions for various HP PA-RISC chips.  Code
+that runs faster on the PA7100 and later implementations, is in the pa7100
+directory.
+
+RELEVANT OPTIMIZATION ISSUES
+
+  Load and Store timing
+
+On the PA7000 no memory instructions can issue the two cycles after a store.
+For the PA7100, this is reduced to one cycle.
+
+The PA7100 has a lookup-free cache, so it helps to schedule loads and the
+dependent instruction really far from each other.
+
+STATUS
+
+1. mpn_mul_1 could be improved to 6.5 cycles/limb on the PA7100, using the
+   instructions below (but some sw pipelining is needed to avoid the
+   xmpyu-fstds delay):
+
+	fldds	s1_ptr
+
+	xmpyu
+	fstds	N(%r30)
+	xmpyu
+	fstds	N(%r30)
+
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+
+	addc
+	stws	res_ptr
+	addc
+	stws	res_ptr
+
+	addib	Loop
+
+2. mpn_addmul_1 could be improved from the current 10 to 7.5 cycles/limb
+   (asymptotically) on the PA7100, using the instructions below.  With proper
+   sw pipelining and the unrolling level below, the speed becomes 8
+   cycles/limb.
+
+	fldds	s1_ptr
+	fldds	s1_ptr
+
+	xmpyu
+	fstds	N(%r30)
+	xmpyu
+	fstds	N(%r30)
+	xmpyu
+	fstds	N(%r30)
+	xmpyu
+	fstds	N(%r30)
+
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	addc
+	addc
+	addc
+	addc
+	addc	%r0,%r0,cy-limb
+
+	ldws	res_ptr
+	ldws	res_ptr
+	ldws	res_ptr
+	ldws	res_ptr
+	add
+	stws	res_ptr
+	addc
+	stws	res_ptr
+	addc
+	stws	res_ptr
+	addc
+	stws	res_ptr
+
+	addib
+
+3. For the PA8000 we have to stick to using 32-bit limbs before compiler
+   support emerges.  But we want to use 64-bit operations whenever possible,
+   in particular for loads and stores.  It is possible to handle mpn_add_n
+   efficiently by rotating (when s1/s2 are aligned), masking+bit field
+   inserting when (they are not).  The speed should double compared to the
+   code used today.
diff --git a/rts/gmp/mpn/hppa/add_n.s b/rts/gmp/mpn/hppa/add_n.s
new file mode 100644
index 0000000000..c53b2f71b3
--- /dev/null
+++ b/rts/gmp/mpn/hppa/add_n.s
@@ -0,0 +1,58 @@
+; HP-PA  __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; One might want to unroll this as for other processors, but it turns
+; out that the data cache contention after a store makes such
+; unrolling useless.  We can't come under 5 cycles/limb anyway.
+
+	.code
+	.export		__gmpn_add_n
+__gmpn_add_n
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+
+	addib,=		-1,%r23,L$end	; check for (SIZE == 1)
+	 add		%r20,%r19,%r28	; add first limbs ignoring cy
+
+L$loop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,<>	-1,%r23,L$loop
+	 addc		%r20,%r19,%r28
+
+L$end	stws		%r28,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r0,%r28
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/gmp-mparam.h b/rts/gmp/mpn/hppa/gmp-mparam.h
new file mode 100644
index 0000000000..98b6d9ce3c
--- /dev/null
+++ b/rts/gmp/mpn/hppa/gmp-mparam.h
@@ -0,0 +1,63 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values are for the PA7100 using GCC.  */
+/* Generated by tuneup.c, 2000-07-25. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   30
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      172
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   59
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      185
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              96
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD            122
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            18
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD       46
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD          33
+#endif
diff --git a/rts/gmp/mpn/hppa/hppa1_1/addmul_1.s b/rts/gmp/mpn/hppa/hppa1_1/addmul_1.s
new file mode 100644
index 0000000000..c7d218f922
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/addmul_1.s
@@ -0,0 +1,102 @@
+; HP-PA-1.1 __gmpn_addmul_1 -- Multiply a limb vector with a limb and
+; add the result to a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r26
+; s1_ptr	r25
+; size		r24
+; s2_limb	r23
+
+; This runs at 11 cycles/limb on a PA7000.  With the used instructions, it
+; can not become faster due to data cache contention after a store.  On the
+; PA7100 it runs at 10 cycles/limb, and that can not be improved either,
+; since only the xmpyu does not need the integer pipeline, so the only
+; dual-issue we will get are addc+xmpyu.  Unrolling could gain a cycle/limb
+; on the PA7100.
+
+; There are some ideas described in mul_1.s that applies to this code too.
+
+	.code
+	.export		__gmpn_addmul_1
+__gmpn_addmul_1
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldo		64(%r30),%r30
+	fldws,ma	4(%r25),%fr5
+	stw		%r23,-16(%r30)		; move s2_limb ...
+	addib,=		-1,%r24,L$just_one_limb
+	 fldws		-16(%r30),%fr4		; ... into fr4
+	add		%r0,%r0,%r0		; clear carry
+	xmpyu		%fr4,%fr5,%fr6
+	fldws,ma	4(%r25),%fr7
+	fstds		%fr6,-16(%r30)
+	xmpyu		%fr4,%fr7,%fr8
+	ldw		-12(%r30),%r19		; least significant limb in product
+	ldw		-16(%r30),%r28
+
+	fstds		%fr8,-16(%r30)
+	addib,=		-1,%r24,L$end
+	 ldw		-12(%r30),%r1
+
+; Main loop
+L$loop	ldws		0(%r26),%r29
+	fldws,ma	4(%r25),%fr5
+	add		%r29,%r19,%r19
+	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		-16(%r30),%r28
+	fstds		%fr6,-16(%r30)
+	addc		%r0,%r28,%r28
+	addib,<>	-1,%r24,L$loop
+	 ldw		-12(%r30),%r1
+
+L$end	ldw		0(%r26),%r29
+	add		%r29,%r19,%r19
+	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	ldw		-16(%r30),%r28
+	ldws		0(%r26),%r29
+	addc		%r0,%r28,%r28
+	add		%r29,%r19,%r19
+	stws,ma		%r19,4(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+L$just_one_limb
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		0(%r26),%r29
+	fstds		%fr6,-16(%r30)
+	ldw		-12(%r30),%r1
+	ldw		-16(%r30),%r28
+	add		%r29,%r1,%r19
+	stw		%r19,0(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/mul_1.s b/rts/gmp/mpn/hppa/hppa1_1/mul_1.s
new file mode 100644
index 0000000000..4512fddec9
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/mul_1.s
@@ -0,0 +1,98 @@
+; HP-PA-1.1 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+; the result in a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r26
+; s1_ptr	r25
+; size		r24
+; s2_limb	r23
+
+; This runs at 9 cycles/limb on a PA7000.  With the used instructions, it can
+; not become faster due to data cache contention after a store.  On the
+; PA7100 it runs at 7 cycles/limb, and that can not be improved either, since
+; only the xmpyu does not need the integer pipeline, so the only dual-issue
+; we will get are addc+xmpyu.  Unrolling would not help either CPU.
+
+; We could use fldds to read two limbs at a time from the S1 array, and that
+; could bring down the times to 8.5 and 6.5 cycles/limb for the PA7000 and
+; PA7100, respectively.  We don't do that since it does not seem worth the
+; (alignment) troubles...
+
+; At least the PA7100 is rumored to be able to deal with cache-misses
+; without stalling instruction issue.  If this is true, and the cache is
+; actually also lockup-free, we should use a deeper software pipeline, and
+; load from S1 very early!  (The loads and stores to -12(sp) will surely be
+; in the cache.)
+
+	.code
+	.export		__gmpn_mul_1
+__gmpn_mul_1
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldo		64(%r30),%r30
+	fldws,ma	4(%r25),%fr5
+	stw		%r23,-16(%r30)		; move s2_limb ...
+	addib,=		-1,%r24,L$just_one_limb
+	 fldws		-16(%r30),%fr4		; ... into fr4
+	add		%r0,%r0,%r0		; clear carry
+	xmpyu		%fr4,%fr5,%fr6
+	fldws,ma	4(%r25),%fr7
+	fstds	 	%fr6,-16(%r30)
+	xmpyu		%fr4,%fr7,%fr8
+	ldw		-12(%r30),%r19		; least significant limb in product
+	ldw		-16(%r30),%r28
+
+	fstds		%fr8,-16(%r30)
+	addib,=		-1,%r24,L$end
+	 ldw		-12(%r30),%r1
+
+; Main loop
+L$loop	fldws,ma	4(%r25),%fr5
+	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		-16(%r30),%r28
+	fstds		%fr6,-16(%r30)
+	addib,<>	-1,%r24,L$loop
+	 ldw		-12(%r30),%r1
+
+L$end	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	ldw		-16(%r30),%r28
+	stws,ma		%r19,4(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+L$just_one_limb
+	xmpyu		%fr4,%fr5,%fr6
+	fstds		%fr6,-16(%r30)
+	ldw		-16(%r30),%r28
+	ldo		-64(%r30),%r30
+	bv		0(%r2)
+	 fstws		%fr6R,0(%r26)
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/add_n.s b/rts/gmp/mpn/hppa/hppa1_1/pa7100/add_n.s
new file mode 100644
index 0000000000..4f4be08b37
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/add_n.s
@@ -0,0 +1,75 @@
+; HP-PA  __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+; This is optimized for the PA7100, where is runs at 4.25 cycles/limb
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+	.code
+	.export		__gmpn_add_n
+__gmpn_add_n
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+
+	addib,<=	-5,%r23,L$rest
+	 add		%r20,%r19,%r28	; add first limbs ignoring cy
+
+L$loop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addc		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addc		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addc		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,>		-4,%r23,L$loop
+	addc		%r20,%r19,%r28
+
+L$rest	addib,=		4,%r23,L$end
+	nop
+L$eloop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,>		-1,%r23,L$eloop
+	addc		%r20,%r19,%r28
+
+L$end	stws		%r28,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r0,%r28
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/addmul_1.S b/rts/gmp/mpn/hppa/hppa1_1/pa7100/addmul_1.S
new file mode 100644
index 0000000000..04db06822e
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/addmul_1.S
@@ -0,0 +1,189 @@
+; HP-PA 7100/7200 __gmpn_addmul_1 -- Multiply a limb vector with a limb and
+; add the result to a second limb vector.
+
+; Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define res_ptr	%r26
+#define s1_ptr	%r25
+#define size	%r24
+#define s2_limb	%r23
+
+#define cylimb	%r28
+#define s0	%r19
+#define s1	%r20
+#define s2	%r3
+#define s3	%r4
+#define lo0	%r21
+#define lo1	%r5
+#define lo2	%r6
+#define lo3	%r7
+#define hi0	%r22
+#define hi1	%r23				/* safe to reuse */
+#define hi2	%r29
+#define hi3	%r1
+
+	.code
+	.export		__gmpn_addmul_1
+__gmpn_addmul_1
+	.proc
+	.callinfo	frame=128,no_calls
+	.entry
+
+	ldo	128(%r30),%r30
+	stws	s2_limb,-16(%r30)
+	add	 %r0,%r0,cylimb			; clear cy and cylimb
+	addib,<	-4,size,L$few_limbs
+	fldws	-16(%r30),%fr31R
+
+	ldo	-112(%r30),%r31
+	stw	%r3,-96(%r30)
+	stw	%r4,-92(%r30)
+	stw	%r5,-88(%r30)
+	stw	%r6,-84(%r30)
+	stw	%r7,-80(%r30)
+
+	bb,>=,n	 s1_ptr,29,L$0
+
+	fldws,ma 4(s1_ptr),%fr4
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4,%fr31R,%fr5
+	fstds	 %fr5,-16(%r31)
+	ldws	-16(%r31),cylimb
+	ldws	-12(%r31),lo0
+	add	 s0,lo0,s0
+	addib,< -1,size,L$few_limbs
+	stws,ma	 s0,4(res_ptr)
+
+; start software pipeline ----------------------------------------------------
+L$0	fldds,ma 8(s1_ptr),%fr4
+	fldds,ma 8(s1_ptr),%fr8
+
+	xmpyu	 %fr4L,%fr31R,%fr5
+	xmpyu	 %fr4R,%fr31R,%fr6
+	xmpyu	 %fr8L,%fr31R,%fr9
+	xmpyu	 %fr8R,%fr31R,%fr10
+
+	fstds	 %fr5,-16(%r31)
+	fstds	 %fr6,-8(%r31)
+	fstds	 %fr9,0(%r31)
+	fstds	 %fr10,8(%r31)
+
+	ldws   -16(%r31),hi0
+	ldws   -12(%r31),lo0
+	ldws	-8(%r31),hi1
+	ldws	-4(%r31),lo1
+	ldws	 0(%r31),hi2
+	ldws	 4(%r31),lo2
+	ldws	 8(%r31),hi3
+	ldws	12(%r31),lo3
+
+	addc	 lo0,cylimb,lo0
+	addc	 lo1,hi0,lo1
+	addc	 lo2,hi1,lo2
+	addc	 lo3,hi2,lo3
+
+	addib,<	 -4,size,L$end
+	addc	 %r0,hi3,cylimb			; propagate carry into cylimb
+; main loop ------------------------------------------------------------------
+L$loop	fldds,ma 8(s1_ptr),%fr4
+	fldds,ma 8(s1_ptr),%fr8
+
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4L,%fr31R,%fr5
+	ldws	 4(res_ptr),s1
+	xmpyu	 %fr4R,%fr31R,%fr6
+	ldws	 8(res_ptr),s2
+	xmpyu	 %fr8L,%fr31R,%fr9
+	ldws	12(res_ptr),s3
+	xmpyu	 %fr8R,%fr31R,%fr10
+
+	fstds	 %fr5,-16(%r31)
+	add	 s0,lo0,s0
+	fstds	 %fr6,-8(%r31)
+	addc	 s1,lo1,s1
+	fstds	 %fr9,0(%r31)
+	addc	 s2,lo2,s2
+	fstds	 %fr10,8(%r31)
+	addc	 s3,lo3,s3
+
+	ldws   -16(%r31),hi0
+	ldws   -12(%r31),lo0
+	ldws	-8(%r31),hi1
+	ldws	-4(%r31),lo1
+	ldws	 0(%r31),hi2
+	ldws	 4(%r31),lo2
+	ldws	 8(%r31),hi3
+	ldws	12(%r31),lo3
+
+	addc	 lo0,cylimb,lo0
+	stws,ma	 s0,4(res_ptr)
+	addc	 lo1,hi0,lo1
+	stws,ma	 s1,4(res_ptr)
+	addc	 lo2,hi1,lo2
+	stws,ma	 s2,4(res_ptr)
+	addc	 lo3,hi2,lo3
+	stws,ma	 s3,4(res_ptr)
+
+	addib,>= -4,size,L$loop
+	addc	 %r0,hi3,cylimb			; propagate carry into cylimb
+; finish software pipeline ---------------------------------------------------
+L$end	ldws	 0(res_ptr),s0
+	ldws	 4(res_ptr),s1
+	ldws	 8(res_ptr),s2
+	ldws	12(res_ptr),s3
+
+	add	 s0,lo0,s0
+	stws,ma	 s0,4(res_ptr)
+	addc	 s1,lo1,s1
+	stws,ma	 s1,4(res_ptr)
+	addc	 s2,lo2,s2
+	stws,ma	 s2,4(res_ptr)
+	addc	 s3,lo3,s3
+	stws,ma	 s3,4(res_ptr)
+
+; restore callee-saves registers ---------------------------------------------
+	ldw	-96(%r30),%r3
+	ldw	-92(%r30),%r4
+	ldw	-88(%r30),%r5
+	ldw	-84(%r30),%r6
+	ldw	-80(%r30),%r7
+
+L$few_limbs
+	addib,=,n 4,size,L$ret
+L$loop2	fldws,ma 4(s1_ptr),%fr4
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4,%fr31R,%fr5
+	fstds	 %fr5,-16(%r30)
+	ldws	-16(%r30),hi0
+	ldws	-12(%r30),lo0
+	addc	 lo0,cylimb,lo0
+	addc	 %r0,hi0,cylimb
+	add	 s0,lo0,s0
+	stws,ma	 s0,4(res_ptr)
+	addib,<> -1,size,L$loop2
+	nop
+
+L$ret	addc	 %r0,cylimb,cylimb
+	bv	 0(%r2)
+	ldo	 -128(%r30),%r30
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/lshift.s b/rts/gmp/mpn/hppa/hppa1_1/pa7100/lshift.s
new file mode 100644
index 0000000000..31669b1a55
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/lshift.s
@@ -0,0 +1,83 @@
+; HP-PA  __gmpn_lshift --
+; This is optimized for the PA7100, where is runs at 3.25 cycles/limb
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s_ptr		gr25
+; size		gr24
+; cnt		gr23
+
+	.code
+	.export		__gmpn_lshift
+__gmpn_lshift
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	sh2add		%r24,%r25,%r25
+	sh2add		%r24,%r26,%r26
+	ldws,mb		-4(0,%r25),%r22
+	subi		32,%r23,%r1
+	mtsar		%r1
+	addib,=		-1,%r24,L$0004
+	vshd		%r0,%r22,%r28		; compute carry out limb
+	ldws,mb		-4(0,%r25),%r29
+	addib,<=	-5,%r24,L$rest
+	vshd		%r22,%r29,%r20
+
+L$loop	ldws,mb		-4(0,%r25),%r22
+	stws,mb		%r20,-4(0,%r26)
+	vshd		%r29,%r22,%r20
+	ldws,mb		-4(0,%r25),%r29
+	stws,mb		%r20,-4(0,%r26)
+	vshd		%r22,%r29,%r20
+	ldws,mb		-4(0,%r25),%r22
+	stws,mb		%r20,-4(0,%r26)
+	vshd		%r29,%r22,%r20
+	ldws,mb		-4(0,%r25),%r29
+	stws,mb		%r20,-4(0,%r26)
+	addib,>		-4,%r24,L$loop
+	vshd		%r22,%r29,%r20
+
+L$rest	addib,=		4,%r24,L$end1
+	nop
+L$eloop	ldws,mb		-4(0,%r25),%r22
+	stws,mb		%r20,-4(0,%r26)
+	addib,<=	-1,%r24,L$end2
+	vshd		%r29,%r22,%r20
+	ldws,mb		-4(0,%r25),%r29
+	stws,mb		%r20,-4(0,%r26)
+	addib,>		-1,%r24,L$eloop
+	vshd		%r22,%r29,%r20
+
+L$end1	stws,mb		%r20,-4(0,%r26)
+	vshd		%r29,%r0,%r20
+	bv		0(%r2)
+	stw		%r20,-4(0,%r26)
+L$end2	stws,mb		%r20,-4(0,%r26)
+L$0004	vshd		%r22,%r0,%r20
+	bv		0(%r2)
+	stw		%r20,-4(0,%r26)
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/rshift.s b/rts/gmp/mpn/hppa/hppa1_1/pa7100/rshift.s
new file mode 100644
index 0000000000..d32b10b4b1
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/rshift.s
@@ -0,0 +1,80 @@
+; HP-PA  __gmpn_rshift --
+; This is optimized for the PA7100, where is runs at 3.25 cycles/limb
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s_ptr		gr25
+; size		gr24
+; cnt		gr23
+
+	.code
+	.export		__gmpn_rshift
+__gmpn_rshift
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r22
+	mtsar		%r23
+	addib,=		-1,%r24,L$0004
+	vshd		%r22,%r0,%r28		; compute carry out limb
+	ldws,ma		4(0,%r25),%r29
+	addib,<=	-5,%r24,L$rest
+	vshd		%r29,%r22,%r20
+
+L$loop	ldws,ma		4(0,%r25),%r22
+	stws,ma		%r20,4(0,%r26)
+	vshd		%r22,%r29,%r20
+	ldws,ma		4(0,%r25),%r29
+	stws,ma		%r20,4(0,%r26)
+	vshd		%r29,%r22,%r20
+	ldws,ma		4(0,%r25),%r22
+	stws,ma		%r20,4(0,%r26)
+	vshd		%r22,%r29,%r20
+	ldws,ma		4(0,%r25),%r29
+	stws,ma		%r20,4(0,%r26)
+	addib,>		-4,%r24,L$loop
+	vshd		%r29,%r22,%r20
+
+L$rest	addib,=		4,%r24,L$end1
+	nop
+L$eloop	ldws,ma		4(0,%r25),%r22
+	stws,ma		%r20,4(0,%r26)
+	addib,<=	-1,%r24,L$end2
+	vshd		%r22,%r29,%r20
+	ldws,ma		4(0,%r25),%r29
+	stws,ma		%r20,4(0,%r26)
+	addib,>		-1,%r24,L$eloop
+	vshd		%r29,%r22,%r20
+
+L$end1	stws,ma		%r20,4(0,%r26)
+	vshd		%r0,%r29,%r20
+	bv		0(%r2)
+	stw		%r20,0(0,%r26)
+L$end2	stws,ma		%r20,4(0,%r26)
+L$0004	vshd		%r0,%r22,%r20
+	bv		0(%r2)
+	stw		%r20,0(0,%r26)
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/sub_n.s b/rts/gmp/mpn/hppa/hppa1_1/pa7100/sub_n.s
new file mode 100644
index 0000000000..0eec41c4b3
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/sub_n.s
@@ -0,0 +1,76 @@
+; HP-PA  __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+; This is optimized for the PA7100, where is runs at 4.25 cycles/limb
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+	.code
+	.export		__gmpn_sub_n
+__gmpn_sub_n
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+
+	addib,<=	-5,%r23,L$rest
+	 sub		%r20,%r19,%r28	; subtract first limbs ignoring cy
+
+L$loop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	subb		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	subb		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	subb		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,>		-4,%r23,L$loop
+	subb		%r20,%r19,%r28
+
+L$rest	addib,=		4,%r23,L$end
+	nop
+L$eloop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,>		-1,%r23,L$eloop
+	subb		%r20,%r19,%r28
+
+L$end	stws		%r28,0(0,%r26)
+	addc		%r0,%r0,%r28
+	bv		0(%r2)
+	 subi		1,%r28,%r28
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/pa7100/submul_1.S b/rts/gmp/mpn/hppa/hppa1_1/pa7100/submul_1.S
new file mode 100644
index 0000000000..0fba21dcef
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/pa7100/submul_1.S
@@ -0,0 +1,195 @@
+; HP-PA 7100/7200 __gmpn_submul_1 -- Multiply a limb vector with a limb and
+; subtract the result from a second limb vector.
+
+; Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define res_ptr	%r26
+#define s1_ptr	%r25
+#define size	%r24
+#define s2_limb	%r23
+
+#define cylimb	%r28
+#define s0	%r19
+#define s1	%r20
+#define s2	%r3
+#define s3	%r4
+#define lo0	%r21
+#define lo1	%r5
+#define lo2	%r6
+#define lo3	%r7
+#define hi0	%r22
+#define hi1	%r23				/* safe to reuse */
+#define hi2	%r29
+#define hi3	%r1
+
+	.code
+	.export		__gmpn_submul_1
+__gmpn_submul_1
+	.proc
+	.callinfo	frame=128,no_calls
+	.entry
+
+	ldo	128(%r30),%r30
+	stws	s2_limb,-16(%r30)
+	add	 %r0,%r0,cylimb			; clear cy and cylimb
+	addib,<	-4,size,L$few_limbs
+	fldws	-16(%r30),%fr31R
+
+	ldo	-112(%r30),%r31
+	stw	%r3,-96(%r30)
+	stw	%r4,-92(%r30)
+	stw	%r5,-88(%r30)
+	stw	%r6,-84(%r30)
+	stw	%r7,-80(%r30)
+
+	bb,>=,n	 s1_ptr,29,L$0
+
+	fldws,ma 4(s1_ptr),%fr4
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4,%fr31R,%fr5
+	fstds	 %fr5,-16(%r31)
+	ldws	-16(%r31),cylimb
+	ldws	-12(%r31),lo0
+	sub	 s0,lo0,s0
+	add	 s0,lo0,%r0			; invert cy
+	addib,< -1,size,L$few_limbs
+	stws,ma	 s0,4(res_ptr)
+
+; start software pipeline ----------------------------------------------------
+L$0	fldds,ma 8(s1_ptr),%fr4
+	fldds,ma 8(s1_ptr),%fr8
+
+	xmpyu	 %fr4L,%fr31R,%fr5
+	xmpyu	 %fr4R,%fr31R,%fr6
+	xmpyu	 %fr8L,%fr31R,%fr9
+	xmpyu	 %fr8R,%fr31R,%fr10
+
+	fstds	 %fr5,-16(%r31)
+	fstds	 %fr6,-8(%r31)
+	fstds	 %fr9,0(%r31)
+	fstds	 %fr10,8(%r31)
+
+	ldws   -16(%r31),hi0
+	ldws   -12(%r31),lo0
+	ldws	-8(%r31),hi1
+	ldws	-4(%r31),lo1
+	ldws	 0(%r31),hi2
+	ldws	 4(%r31),lo2
+	ldws	 8(%r31),hi3
+	ldws	12(%r31),lo3
+
+	addc	 lo0,cylimb,lo0
+	addc	 lo1,hi0,lo1
+	addc	 lo2,hi1,lo2
+	addc	 lo3,hi2,lo3
+
+	addib,<	 -4,size,L$end
+	addc	 %r0,hi3,cylimb			; propagate carry into cylimb
+; main loop ------------------------------------------------------------------
+L$loop	fldds,ma 8(s1_ptr),%fr4
+	fldds,ma 8(s1_ptr),%fr8
+
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4L,%fr31R,%fr5
+	ldws	 4(res_ptr),s1
+	xmpyu	 %fr4R,%fr31R,%fr6
+	ldws	 8(res_ptr),s2
+	xmpyu	 %fr8L,%fr31R,%fr9
+	ldws	12(res_ptr),s3
+	xmpyu	 %fr8R,%fr31R,%fr10
+
+	fstds	 %fr5,-16(%r31)
+	sub	 s0,lo0,s0
+	fstds	 %fr6,-8(%r31)
+	subb	 s1,lo1,s1
+	fstds	 %fr9,0(%r31)
+	subb	 s2,lo2,s2
+	fstds	 %fr10,8(%r31)
+	subb	 s3,lo3,s3
+	subb	 %r0,%r0,lo0			; these two insns ...
+	add	 lo0,lo0,%r0			; ... just invert cy
+
+	ldws   -16(%r31),hi0
+	ldws   -12(%r31),lo0
+	ldws	-8(%r31),hi1
+	ldws	-4(%r31),lo1
+	ldws	 0(%r31),hi2
+	ldws	 4(%r31),lo2
+	ldws	 8(%r31),hi3
+	ldws	12(%r31),lo3
+
+	addc	 lo0,cylimb,lo0
+	stws,ma	 s0,4(res_ptr)
+	addc	 lo1,hi0,lo1
+	stws,ma	 s1,4(res_ptr)
+	addc	 lo2,hi1,lo2
+	stws,ma	 s2,4(res_ptr)
+	addc	 lo3,hi2,lo3
+	stws,ma	 s3,4(res_ptr)
+
+	addib,>= -4,size,L$loop
+	addc	 %r0,hi3,cylimb			; propagate carry into cylimb
+; finish software pipeline ---------------------------------------------------
+L$end	ldws	 0(res_ptr),s0
+	ldws	 4(res_ptr),s1
+	ldws	 8(res_ptr),s2
+	ldws	12(res_ptr),s3
+
+	sub	 s0,lo0,s0
+	stws,ma	 s0,4(res_ptr)
+	subb	 s1,lo1,s1
+	stws,ma	 s1,4(res_ptr)
+	subb	 s2,lo2,s2
+	stws,ma	 s2,4(res_ptr)
+	subb	 s3,lo3,s3
+	stws,ma	 s3,4(res_ptr)
+	subb	 %r0,%r0,lo0			; these two insns ...
+	add	 lo0,lo0,%r0			; ... invert cy
+
+; restore callee-saves registers ---------------------------------------------
+	ldw	-96(%r30),%r3
+	ldw	-92(%r30),%r4
+	ldw	-88(%r30),%r5
+	ldw	-84(%r30),%r6
+	ldw	-80(%r30),%r7
+
+L$few_limbs
+	addib,=,n 4,size,L$ret
+L$loop2	fldws,ma 4(s1_ptr),%fr4
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4,%fr31R,%fr5
+	fstds	 %fr5,-16(%r30)
+	ldws	-16(%r30),hi0
+	ldws	-12(%r30),lo0
+	addc	 lo0,cylimb,lo0
+	addc	 %r0,hi0,cylimb
+	sub	 s0,lo0,s0
+	add	 s0,lo0,%r0			; invert cy
+	stws,ma	 s0,4(res_ptr)
+	addib,<> -1,size,L$loop2
+	nop
+
+L$ret	addc	 %r0,cylimb,cylimb
+	bv	 0(%r2)
+	ldo	 -128(%r30),%r30
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/submul_1.s b/rts/gmp/mpn/hppa/hppa1_1/submul_1.s
new file mode 100644
index 0000000000..20a5b5ce0a
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/submul_1.s
@@ -0,0 +1,111 @@
+; HP-PA-1.1 __gmpn_submul_1 -- Multiply a limb vector with a limb and
+; subtract the result from a second limb vector.
+
+; Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r26
+; s1_ptr	r25
+; size		r24
+; s2_limb	r23
+
+; This runs at 12 cycles/limb on a PA7000.  With the used instructions, it
+; can not become faster due to data cache contention after a store.  On the
+; PA7100 it runs at 11 cycles/limb, and that can not be improved either,
+; since only the xmpyu does not need the integer pipeline, so the only
+; dual-issue we will get are addc+xmpyu.  Unrolling could gain a cycle/limb
+; on the PA7100.
+
+; There are some ideas described in mul_1.s that applies to this code too.
+
+; It seems possible to make this run as fast as __gmpn_addmul_1, if we use
+; 	sub,>>=	%r29,%r19,%r22
+;	addi	1,%r28,%r28
+; but that requires reworking the hairy software pipeline...
+
+	.code
+	.export		__gmpn_submul_1
+__gmpn_submul_1
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldo		64(%r30),%r30
+	fldws,ma	4(%r25),%fr5
+	stw		%r23,-16(%r30)		; move s2_limb ...
+	addib,=		-1,%r24,L$just_one_limb
+	 fldws		-16(%r30),%fr4		; ... into fr4
+	add		%r0,%r0,%r0		; clear carry
+	xmpyu		%fr4,%fr5,%fr6
+	fldws,ma	4(%r25),%fr7
+	fstds		%fr6,-16(%r30)
+	xmpyu		%fr4,%fr7,%fr8
+	ldw		-12(%r30),%r19		; least significant limb in product
+	ldw		-16(%r30),%r28
+
+	fstds		%fr8,-16(%r30)
+	addib,=		-1,%r24,L$end
+	 ldw		-12(%r30),%r1
+
+; Main loop
+L$loop	ldws		0(%r26),%r29
+	fldws,ma	4(%r25),%fr5
+	sub		%r29,%r19,%r22
+	add		%r22,%r19,%r0
+	stws,ma		%r22,4(%r26)
+	addc		%r28,%r1,%r19
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		-16(%r30),%r28
+	fstds		%fr6,-16(%r30)
+	addc		%r0,%r28,%r28
+	addib,<>	-1,%r24,L$loop
+	 ldw		-12(%r30),%r1
+
+L$end	ldw		0(%r26),%r29
+	sub		%r29,%r19,%r22
+	add		%r22,%r19,%r0
+	stws,ma		%r22,4(%r26)
+	addc		%r28,%r1,%r19
+	ldw		-16(%r30),%r28
+	ldws		0(%r26),%r29
+	addc		%r0,%r28,%r28
+	sub		%r29,%r19,%r22
+	add		%r22,%r19,%r0
+	stws,ma		%r22,4(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+L$just_one_limb
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		0(%r26),%r29
+	fstds		%fr6,-16(%r30)
+	ldw		-12(%r30),%r1
+	ldw		-16(%r30),%r28
+	sub		%r29,%r1,%r22
+	add		%r22,%r1,%r0
+	stw		%r22,0(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/udiv_qrnnd.S b/rts/gmp/mpn/hppa/hppa1_1/udiv_qrnnd.S
new file mode 100644
index 0000000000..b83d6f4dd2
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/udiv_qrnnd.S
@@ -0,0 +1,80 @@
+; HP-PA  __udiv_qrnnd division support, used from longlong.h.
+; This version runs fast on PA 7000 and later.
+
+; Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; rem_ptr	gr26
+; n1		gr25
+; n0		gr24
+; d		gr23
+
+	.code
+L$0000	.word		0x43f00000		; 2^64
+	.word		0x0
+	.export		__gmpn_udiv_qrnnd
+__gmpn_udiv_qrnnd
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+	ldo		64(%r30),%r30
+
+	stws		%r25,-16(0,%r30)	; n_hi
+	stws		%r24,-12(0,%r30)	; n_lo
+#ifdef PIC
+	addil		LT%L$0000,%r19
+	ldo		RT%L$0000(%r1),%r19
+#else
+	ldil		L%L$0000,%r19
+	ldo		R%L$0000(%r19),%r19
+#endif	
+	fldds		-16(0,%r30),%fr5
+	stws		%r23,-12(0,%r30)
+	comib,<=	0,%r25,L$1
+	fcnvxf,dbl,dbl	%fr5,%fr5
+	fldds		0(0,%r19),%fr4
+	fadd,dbl	%fr4,%fr5,%fr5
+L$1
+	fcpy,sgl	%fr0,%fr6L
+	fldws		-12(0,%r30),%fr6R
+	fcnvxf,dbl,dbl	%fr6,%fr4
+
+	fdiv,dbl	%fr5,%fr4,%fr5
+
+	fcnvfx,dbl,dbl	%fr5,%fr4
+	fstws		%fr4R,-16(%r30)
+	xmpyu		%fr4R,%fr6R,%fr6
+	ldws		-16(%r30),%r28
+	fstds		%fr6,-16(0,%r30)
+	ldws		-12(0,%r30),%r21
+	ldws		-16(0,%r30),%r20
+	sub		%r24,%r21,%r22
+	subb		%r25,%r20,%r19
+	comib,=		0,%r19,L$2
+	ldo		-64(%r30),%r30
+
+	add		%r22,%r23,%r22
+	ldo		-1(%r28),%r28
+L$2	bv		0(%r2)
+	stws		%r22,0(0,%r26)
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa1_1/umul.s b/rts/gmp/mpn/hppa/hppa1_1/umul.s
new file mode 100644
index 0000000000..1f1300ac9b
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa1_1/umul.s
@@ -0,0 +1,42 @@
+; Copyright (C) 1999 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+	.code
+	.export		__umul_ppmm
+	.align 4
+__umul_ppmm
+	.proc
+	.callinfo frame=64,no_calls
+	.entry
+
+	ldo 64(%r30),%r30
+	stw %r25,-16(0,%r30)
+	fldws -16(0,%r30),%fr22R
+	stw %r24,-16(0,%r30)
+	fldws -16(0,%r30),%fr22L
+	xmpyu %fr22R,%fr22L,%fr22
+	fstds %fr22,-16(0,%r30)
+	ldw -16(0,%r30),%r28
+	ldw -12(0,%r30),%r29
+	stw %r29,0(0,%r26)
+	bv 0(%r2)
+	ldo -64(%r30),%r30
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa2_0/add_n.s b/rts/gmp/mpn/hppa/hppa2_0/add_n.s
new file mode 100644
index 0000000000..6e97278a39
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa2_0/add_n.s
@@ -0,0 +1,88 @@
+; HP-PA 2.0 32-bit __gmpn_add_n -- Add two limb vectors of the same length > 0
+; and store sum in a third limb vector.
+
+; Copyright (C) 1997, 1998, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+	.code
+	.export	__gmpn_add_n
+__gmpn_add_n
+	.proc
+	.callinfo frame=0,no_calls
+	.entry
+
+	sub		%r0,%r23,%r22
+	zdep		%r22,30,3,%r28		; r28 = 2 * (-n & 7)
+	zdep		%r22,29,3,%r22		; r22 = 4 * (-n & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	sub		%r24,%r22,%r24		; offset s2_ptr
+	sub		%r26,%r22,%r26		; offset res_ptr
+	blr		%r28,%r0		; branch into loop
+	add		%r0,%r0,%r0		; reset carry
+
+L$loop	ldw		0(%r25),%r20
+	ldw		0(%r24),%r31
+	addc		%r20,%r31,%r20
+	stw		%r20,0(%r26)
+L$7	ldw		4(%r25),%r21
+	ldw		4(%r24),%r19
+	addc		%r21,%r19,%r21
+	stw		%r21,4(%r26)
+L$6	ldw		8(%r25),%r20
+	ldw		8(%r24),%r31
+	addc		%r20,%r31,%r20
+	stw		%r20,8(%r26)
+L$5	ldw		12(%r25),%r21
+	ldw		12(%r24),%r19
+	addc		%r21,%r19,%r21
+	stw		%r21,12(%r26)
+L$4	ldw		16(%r25),%r20
+	ldw		16(%r24),%r31
+	addc		%r20,%r31,%r20
+	stw		%r20,16(%r26)
+L$3	ldw		20(%r25),%r21
+	ldw		20(%r24),%r19
+	addc		%r21,%r19,%r21
+	stw		%r21,20(%r26)
+L$2	ldw		24(%r25),%r20
+	ldw		24(%r24),%r31
+	addc		%r20,%r31,%r20
+	stw		%r20,24(%r26)
+L$1	ldw		28(%r25),%r21
+	ldo		32(%r25),%r25
+	ldw		28(%r24),%r19
+	addc		%r21,%r19,%r21
+	stw		%r21,28(%r26)
+	ldo		32(%r24),%r24
+	addib,>		-8,%r23,L$loop
+	ldo		32(%r26),%r26
+
+	bv		(%r2)
+	.exit
+	addc		%r0,%r0,%r28
+	.procend
diff --git a/rts/gmp/mpn/hppa/hppa2_0/sub_n.s b/rts/gmp/mpn/hppa/hppa2_0/sub_n.s
new file mode 100644
index 0000000000..7d9b50fc27
--- /dev/null
+++ b/rts/gmp/mpn/hppa/hppa2_0/sub_n.s
@@ -0,0 +1,88 @@
+; HP-PA 2.0 32-bit __gmpn_sub_n -- Subtract two limb vectors of the same
+; length > 0 and store difference in a third limb vector.
+
+; Copyright (C) 1997, 1998, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+	.code
+	.export	__gmpn_sub_n
+__gmpn_sub_n
+	.proc
+	.callinfo frame=0,no_calls
+	.entry
+
+	sub		%r0,%r23,%r22
+	zdep		%r22,30,3,%r28		; r28 = 2 * (-n & 7)
+	zdep		%r22,29,3,%r22		; r22 = 4 * (-n & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	sub		%r24,%r22,%r24		; offset s2_ptr
+	blr		%r28,%r0		; branch into loop
+	sub		%r26,%r22,%r26		; offset res_ptr and set carry
+
+L$loop	ldw		0(%r25),%r20
+	ldw		0(%r24),%r31
+	subb		%r20,%r31,%r20
+	stw		%r20,0(%r26)
+L$7	ldw		4(%r25),%r21
+	ldw		4(%r24),%r19
+	subb		%r21,%r19,%r21
+	stw		%r21,4(%r26)
+L$6	ldw		8(%r25),%r20
+	ldw		8(%r24),%r31
+	subb		%r20,%r31,%r20
+	stw		%r20,8(%r26)
+L$5	ldw		12(%r25),%r21
+	ldw		12(%r24),%r19
+	subb		%r21,%r19,%r21
+	stw		%r21,12(%r26)
+L$4	ldw		16(%r25),%r20
+	ldw		16(%r24),%r31
+	subb		%r20,%r31,%r20
+	stw		%r20,16(%r26)
+L$3	ldw		20(%r25),%r21
+	ldw		20(%r24),%r19
+	subb		%r21,%r19,%r21
+	stw		%r21,20(%r26)
+L$2	ldw		24(%r25),%r20
+	ldw		24(%r24),%r31
+	subb		%r20,%r31,%r20
+	stw		%r20,24(%r26)
+L$1	ldw		28(%r25),%r21
+	ldo		32(%r25),%r25
+	ldw		28(%r24),%r19
+	subb		%r21,%r19,%r21
+	stw		%r21,28(%r26)
+	ldo		32(%r24),%r24
+	addib,>		-8,%r23,L$loop
+	ldo		32(%r26),%r26
+
+	addc		%r0,%r0,%r28
+	bv		(%r2)
+	.exit
+	subi		1,%r28,%r28
+	.procend
diff --git a/rts/gmp/mpn/hppa/lshift.s b/rts/gmp/mpn/hppa/lshift.s
new file mode 100644
index 0000000000..f5a2daad60
--- /dev/null
+++ b/rts/gmp/mpn/hppa/lshift.s
@@ -0,0 +1,66 @@
+; HP-PA  __gmpn_lshift --
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s_ptr		gr25
+; size		gr24
+; cnt		gr23
+
+	.code
+	.export		__gmpn_lshift
+__gmpn_lshift
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	sh2add		%r24,%r25,%r25
+	sh2add		%r24,%r26,%r26
+	ldws,mb		-4(0,%r25),%r22
+	subi		32,%r23,%r1
+	mtsar		%r1
+	addib,=		-1,%r24,L$0004
+	vshd		%r0,%r22,%r28		; compute carry out limb
+	ldws,mb		-4(0,%r25),%r29
+	addib,=		-1,%r24,L$0002
+	vshd		%r22,%r29,%r20
+
+L$loop	ldws,mb		-4(0,%r25),%r22
+	stws,mb		%r20,-4(0,%r26)
+	addib,=		-1,%r24,L$0003
+	vshd		%r29,%r22,%r20
+	ldws,mb		-4(0,%r25),%r29
+	stws,mb		%r20,-4(0,%r26)
+	addib,<>	-1,%r24,L$loop
+	vshd		%r22,%r29,%r20
+
+L$0002	stws,mb		%r20,-4(0,%r26)
+	vshd		%r29,%r0,%r20
+	bv		0(%r2)
+	stw		%r20,-4(0,%r26)
+L$0003	stws,mb		%r20,-4(0,%r26)
+L$0004	vshd		%r22,%r0,%r20
+	bv		0(%r2)
+	stw		%r20,-4(0,%r26)
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/rshift.s b/rts/gmp/mpn/hppa/rshift.s
new file mode 100644
index 0000000000..e05e2f10b5
--- /dev/null
+++ b/rts/gmp/mpn/hppa/rshift.s
@@ -0,0 +1,63 @@
+; HP-PA  __gmpn_rshift -- 
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s_ptr		gr25
+; size		gr24
+; cnt		gr23
+
+	.code
+	.export		__gmpn_rshift
+__gmpn_rshift
+	.proc
+	.callinfo	frame=64,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r22
+	mtsar		%r23
+	addib,=		-1,%r24,L$0004
+	vshd		%r22,%r0,%r28		; compute carry out limb
+	ldws,ma		4(0,%r25),%r29
+	addib,=		-1,%r24,L$0002
+	vshd		%r29,%r22,%r20
+
+L$loop	ldws,ma		4(0,%r25),%r22
+	stws,ma		%r20,4(0,%r26)
+	addib,=		-1,%r24,L$0003
+	vshd		%r22,%r29,%r20
+	ldws,ma		4(0,%r25),%r29
+	stws,ma		%r20,4(0,%r26)
+	addib,<>	-1,%r24,L$loop
+	vshd		%r29,%r22,%r20
+
+L$0002	stws,ma		%r20,4(0,%r26)
+	vshd		%r0,%r29,%r20
+	bv		0(%r2)
+	stw		%r20,0(0,%r26)
+L$0003	stws,ma		%r20,4(0,%r26)
+L$0004	vshd		%r0,%r22,%r20
+	bv		0(%r2)
+	stw		%r20,0(0,%r26)
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/sub_n.s b/rts/gmp/mpn/hppa/sub_n.s
new file mode 100644
index 0000000000..8f770ad1ad
--- /dev/null
+++ b/rts/gmp/mpn/hppa/sub_n.s
@@ -0,0 +1,59 @@
+; HP-PA  __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; One might want to unroll this as for other processors, but it turns
+; out that the data cache contention after a store makes such
+; unrolling useless.  We can't come under 5 cycles/limb anyway.
+
+	.code
+	.export		__gmpn_sub_n
+__gmpn_sub_n
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+
+	addib,=		-1,%r23,L$end	; check for (SIZE == 1)
+	 sub		%r20,%r19,%r28	; subtract first limbs ignoring cy
+
+L$loop	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,<>	-1,%r23,L$loop
+	 subb		%r20,%r19,%r28
+
+L$end	stws		%r28,0(0,%r26)
+	addc		%r0,%r0,%r28
+	bv		0(%r2)
+	 subi		1,%r28,%r28
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/hppa/udiv_qrnnd.s b/rts/gmp/mpn/hppa/udiv_qrnnd.s
new file mode 100644
index 0000000000..9aa3b8a830
--- /dev/null
+++ b/rts/gmp/mpn/hppa/udiv_qrnnd.s
@@ -0,0 +1,286 @@
+; HP-PA  __udiv_qrnnd division support, used from longlong.h.
+; This version runs fast on pre-PA7000 CPUs.
+
+; Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; rem_ptr	gr26
+; n1		gr25
+; n0		gr24
+; d		gr23
+
+; The code size is a bit excessive.  We could merge the last two ds;addc
+; sequences by simply moving the "bb,< Odd" instruction down.  The only
+; trouble is the FFFFFFFF code that would need some hacking.
+
+	.code
+	.export		__gmpn_udiv_qrnnd
+__gmpn_udiv_qrnnd
+	.proc
+	.callinfo	frame=0,no_calls
+	.entry
+
+	comb,<		%r23,0,L$largedivisor
+	 sub		%r0,%r23,%r1		; clear cy as side-effect
+	ds		%r0,%r1,%r0
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r28
+	ds		%r25,%r23,%r25
+	comclr,>=	%r25,%r0,%r0
+	addl		%r25,%r23,%r25
+	stws		%r25,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r28,%r28,%r28
+
+L$largedivisor
+	extru		%r24,31,1,%r19		; r19 = n0 & 1
+	bb,<		%r23,31,L$odd
+	 extru		%r23,30,31,%r22		; r22 = d >> 1
+	shd		%r25,%r24,1,%r24	; r24 = new n0
+	extru		%r25,30,31,%r25		; r25 = new n1
+	sub		%r0,%r22,%r21
+	ds		%r0,%r21,%r0
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	comclr,>=	%r25,%r0,%r0
+	addl		%r25,%r22,%r25
+	sh1addl		%r25,%r19,%r25
+	stws		%r25,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r24,%r24,%r28
+
+L$odd	addib,sv,n	1,%r22,L$FF..		; r22 = (d / 2 + 1)
+	shd		%r25,%r24,1,%r24	; r24 = new n0
+	extru		%r25,30,31,%r25		; r25 = new n1
+	sub		%r0,%r22,%r21
+	ds		%r0,%r21,%r0
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r28
+	comclr,>=	%r25,%r0,%r0
+	addl		%r25,%r22,%r25
+	sh1addl		%r25,%r19,%r25
+; We have computed (n1,,n0) / (d + 1), q' = r28, r' = r25
+	add,nuv		%r28,%r25,%r25
+	addl		%r25,%r1,%r25
+	addc		%r0,%r28,%r28
+	sub,<<		%r25,%r23,%r0
+	addl		%r25,%r1,%r25
+	stws		%r25,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r28,%r28
+
+; This is just a special case of the code above.
+; We come here when d == 0xFFFFFFFF
+L$FF..	add,uv		%r25,%r24,%r24
+	sub,<<		%r24,%r23,%r0
+	ldo		1(%r24),%r24
+	stws		%r24,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r25,%r28
+
+	.exit
+	.procend
diff --git a/rts/gmp/mpn/i960/README b/rts/gmp/mpn/i960/README
new file mode 100644
index 0000000000..d68a0a83eb
--- /dev/null
+++ b/rts/gmp/mpn/i960/README
@@ -0,0 +1,9 @@
+This directory contains mpn functions for Intel i960 processors.
+
+RELEVANT OPTIMIZATION ISSUES
+
+The code in this directory is not well optimized.
+
+STATUS
+
+The code in this directory has not been tested.
diff --git a/rts/gmp/mpn/i960/add_n.s b/rts/gmp/mpn/i960/add_n.s
new file mode 100644
index 0000000000..387317a397
--- /dev/null
+++ b/rts/gmp/mpn/i960/add_n.s
@@ -0,0 +1,43 @@
+# I960 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+# sum in a third limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+	.align 4
+	.globl ___gmpn_add_n
+___gmpn_add_n:
+	mov	0,g6		# clear carry-save register
+	cmpo	1,0		# clear cy
+
+Loop:	subo	1,g3,g3		# update loop counter
+	ld	(g1),g5		# load from s1_ptr
+	addo	4,g1,g1		# s1_ptr++
+	ld	(g2),g4		# load from s2_ptr
+	addo	4,g2,g2		# s2_ptr++
+	cmpo	g6,1		# restore cy from g6, relies on cy being 0
+	addc	g4,g5,g4	# main add
+	subc	0,0,g6		# save cy in g6
+	st	g4,(g0)		# store result to res_ptr
+	addo	4,g0,g0		# res_ptr++
+	cmpobne	0,g3,Loop	# when branch is taken, clears C bit
+
+	mov	g6,g0
+	ret
diff --git a/rts/gmp/mpn/i960/addmul_1.s b/rts/gmp/mpn/i960/addmul_1.s
new file mode 100644
index 0000000000..7df1418356
--- /dev/null
+++ b/rts/gmp/mpn/i960/addmul_1.s
@@ -0,0 +1,48 @@
+# I960 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+	.align	4
+	.globl	___gmpn_mul_1
+___gmpn_mul_1:
+	subo	g2,0,g2
+	shlo	2,g2,g4
+	subo	g4,g1,g1
+	subo	g4,g0,g13
+	mov	0,g0
+
+	cmpo	1,0		# clear C bit on AC.cc
+
+Loop:	ld	(g1)[g2*4],g5
+	emul	g3,g5,g6
+	ld	(g13)[g2*4],g5
+
+	addc	g0,g6,g6	# relies on that C bit is clear
+	addc	0,g7,g7
+	addc	g5,g6,g6	# relies on that C bit is clear
+	st	g6,(g13)[g2*4]
+	addc	0,g7,g0
+
+	addo	g2,1,g2
+	cmpobne	0,g2,Loop	# when branch is taken, clears C bit
+
+	ret
diff --git a/rts/gmp/mpn/i960/mul_1.s b/rts/gmp/mpn/i960/mul_1.s
new file mode 100644
index 0000000000..5c0c985aa5
--- /dev/null
+++ b/rts/gmp/mpn/i960/mul_1.s
@@ -0,0 +1,45 @@
+# I960 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+	.align	4
+	.globl	___gmpn_mul_1
+___gmpn_mul_1:
+	subo	g2,0,g2
+	shlo	2,g2,g4
+	subo	g4,g1,g1
+	subo	g4,g0,g13
+	mov	0,g0
+
+	cmpo	1,0		# clear C bit on AC.cc
+
+Loop:	ld	(g1)[g2*4],g5
+	emul	g3,g5,g6
+
+	addc	g0,g6,g6	# relies on that C bit is clear
+	st	g6,(g13)[g2*4]
+	addc	0,g7,g0
+
+	addo	g2,1,g2
+	cmpobne	0,g2,Loop	# when branch is taken, clears C bit
+
+	ret
diff --git a/rts/gmp/mpn/i960/sub_n.s b/rts/gmp/mpn/i960/sub_n.s
new file mode 100644
index 0000000000..2db2d46aad
--- /dev/null
+++ b/rts/gmp/mpn/i960/sub_n.s
@@ -0,0 +1,43 @@
+# I960 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+# store difference in a third limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+	.align 4
+	.globl ___gmpn_sub_n
+___gmpn_sub_n:
+	mov	1,g6		# set carry-save register
+	cmpo	1,0		# clear cy
+
+Loop:	subo	1,g3,g3		# update loop counter
+	ld	(g1),g5		# load from s1_ptr
+	addo	4,g1,g1		# s1_ptr++
+	ld	(g2),g4		# load from s2_ptr
+	addo	4,g2,g2		# s2_ptr++
+	cmpo	g6,1		# restore cy from g6, relies on cy being 0
+	subc	g4,g5,g4	# main subtract
+	subc	0,0,g6		# save cy in g6
+	st	g4,(g0)		# store result to res_ptr
+	addo	4,g0,g0		# res_ptr++
+	cmpobne	0,g3,Loop	# when branch is taken, cy will be 0
+
+	mov	g6,g0
+	ret
diff --git a/rts/gmp/mpn/lisp/gmpasm-mode.el b/rts/gmp/mpn/lisp/gmpasm-mode.el
new file mode 100644
index 0000000000..5d9da7fa1f
--- /dev/null
+++ b/rts/gmp/mpn/lisp/gmpasm-mode.el
@@ -0,0 +1,351 @@
+;;; gmpasm-mode.el -- GNU MP asm and m4 editing mode.
+
+
+;; Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+;;
+;; This file is part of the GNU MP Library.
+;;
+;; The GNU MP Library is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU Lesser General Public License as published by
+;; the Free Software Foundation; either version 2.1 of the License, or (at your
+;; option) any later version.
+;;
+;; The GNU MP Library is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+;; License for more details.
+;;
+;; You should have received a copy of the GNU Lesser General Public License
+;; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+;; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+;; MA 02111-1307, USA.
+
+
+;;; Commentary:
+;;
+;; gmpasm-mode is an editing mode for m4 processed assembler code and m4
+;; macro files in GMP.  It's similar to m4-mode, but has a number of
+;; settings better suited to GMP.
+;;
+;;
+;; Install
+;; -------
+;;
+;; To make M-x gmpasm-mode available, put gmpasm-mode.el somewhere in the
+;; load-path and the following in .emacs
+;;
+;;	(autoload 'gmpasm-mode "gmpasm-mode" nil t)
+;;
+;; To use gmpasm-mode automatically on all .asm and .m4 files, put the
+;; following in .emacs
+;;
+;;	(add-to-list 'auto-mode-alist '("\\.asm\\'" . gmpasm-mode))
+;;	(add-to-list 'auto-mode-alist '("\\.m4\\'" . gmpasm-mode))
+;;
+;; To have gmpasm-mode only on gmp files, try instead something like the
+;; following, which uses it only in a directory starting with "gmp", or a
+;; sub-directory of such.
+;;
+;;	(add-to-list 'auto-mode-alist
+;;	             '("/gmp.*/.*\\.\\(asm\\|m4\\)\\'" . gmpasm-mode))
+;;
+;; Byte compiling will slightly speed up loading.  If you want a docstring
+;; in the autoload you can use M-x update-file-autoloads if you set it up
+;; right.
+;;
+;;
+;; Emacsen
+;; -------
+;;
+;; FSF Emacs 20.x - gmpasm-mode is designed for this.
+;; XEmacs 20.x - seems to work.
+;;
+;; FSF Emacs 19.x - should work if replacements for some 20.x-isms are
+;;    available.  comment-region with "C" won't really do the right thing
+;;    though.
+
+
+;;; Code:
+
+(defgroup gmpasm nil
+  "GNU MP m4 and asm editing."
+  :prefix "gmpasm-"
+  :group 'languages)
+
+(defcustom gmpasm-mode-hook nil
+  "*Hook called by `gmpasm-mode'."
+  :type 'hook
+  :group 'gmpasm)
+
+(defcustom gmpasm-comment-start-regexp "[#;!@C]"
+  "*Regexp matching possible comment styles.
+See `gmpasm-mode' docstring for how this is used."
+  :type 'regexp
+  :group 'gmpasm)
+
+
+(defun gmpasm-add-to-list-second (list-var element)
+  "(gmpasm-add-to-list-second LIST-VAR ELEMENT)
+
+Add ELEMENT to LIST-VAR as the second element in the list, if it isn't
+already in the list.  If LIST-VAR is nil, then ELEMENT is just added as the
+sole element in the list.
+
+This is like `add-to-list', but it puts the new value second in the list.
+
+The first cons cell is copied rather than changed in-place, so references to
+the list elsewhere won't be affected."
+
+  (if (member element (symbol-value list-var))
+      (symbol-value list-var)
+    (set list-var
+	 (if (symbol-value list-var)
+	     (cons (car (symbol-value list-var))
+		   (cons element
+			 (cdr (symbol-value list-var))))
+	   (list element)))))
+
+
+(defun gmpasm-delete-from-list (list-var element)
+  "(gmpasm-delete-from-list LIST-VAR ELEMENT)
+
+Delete ELEMENT from LIST-VAR, using `delete'.
+This is like `add-to-list', but the element is deleted from the list.
+The list is copied rather than changed in-place, so references to it elsewhere
+won't be affected."
+
+  (set list-var (delete element (copy-sequence (symbol-value list-var)))))
+
+
+(defvar gmpasm-mode-map
+  (let ((map (make-sparse-keymap)))
+    
+    ;; assembler and dnl commenting
+    (define-key map "\C-c\C-c" 'comment-region)
+    (define-key map "\C-c\C-d" 'gmpasm-comment-region-dnl)
+    
+    ;; kill an M-x compile, since it's not hard to put m4 into an infinite
+    ;; loop
+    (define-key map "\C-c\C-k" 'kill-compilation)
+    
+    map)
+  "Keymap for `gmpasm-mode'.")
+
+
+(defvar gmpasm-mode-syntax-table
+  (let ((table (make-syntax-table)))
+    ;; underscore left as a symbol char, like C mode
+    
+    ;; m4 quotes
+    (modify-syntax-entry ?`  "('"  table)
+    (modify-syntax-entry ?'  ")`"  table)
+
+    table)
+  "Syntax table used in `gmpasm-mode'.
+
+m4 ignores quote marks in # comments at the top level, but inside quotes #
+isn't special and all quotes are active.  There seems no easy way to express
+this in the syntax table, so nothing is done for comments.  Usually this is
+best, since it picks up invalid apostrophes in comments inside quotes.")
+
+
+(defvar gmpasm-font-lock-keywords
+  (eval-when-compile
+    (list
+     (cons
+      (concat
+       "\\b"
+       (regexp-opt
+	'("deflit" "defreg" "defframe" "defframe_pushl"
+	  "define_not_for_expansion"
+	  "ASM_START" "ASM_END" "PROLOGUE" "EPILOGUE"
+	  "forloop"
+	  "TEXT" "DATA" "ALIGN" "W32"
+	  "builtin" "changecom" "changequote" "changeword" "debugfile"
+	  "debugmode" "decr" "define" "defn" "divert" "divnum" "dumpdef"
+	  "errprint" "esyscmd" "eval" "__file__" "format" "gnu" "ifdef"
+	  "ifelse" "include" "incr" "index" "indir" "len" "__line__"
+	  "m4exit" "m4wrap" "maketemp" "patsubst" "popdef" "pushdef"
+	  "regexp" "shift" "sinclude" "substr" "syscmd" "sysval"
+	  "traceoff" "traceon" "translit" "undefine" "undivert" "unix")
+	t)
+       "\\b") 'font-lock-keyword-face)))
+
+  "`font-lock-keywords' for `gmpasm-mode'.
+
+The keywords are m4 builtins and some of the GMP macros used in asm files.
+L and LF don't look good fontified, so they're omitted.
+
+The right assembler comment regexp is added dynamically buffer-local (with
+dnl too).")
+
+
+;; Initialized if gmpasm-mode finds filladapt loaded.
+(defvar gmpasm-filladapt-token-table nil
+  "Filladapt token table used in `gmpasm-mode'.")
+(defvar gmpasm-filladapt-token-match-table nil
+  "Filladapt token match table used in `gmpasm-mode'.")
+(defvar gmpasm-filladapt-token-conversion-table nil
+  "Filladapt token conversion table used in `gmpasm-mode'.")
+
+
+;;;###autoload
+(defun gmpasm-mode ()
+  "A major mode for editing GNU MP asm and m4 files.
+
+\\{gmpasm-mode-map}
+`comment-start' and `comment-end' are set buffer-local to assembler
+commenting appropriate for the CPU by looking for something matching
+`gmpasm-comment-start-regexp' at the start of a line, or \"#\" is used if
+there's no match (if \"#\" isn't what you want, type in a desired comment
+and do \\[gmpasm-mode] to reinitialize).
+
+`adaptive-fill-regexp' is set buffer-local to the standard regexp with
+`comment-start' and dnl added.  If filladapt.el has been loaded it similarly
+gets `comment-start' and dnl added as buffer-local fill prefixes.
+
+Font locking has the m4 builtins, some of the GMP macros, m4 dnl commenting,
+and assembler commenting (based on the `comment-start' determined).
+
+Note that `gmpasm-comment-start-regexp' is only matched as a whole word, so
+the `C' in it is only matched as a whole word, not on something that happens
+to start with `C'.  Also it's only the particular `comment-start' determined
+that's added for filling etc, not the whole `gmpasm-comment-start-regexp'.
+
+`gmpasm-mode-hook' is run after initializations are complete.
+"
+
+  (interactive)
+  (kill-all-local-variables)
+  (setq major-mode 'gmpasm-mode
+        mode-name  "gmpasm")
+  (use-local-map gmpasm-mode-map)
+  (set-syntax-table gmpasm-mode-syntax-table)
+  (setq fill-column 76)
+
+  ;; Short instructions might fit with 32, but anything with labels or
+  ;; expressions soon needs the comments pushed out to column 40.
+  (setq comment-column 40)
+
+  ;; Don't want to find out the hard way which dumb assemblers don't like a
+  ;; missing final newline.
+  (set (make-local-variable 'require-final-newline) t)
+
+  ;; The first match of gmpasm-comment-start-regexp at the start of a line
+  ;; determines comment-start, or "#" if no match.
+  (set (make-local-variable 'comment-start)
+       (save-excursion
+	 (goto-char (point-min))
+	 (if (re-search-forward
+	      (concat "^\\(" gmpasm-comment-start-regexp "\\)\\(\\s-\\|$\\)")
+	      nil t)
+	     (match-string 1)
+	   "#")))
+  (set (make-local-variable 'comment-end) "")
+
+  ;; If comment-start ends in an alphanumeric then \b is used to match it
+  ;; only as a separate word.  The test is for an alphanumeric rather than
+  ;; \w since we might try # or ! as \w characters but without wanting \b.
+  (let ((comment-regexp
+	 (concat (regexp-quote comment-start)
+		 (if (string-match "[a-zA-Z0-9]\\'" comment-start) "\\b"))))
+    
+    ;; Whitespace is required before a comment-start so m4 $# doesn't match
+    ;; when comment-start is "#".
+    ;; Only spaces or tabs match after, so newline isn't included in the
+    ;; font lock below.
+    (set (make-local-variable 'comment-start-skip)
+	 (concat "\\(^\\|\\s-\\)" comment-regexp "[ \t]*"))
+
+    ;; Comment fontification based on comment-start, matching through to the
+    ;; end of the line.
+    (add-to-list (make-local-variable 'gmpasm-font-lock-keywords)
+		 (cons (concat
+			"\\(\\bdnl\\b\\|" comment-start-skip "\\).*$")
+		       'font-lock-comment-face))
+
+    (set (make-local-variable 'font-lock-defaults)
+	 '(gmpasm-font-lock-keywords
+	   t	         ; no syntactic fontification (of strings etc)
+	   nil           ; no case-fold
+	   ((?_ . "w"))  ; _ part of a word while fontifying
+	   ))
+
+    ;; Paragraphs are separated by blank lines, or lines with only dnl or
+    ;; comment-start.
+    (set (make-local-variable 'paragraph-separate)
+	 (concat "[ \t\f]*\\(\\(" comment-regexp "\\|dnl\\)[ \t]*\\)*$"))
+    (set (make-local-variable 'paragraph-start)
+	 (concat "\f\\|" paragraph-separate))
+ 
+    ;; Adaptive fill gets dnl and comment-start as comment style prefixes on
+    ;; top of the standard regexp (which has # and ; already actually).
+    (set (make-local-variable 'adaptive-fill-regexp)
+	 (concat "[ \t]*\\(\\("
+		 comment-regexp
+		 "\\|dnl\\|[-|#;>*]+\\|(?[0-9]+[.)]\\)[ \t]*\\)*"))
+    (set (make-local-variable 'adaptive-fill-first-line-regexp)
+	 "\\`\\([ \t]*dnl\\)?[ \t]*\\'")
+
+    (when (fboundp 'filladapt-mode)
+      (when (not gmpasm-filladapt-token-table)
+	(setq gmpasm-filladapt-token-table
+	      filladapt-token-table)
+	(setq gmpasm-filladapt-token-match-table
+	      filladapt-token-match-table)
+	(setq gmpasm-filladapt-token-conversion-table
+	      filladapt-token-conversion-table)
+	
+	;; Numbered bullet points like "2.1" get matched at the start of a
+	;; line when it's really something like "2.1 cycles/limb", so delete
+	;; this from the list.  The regexp for "1.", "2." etc is left
+	;; though.
+	(gmpasm-delete-from-list 'gmpasm-filladapt-token-table
+				 '("[0-9]+\\(\\.[0-9]+\\)+[ \t]"
+				   bullet))
+	  
+	;; "%" as a comment prefix interferes with x86 register names
+	;; like %eax, so delete this.
+	(gmpasm-delete-from-list 'gmpasm-filladapt-token-table
+				 '("%+" postscript-comment))
+	
+	(add-to-list 'gmpasm-filladapt-token-match-table
+		     '(gmpasm-comment gmpasm-comment))
+	(add-to-list 'gmpasm-filladapt-token-conversion-table
+		     '(gmpasm-comment . exact))
+	)
+      
+      (set (make-local-variable 'filladapt-token-table)
+	   gmpasm-filladapt-token-table)
+      (set (make-local-variable 'filladapt-token-match-table)
+	   gmpasm-filladapt-token-match-table)
+      (set (make-local-variable 'filladapt-token-conversion-table)
+	   gmpasm-filladapt-token-conversion-table)
+    
+      ;; Add dnl and comment-start as fill prefixes.
+      ;; Comments in filladapt.el say filladapt-token-table must begin
+      ;; with ("^" beginning-of-line), so put our addition second.
+      (gmpasm-add-to-list-second 'filladapt-token-table
+				 (list (concat "dnl[ \t]\\|" comment-regexp)
+				       'gmpasm-comment))
+      ))
+  
+  (run-hooks 'gmpasm-mode-hook))
+
+
+(defun gmpasm-comment-region-dnl (beg end &optional arg)
+  "(gmpasm-comment-region BEG END &option ARG)
+
+Comment or uncomment each line in the region using `dnl'.
+With \\[universal-argument] prefix arg, uncomment each line in region.
+This is `comment-region', but using \"dnl\"."
+
+  (interactive "r\nP")
+  (let ((comment-start "dnl")
+	(comment-end ""))
+    (comment-region beg end arg)))
+
+
+(provide 'gmpasm-mode)
+
+;;; gmpasm-mode.el ends here
diff --git a/rts/gmp/mpn/m68k/add_n.S b/rts/gmp/mpn/m68k/add_n.S
new file mode 100644
index 0000000000..9e1d89d64f
--- /dev/null
+++ b/rts/gmp/mpn/m68k/add_n.S
@@ -0,0 +1,79 @@
+/* mc68020 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+   sum in a third limb vector.
+
+Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s1_ptr	(sp + 8)
+  s2_ptr	(sp + 16)
+  size		(sp + 12)
+*/
+
+#include "asm-syntax.h"
+
+	TEXT
+	ALIGN
+	GLOBL	C_SYMBOL_NAME(__gmpn_add_n)
+
+C_SYMBOL_NAME(__gmpn_add_n:)
+PROLOG(__gmpn_add_n)
+/* Save used registers on the stack.  */
+	movel	R(d2),MEM_PREDEC(sp)
+	movel	R(a2),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers.  Better use movem?  */
+	movel	MEM_DISP(sp,12),R(a2)
+	movel	MEM_DISP(sp,16),R(a0)
+	movel	MEM_DISP(sp,20),R(a1)
+	movel	MEM_DISP(sp,24),R(d2)
+
+	eorw	#1,R(d2)
+	lsrl	#1,R(d2)
+	bcc	L(L1)
+	subql	#1,R(d2)	/* clears cy as side effect */
+
+L(Loop:)
+	movel	MEM_POSTINC(a0),R(d0)
+	movel	MEM_POSTINC(a1),R(d1)
+	addxl	R(d1),R(d0)
+	movel	R(d0),MEM_POSTINC(a2)
+L(L1:)	movel	MEM_POSTINC(a0),R(d0)
+	movel	MEM_POSTINC(a1),R(d1)
+	addxl	R(d1),R(d0)
+	movel	R(d0),MEM_POSTINC(a2)
+
+	dbf	R(d2),L(Loop)		/* loop until 16 lsb of %4 == -1 */
+	subxl	R(d0),R(d0)	/* d0 <= -cy; save cy as 0 or -1 in d0 */
+	subl	#0x10000,R(d2)
+	bcs	L(L2)
+	addl	R(d0),R(d0)	/* restore cy */
+	bra	L(Loop)
+
+L(L2:)
+	negl	R(d0)
+
+/* Restore used registers from stack frame.  */
+	movel	MEM_POSTINC(sp),R(a2)
+	movel	MEM_POSTINC(sp),R(d2)
+
+	rts
+EPILOG(__gmpn_add_n)
diff --git a/rts/gmp/mpn/m68k/lshift.S b/rts/gmp/mpn/m68k/lshift.S
new file mode 100644
index 0000000000..a539d5d42e
--- /dev/null
+++ b/rts/gmp/mpn/m68k/lshift.S
@@ -0,0 +1,150 @@
+/* mc68020 __gmpn_lshift -- Shift left a low-level natural-number integer.
+
+Copyright (C) 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s_ptr		(sp + 8)
+  s_size	(sp + 16)
+  cnt		(sp + 12)
+*/
+
+#include "asm-syntax.h"
+
+#define res_ptr a1
+#define s_ptr a0
+#define s_size d6
+#define cnt d4
+
+	TEXT
+	ALIGN
+	GLOBL	C_SYMBOL_NAME(__gmpn_lshift)
+
+C_SYMBOL_NAME(__gmpn_lshift:)
+PROLOG(__gmpn_lshift)
+
+/* Save used registers on the stack.  */
+	moveml	R(d2)-R(d6)/R(a2),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers.  */
+	movel	MEM_DISP(sp,28),R(res_ptr)
+	movel	MEM_DISP(sp,32),R(s_ptr)
+	movel	MEM_DISP(sp,36),R(s_size)
+	movel	MEM_DISP(sp,40),R(cnt)
+
+	moveql	#1,R(d5)
+	cmpl	R(d5),R(cnt)
+	bne	L(Lnormal)
+	cmpl	R(s_ptr),R(res_ptr)
+	bls	L(Lspecial)		/* jump if s_ptr >= res_ptr */
+#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020))
+	lea	MEM_INDX1(s_ptr,s_size,l,4),R(a2)
+#else /* not mc68020 */
+	movel	R(s_size),R(d0)
+	asll	#2,R(d0)
+	lea	MEM_INDX(s_ptr,d0,l),R(a2)
+#endif
+	cmpl	R(res_ptr),R(a2)
+	bls	L(Lspecial)		/* jump if res_ptr >= s_ptr + s_size */
+
+L(Lnormal:)
+	moveql	#32,R(d5)
+	subl	R(cnt),R(d5)
+
+#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020))
+	lea	MEM_INDX1(s_ptr,s_size,l,4),R(s_ptr)
+	lea	MEM_INDX1(res_ptr,s_size,l,4),R(res_ptr)
+#else /* not mc68000 */
+	movel	R(s_size),R(d0)
+	asll	#2,R(d0)
+	addl	R(s_size),R(s_ptr)
+	addl	R(s_size),R(res_ptr)
+#endif
+	movel	MEM_PREDEC(s_ptr),R(d2)
+	movel	R(d2),R(d0)
+	lsrl	R(d5),R(d0)		/* compute carry limb */
+
+	lsll	R(cnt),R(d2)
+	movel	R(d2),R(d1)
+	subql	#1,R(s_size)
+	beq	L(Lend)
+	lsrl	#1,R(s_size)
+	bcs	L(L1)
+	subql	#1,R(s_size)
+
+L(Loop:)
+	movel	MEM_PREDEC(s_ptr),R(d2)
+	movel	R(d2),R(d3)
+	lsrl	R(d5),R(d3)
+	orl	R(d3),R(d1)
+	movel	R(d1),MEM_PREDEC(res_ptr)
+	lsll	R(cnt),R(d2)
+L(L1:)
+	movel	MEM_PREDEC(s_ptr),R(d1)
+	movel	R(d1),R(d3)
+	lsrl	R(d5),R(d3)
+	orl	R(d3),R(d2)
+	movel	R(d2),MEM_PREDEC(res_ptr)
+	lsll	R(cnt),R(d1)
+
+	dbf	R(s_size),L(Loop)
+	subl	#0x10000,R(s_size)
+	bcc	L(Loop)
+
+L(Lend:)
+	movel	R(d1),MEM_PREDEC(res_ptr) /* store least significant limb */
+
+/* Restore used registers from stack frame.  */
+	moveml	MEM_POSTINC(sp),R(d2)-R(d6)/R(a2)
+	rts
+
+/* We loop from least significant end of the arrays, which is only
+   permissable if the source and destination don't overlap, since the
+   function is documented to work for overlapping source and destination.  */
+
+L(Lspecial:)
+	clrl	R(d0)			/* initialize carry */
+	eorw	#1,R(s_size)
+	lsrl	#1,R(s_size)
+	bcc	L(LL1)
+	subql	#1,R(s_size)
+
+L(LLoop:)
+	movel	MEM_POSTINC(s_ptr),R(d2)
+	addxl	R(d2),R(d2)
+	movel	R(d2),MEM_POSTINC(res_ptr)
+L(LL1:)
+	movel	MEM_POSTINC(s_ptr),R(d2)
+	addxl	R(d2),R(d2)
+	movel	R(d2),MEM_POSTINC(res_ptr)
+
+	dbf	R(s_size),L(LLoop)
+	addxl	R(d0),R(d0)		/* save cy in lsb */
+	subl	#0x10000,R(s_size)
+	bcs	L(LLend)
+	lsrl	#1,R(d0)		/* restore cy */
+	bra	L(LLoop)
+
+L(LLend:)
+/* Restore used registers from stack frame.  */
+	moveml	MEM_POSTINC(sp),R(d2)-R(d6)/R(a2)
+	rts
+EPILOG(__gmpn_lshift)
diff --git a/rts/gmp/mpn/m68k/mc68020/addmul_1.S b/rts/gmp/mpn/m68k/mc68020/addmul_1.S
new file mode 100644
index 0000000000..6638115d71
--- /dev/null
+++ b/rts/gmp/mpn/m68k/mc68020/addmul_1.S
@@ -0,0 +1,83 @@
+/* mc68020 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+   the result to a second limb vector.
+
+Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s1_ptr	(sp + 8)
+  s1_size	(sp + 12)
+  s2_limb	(sp + 16)
+*/
+
+#include "asm-syntax.h"
+
+	TEXT
+	ALIGN
+	GLOBL	C_SYMBOL_NAME(__gmpn_addmul_1)
+
+C_SYMBOL_NAME(__gmpn_addmul_1:)
+PROLOG(__gmpn_addmul_1)
+
+#define res_ptr a0
+#define s1_ptr a1
+#define s1_size d2
+#define s2_limb d4
+
+/* Save used registers on the stack.  */
+	moveml	R(d2)-R(d5),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers.  Better use movem?  */
+	movel	MEM_DISP(sp,20),R(res_ptr)
+	movel	MEM_DISP(sp,24),R(s1_ptr)
+	movel	MEM_DISP(sp,28),R(s1_size)
+	movel	MEM_DISP(sp,32),R(s2_limb)
+
+	eorw	#1,R(s1_size)
+	clrl	R(d1)
+	clrl	R(d5)
+	lsrl	#1,R(s1_size)
+	bcc	L(L1)
+	subql	#1,R(s1_size)
+	subl	R(d0),R(d0)		/* (d0,cy) <= (0,0) */
+
+L(Loop:)
+	movel	MEM_POSTINC(s1_ptr),R(d3)
+	mulul	R(s2_limb),R(d1):R(d3)
+	addxl	R(d0),R(d3)
+	addxl	R(d5),R(d1)
+	addl	R(d3),MEM_POSTINC(res_ptr)
+L(L1:)	movel	MEM_POSTINC(s1_ptr),R(d3)
+	mulul	R(s2_limb),R(d0):R(d3)
+	addxl	R(d1),R(d3)
+	addxl	R(d5),R(d0)
+	addl	R(d3),MEM_POSTINC(res_ptr)
+
+	dbf	R(s1_size),L(Loop)
+	addxl	R(d5),R(d0)
+	subl	#0x10000,R(s1_size)
+	bcc	L(Loop)
+
+/* Restore used registers from stack frame.  */
+	moveml	MEM_POSTINC(sp),R(d2)-R(d5)
+
+	rts
+EPILOG(__gmpn_addmul_1)
diff --git a/rts/gmp/mpn/m68k/mc68020/mul_1.S b/rts/gmp/mpn/m68k/mc68020/mul_1.S
new file mode 100644
index 0000000000..fdd4c39d70
--- /dev/null
+++ b/rts/gmp/mpn/m68k/mc68020/mul_1.S
@@ -0,0 +1,90 @@
+/* mc68020 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+   the result in a second limb vector.
+
+Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s1_ptr	(sp + 8)
+  s1_size	(sp + 12)
+  s2_limb	(sp + 16)
+*/
+
+#include "asm-syntax.h"
+
+	TEXT
+	ALIGN
+	GLOBL	C_SYMBOL_NAME(__gmpn_mul_1)
+
+C_SYMBOL_NAME(__gmpn_mul_1:)
+PROLOG(__gmpn_mul_1)
+
+#define res_ptr a0
+#define s1_ptr a1
+#define s1_size d2
+#define s2_limb d4
+
+/* Save used registers on the stack.  */
+	moveml	R(d2)-R(d4),MEM_PREDEC(sp)
+#if 0
+	movel	R(d2),MEM_PREDEC(sp)
+	movel	R(d3),MEM_PREDEC(sp)
+	movel	R(d4),MEM_PREDEC(sp)
+#endif
+
+/* Copy the arguments to registers.  Better use movem?  */
+	movel	MEM_DISP(sp,16),R(res_ptr)
+	movel	MEM_DISP(sp,20),R(s1_ptr)
+	movel	MEM_DISP(sp,24),R(s1_size)
+	movel	MEM_DISP(sp,28),R(s2_limb)
+
+	eorw	#1,R(s1_size)
+	clrl	R(d1)
+	lsrl	#1,R(s1_size)
+	bcc	L(L1)
+	subql	#1,R(s1_size)
+	subl	R(d0),R(d0)	/* (d0,cy) <= (0,0) */
+
+L(Loop:)
+	movel	MEM_POSTINC(s1_ptr),R(d3)
+	mulul	R(s2_limb),R(d1):R(d3)
+	addxl	R(d0),R(d3)
+	movel	R(d3),MEM_POSTINC(res_ptr)
+L(L1:)	movel	MEM_POSTINC(s1_ptr),R(d3)
+	mulul	R(s2_limb),R(d0):R(d3)
+	addxl	R(d1),R(d3)
+	movel	R(d3),MEM_POSTINC(res_ptr)
+
+	dbf	R(s1_size),L(Loop)
+	clrl	R(d3)
+	addxl	R(d3),R(d0)
+	subl	#0x10000,R(s1_size)
+	bcc	L(Loop)
+
+/* Restore used registers from stack frame.  */
+	moveml	MEM_POSTINC(sp),R(d2)-R(d4)
+#if 0
+	movel	MEM_POSTINC(sp),R(d4)
+	movel	MEM_POSTINC(sp),R(d3)
+	movel	MEM_POSTINC(sp),R(d2)
+#endif
+	rts
+EPILOG(__gmpn_mul_1)
diff --git a/rts/gmp/mpn/m68k/mc68020/submul_1.S b/rts/gmp/mpn/m68k/mc68020/submul_1.S
new file mode 100644
index 0000000000..3c36b70166
--- /dev/null
+++ b/rts/gmp/mpn/m68k/mc68020/submul_1.S
@@ -0,0 +1,83 @@
+/* mc68020 __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract
+   the result from a second limb vector.
+
+Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s1_ptr	(sp + 8)
+  s1_size	(sp + 12)
+  s2_limb	(sp + 16)
+*/
+
+#include "asm-syntax.h"
+
+	TEXT
+	ALIGN
+	GLOBL	C_SYMBOL_NAME(__gmpn_submul_1)
+
+C_SYMBOL_NAME(__gmpn_submul_1:)
+PROLOG(__gmpn_submul_1)
+
+#define res_ptr a0
+#define s1_ptr a1
+#define s1_size d2
+#define s2_limb d4
+
+/* Save used registers on the stack.  */
+	moveml	R(d2)-R(d5),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers.  Better use movem?  */
+	movel	MEM_DISP(sp,20),R(res_ptr)
+	movel	MEM_DISP(sp,24),R(s1_ptr)
+	movel	MEM_DISP(sp,28),R(s1_size)
+	movel	MEM_DISP(sp,32),R(s2_limb)
+
+	eorw	#1,R(s1_size)
+	clrl	R(d1)
+	clrl	R(d5)
+	lsrl	#1,R(s1_size)
+	bcc	L(L1)
+	subql	#1,R(s1_size)
+	subl	R(d0),R(d0)	/* (d0,cy) <= (0,0) */
+
+L(Loop:)
+	movel	MEM_POSTINC(s1_ptr),R(d3)
+	mulul	R(s2_limb),R(d1):R(d3)
+	addxl	R(d0),R(d3)
+	addxl	R(d5),R(d1)
+	subl	R(d3),MEM_POSTINC(res_ptr)
+L(L1:)	movel	MEM_POSTINC(s1_ptr),R(d3)
+	mulul	R(s2_limb),R(d0):R(d3)
+	addxl	R(d1),R(d3)
+	addxl	R(d5),R(d0)
+	subl	R(d3),MEM_POSTINC(res_ptr)
+
+	dbf	R(s1_size),L(Loop)
+	addxl	R(d5),R(d0)
+	subl	#0x10000,R(s1_size)
+	bcc	L(Loop)
+
+/* Restore used registers from stack frame.  */
+	moveml	MEM_POSTINC(sp),R(d2)-R(d5)
+
+	rts
+EPILOG(__gmpn_submul_1)
diff --git a/rts/gmp/mpn/m68k/mc68020/udiv.S b/rts/gmp/mpn/m68k/mc68020/udiv.S
new file mode 100644
index 0000000000..d00cf13558
--- /dev/null
+++ b/rts/gmp/mpn/m68k/mc68020/udiv.S
@@ -0,0 +1,31 @@
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+.text
+	.even
+.globl ___udiv_qrnnd
+___udiv_qrnnd:
+	movel sp@(4),a0
+	movel sp@(8),d1
+	movel sp@(12),d0
+	divul sp@(16),d1:d0
+	movel d1,a0@
+	rts
diff --git a/rts/gmp/mpn/m68k/mc68020/umul.S b/rts/gmp/mpn/m68k/mc68020/umul.S
new file mode 100644
index 0000000000..a34ae6c543
--- /dev/null
+++ b/rts/gmp/mpn/m68k/mc68020/umul.S
@@ -0,0 +1,31 @@
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+.text
+	.even
+.globl ___umul_ppmm
+___umul_ppmm:
+	movel sp@(4),a0
+	movel sp@(8),d1
+	movel sp@(12),d0
+	mulul d0,d0:d1
+	movel d1,a0@
+	rts
diff --git a/rts/gmp/mpn/m68k/rshift.S b/rts/gmp/mpn/m68k/rshift.S
new file mode 100644
index 0000000000..b47a48e52a
--- /dev/null
+++ b/rts/gmp/mpn/m68k/rshift.S
@@ -0,0 +1,149 @@
+/* mc68020 __gmpn_rshift -- Shift right a low-level natural-number integer.
+
+Copyright (C) 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s_ptr		(sp + 8)
+  s_size	(sp + 16)
+  cnt		(sp + 12)
+*/
+
+#include "asm-syntax.h"
+
+#define res_ptr a1
+#define s_ptr a0
+#define s_size d6
+#define cnt d4
+
+	TEXT
+	ALIGN
+	GLOBL	C_SYMBOL_NAME(__gmpn_rshift)
+
+C_SYMBOL_NAME(__gmpn_rshift:)
+PROLOG(__gmpn_rshift)
+/* Save used registers on the stack.  */
+	moveml	R(d2)-R(d6)/R(a2),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers.  */
+	movel	MEM_DISP(sp,28),R(res_ptr)
+	movel	MEM_DISP(sp,32),R(s_ptr)
+	movel	MEM_DISP(sp,36),R(s_size)
+	movel	MEM_DISP(sp,40),R(cnt)
+
+	moveql	#1,R(d5)
+	cmpl	R(d5),R(cnt)
+	bne	L(Lnormal)
+	cmpl	R(res_ptr),R(s_ptr)
+	bls	L(Lspecial)		/* jump if res_ptr >= s_ptr */
+#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020))
+	lea	MEM_INDX1(res_ptr,s_size,l,4),R(a2)
+#else /* not mc68020 */
+	movel	R(s_size),R(d0)
+	asll	#2,R(d0)
+	lea	MEM_INDX(res_ptr,d0,l),R(a2)
+#endif
+	cmpl	R(s_ptr),R(a2)
+	bls	L(Lspecial)		/* jump if s_ptr >= res_ptr + s_size */
+
+L(Lnormal:)
+	moveql	#32,R(d5)
+	subl	R(cnt),R(d5)
+	movel	MEM_POSTINC(s_ptr),R(d2)
+	movel	R(d2),R(d0)
+	lsll	R(d5),R(d0)		/* compute carry limb */
+
+	lsrl	R(cnt),R(d2)
+	movel	R(d2),R(d1)
+	subql	#1,R(s_size)
+	beq	L(Lend)
+	lsrl	#1,R(s_size)
+	bcs	L(L1)
+	subql	#1,R(s_size)
+
+L(Loop:)
+	movel	MEM_POSTINC(s_ptr),R(d2)
+	movel	R(d2),R(d3)
+	lsll	R(d5),R(d3)
+	orl	R(d3),R(d1)
+	movel	R(d1),MEM_POSTINC(res_ptr)
+	lsrl	R(cnt),R(d2)
+L(L1:)
+	movel	MEM_POSTINC(s_ptr),R(d1)
+	movel	R(d1),R(d3)
+	lsll	R(d5),R(d3)
+	orl	R(d3),R(d2)
+	movel	R(d2),MEM_POSTINC(res_ptr)
+	lsrl	R(cnt),R(d1)
+
+	dbf	R(s_size),L(Loop)
+	subl	#0x10000,R(s_size)
+	bcc	L(Loop)
+
+L(Lend:)
+	movel	R(d1),MEM(res_ptr) /* store most significant limb */
+
+/* Restore used registers from stack frame.  */
+	moveml	MEM_POSTINC(sp),R(d2)-R(d6)/R(a2)
+	rts
+
+/* We loop from most significant end of the arrays, which is only
+   permissable if the source and destination don't overlap, since the
+   function is documented to work for overlapping source and destination.  */
+
+L(Lspecial:)
+#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020))
+	lea	MEM_INDX1(s_ptr,s_size,l,4),R(s_ptr)
+	lea	MEM_INDX1(res_ptr,s_size,l,4),R(res_ptr)
+#else /* not mc68000 */
+	movel	R(s_size),R(d0)
+	asll	#2,R(d0)
+	addl	R(s_size),R(s_ptr)
+	addl	R(s_size),R(res_ptr)
+#endif
+
+	clrl	R(d0)			/* initialize carry */
+	eorw	#1,R(s_size)
+	lsrl	#1,R(s_size)
+	bcc	L(LL1)
+	subql	#1,R(s_size)
+
+L(LLoop:)
+	movel	MEM_PREDEC(s_ptr),R(d2)
+	roxrl	#1,R(d2)
+	movel	R(d2),MEM_PREDEC(res_ptr)
+L(LL1:)
+	movel	MEM_PREDEC(s_ptr),R(d2)
+	roxrl	#1,R(d2)
+	movel	R(d2),MEM_PREDEC(res_ptr)
+
+	dbf	R(s_size),L(LLoop)
+	roxrl	#1,R(d0)		/* save cy in msb */
+	subl	#0x10000,R(s_size)
+	bcs	L(LLend)
+	addl	R(d0),R(d0)		/* restore cy */
+	bra	L(LLoop)
+
+L(LLend:)
+/* Restore used registers from stack frame.  */
+	moveml	MEM_POSTINC(sp),R(d2)-R(d6)/R(a2)
+	rts
+EPILOG(__gmpn_rshift)
diff --git a/rts/gmp/mpn/m68k/sub_n.S b/rts/gmp/mpn/m68k/sub_n.S
new file mode 100644
index 0000000000..ce45b24db5
--- /dev/null
+++ b/rts/gmp/mpn/m68k/sub_n.S
@@ -0,0 +1,79 @@
+/* mc68020 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+   store difference in a third limb vector.
+
+Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+  INPUT PARAMETERS
+  res_ptr	(sp + 4)
+  s1_ptr	(sp + 8)
+  s2_ptr	(sp + 16)
+  size		(sp + 12)
+*/
+
+#include "asm-syntax.h"
+
+	TEXT
+	ALIGN
+	GLOBL	C_SYMBOL_NAME(__gmpn_sub_n)
+
+C_SYMBOL_NAME(__gmpn_sub_n:)
+PROLOG(__gmpn_sub_n)
+/* Save used registers on the stack.  */
+	movel	R(d2),MEM_PREDEC(sp)
+	movel	R(a2),MEM_PREDEC(sp)
+
+/* Copy the arguments to registers.  Better use movem?  */
+	movel	MEM_DISP(sp,12),R(a2)
+	movel	MEM_DISP(sp,16),R(a0)
+	movel	MEM_DISP(sp,20),R(a1)
+	movel	MEM_DISP(sp,24),R(d2)
+
+	eorw	#1,R(d2)
+	lsrl	#1,R(d2)
+	bcc	L(L1)
+	subql	#1,R(d2)	/* clears cy as side effect */
+
+L(Loop:)
+	movel	MEM_POSTINC(a0),R(d0)
+	movel	MEM_POSTINC(a1),R(d1)
+	subxl	R(d1),R(d0)
+	movel	R(d0),MEM_POSTINC(a2)
+L(L1:)	movel	MEM_POSTINC(a0),R(d0)
+	movel	MEM_POSTINC(a1),R(d1)
+	subxl	R(d1),R(d0)
+	movel	R(d0),MEM_POSTINC(a2)
+
+	dbf	R(d2),L(Loop)		/* loop until 16 lsb of %4 == -1 */
+	subxl	R(d0),R(d0)	/* d0 <= -cy; save cy as 0 or -1 in d0 */
+	subl	#0x10000,R(d2)
+	bcs	L(L2)
+	addl	R(d0),R(d0)	/* restore cy */
+	bra	L(Loop)
+
+L(L2:)
+	negl	R(d0)
+
+/* Restore used registers from stack frame.  */
+	movel	MEM_POSTINC(sp),R(a2)
+	movel	MEM_POSTINC(sp),R(d2)
+
+	rts
+EPILOG(__gmpn_sub_n)
diff --git a/rts/gmp/mpn/m68k/syntax.h b/rts/gmp/mpn/m68k/syntax.h
new file mode 100644
index 0000000000..9eec279c06
--- /dev/null
+++ b/rts/gmp/mpn/m68k/syntax.h
@@ -0,0 +1,177 @@
+/* asm.h -- Definitions for 68k syntax variations.
+
+Copyright (C) 1992, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.  */
+
+#undef ALIGN
+
+#ifdef MIT_SYNTAX
+#define PROLOG(name)
+#define EPILOG(name)
+#define R(r)r
+#define MEM(base)base@
+#define MEM_DISP(base,displacement)base@(displacement)
+#define MEM_INDX(base,idx,size_suffix)base@(idx:size_suffix)
+#define MEM_INDX1(base,idx,size_suffix,scale)base@(idx:size_suffix:scale)
+#define MEM_PREDEC(memory_base)memory_base@-
+#define MEM_POSTINC(memory_base)memory_base@+
+#define L(label) label
+#define TEXT .text
+#define ALIGN .even
+#define GLOBL .globl
+#define moveql moveq
+/* Use variable sized opcodes.  */
+#define bcc jcc
+#define bcs jcs
+#define bls jls
+#define beq jeq
+#define bne jne
+#define bra jra
+#endif
+
+#ifdef SONY_SYNTAX
+#define PROLOG(name)
+#define EPILOG(name)
+#define R(r)r
+#define MEM(base)(base)
+#define MEM_DISP(base,displacement)(displacement,base)
+#define MEM_INDX(base,idx,size_suffix)(base,idx.size_suffix)
+#define MEM_INDX1(base,idx,size_suffix,scale)(base,idx.size_suffix*scale)
+#define MEM_PREDEC(memory_base)-(memory_base)
+#define MEM_POSTINC(memory_base)(memory_base)+
+#define L(label) label
+#define TEXT .text
+#define ALIGN .even
+#define GLOBL .globl
+#endif
+
+#ifdef MOTOROLA_SYNTAX
+#define PROLOG(name)
+#define EPILOG(name)
+#define R(r)r
+#define MEM(base)(base)
+#define MEM_DISP(base,displacement)(displacement,base)
+#define MEM_INDX(base,idx,size_suffix)(base,idx.size_suffix)
+#define MEM_INDX1(base,idx,size_suffix,scale)(base,idx.size_suffix*scale)
+#define MEM_PREDEC(memory_base)-(memory_base)
+#define MEM_POSTINC(memory_base)(memory_base)+
+#define L(label) label
+#define TEXT
+#define ALIGN
+#define GLOBL XDEF
+#define lea LEA
+#define movel MOVE.L
+#define moveml MOVEM.L
+#define moveql MOVEQ.L
+#define cmpl CMP.L
+#define orl OR.L
+#define clrl CLR.L
+#define eorw EOR.W
+#define lsrl LSR.L
+#define lsll LSL.L
+#define roxrl ROXR.L
+#define roxll ROXL.L
+#define addl ADD.L
+#define addxl ADDX.L
+#define addql ADDQ.L
+#define subl SUB.L
+#define subxl SUBX.L
+#define subql SUBQ.L
+#define negl NEG.L
+#define mulul MULU.L
+#define bcc BCC
+#define bcs BCS
+#define bls BLS
+#define beq BEQ
+#define bne BNE
+#define bra BRA
+#define dbf DBF
+#define rts RTS
+#define d0 D0
+#define d1 D1
+#define d2 D2
+#define d3 D3
+#define d4 D4
+#define d5 D5
+#define d6 D6
+#define d7 D7
+#define a0 A0
+#define a1 A1
+#define a2 A2
+#define a3 A3
+#define a4 A4
+#define a5 A5
+#define a6 A6
+#define a7 A7
+#define sp SP
+#endif
+
+#ifdef ELF_SYNTAX
+#define PROLOG(name) .type name,@function
+#define EPILOG(name) .size name,.-name
+#define MEM(base)(R(base))
+#define MEM_DISP(base,displacement)(displacement,R(base))
+#define MEM_PREDEC(memory_base)-(R(memory_base))
+#define MEM_POSTINC(memory_base)(R(memory_base))+
+#ifdef __STDC__
+#define R_(r)%##r
+#define R(r)R_(r)
+#define MEM_INDX_(base,idx,size_suffix)(R(base),R(idx##.##size_suffix))
+#define MEM_INDX(base,idx,size_suffix)MEM_INDX_(base,idx,size_suffix)
+#define MEM_INDX1_(base,idx,size_suffix,scale)(R(base),R(idx##.##size_suffix*scale))
+#define MEM_INDX1(base,idx,size_suffix,scale)MEM_INDX1_(base,idx,size_suffix,scale)
+#define L(label) .##label
+#else
+#define R(r)%/**/r
+#define MEM_INDX(base,idx,size_suffix)(R(base),R(idx).size_suffix)
+#define MEM_INDX1(base,idx,size_suffix,scale)(R(base),R(idx).size_suffix*scale)
+#define L(label) ./**/label
+#endif
+#define TEXT .text
+#define ALIGN .align 2
+#define GLOBL .globl
+#define bcc jbcc
+#define bcs jbcs
+#define bls jbls
+#define beq jbeq
+#define bne jbne
+#define bra jbra
+#endif
+
+#if defined (SONY_SYNTAX) || defined (ELF_SYNTAX)
+#define movel move.l
+#define moveml movem.l
+#define moveql moveq.l
+#define cmpl cmp.l
+#define orl or.l
+#define clrl clr.l
+#define eorw eor.w
+#define lsrl lsr.l
+#define lsll lsl.l
+#define roxrl roxr.l
+#define roxll roxl.l
+#define addl add.l
+#define addxl addx.l
+#define addql addq.l
+#define subl sub.l
+#define subxl subx.l
+#define subql subq.l
+#define negl neg.l
+#define mulul mulu.l
+#endif
diff --git a/rts/gmp/mpn/m88k/add_n.s b/rts/gmp/mpn/m88k/add_n.s
new file mode 100644
index 0000000000..0b776c618a
--- /dev/null
+++ b/rts/gmp/mpn/m88k/add_n.s
@@ -0,0 +1,104 @@
+; mc88100 __gmpn_add -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; s2_ptr	r4
+; size		r5
+
+; This code has been optimized to run one instruction per clock, avoiding
+; load stalls and writeback contention.  As a result, the instruction
+; order is not always natural.
+
+; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100,
+; but on the 88110, it seems to run much slower, 6.6 clocks/limb.
+
+	text
+	align	 16
+	global	 ___gmpn_add_n
+___gmpn_add_n:
+	ld	r6,r3,0			; read first limb from s1_ptr
+	extu	r10,r5,3
+	ld	r7,r4,0			; read first limb from s2_ptr
+
+	subu.co	r5,r0,r5		; (clear carry as side effect)
+	mak	r5,r5,3<4>
+	bcnd	eq0,r5,Lzero
+
+	or	r12,r0,lo16(Lbase)
+	or.u	r12,r12,hi16(Lbase)
+	addu	r12,r12,r5		; r12 is address for entering in loop
+
+	extu	r5,r5,2			; divide by 4
+	subu	r2,r2,r5		; adjust res_ptr
+	subu	r3,r3,r5		; adjust s1_ptr
+	subu	r4,r4,r5		; adjust s2_ptr
+
+	or	r8,r6,r0
+
+	jmp.n	r12
+	 or	r9,r7,r0
+
+Loop:	addu	r3,r3,32
+	st	r8,r2,28
+	addu	r4,r4,32
+	ld	r6,r3,0
+	addu	r2,r2,32
+	ld	r7,r4,0
+Lzero:	subu	r10,r10,1		; add 0 + 8r limbs (adj loop cnt)
+Lbase:	ld	r8,r3,4
+	addu.cio r6,r6,r7
+	ld	r9,r4,4
+	st	r6,r2,0
+	ld	r6,r3,8			; add 7 + 8r limbs
+	addu.cio r8,r8,r9
+	ld	r7,r4,8
+	st	r8,r2,4
+	ld	r8,r3,12		; add 6 + 8r limbs
+	addu.cio r6,r6,r7
+	ld	r9,r4,12
+	st	r6,r2,8
+	ld	r6,r3,16		; add 5 + 8r limbs
+	addu.cio r8,r8,r9
+	ld	r7,r4,16
+	st	r8,r2,12
+	ld	r8,r3,20		; add 4 + 8r limbs
+	addu.cio r6,r6,r7
+	ld	r9,r4,20
+	st	r6,r2,16
+	ld	r6,r3,24		; add 3 + 8r limbs
+	addu.cio r8,r8,r9
+	ld	r7,r4,24
+	st	r8,r2,20
+	ld	r8,r3,28		; add 2 + 8r limbs
+	addu.cio r6,r6,r7
+	ld	r9,r4,28
+	st	r6,r2,24
+	bcnd.n	ne0,r10,Loop		; add 1 + 8r limbs
+	 addu.cio r8,r8,r9
+
+	st	r8,r2,28		; store most significant limb
+
+	jmp.n	 r1
+	 addu.ci r2,r0,r0		; return carry-out from most sign. limb
diff --git a/rts/gmp/mpn/m88k/mc88110/add_n.S b/rts/gmp/mpn/m88k/mc88110/add_n.S
new file mode 100644
index 0000000000..843a50dded
--- /dev/null
+++ b/rts/gmp/mpn/m88k/mc88110/add_n.S
@@ -0,0 +1,200 @@
+; mc88110 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+#define res_ptr	r2
+#define s1_ptr	r3
+#define s2_ptr	r4
+#define size	r5
+
+#include "sysdep.h"
+
+	text
+	align	16
+	global	C_SYMBOL_NAME(__gmpn_add_n)
+C_SYMBOL_NAME(__gmpn_add_n):
+	addu.co	 r0,r0,r0		; clear cy flag
+	xor	 r12,s2_ptr,res_ptr
+	bb1	 2,r12,L1
+; **  V1a  **
+L0:	bb0	 2,res_ptr,L_v1		; branch if res_ptr is aligned?
+/* Add least significant limb separately to align res_ptr and s2_ptr */
+	ld	 r10,s1_ptr,0
+	addu	 s1_ptr,s1_ptr,4
+	ld	 r8,s2_ptr,0
+	addu	 s2_ptr,s2_ptr,4
+	subu	 size,size,1
+	addu.co	 r6,r10,r8
+	st	 r6,res_ptr,0
+	addu	 res_ptr,res_ptr,4
+L_v1:	cmp	 r12,size,2
+	bb1	 lt,r12,Lend2
+
+	ld	 r10,s1_ptr,0
+	ld	 r12,s1_ptr,4
+	ld.d	 r8,s2_ptr,0
+	subu	 size,size,10
+	bcnd	 lt0,size,Lfin1
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+	align	 8
+Loop1:	subu	 size,size,8
+	addu.cio r6,r10,r8
+	ld	 r10,s1_ptr,8
+	addu.cio r7,r12,r9
+	ld	 r12,s1_ptr,12
+	ld.d	 r8,s2_ptr,8
+	st.d	 r6,res_ptr,0
+	addu.cio r6,r10,r8
+	ld	 r10,s1_ptr,16
+	addu.cio r7,r12,r9
+	ld	 r12,s1_ptr,20
+	ld.d	 r8,s2_ptr,16
+	st.d	 r6,res_ptr,8
+	addu.cio r6,r10,r8
+	ld	 r10,s1_ptr,24
+	addu.cio r7,r12,r9
+	ld	 r12,s1_ptr,28
+	ld.d	 r8,s2_ptr,24
+	st.d	 r6,res_ptr,16
+	addu.cio r6,r10,r8
+	ld	 r10,s1_ptr,32
+	addu.cio r7,r12,r9
+	ld	 r12,s1_ptr,36
+	addu	 s1_ptr,s1_ptr,32
+	ld.d	 r8,s2_ptr,32
+	addu	 s2_ptr,s2_ptr,32
+	st.d	 r6,res_ptr,24
+	addu	 res_ptr,res_ptr,32
+	bcnd	 ge0,size,Loop1
+
+Lfin1:	addu	 size,size,8-2
+	bcnd	 lt0,size,Lend1
+/* Add blocks of 2 limbs until less than 2 limbs remain */
+Loope1:	addu.cio r6,r10,r8
+	ld	 r10,s1_ptr,8
+	addu.cio r7,r12,r9
+	ld	 r12,s1_ptr,12
+	ld.d	 r8,s2_ptr,8
+	st.d	 r6,res_ptr,0
+	subu	 size,size,2
+	addu	 s1_ptr,s1_ptr,8
+	addu	 s2_ptr,s2_ptr,8
+	addu	 res_ptr,res_ptr,8
+	bcnd	 ge0,size,Loope1
+Lend1:	addu.cio r6,r10,r8
+	addu.cio r7,r12,r9
+	st.d	 r6,res_ptr,0
+
+	bb0	 0,size,Lret1
+/* Add last limb */
+	ld	 r10,s1_ptr,8
+	ld	 r8,s2_ptr,8
+	addu.cio r6,r10,r8
+	st	 r6,res_ptr,8
+
+Lret1:	jmp.n	 r1
+	addu.ci	 r2,r0,r0		; return carry-out from most sign. limb
+
+L1:	xor	 r12,s1_ptr,res_ptr
+	bb1	 2,r12,L2
+; **  V1b  **
+	or	 r12,r0,s2_ptr
+	or	 s2_ptr,r0,s1_ptr
+	or	 s1_ptr,r0,r12
+	br	 L0
+
+; **  V2  **
+/* If we come here, the alignment of s1_ptr and res_ptr as well as the
+   alignment of s2_ptr and res_ptr differ.  Since there are only two ways
+   things can be aligned (that we care about) we now know that the alignment
+   of s1_ptr and s2_ptr are the same.  */
+
+L2:	cmp	 r12,size,1
+	bb1	 eq,r12,Ljone
+	bb0	 2,s1_ptr,L_v2		; branch if s1_ptr is aligned
+/* Add least significant limb separately to align res_ptr and s2_ptr */
+	ld	 r10,s1_ptr,0
+	addu	 s1_ptr,s1_ptr,4
+	ld	 r8,s2_ptr,0
+	addu	 s2_ptr,s2_ptr,4
+	subu	 size,size,1
+	addu.co	 r6,r10,r8
+	st	 r6,res_ptr,0
+	addu	 res_ptr,res_ptr,4
+
+L_v2:	subu	 size,size,8
+	bcnd	 lt0,size,Lfin2
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+	align	 8
+Loop2:	subu	 size,size,8
+	ld.d	 r8,s1_ptr,0
+	ld.d	 r6,s2_ptr,0
+	addu.cio r8,r8,r6
+	st	 r8,res_ptr,0
+	addu.cio r9,r9,r7
+	st	 r9,res_ptr,4
+	ld.d	 r8,s1_ptr,8
+	ld.d	 r6,s2_ptr,8
+	addu.cio r8,r8,r6
+	st	 r8,res_ptr,8
+	addu.cio r9,r9,r7
+	st	 r9,res_ptr,12
+	ld.d	 r8,s1_ptr,16
+	ld.d	 r6,s2_ptr,16
+	addu.cio r8,r8,r6
+	st	 r8,res_ptr,16
+	addu.cio r9,r9,r7
+	st	 r9,res_ptr,20
+	ld.d	 r8,s1_ptr,24
+	ld.d	 r6,s2_ptr,24
+	addu.cio r8,r8,r6
+	st	 r8,res_ptr,24
+	addu.cio r9,r9,r7
+	st	 r9,res_ptr,28
+	addu	 s1_ptr,s1_ptr,32
+	addu	 s2_ptr,s2_ptr,32
+	addu	 res_ptr,res_ptr,32
+	bcnd	 ge0,size,Loop2
+
+Lfin2:	addu	 size,size,8-2
+	bcnd	 lt0,size,Lend2
+Loope2:	ld.d	 r8,s1_ptr,0
+	ld.d	 r6,s2_ptr,0
+	addu.cio r8,r8,r6
+	st	 r8,res_ptr,0
+	addu.cio r9,r9,r7
+	st	 r9,res_ptr,4
+	subu	 size,size,2
+	addu	 s1_ptr,s1_ptr,8
+	addu	 s2_ptr,s2_ptr,8
+	addu	 res_ptr,res_ptr,8
+	bcnd	 ge0,size,Loope2
+Lend2:	bb0	 0,size,Lret2
+/* Add last limb */
+Ljone:	ld	 r10,s1_ptr,0
+	ld	 r8,s2_ptr,0
+	addu.cio r6,r10,r8
+	st	 r6,res_ptr,0
+
+Lret2:	jmp.n	 r1
+	addu.ci	 r2,r0,r0		; return carry-out from most sign. limb
diff --git a/rts/gmp/mpn/m88k/mc88110/addmul_1.s b/rts/gmp/mpn/m88k/mc88110/addmul_1.s
new file mode 100644
index 0000000000..7d97c87c79
--- /dev/null
+++ b/rts/gmp/mpn/m88k/mc88110/addmul_1.s
@@ -0,0 +1,61 @@
+; mc88110 __gmpn_addmul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; size		r4
+; s2_limb	r5
+
+	text
+	align	16
+	global	___gmpn_addmul_1
+___gmpn_addmul_1:
+	lda	 r3,r3[r4]
+	lda	 r8,r2[r4]		; RES_PTR in r8 since r2 is retval
+	subu	 r4,r0,r4
+	addu.co	 r2,r0,r0		; r2 = cy = 0
+
+	ld	 r6,r3[r4]
+	addu	 r4,r4,1
+	subu	 r8,r8,4
+	bcnd.n	 eq0,r4,Lend
+	 mulu.d	 r10,r6,r5
+
+Loop:	ld	 r7,r8[r4]
+	ld	 r6,r3[r4]
+	addu.cio r9,r11,r2
+	addu.ci	 r2,r10,r0
+	addu.co	 r9,r9,r7
+	st	 r9,r8[r4]
+	addu	 r4,r4,1
+	mulu.d	 r10,r6,r5
+	bcnd	 ne0,r4,Loop
+
+Lend:	ld	 r7,r8,0
+	addu.cio r9,r11,r2
+	addu.ci	 r2,r10,r0
+	addu.co	 r9,r9,r7
+	st	 r9,r8,0
+	jmp.n	 r1
+	 addu.ci r2,r2,r0
diff --git a/rts/gmp/mpn/m88k/mc88110/mul_1.s b/rts/gmp/mpn/m88k/mc88110/mul_1.s
new file mode 100644
index 0000000000..b8483afa91
--- /dev/null
+++ b/rts/gmp/mpn/m88k/mc88110/mul_1.s
@@ -0,0 +1,59 @@
+; mc88110 __gmpn_mul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; size		r4
+; s2_limb	r5
+
+	text
+	align	16
+	global	___gmpn_mul_1
+___gmpn_mul_1:
+	; Make S1_PTR and RES_PTR point at the end of their blocks
+	; and negate SIZE.
+	lda	 r3,r3[r4]
+	lda	 r8,r2[r4]		; RES_PTR in r8 since r2 is retval
+	subu	 r4,r0,r4
+
+	addu.co	 r2,r0,r0		; r2 = cy = 0
+
+	ld	 r6,r3[r4]
+	addu	 r4,r4,1
+	mulu.d	 r10,r6,r5
+	bcnd.n	 eq0,r4,Lend
+	 subu	 r8,r8,8
+
+Loop:	ld	 r6,r3[r4]
+	addu.cio r9,r11,r2
+	or	 r2,r10,r0		; could be avoided if unrolled
+	addu	 r4,r4,1
+	mulu.d	 r10,r6,r5
+	bcnd.n	 ne0,r4,Loop
+	 st	 r9,r8[r4]
+
+Lend:	addu.cio r9,r11,r2
+	st	 r9,r8,4
+	jmp.n	 r1
+	 addu.ci r2,r10,r0
diff --git a/rts/gmp/mpn/m88k/mc88110/sub_n.S b/rts/gmp/mpn/m88k/mc88110/sub_n.S
new file mode 100644
index 0000000000..715a3faf25
--- /dev/null
+++ b/rts/gmp/mpn/m88k/mc88110/sub_n.S
@@ -0,0 +1,276 @@
+; mc88110 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+#define res_ptr	r2
+#define s1_ptr	r3
+#define s2_ptr	r4
+#define size	r5
+
+#include "sysdep.h"
+
+	text
+	align	16
+	global	C_SYMBOL_NAME(__gmpn_sub_n)
+C_SYMBOL_NAME(__gmpn_sub_n):
+	subu.co	 r0,r0,r0		; set cy flag
+	xor	 r12,s2_ptr,res_ptr
+	bb1	 2,r12,L1
+; **  V1a  **
+L0:	bb0	 2,res_ptr,L_v1		; branch if res_ptr is aligned
+/* Add least significant limb separately to align res_ptr and s2_ptr */
+	ld	 r10,s1_ptr,0
+	addu	 s1_ptr,s1_ptr,4
+	ld	 r8,s2_ptr,0
+	addu	 s2_ptr,s2_ptr,4
+	subu	 size,size,1
+	subu.co	 r6,r10,r8
+	st	 r6,res_ptr,0
+	addu	 res_ptr,res_ptr,4
+L_v1:	cmp	 r12,size,2
+	bb1	 lt,r12,Lend2
+
+	ld	 r10,s1_ptr,0
+	ld	 r12,s1_ptr,4
+	ld.d	 r8,s2_ptr,0
+	subu	 size,size,10
+	bcnd	 lt0,size,Lfin1
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+	align	 8
+Loop1:	subu	 size,size,8
+	subu.cio r6,r10,r8
+	ld	 r10,s1_ptr,8
+	subu.cio r7,r12,r9
+	ld	 r12,s1_ptr,12
+	ld.d	 r8,s2_ptr,8
+	st.d	 r6,res_ptr,0
+	subu.cio r6,r10,r8
+	ld	 r10,s1_ptr,16
+	subu.cio r7,r12,r9
+	ld	 r12,s1_ptr,20
+	ld.d	 r8,s2_ptr,16
+	st.d	 r6,res_ptr,8
+	subu.cio r6,r10,r8
+	ld	 r10,s1_ptr,24
+	subu.cio r7,r12,r9
+	ld	 r12,s1_ptr,28
+	ld.d	 r8,s2_ptr,24
+	st.d	 r6,res_ptr,16
+	subu.cio r6,r10,r8
+	ld	 r10,s1_ptr,32
+	subu.cio r7,r12,r9
+	ld	 r12,s1_ptr,36
+	addu	 s1_ptr,s1_ptr,32
+	ld.d	 r8,s2_ptr,32
+	addu	 s2_ptr,s2_ptr,32
+	st.d	 r6,res_ptr,24
+	addu	 res_ptr,res_ptr,32
+	bcnd	 ge0,size,Loop1
+
+Lfin1:	addu	 size,size,8-2
+	bcnd	 lt0,size,Lend1
+/* Add blocks of 2 limbs until less than 2 limbs remain */
+Loope1:	subu.cio r6,r10,r8
+	ld	 r10,s1_ptr,8
+	subu.cio r7,r12,r9
+	ld	 r12,s1_ptr,12
+	ld.d	 r8,s2_ptr,8
+	st.d	 r6,res_ptr,0
+	subu	 size,size,2
+	addu	 s1_ptr,s1_ptr,8
+	addu	 s2_ptr,s2_ptr,8
+	addu	 res_ptr,res_ptr,8
+	bcnd	 ge0,size,Loope1
+Lend1:	subu.cio r6,r10,r8
+	subu.cio r7,r12,r9
+	st.d	 r6,res_ptr,0
+
+	bb0	 0,size,Lret1
+/* Add last limb */
+	ld	 r10,s1_ptr,8
+	ld	 r8,s2_ptr,8
+	subu.cio r6,r10,r8
+	st	 r6,res_ptr,8
+
+Lret1:	addu.ci r2,r0,r0		; return carry-out from most sign. limb
+	jmp.n	 r1
+	 xor	r2,r2,1
+
+L1:	xor	 r12,s1_ptr,res_ptr
+	bb1	 2,r12,L2
+; **  V1b  **
+	bb0	 2,res_ptr,L_v1b	; branch if res_ptr is aligned
+/* Add least significant limb separately to align res_ptr and s1_ptr */
+	ld	 r10,s2_ptr,0
+	addu	 s2_ptr,s2_ptr,4
+	ld	 r8,s1_ptr,0
+	addu	 s1_ptr,s1_ptr,4
+	subu	 size,size,1
+	subu.co	 r6,r8,r10
+	st	 r6,res_ptr,0
+	addu	 res_ptr,res_ptr,4
+L_v1b:	cmp	 r12,size,2
+	bb1	 lt,r12,Lend2
+
+	ld	 r10,s2_ptr,0
+	ld	 r12,s2_ptr,4
+	ld.d	 r8,s1_ptr,0
+	subu	 size,size,10
+	bcnd	 lt0,size,Lfin1b
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+	align	 8
+Loop1b:	subu	 size,size,8
+	subu.cio r6,r8,r10
+	ld	 r10,s2_ptr,8
+	subu.cio r7,r9,r12
+	ld	 r12,s2_ptr,12
+	ld.d	 r8,s1_ptr,8
+	st.d	 r6,res_ptr,0
+	subu.cio r6,r8,r10
+	ld	 r10,s2_ptr,16
+	subu.cio r7,r9,r12
+	ld	 r12,s2_ptr,20
+	ld.d	 r8,s1_ptr,16
+	st.d	 r6,res_ptr,8
+	subu.cio r6,r8,r10
+	ld	 r10,s2_ptr,24
+	subu.cio r7,r9,r12
+	ld	 r12,s2_ptr,28
+	ld.d	 r8,s1_ptr,24
+	st.d	 r6,res_ptr,16
+	subu.cio r6,r8,r10
+	ld	 r10,s2_ptr,32
+	subu.cio r7,r9,r12
+	ld	 r12,s2_ptr,36
+	addu	 s2_ptr,s2_ptr,32
+	ld.d	 r8,s1_ptr,32
+	addu	 s1_ptr,s1_ptr,32
+	st.d	 r6,res_ptr,24
+	addu	 res_ptr,res_ptr,32
+	bcnd	 ge0,size,Loop1b
+
+Lfin1b:	addu	 size,size,8-2
+	bcnd	 lt0,size,Lend1b
+/* Add blocks of 2 limbs until less than 2 limbs remain */
+Loope1b:subu.cio r6,r8,r10
+	ld	 r10,s2_ptr,8
+	subu.cio r7,r9,r12
+	ld	 r12,s2_ptr,12
+	ld.d	 r8,s1_ptr,8
+	st.d	 r6,res_ptr,0
+	subu	 size,size,2
+	addu	 s1_ptr,s1_ptr,8
+	addu	 s2_ptr,s2_ptr,8
+	addu	 res_ptr,res_ptr,8
+	bcnd	 ge0,size,Loope1b
+Lend1b:	subu.cio r6,r8,r10
+	subu.cio r7,r9,r12
+	st.d	 r6,res_ptr,0
+
+	bb0	 0,size,Lret1b
+/* Add last limb */
+	ld	 r10,s2_ptr,8
+	ld	 r8,s1_ptr,8
+	subu.cio r6,r8,r10
+	st	 r6,res_ptr,8
+
+Lret1b:	addu.ci r2,r0,r0		; return carry-out from most sign. limb
+	jmp.n	 r1
+	 xor	r2,r2,1
+
+; **  V2  **
+/* If we come here, the alignment of s1_ptr and res_ptr as well as the
+   alignment of s2_ptr and res_ptr differ.  Since there are only two ways
+   things can be aligned (that we care about) we now know that the alignment
+   of s1_ptr and s2_ptr are the same.  */
+
+L2:	cmp	 r12,size,1
+	bb1	 eq,r12,Ljone
+	bb0	 2,s1_ptr,L_v2		; branch if s1_ptr is aligned
+/* Add least significant limb separately to align res_ptr and s2_ptr */
+	ld	 r10,s1_ptr,0
+	addu	 s1_ptr,s1_ptr,4
+	ld	 r8,s2_ptr,0
+	addu	 s2_ptr,s2_ptr,4
+	subu	 size,size,1
+	subu.co	 r6,r10,r8
+	st	 r6,res_ptr,0
+	addu	 res_ptr,res_ptr,4
+
+L_v2:	subu	 size,size,8
+	bcnd	 lt0,size,Lfin2
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+	align	 8
+Loop2:	subu	 size,size,8
+	ld.d	 r8,s1_ptr,0
+	ld.d	 r6,s2_ptr,0
+	subu.cio r8,r8,r6
+	st	 r8,res_ptr,0
+	subu.cio r9,r9,r7
+	st	 r9,res_ptr,4
+	ld.d	 r8,s1_ptr,8
+	ld.d	 r6,s2_ptr,8
+	subu.cio r8,r8,r6
+	st	 r8,res_ptr,8
+	subu.cio r9,r9,r7
+	st	 r9,res_ptr,12
+	ld.d	 r8,s1_ptr,16
+	ld.d	 r6,s2_ptr,16
+	subu.cio r8,r8,r6
+	st	 r8,res_ptr,16
+	subu.cio r9,r9,r7
+	st	 r9,res_ptr,20
+	ld.d	 r8,s1_ptr,24
+	ld.d	 r6,s2_ptr,24
+	subu.cio r8,r8,r6
+	st	 r8,res_ptr,24
+	subu.cio r9,r9,r7
+	st	 r9,res_ptr,28
+	addu	 s1_ptr,s1_ptr,32
+	addu	 s2_ptr,s2_ptr,32
+	addu	 res_ptr,res_ptr,32
+	bcnd	 ge0,size,Loop2
+
+Lfin2:	addu	 size,size,8-2
+	bcnd	 lt0,size,Lend2
+Loope2:	ld.d	 r8,s1_ptr,0
+	ld.d	 r6,s2_ptr,0
+	subu.cio r8,r8,r6
+	st	 r8,res_ptr,0
+	subu.cio r9,r9,r7
+	st	 r9,res_ptr,4
+	subu	 size,size,2
+	addu	 s1_ptr,s1_ptr,8
+	addu	 s2_ptr,s2_ptr,8
+	addu	 res_ptr,res_ptr,8
+	bcnd	 ge0,size,Loope2
+Lend2:	bb0	 0,size,Lret2
+/* Add last limb */
+Ljone:	ld	 r10,s1_ptr,0
+	ld	 r8,s2_ptr,0
+	subu.cio r6,r10,r8
+	st	 r6,res_ptr,0
+
+Lret2:	addu.ci r2,r0,r0		; return carry-out from most sign. limb
+	jmp.n	 r1
+	 xor	r2,r2,1
diff --git a/rts/gmp/mpn/m88k/mul_1.s b/rts/gmp/mpn/m88k/mul_1.s
new file mode 100644
index 0000000000..06370837ef
--- /dev/null
+++ b/rts/gmp/mpn/m88k/mul_1.s
@@ -0,0 +1,127 @@
+; mc88100 __gmpn_mul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; size		r4
+; s2_limb	r5
+
+; Common overhead is about 11 cycles/invocation.
+
+; The speed for S2_LIMB >= 0x10000 is approximately 21 cycles/limb.  (The
+; pipeline stalls 2 cycles due to WB contention.)
+
+; The speed for S2_LIMB < 0x10000 is approximately 16 cycles/limb.  (The
+; pipeline stalls 2 cycles due to WB contention and 1 cycle due to latency.)
+
+; To enhance speed:
+; 1. Unroll main loop 4-8 times.
+; 2. Schedule code to avoid WB contention.  It might be tempting to move the
+;    ld instruction in the loops down to save 2 cycles (less WB contention),
+;    but that looses because the ultimate value will be read from outside
+;    the allocated space.  But if we handle the ultimate multiplication in
+;    the tail, we can do this.
+; 3. Make the multiplication with less instructions.  I think the code for
+;    (S2_LIMB >= 0x10000) is not minimal.
+; With these techniques the (S2_LIMB >= 0x10000) case would run in 17 or
+; less cycles/limb; the (S2_LIMB < 0x10000) case would run in 11
+; cycles/limb.  (Assuming infinite unrolling.)
+
+	text
+	align	 16
+	global	 ___gmpn_mul_1
+___gmpn_mul_1:
+
+	; Make S1_PTR and RES_PTR point at the end of their blocks
+	; and negate SIZE.
+	lda	 r3,r3[r4]
+	lda	 r6,r2[r4]	; RES_PTR in r6 since r2 is retval
+	subu	 r4,r0,r4
+
+	addu.co	 r2,r0,r0	; r2 = cy = 0
+	ld	 r9,r3[r4]
+	mask	 r7,r5,0xffff	; r7 = lo(S2_LIMB)
+	extu	 r8,r5,16	; r8 = hi(S2_LIMB)
+	bcnd.n	 eq0,r8,Lsmall	; jump if (hi(S2_LIMB) == 0)
+	 subu	 r6,r6,4
+
+; General code for any value of S2_LIMB.
+
+	; Make a stack frame and save r25 and r26
+	subu	 r31,r31,16
+	st.d	 r25,r31,8
+
+	; Enter the loop in the middle
+	br.n	L1
+	addu	 r4,r4,1
+
+Loop:	ld	 r9,r3[r4]
+	st	 r26,r6[r4]
+; bcnd	ne0,r0,0		; bubble
+	addu	 r4,r4,1
+L1:	mul	 r26,r9,r5	; low word of product	mul_1	WB ld
+	mask	 r12,r9,0xffff	; r12 = lo(s1_limb)	mask_1
+	mul	 r11,r12,r7	; r11 =  prod_0		mul_2	WB mask_1
+	mul	 r10,r12,r8	; r10 = prod_1a		mul_3
+	extu	 r13,r9,16	; r13 = hi(s1_limb)	extu_1	WB mul_1
+	mul	 r12,r13,r7	; r12 = prod_1b		mul_4	WB extu_1
+	mul	 r25,r13,r8	; r25  = prod_2		mul_5	WB mul_2
+	extu	 r11,r11,16	; r11 = hi(prod_0)	extu_2	WB mul_3
+	addu	 r10,r10,r11	;			addu_1	WB extu_2
+; bcnd	ne0,r0,0		; bubble			WB addu_1
+	addu.co	 r10,r10,r12	;				WB mul_4
+	mask.u	 r10,r10,0xffff	; move the 16 most significant bits...
+	addu.ci	 r10,r10,r0	; ...to the low half of the word...
+	rot	 r10,r10,16	; ...and put carry in pos 16.
+	addu.co	 r26,r26,r2	; add old carry limb
+	bcnd.n	 ne0,r4,Loop
+	 addu.ci r2,r25,r10	; compute new carry limb
+
+	st	 r26,r6[r4]
+	ld.d	 r25,r31,8
+	jmp.n	 r1
+	 addu	 r31,r31,16
+
+; Fast code for S2_LIMB < 0x10000
+Lsmall:
+	; Enter the loop in the middle
+	br.n	SL1
+	addu	 r4,r4,1
+
+SLoop:	ld	 r9,r3[r4]	;
+	st	 r8,r6[r4]	;
+	addu	 r4,r4,1	;
+SL1:	mul	 r8,r9,r5	; low word of product
+	mask	 r12,r9,0xffff	; r12 = lo(s1_limb)
+	extu	 r13,r9,16	; r13 = hi(s1_limb)
+	mul	 r11,r12,r7	; r11 =  prod_0
+	mul	 r12,r13,r7	; r12 = prod_1b
+	addu.cio r8,r8,r2	; add old carry limb
+	extu	 r10,r11,16	; r11 = hi(prod_0)
+	addu	 r10,r10,r12	;
+	bcnd.n	 ne0,r4,SLoop
+	extu	 r2,r10,16	; r2 = new carry limb
+
+	jmp.n	 r1
+	st	 r8,r6[r4]
diff --git a/rts/gmp/mpn/m88k/sub_n.s b/rts/gmp/mpn/m88k/sub_n.s
new file mode 100644
index 0000000000..2fd345a135
--- /dev/null
+++ b/rts/gmp/mpn/m88k/sub_n.s
@@ -0,0 +1,106 @@
+; mc88100 __gmpn_sub -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; s2_ptr	r4
+; size		r5
+
+; This code has been optimized to run one instruction per clock, avoiding
+; load stalls and writeback contention.  As a result, the instruction
+; order is not always natural.
+
+; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100,
+; but on the 88110, it seems to run much slower, 6.6 clocks/limb.
+
+	text
+	align	 16
+	global	 ___gmpn_sub_n
+___gmpn_sub_n:
+	ld	r6,r3,0			; read first limb from s1_ptr
+	extu	r10,r5,3
+	ld	r7,r4,0			; read first limb from s2_ptr
+
+	subu	r5,r0,r5
+	mak	r5,r5,3<4>
+	bcnd.n	eq0,r5,Lzero
+	subu.co	r0,r0,r0		; initialize carry
+
+	or	r12,r0,lo16(Lbase)
+	or.u	r12,r12,hi16(Lbase)
+	addu	r12,r12,r5		; r12 is address for entering in loop
+
+	extu	r5,r5,2			; divide by 4
+	subu	r2,r2,r5		; adjust res_ptr
+	subu	r3,r3,r5		; adjust s1_ptr
+	subu	r4,r4,r5		; adjust s2_ptr
+
+	or	r8,r6,r0
+
+	jmp.n	r12
+	 or	r9,r7,r0
+
+Loop:	addu	r3,r3,32
+	st	r8,r2,28
+	addu	r4,r4,32
+	ld	r6,r3,0
+	addu	r2,r2,32
+	ld	r7,r4,0
+Lzero:	subu	r10,r10,1		; subtract 0 + 8r limbs (adj loop cnt)
+Lbase:	ld	r8,r3,4
+	subu.cio r6,r6,r7
+	ld	r9,r4,4
+	st	r6,r2,0
+	ld	r6,r3,8			; subtract 7 + 8r limbs
+	subu.cio r8,r8,r9
+	ld	r7,r4,8
+	st	r8,r2,4
+	ld	r8,r3,12		; subtract 6 + 8r limbs
+	subu.cio r6,r6,r7
+	ld	r9,r4,12
+	st	r6,r2,8
+	ld	r6,r3,16		; subtract 5 + 8r limbs
+	subu.cio r8,r8,r9
+	ld	r7,r4,16
+	st	r8,r2,12
+	ld	r8,r3,20		; subtract 4 + 8r limbs
+	subu.cio r6,r6,r7
+	ld	r9,r4,20
+	st	r6,r2,16
+	ld	r6,r3,24		; subtract 3 + 8r limbs
+	subu.cio r8,r8,r9
+	ld	r7,r4,24
+	st	r8,r2,20
+	ld	r8,r3,28		; subtract 2 + 8r limbs
+	subu.cio r6,r6,r7
+	ld	r9,r4,28
+	st	r6,r2,24
+	bcnd.n	ne0,r10,Loop		; subtract 1 + 8r limbs
+	 subu.cio r8,r8,r9
+
+	st	r8,r2,28		; store most significant limb
+
+	addu.ci r2,r0,r0		; return carry-out from most sign. limb
+	jmp.n	 r1
+	 xor	r2,r2,1
diff --git a/rts/gmp/mpn/mips2/add_n.s b/rts/gmp/mpn/mips2/add_n.s
new file mode 100644
index 0000000000..5c3c7fc8a1
--- /dev/null
+++ b/rts/gmp/mpn/mips2/add_n.s
@@ -0,0 +1,120 @@
+ # MIPS2 __gmpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # s2_ptr	$6
+ # size		$7
+
+	.text
+	.align	2
+	.globl	__gmpn_add_n
+	.ent	__gmpn_add_n
+__gmpn_add_n:
+	.set	noreorder
+	.set	nomacro
+
+	lw	$10,0($5)
+	lw	$11,0($6)
+
+	addiu	$7,$7,-1
+	and	$9,$7,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 move	$2,$0
+
+	subu	$7,$7,$9
+
+.Loop0:	addiu	$9,$9,-1
+	lw	$12,4($5)
+	addu	$11,$11,$2
+	lw	$13,4($6)
+	sltu	$8,$11,$2
+	addu	$11,$10,$11
+	sltu	$2,$11,$10
+	sw	$11,0($4)
+	or	$2,$2,$8
+
+	addiu	$5,$5,4
+	addiu	$6,$6,4
+	move	$10,$12
+	move	$11,$13
+	bne	$9,$0,.Loop0
+	 addiu	$4,$4,4
+
+.L0:	beq	$7,$0,.Lend
+	 nop
+
+.Loop:	addiu	$7,$7,-4
+
+	lw	$12,4($5)
+	addu	$11,$11,$2
+	lw	$13,4($6)
+	sltu	$8,$11,$2
+	addu	$11,$10,$11
+	sltu	$2,$11,$10
+	sw	$11,0($4)
+	or	$2,$2,$8
+
+	lw	$10,8($5)
+	addu	$13,$13,$2
+	lw	$11,8($6)
+	sltu	$8,$13,$2
+	addu	$13,$12,$13
+	sltu	$2,$13,$12
+	sw	$13,4($4)
+	or	$2,$2,$8
+
+	lw	$12,12($5)
+	addu	$11,$11,$2
+	lw	$13,12($6)
+	sltu	$8,$11,$2
+	addu	$11,$10,$11
+	sltu	$2,$11,$10
+	sw	$11,8($4)
+	or	$2,$2,$8
+
+	lw	$10,16($5)
+	addu	$13,$13,$2
+	lw	$11,16($6)
+	sltu	$8,$13,$2
+	addu	$13,$12,$13
+	sltu	$2,$13,$12
+	sw	$13,12($4)
+	or	$2,$2,$8
+
+	addiu	$5,$5,16
+	addiu	$6,$6,16
+
+	bne	$7,$0,.Loop
+	 addiu	$4,$4,16
+
+.Lend:	addu	$11,$11,$2
+	sltu	$8,$11,$2
+	addu	$11,$10,$11
+	sltu	$2,$11,$10
+	sw	$11,0($4)
+	j	$31
+	or	$2,$2,$8
+
+	.end	__gmpn_add_n
diff --git a/rts/gmp/mpn/mips2/addmul_1.s b/rts/gmp/mpn/mips2/addmul_1.s
new file mode 100644
index 0000000000..1e5037751b
--- /dev/null
+++ b/rts/gmp/mpn/mips2/addmul_1.s
@@ -0,0 +1,97 @@
+ # MIPS __gmpn_addmul_1 -- Multiply a limb vector with a single limb and
+ # add the product to a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	 4
+	.globl	 __gmpn_addmul_1
+	.ent	__gmpn_addmul_1
+__gmpn_addmul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	lw	$8,0($5)
+
+ # warm up phase 1
+	addiu	$5,$5,4
+	multu	$8,$7
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	lw	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addiu	$5,$5,4
+	addu	$3,$3,$2	# add old carry limb to low product limb
+	multu	$8,$7
+	lw	$8,0($5)	# load new s1 limb as early as possible
+	addiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$3,$2	# carry from previous addition -> $2
+	addu	$3,$10,$3
+	sltu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	addiu	$4,$4,4
+	bne	$6,$0,Loop
+	 addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addu	$3,$3,$2
+	sltu	$2,$3,$2
+	multu	$8,$7
+	addu	$3,$10,$3
+	sltu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	addiu	$4,$4,4
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addu	$3,$3,$2
+	sltu	$2,$3,$2
+	addu	$3,$10,$3
+	sltu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	j	$31
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__gmpn_addmul_1
diff --git a/rts/gmp/mpn/mips2/lshift.s b/rts/gmp/mpn/mips2/lshift.s
new file mode 100644
index 0000000000..2ca3a3c800
--- /dev/null
+++ b/rts/gmp/mpn/mips2/lshift.s
@@ -0,0 +1,95 @@
+ # MIPS2 __gmpn_lshift --
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # src_ptr	$5
+ # size		$6
+ # cnt		$7
+
+	.text
+	.align	2
+	.globl	__gmpn_lshift
+	.ent	__gmpn_lshift
+__gmpn_lshift:
+	.set	noreorder
+	.set	nomacro
+
+	sll	$2,$6,2
+	addu	$5,$5,$2	# make r5 point at end of src
+	lw	$10,-4($5)	# load first limb
+	subu	$13,$0,$7
+	addu	$4,$4,$2	# make r4 point at end of res
+	addiu	$6,$6,-1
+	and	$9,$6,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 srl	$2,$10,$13	# compute function result
+
+	subu	$6,$6,$9
+
+.Loop0:	lw	$3,-8($5)
+	addiu	$4,$4,-4
+	addiu	$5,$5,-4
+	addiu	$9,$9,-1
+	sll	$11,$10,$7
+	srl	$12,$3,$13
+	move	$10,$3
+	or	$8,$11,$12
+	bne	$9,$0,.Loop0
+	 sw	$8,0($4)
+
+.L0:	beq	$6,$0,.Lend
+	 nop
+
+.Loop:	lw	$3,-8($5)
+	addiu	$4,$4,-16
+	addiu	$6,$6,-4
+	sll	$11,$10,$7
+	srl	$12,$3,$13
+
+	lw	$10,-12($5)
+	sll	$14,$3,$7
+	or	$8,$11,$12
+	sw	$8,12($4)
+	srl	$9,$10,$13
+
+	lw	$3,-16($5)
+	sll	$11,$10,$7
+	or	$8,$14,$9
+	sw	$8,8($4)
+	srl	$12,$3,$13
+
+	lw	$10,-20($5)
+	sll	$14,$3,$7
+	or	$8,$11,$12
+	sw	$8,4($4)
+	srl	$9,$10,$13
+
+	addiu	$5,$5,-16
+	or	$8,$14,$9
+	bgtz	$6,.Loop
+	 sw	$8,0($4)
+
+.Lend:	sll	$8,$10,$7
+	j	$31
+	sw	$8,-4($4)
+	.end	__gmpn_lshift
diff --git a/rts/gmp/mpn/mips2/mul_1.s b/rts/gmp/mpn/mips2/mul_1.s
new file mode 100644
index 0000000000..ea8aa26809
--- /dev/null
+++ b/rts/gmp/mpn/mips2/mul_1.s
@@ -0,0 +1,85 @@
+ # MIPS __gmpn_mul_1 -- Multiply a limb vector with a single limb and
+ # store the product in a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	 4
+	.globl	 __gmpn_mul_1
+	.ent	__gmpn_mul_1
+__gmpn_mul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	lw	$8,0($5)
+
+ # warm up phase 1
+	addiu	$5,$5,4
+	multu	$8,$7
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	lw	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	mflo	$10
+	mfhi	$9
+	addiu	$5,$5,4
+	addu	$10,$10,$2	# add old carry limb to low product limb
+	multu	$8,$7
+	lw	$8,0($5)	# load new s1 limb as early as possible
+	addiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$10,$2	# carry from previous addition -> $2
+	sw	$10,0($4)
+	addiu	$4,$4,4
+	bne	$6,$0,Loop
+	 addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	mflo	$10
+	mfhi	$9
+	addu	$10,$10,$2
+	sltu	$2,$10,$2
+	multu	$8,$7
+	sw	$10,0($4)
+	addiu	$4,$4,4
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	mflo	$10
+	mfhi	$9
+	addu	$10,$10,$2
+	sltu	$2,$10,$2
+	sw	$10,0($4)
+	j	$31
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__gmpn_mul_1
diff --git a/rts/gmp/mpn/mips2/rshift.s b/rts/gmp/mpn/mips2/rshift.s
new file mode 100644
index 0000000000..37c8f39cb4
--- /dev/null
+++ b/rts/gmp/mpn/mips2/rshift.s
@@ -0,0 +1,92 @@
+ # MIPS2 __gmpn_rshift --
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # src_ptr	$5
+ # size		$6
+ # cnt		$7
+
+	.text
+	.align	2
+	.globl	__gmpn_rshift
+	.ent	__gmpn_rshift
+__gmpn_rshift:
+	.set	noreorder
+	.set	nomacro
+
+	lw	$10,0($5)	# load first limb
+	subu	$13,$0,$7
+	addiu	$6,$6,-1
+	and	$9,$6,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 sll	$2,$10,$13	# compute function result
+
+	subu	$6,$6,$9
+
+.Loop0:	lw	$3,4($5)
+	addiu	$4,$4,4
+	addiu	$5,$5,4
+	addiu	$9,$9,-1
+	srl	$11,$10,$7
+	sll	$12,$3,$13
+	move	$10,$3
+	or	$8,$11,$12
+	bne	$9,$0,.Loop0
+	 sw	$8,-4($4)
+
+.L0:	beq	$6,$0,.Lend
+	 nop
+
+.Loop:	lw	$3,4($5)
+	addiu	$4,$4,16
+	addiu	$6,$6,-4
+	srl	$11,$10,$7
+	sll	$12,$3,$13
+
+	lw	$10,8($5)
+	srl	$14,$3,$7
+	or	$8,$11,$12
+	sw	$8,-16($4)
+	sll	$9,$10,$13
+
+	lw	$3,12($5)
+	srl	$11,$10,$7
+	or	$8,$14,$9
+	sw	$8,-12($4)
+	sll	$12,$3,$13
+
+	lw	$10,16($5)
+	srl	$14,$3,$7
+	or	$8,$11,$12
+	sw	$8,-8($4)
+	sll	$9,$10,$13
+
+	addiu	$5,$5,16
+	or	$8,$14,$9
+	bgtz	$6,.Loop
+	 sw	$8,-4($4)
+
+.Lend:	srl	$8,$10,$7
+	j	$31
+	sw	$8,0($4)
+	.end	__gmpn_rshift
diff --git a/rts/gmp/mpn/mips2/sub_n.s b/rts/gmp/mpn/mips2/sub_n.s
new file mode 100644
index 0000000000..51d34f3ac3
--- /dev/null
+++ b/rts/gmp/mpn/mips2/sub_n.s
@@ -0,0 +1,120 @@
+ # MIPS2 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # s2_ptr	$6
+ # size		$7
+
+	.text
+	.align	2
+	.globl	__gmpn_sub_n
+	.ent	__gmpn_sub_n
+__gmpn_sub_n:
+	.set	noreorder
+	.set	nomacro
+
+	lw	$10,0($5)
+	lw	$11,0($6)
+
+	addiu	$7,$7,-1
+	and	$9,$7,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 move	$2,$0
+
+	subu	$7,$7,$9
+
+.Loop0:	addiu	$9,$9,-1
+	lw	$12,4($5)
+	addu	$11,$11,$2
+	lw	$13,4($6)
+	sltu	$8,$11,$2
+	subu	$11,$10,$11
+	sltu	$2,$10,$11
+	sw	$11,0($4)
+	or	$2,$2,$8
+
+	addiu	$5,$5,4
+	addiu	$6,$6,4
+	move	$10,$12
+	move	$11,$13
+	bne	$9,$0,.Loop0
+	 addiu	$4,$4,4
+
+.L0:	beq	$7,$0,.Lend
+	 nop
+
+.Loop:	addiu	$7,$7,-4
+
+	lw	$12,4($5)
+	addu	$11,$11,$2
+	lw	$13,4($6)
+	sltu	$8,$11,$2
+	subu	$11,$10,$11
+	sltu	$2,$10,$11
+	sw	$11,0($4)
+	or	$2,$2,$8
+
+	lw	$10,8($5)
+	addu	$13,$13,$2
+	lw	$11,8($6)
+	sltu	$8,$13,$2
+	subu	$13,$12,$13
+	sltu	$2,$12,$13
+	sw	$13,4($4)
+	or	$2,$2,$8
+
+	lw	$12,12($5)
+	addu	$11,$11,$2
+	lw	$13,12($6)
+	sltu	$8,$11,$2
+	subu	$11,$10,$11
+	sltu	$2,$10,$11
+	sw	$11,8($4)
+	or	$2,$2,$8
+
+	lw	$10,16($5)
+	addu	$13,$13,$2
+	lw	$11,16($6)
+	sltu	$8,$13,$2
+	subu	$13,$12,$13
+	sltu	$2,$12,$13
+	sw	$13,12($4)
+	or	$2,$2,$8
+
+	addiu	$5,$5,16
+	addiu	$6,$6,16
+
+	bne	$7,$0,.Loop
+	 addiu	$4,$4,16
+
+.Lend:	addu	$11,$11,$2
+	sltu	$8,$11,$2
+	subu	$11,$10,$11
+	sltu	$2,$10,$11
+	sw	$11,0($4)
+	j	$31
+	or	$2,$2,$8
+
+	.end	__gmpn_sub_n
diff --git a/rts/gmp/mpn/mips2/submul_1.s b/rts/gmp/mpn/mips2/submul_1.s
new file mode 100644
index 0000000000..495dea3ba2
--- /dev/null
+++ b/rts/gmp/mpn/mips2/submul_1.s
@@ -0,0 +1,97 @@
+ # MIPS __gmpn_submul_1 -- Multiply a limb vector with a single limb and
+ # subtract the product from a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	 4
+	.globl	 __gmpn_submul_1
+	.ent	__gmpn_submul_1
+__gmpn_submul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	lw	$8,0($5)
+
+ # warm up phase 1
+	addiu	$5,$5,4
+	multu	$8,$7
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	lw	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addiu	$5,$5,4
+	addu	$3,$3,$2	# add old carry limb to low product limb
+	multu	$8,$7
+	lw	$8,0($5)	# load new s1 limb as early as possible
+	addiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$3,$2	# carry from previous addition -> $2
+	subu	$3,$10,$3
+	sgtu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	addiu	$4,$4,4
+	bne	$6,$0,Loop
+	 addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addu	$3,$3,$2
+	sltu	$2,$3,$2
+	multu	$8,$7
+	subu	$3,$10,$3
+	sgtu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	addiu	$4,$4,4
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addu	$3,$3,$2
+	sltu	$2,$3,$2
+	subu	$3,$10,$3
+	sgtu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	j	$31
+	addu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__gmpn_submul_1
diff --git a/rts/gmp/mpn/mips2/umul.s b/rts/gmp/mpn/mips2/umul.s
new file mode 100644
index 0000000000..40e847614c
--- /dev/null
+++ b/rts/gmp/mpn/mips2/umul.s
@@ -0,0 +1,30 @@
+ # Copyright (C) 1999 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+	.text
+	.align	2
+	.globl	__umul_ppmm
+	.ent	__umul_ppmm
+__umul_ppmm:
+	multu $5,$6
+	mflo	$3
+	mfhi	$2
+	sw	$3,0($4)
+	j	$31
+	.end	__umul_ppmm
diff --git a/rts/gmp/mpn/mips3/README b/rts/gmp/mpn/mips3/README
new file mode 100644
index 0000000000..e94b2c7460
--- /dev/null
+++ b/rts/gmp/mpn/mips3/README
@@ -0,0 +1,23 @@
+This directory contains mpn functions optimized for MIPS3.  Example of
+processors that implement MIPS3 are R4000, R4400, R4600, R4700, and R8000.
+
+RELEVANT OPTIMIZATION ISSUES
+
+1. On the R4000 and R4400, branches, both the plain and the "likely" ones,
+   take 3 cycles to execute.  (The fastest possible loop will take 4 cycles,
+   because of the delay insn.)
+
+   On the R4600, branches takes a single cycle
+
+   On the R8000, branches often take no noticable cycles, as they are
+   executed in a separate function unit..
+
+2. The R4000 and R4400 have a load latency of 4 cycles.
+
+3. On the R4000 and R4400, multiplies take a data-dependent number of
+   cycles, contrary to the SGI documentation.  There seem to be 3 or 4
+   possible latencies.
+
+STATUS
+
+Good...
diff --git a/rts/gmp/mpn/mips3/add_n.s b/rts/gmp/mpn/mips3/add_n.s
new file mode 100644
index 0000000000..adad0beaef
--- /dev/null
+++ b/rts/gmp/mpn/mips3/add_n.s
@@ -0,0 +1,120 @@
+ # MIPS3 __gmpn_add_n -- Add two limb vectors of the same length > 0 and
+ # store sum in a third limb vector.
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # s2_ptr	$6
+ # size		$7
+
+	.text
+	.align	2
+	.globl	__gmpn_add_n
+	.ent	__gmpn_add_n
+__gmpn_add_n:
+	.set	noreorder
+	.set	nomacro
+
+	ld	$10,0($5)
+	ld	$11,0($6)
+
+	daddiu	$7,$7,-1
+	and	$9,$7,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 move	$2,$0
+
+	dsubu	$7,$7,$9
+
+.Loop0:	daddiu	$9,$9,-1
+	ld	$12,8($5)
+	daddu	$11,$11,$2
+	ld	$13,8($6)
+	sltu	$8,$11,$2
+	daddu	$11,$10,$11
+	sltu	$2,$11,$10
+	sd	$11,0($4)
+	or	$2,$2,$8
+
+	daddiu	$5,$5,8
+	daddiu	$6,$6,8
+	move	$10,$12
+	move	$11,$13
+	bne	$9,$0,.Loop0
+	 daddiu	$4,$4,8
+
+.L0:	beq	$7,$0,.Lend
+	 nop
+
+.Loop:	daddiu	$7,$7,-4
+
+	ld	$12,8($5)
+	daddu	$11,$11,$2
+	ld	$13,8($6)
+	sltu	$8,$11,$2
+	daddu	$11,$10,$11
+	sltu	$2,$11,$10
+	sd	$11,0($4)
+	or	$2,$2,$8
+
+	ld	$10,16($5)
+	daddu	$13,$13,$2
+	ld	$11,16($6)
+	sltu	$8,$13,$2
+	daddu	$13,$12,$13
+	sltu	$2,$13,$12
+	sd	$13,8($4)
+	or	$2,$2,$8
+
+	ld	$12,24($5)
+	daddu	$11,$11,$2
+	ld	$13,24($6)
+	sltu	$8,$11,$2
+	daddu	$11,$10,$11
+	sltu	$2,$11,$10
+	sd	$11,16($4)
+	or	$2,$2,$8
+
+	ld	$10,32($5)
+	daddu	$13,$13,$2
+	ld	$11,32($6)
+	sltu	$8,$13,$2
+	daddu	$13,$12,$13
+	sltu	$2,$13,$12
+	sd	$13,24($4)
+	or	$2,$2,$8
+
+	daddiu	$5,$5,32
+	daddiu	$6,$6,32
+
+	bne	$7,$0,.Loop
+	 daddiu	$4,$4,32
+
+.Lend:	daddu	$11,$11,$2
+	sltu	$8,$11,$2
+	daddu	$11,$10,$11
+	sltu	$2,$11,$10
+	sd	$11,0($4)
+	j	$31
+	or	$2,$2,$8
+
+	.end	__gmpn_add_n
diff --git a/rts/gmp/mpn/mips3/addmul_1.s b/rts/gmp/mpn/mips3/addmul_1.s
new file mode 100644
index 0000000000..d390e2298e
--- /dev/null
+++ b/rts/gmp/mpn/mips3/addmul_1.s
@@ -0,0 +1,97 @@
+ # MIPS3 __gmpn_addmul_1 -- Multiply a limb vector with a single limb and
+ # add the product to a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	4
+	.globl	__gmpn_addmul_1
+	.ent	__gmpn_addmul_1
+__gmpn_addmul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	ld	$8,0($5)
+
+ # warm up phase 1
+	daddiu	$5,$5,8
+	dmultu	$8,$7
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	ld	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddiu	$5,$5,8
+	daddu	$3,$3,$2	# add old carry limb to low product limb
+	dmultu	$8,$7
+	ld	$8,0($5)	# load new s1 limb as early as possible
+	daddiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$3,$2	# carry from previous addition -> $2
+	daddu	$3,$10,$3
+	sltu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	daddiu	$4,$4,8
+	bne	$6,$0,Loop
+	 daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddu	$3,$3,$2
+	sltu	$2,$3,$2
+	dmultu	$8,$7
+	daddu	$3,$10,$3
+	sltu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	daddiu	$4,$4,8
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddu	$3,$3,$2
+	sltu	$2,$3,$2
+	daddu	$3,$10,$3
+	sltu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	j	$31
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__gmpn_addmul_1
diff --git a/rts/gmp/mpn/mips3/gmp-mparam.h b/rts/gmp/mpn/mips3/gmp-mparam.h
new file mode 100644
index 0000000000..656e90c7b0
--- /dev/null
+++ b/rts/gmp/mpn/mips3/gmp-mparam.h
@@ -0,0 +1,58 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values are for the R10000 usign the system cc.  */
+/* Generated by tuneup.c, 2000-07-25. */
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD 16
+#endif
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD 32
+#endif
+
+/* Supressed the TOOM3 values as they looked absolutely crazy
+   (698 and 21 respectively) */
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              58
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD             54
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            82
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD         159
+#endif
diff --git a/rts/gmp/mpn/mips3/lshift.s b/rts/gmp/mpn/mips3/lshift.s
new file mode 100644
index 0000000000..372606fddf
--- /dev/null
+++ b/rts/gmp/mpn/mips3/lshift.s
@@ -0,0 +1,95 @@
+ # MIPS3 __gmpn_lshift --
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # src_ptr	$5
+ # size		$6
+ # cnt		$7
+
+	.text
+	.align	2
+	.globl	__gmpn_lshift
+	.ent	__gmpn_lshift
+__gmpn_lshift:
+	.set	noreorder
+	.set	nomacro
+
+	dsll	$2,$6,3
+	daddu	$5,$5,$2	# make r5 point at end of src
+	ld	$10,-8($5)	# load first limb
+	dsubu	$13,$0,$7
+	daddu	$4,$4,$2	# make r4 point at end of res
+	daddiu	$6,$6,-1
+	and	$9,$6,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 dsrl	$2,$10,$13	# compute function result
+
+	dsubu	$6,$6,$9
+
+.Loop0:	ld	$3,-16($5)
+	daddiu	$4,$4,-8
+	daddiu	$5,$5,-8
+	daddiu	$9,$9,-1
+	dsll	$11,$10,$7
+	dsrl	$12,$3,$13
+	move	$10,$3
+	or	$8,$11,$12
+	bne	$9,$0,.Loop0
+	 sd	$8,0($4)
+
+.L0:	beq	$6,$0,.Lend
+	 nop
+
+.Loop:	ld	$3,-16($5)
+	daddiu	$4,$4,-32
+	daddiu	$6,$6,-4
+	dsll	$11,$10,$7
+	dsrl	$12,$3,$13
+
+	ld	$10,-24($5)
+	dsll	$14,$3,$7
+	or	$8,$11,$12
+	sd	$8,24($4)
+	dsrl	$9,$10,$13
+
+	ld	$3,-32($5)
+	dsll	$11,$10,$7
+	or	$8,$14,$9
+	sd	$8,16($4)
+	dsrl	$12,$3,$13
+
+	ld	$10,-40($5)
+	dsll	$14,$3,$7
+	or	$8,$11,$12
+	sd	$8,8($4)
+	dsrl	$9,$10,$13
+
+	daddiu	$5,$5,-32
+	or	$8,$14,$9
+	bgtz	$6,.Loop
+	 sd	$8,0($4)
+
+.Lend:	dsll	$8,$10,$7
+	j	$31
+	sd	$8,-8($4)
+	.end	__gmpn_lshift
diff --git a/rts/gmp/mpn/mips3/mul_1.s b/rts/gmp/mpn/mips3/mul_1.s
new file mode 100644
index 0000000000..6659e2b4eb
--- /dev/null
+++ b/rts/gmp/mpn/mips3/mul_1.s
@@ -0,0 +1,85 @@
+ # MIPS3 __gmpn_mul_1 -- Multiply a limb vector with a single limb and
+ # store the product in a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	4
+	.globl	__gmpn_mul_1
+	.ent	__gmpn_mul_1
+__gmpn_mul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	ld	$8,0($5)
+
+ # warm up phase 1
+	daddiu	$5,$5,8
+	dmultu	$8,$7
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	ld	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	mflo	$10
+	mfhi	$9
+	daddiu	$5,$5,8
+	daddu	$10,$10,$2	# add old carry limb to low product limb
+	dmultu	$8,$7
+	ld	$8,0($5)	# load new s1 limb as early as possible
+	daddiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$10,$2	# carry from previous addition -> $2
+	sd	$10,0($4)
+	daddiu	$4,$4,8
+	bne	$6,$0,Loop
+	 daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	mflo	$10
+	mfhi	$9
+	daddu	$10,$10,$2
+	sltu	$2,$10,$2
+	dmultu	$8,$7
+	sd	$10,0($4)
+	daddiu	$4,$4,8
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	mflo	$10
+	mfhi	$9
+	daddu	$10,$10,$2
+	sltu	$2,$10,$2
+	sd	$10,0($4)
+	j	$31
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__gmpn_mul_1
diff --git a/rts/gmp/mpn/mips3/rshift.s b/rts/gmp/mpn/mips3/rshift.s
new file mode 100644
index 0000000000..59c7fd3492
--- /dev/null
+++ b/rts/gmp/mpn/mips3/rshift.s
@@ -0,0 +1,92 @@
+ # MIPS3 __gmpn_rshift --
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # src_ptr	$5
+ # size		$6
+ # cnt		$7
+
+	.text
+	.align	2
+	.globl	__gmpn_rshift
+	.ent	__gmpn_rshift
+__gmpn_rshift:
+	.set	noreorder
+	.set	nomacro
+
+	ld	$10,0($5)	# load first limb
+	dsubu	$13,$0,$7
+	daddiu	$6,$6,-1
+	and	$9,$6,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 dsll	$2,$10,$13	# compute function result
+
+	dsubu	$6,$6,$9
+
+.Loop0:	ld	$3,8($5)
+	daddiu	$4,$4,8
+	daddiu	$5,$5,8
+	daddiu	$9,$9,-1
+	dsrl	$11,$10,$7
+	dsll	$12,$3,$13
+	move	$10,$3
+	or	$8,$11,$12
+	bne	$9,$0,.Loop0
+	 sd	$8,-8($4)
+
+.L0:	beq	$6,$0,.Lend
+	 nop
+
+.Loop:	ld	$3,8($5)
+	daddiu	$4,$4,32
+	daddiu	$6,$6,-4
+	dsrl	$11,$10,$7
+	dsll	$12,$3,$13
+
+	ld	$10,16($5)
+	dsrl	$14,$3,$7
+	or	$8,$11,$12
+	sd	$8,-32($4)
+	dsll	$9,$10,$13
+
+	ld	$3,24($5)
+	dsrl	$11,$10,$7
+	or	$8,$14,$9
+	sd	$8,-24($4)
+	dsll	$12,$3,$13
+
+	ld	$10,32($5)
+	dsrl	$14,$3,$7
+	or	$8,$11,$12
+	sd	$8,-16($4)
+	dsll	$9,$10,$13
+
+	daddiu	$5,$5,32
+	or	$8,$14,$9
+	bgtz	$6,.Loop
+	 sd	$8,-8($4)
+
+.Lend:	dsrl	$8,$10,$7
+	j	$31
+	sd	$8,0($4)
+	.end	__gmpn_rshift
diff --git a/rts/gmp/mpn/mips3/sub_n.s b/rts/gmp/mpn/mips3/sub_n.s
new file mode 100644
index 0000000000..c57c824b04
--- /dev/null
+++ b/rts/gmp/mpn/mips3/sub_n.s
@@ -0,0 +1,120 @@
+ # MIPS3 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+ # store difference in a third limb vector.
+
+ # Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # s2_ptr	$6
+ # size		$7
+
+	.text
+	.align	2
+	.globl	__gmpn_sub_n
+	.ent	__gmpn_sub_n
+__gmpn_sub_n:
+	.set	noreorder
+	.set	nomacro
+
+	ld	$10,0($5)
+	ld	$11,0($6)
+
+	daddiu	$7,$7,-1
+	and	$9,$7,4-1	# number of limbs in first loop
+	beq	$9,$0,.L0	# if multiple of 4 limbs, skip first loop
+	 move	$2,$0
+
+	dsubu	$7,$7,$9
+
+.Loop0:	daddiu	$9,$9,-1
+	ld	$12,8($5)
+	daddu	$11,$11,$2
+	ld	$13,8($6)
+	sltu	$8,$11,$2
+	dsubu	$11,$10,$11
+	sltu	$2,$10,$11
+	sd	$11,0($4)
+	or	$2,$2,$8
+
+	daddiu	$5,$5,8
+	daddiu	$6,$6,8
+	move	$10,$12
+	move	$11,$13
+	bne	$9,$0,.Loop0
+	 daddiu	$4,$4,8
+
+.L0:	beq	$7,$0,.Lend
+	 nop
+
+.Loop:	daddiu	$7,$7,-4
+
+	ld	$12,8($5)
+	daddu	$11,$11,$2
+	ld	$13,8($6)
+	sltu	$8,$11,$2
+	dsubu	$11,$10,$11
+	sltu	$2,$10,$11
+	sd	$11,0($4)
+	or	$2,$2,$8
+
+	ld	$10,16($5)
+	daddu	$13,$13,$2
+	ld	$11,16($6)
+	sltu	$8,$13,$2
+	dsubu	$13,$12,$13
+	sltu	$2,$12,$13
+	sd	$13,8($4)
+	or	$2,$2,$8
+
+	ld	$12,24($5)
+	daddu	$11,$11,$2
+	ld	$13,24($6)
+	sltu	$8,$11,$2
+	dsubu	$11,$10,$11
+	sltu	$2,$10,$11
+	sd	$11,16($4)
+	or	$2,$2,$8
+
+	ld	$10,32($5)
+	daddu	$13,$13,$2
+	ld	$11,32($6)
+	sltu	$8,$13,$2
+	dsubu	$13,$12,$13
+	sltu	$2,$12,$13
+	sd	$13,24($4)
+	or	$2,$2,$8
+
+	daddiu	$5,$5,32
+	daddiu	$6,$6,32
+
+	bne	$7,$0,.Loop
+	 daddiu	$4,$4,32
+
+.Lend:	daddu	$11,$11,$2
+	sltu	$8,$11,$2
+	dsubu	$11,$10,$11
+	sltu	$2,$10,$11
+	sd	$11,0($4)
+	j	$31
+	or	$2,$2,$8
+
+	.end	__gmpn_sub_n
diff --git a/rts/gmp/mpn/mips3/submul_1.s b/rts/gmp/mpn/mips3/submul_1.s
new file mode 100644
index 0000000000..531f9705a6
--- /dev/null
+++ b/rts/gmp/mpn/mips3/submul_1.s
@@ -0,0 +1,97 @@
+ # MIPS3 __gmpn_submul_1 -- Multiply a limb vector with a single limb and
+ # subtract the product from a second limb vector.
+
+ # Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+ # This file is part of the GNU MP Library.
+
+ # The GNU MP Library is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU Lesser General Public License as published by
+ # the Free Software Foundation; either version 2.1 of the License, or (at your
+ # option) any later version.
+
+ # The GNU MP Library is distributed in the hope that it will be useful, but
+ # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ # or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ # License for more details.
+
+ # You should have received a copy of the GNU Lesser General Public License
+ # along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+ # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ # MA 02111-1307, USA.
+
+
+ # INPUT PARAMETERS
+ # res_ptr	$4
+ # s1_ptr	$5
+ # size		$6
+ # s2_limb	$7
+
+	.text
+	.align	4
+	.globl	__gmpn_submul_1
+	.ent	__gmpn_submul_1
+__gmpn_submul_1:
+	.set    noreorder
+	.set    nomacro
+
+ # warm up phase 0
+	ld	$8,0($5)
+
+ # warm up phase 1
+	daddiu	$5,$5,8
+	dmultu	$8,$7
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		# zero cy2
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	ld	$8,0($5)	# load new s1 limb as early as possible
+
+Loop:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddiu	$5,$5,8
+	daddu	$3,$3,$2	# add old carry limb to low product limb
+	dmultu	$8,$7
+	ld	$8,0($5)	# load new s1 limb as early as possible
+	daddiu	$6,$6,-1	# decrement loop counter
+	sltu	$2,$3,$2	# carry from previous addition -> $2
+	dsubu	$3,$10,$3
+	sgtu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	daddiu	$4,$4,8
+	bne	$6,$0,Loop
+	 daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 1
+$LC1:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddu	$3,$3,$2
+	sltu	$2,$3,$2
+	dmultu	$8,$7
+	dsubu	$3,$10,$3
+	sgtu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	daddiu	$4,$4,8
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+ # cool down phase 0
+$LC0:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddu	$3,$3,$2
+	sltu	$2,$3,$2
+	dsubu	$3,$10,$3
+	sgtu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	j	$31
+	daddu	$2,$9,$2	# add high product limb and carry from addition
+
+	.end	__gmpn_submul_1
diff --git a/rts/gmp/mpn/mp_bases.c b/rts/gmp/mpn/mp_bases.c
new file mode 100644
index 0000000000..011c328c80
--- /dev/null
+++ b/rts/gmp/mpn/mp_bases.c
@@ -0,0 +1,550 @@
+/* __mp_bases -- Structure for conversion between internal binary
+   format and strings in base 2..255.  The fields are explained in
+   gmp-impl.h.
+
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+#if BITS_PER_MP_LIMB == 32
+const struct bases __mp_bases[256] =
+{
+  /*  0 */ {0, 0.0, 0, 0},
+  /*  1 */ {0, 1e38, 0, 0},
+  /*  2 */ {32, 1.0000000000000000, 0x1, 0x0},
+  /*  3 */ {20, 0.6309297535714575, 0xcfd41b91, 0x3b563c24},
+  /*  4 */ {16, 0.5000000000000000, 0x2, 0x0},
+  /*  5 */ {13, 0.4306765580733931, 0x48c27395, 0xc25c2684},
+  /*  6 */ {12, 0.3868528072345416, 0x81bf1000, 0xf91bd1b6},
+  /*  7 */ {11, 0.3562071871080222, 0x75db9c97, 0x1607a2cb},
+  /*  8 */ {10, 0.3333333333333334, 0x3, 0x0},
+  /*  9 */ {10, 0.3154648767857287, 0xcfd41b91, 0x3b563c24},
+  /* 10 */ {9, 0.3010299956639811, 0x3b9aca00, 0x12e0be82},
+  /* 11 */ {9, 0.2890648263178878, 0x8c8b6d2b, 0xd24cde04},
+  /* 12 */ {8, 0.2789429456511298, 0x19a10000, 0x3fa39ab5},
+  /* 13 */ {8, 0.2702381544273197, 0x309f1021, 0x50f8ac5f},
+  /* 14 */ {8, 0.2626495350371936, 0x57f6c100, 0x74843b1e},
+  /* 15 */ {8, 0.2559580248098155, 0x98c29b81, 0xad0326c2},
+  /* 16 */ {8, 0.2500000000000000, 0x4, 0x0},
+  /* 17 */ {7, 0.2446505421182260, 0x18754571, 0x4ef0b6bd},
+  /* 18 */ {7, 0.2398124665681315, 0x247dbc80, 0xc0fc48a1},
+  /* 19 */ {7, 0.2354089133666382, 0x3547667b, 0x33838942},
+  /* 20 */ {7, 0.2313782131597592, 0x4c4b4000, 0xad7f29ab},
+  /* 21 */ {7, 0.2276702486969530, 0x6b5a6e1d, 0x313c3d15},
+  /* 22 */ {7, 0.2242438242175754, 0x94ace180, 0xb8cca9e0},
+  /* 23 */ {7, 0.2210647294575037, 0xcaf18367, 0x42ed6de9},
+  /* 24 */ {6, 0.2181042919855316, 0xb640000, 0x67980e0b},
+  /* 25 */ {6, 0.2153382790366965, 0xe8d4a51, 0x19799812},
+  /* 26 */ {6, 0.2127460535533632, 0x1269ae40, 0xbce85396},
+  /* 27 */ {6, 0.2103099178571525, 0x17179149, 0x62c103a9},
+  /* 28 */ {6, 0.2080145976765095, 0x1cb91000, 0x1d353d43},
+  /* 29 */ {6, 0.2058468324604344, 0x23744899, 0xce1decea},
+  /* 30 */ {6, 0.2037950470905062, 0x2b73a840, 0x790fc511},
+  /* 31 */ {6, 0.2018490865820999, 0x34e63b41, 0x35b865a0},
+  /* 32 */ {6, 0.2000000000000000, 0x5, 0x0},
+  /* 33 */ {6, 0.1982398631705605, 0x4cfa3cc1, 0xa9aed1b3},
+  /* 34 */ {6, 0.1965616322328226, 0x5c13d840, 0x63dfc229},
+  /* 35 */ {6, 0.1949590218937863, 0x6d91b519, 0x2b0fee30},
+  /* 36 */ {6, 0.1934264036172708, 0x81bf1000, 0xf91bd1b6},
+  /* 37 */ {6, 0.1919587200065601, 0x98ede0c9, 0xac89c3a9},
+  /* 38 */ {6, 0.1905514124267734, 0xb3773e40, 0x6d2c32fe},
+  /* 39 */ {6, 0.1892003595168700, 0xd1bbc4d1, 0x387907c9},
+  /* 40 */ {6, 0.1879018247091076, 0xf4240000, 0xc6f7a0b},
+  /* 41 */ {5, 0.1866524112389434, 0x6e7d349, 0x28928154},
+  /* 42 */ {5, 0.1854490234153689, 0x7ca30a0, 0x6e8629d},
+  /* 43 */ {5, 0.1842888331487062, 0x8c32bbb, 0xd373dca0},
+  /* 44 */ {5, 0.1831692509136336, 0x9d46c00, 0xa0b17895},
+  /* 45 */ {5, 0.1820879004699383, 0xaffacfd, 0x746811a5},
+  /* 46 */ {5, 0.1810425967800402, 0xc46bee0, 0x4da6500f},
+  /* 47 */ {5, 0.1800313266566926, 0xdab86ef, 0x2ba23582},
+  /* 48 */ {5, 0.1790522317510414, 0xf300000, 0xdb20a88},
+  /* 49 */ {5, 0.1781035935540111, 0x10d63af1, 0xe68d5ce4},
+  /* 50 */ {5, 0.1771838201355579, 0x12a05f20, 0xb7cdfd9d},
+  /* 51 */ {5, 0.1762914343888821, 0x1490aae3, 0x8e583933},
+  /* 52 */ {5, 0.1754250635819545, 0x16a97400, 0x697cc3ea},
+  /* 53 */ {5, 0.1745834300480449, 0x18ed2825, 0x48a5ca6c},
+  /* 54 */ {5, 0.1737653428714400, 0x1b5e4d60, 0x2b52db16},
+  /* 55 */ {5, 0.1729696904450771, 0x1dff8297, 0x111586a6},
+  /* 56 */ {5, 0.1721954337940981, 0x20d38000, 0xf31d2b36},
+  /* 57 */ {5, 0.1714416005739134, 0x23dd1799, 0xc8d76d19},
+  /* 58 */ {5, 0.1707072796637201, 0x271f35a0, 0xa2cb1eb4},
+  /* 59 */ {5, 0.1699916162869140, 0x2a9ce10b, 0x807c3ec3},
+  /* 60 */ {5, 0.1692938075987814, 0x2e593c00, 0x617ec8bf},
+  /* 61 */ {5, 0.1686130986895011, 0x3257844d, 0x45746cbe},
+  /* 62 */ {5, 0.1679487789570419, 0x369b13e0, 0x2c0aa273},
+  /* 63 */ {5, 0.1673001788101741, 0x3b27613f, 0x14f90805},
+  /* 64 */ {5, 0.1666666666666667, 0x6, 0x0},
+  /* 65 */ {5, 0.1660476462159378, 0x4528a141, 0xd9cf0829},
+  /* 66 */ {5, 0.1654425539190583, 0x4aa51420, 0xb6fc4841},
+  /* 67 */ {5, 0.1648508567221604, 0x50794633, 0x973054cb},
+  /* 68 */ {5, 0.1642720499620502, 0x56a94400, 0x7a1dbe4b},
+  /* 69 */ {5, 0.1637056554452156, 0x5d393975, 0x5f7fcd7f},
+  /* 70 */ {5, 0.1631512196835108, 0x642d7260, 0x47196c84},
+  /* 71 */ {5, 0.1626083122716341, 0x6b8a5ae7, 0x30b43635},
+  /* 72 */ {5, 0.1620765243931223, 0x73548000, 0x1c1fa5f6},
+  /* 73 */ {5, 0.1615554674429964, 0x7b908fe9, 0x930634a},
+  /* 74 */ {5, 0.1610447717564445, 0x84435aa0, 0xef7f4a3c},
+  /* 75 */ {5, 0.1605440854340214, 0x8d71d25b, 0xcf5552d2},
+  /* 76 */ {5, 0.1600530732548213, 0x97210c00, 0xb1a47c8e},
+  /* 77 */ {5, 0.1595714156699382, 0xa1563f9d, 0x9634b43e},
+  /* 78 */ {5, 0.1590988078692941, 0xac16c8e0, 0x7cd3817d},
+  /* 79 */ {5, 0.1586349589155960, 0xb768278f, 0x65536761},
+  /* 80 */ {5, 0.1581795909397823, 0xc3500000, 0x4f8b588e},
+  /* 81 */ {5, 0.1577324383928644, 0xcfd41b91, 0x3b563c24},
+  /* 82 */ {5, 0.1572932473495469, 0xdcfa6920, 0x28928154},
+  /* 83 */ {5, 0.1568617748594410, 0xeac8fd83, 0x1721bfb0},
+  /* 84 */ {5, 0.1564377883420716, 0xf9461400, 0x6e8629d},
+  /* 85 */ {4, 0.1560210650222250, 0x31c84b1, 0x491cc17c},
+  /* 86 */ {4, 0.1556113914024940, 0x342ab10, 0x3a11d83b},
+  /* 87 */ {4, 0.1552085627701551, 0x36a2c21, 0x2be074cd},
+  /* 88 */ {4, 0.1548123827357682, 0x3931000, 0x1e7a02e7},
+  /* 89 */ {4, 0.1544226628011101, 0x3bd5ee1, 0x11d10edd},
+  /* 90 */ {4, 0.1540392219542636, 0x3e92110, 0x5d92c68},
+  /* 91 */ {4, 0.1536618862898642, 0x4165ef1, 0xf50dbfb2},
+  /* 92 */ {4, 0.1532904886526781, 0x4452100, 0xdf9f1316},
+  /* 93 */ {4, 0.1529248683028321, 0x4756fd1, 0xcb52a684},
+  /* 94 */ {4, 0.1525648706011593, 0x4a75410, 0xb8163e97},
+  /* 95 */ {4, 0.1522103467132434, 0x4dad681, 0xa5d8f269},
+  /* 96 */ {4, 0.1518611533308632, 0x5100000, 0x948b0fcd},
+  /* 97 */ {4, 0.1515171524096389, 0x546d981, 0x841e0215},
+  /* 98 */ {4, 0.1511782109217764, 0x57f6c10, 0x74843b1e},
+  /* 99 */ {4, 0.1508442006228941, 0x5b9c0d1, 0x65b11e6e},
+  /* 100 */ {4, 0.1505149978319906, 0x5f5e100, 0x5798ee23},
+  /* 101 */ {4, 0.1501904832236879, 0x633d5f1, 0x4a30b99b},
+  /* 102 */ {4, 0.1498705416319474, 0x673a910, 0x3d6e4d94},
+  /* 103 */ {4, 0.1495550618645152, 0x6b563e1, 0x314825b0},
+  /* 104 */ {4, 0.1492439365274121, 0x6f91000, 0x25b55f2e},
+  /* 105 */ {4, 0.1489370618588283, 0x73eb721, 0x1aadaccb},
+  /* 106 */ {4, 0.1486343375718350, 0x7866310, 0x10294ba2},
+  /* 107 */ {4, 0.1483356667053617, 0x7d01db1, 0x620f8f6},
+  /* 108 */ {4, 0.1480409554829326, 0x81bf100, 0xf91bd1b6},
+  /* 109 */ {4, 0.1477501131786861, 0x869e711, 0xe6d37b2a},
+  /* 110 */ {4, 0.1474630519902391, 0x8ba0a10, 0xd55cff6e},
+  /* 111 */ {4, 0.1471796869179852, 0x90c6441, 0xc4ad2db2},
+  /* 112 */ {4, 0.1468999356504447, 0x9610000, 0xb4b985cf},
+  /* 113 */ {4, 0.1466237184553111, 0x9b7e7c1, 0xa5782bef},
+  /* 114 */ {4, 0.1463509580758620, 0xa112610, 0x96dfdd2a},
+  /* 115 */ {4, 0.1460815796324244, 0xa6cc591, 0x88e7e509},
+  /* 116 */ {4, 0.1458155105286054, 0xacad100, 0x7b8813d3},
+  /* 117 */ {4, 0.1455526803620167, 0xb2b5331, 0x6eb8b595},
+  /* 118 */ {4, 0.1452930208392428, 0xb8e5710, 0x627289db},
+  /* 119 */ {4, 0.1450364656948130, 0xbf3e7a1, 0x56aebc07},
+  /* 120 */ {4, 0.1447829506139581, 0xc5c1000, 0x4b66dc33},
+  /* 121 */ {4, 0.1445324131589439, 0xcc6db61, 0x4094d8a3},
+  /* 122 */ {4, 0.1442847926987864, 0xd345510, 0x3632f7a5},
+  /* 123 */ {4, 0.1440400303421672, 0xda48871, 0x2c3bd1f0},
+  /* 124 */ {4, 0.1437980688733775, 0xe178100, 0x22aa4d5f},
+  /* 125 */ {4, 0.1435588526911310, 0xe8d4a51, 0x19799812},
+  /* 126 */ {4, 0.1433223277500932, 0xf05f010, 0x10a523e5},
+  /* 127 */ {4, 0.1430884415049874, 0xf817e01, 0x828a237},
+  /* 128 */ {4, 0.1428571428571428, 0x7, 0x0},
+  /* 129 */ {4, 0.1426283821033600, 0x10818201, 0xf04ec452},
+  /* 130 */ {4, 0.1424021108869747, 0x11061010, 0xe136444a},
+  /* 131 */ {4, 0.1421782821510107, 0x118db651, 0xd2af9589},
+  /* 132 */ {4, 0.1419568500933153, 0x12188100, 0xc4b42a83},
+  /* 133 */ {4, 0.1417377701235801, 0x12a67c71, 0xb73dccf5},
+  /* 134 */ {4, 0.1415209988221527, 0x1337b510, 0xaa4698c5},
+  /* 135 */ {4, 0.1413064939005528, 0x13cc3761, 0x9dc8f729},
+  /* 136 */ {4, 0.1410942141636095, 0x14641000, 0x91bf9a30},
+  /* 137 */ {4, 0.1408841194731412, 0x14ff4ba1, 0x86257887},
+  /* 138 */ {4, 0.1406761707131039, 0x159df710, 0x7af5c98c},
+  /* 139 */ {4, 0.1404703297561400, 0x16401f31, 0x702c01a0},
+  /* 140 */ {4, 0.1402665594314587, 0x16e5d100, 0x65c3ceb1},
+  /* 141 */ {4, 0.1400648234939879, 0x178f1991, 0x5bb91502},
+  /* 142 */ {4, 0.1398650865947379, 0x183c0610, 0x5207ec23},
+  /* 143 */ {4, 0.1396673142523192, 0x18eca3c1, 0x48ac9c19},
+  /* 144 */ {4, 0.1394714728255649, 0x19a10000, 0x3fa39ab5},
+  /* 145 */ {4, 0.1392775294872041, 0x1a592841, 0x36e98912},
+  /* 146 */ {4, 0.1390854521985406, 0x1b152a10, 0x2e7b3140},
+  /* 147 */ {4, 0.1388952096850913, 0x1bd51311, 0x2655840b},
+  /* 148 */ {4, 0.1387067714131417, 0x1c98f100, 0x1e7596ea},
+  /* 149 */ {4, 0.1385201075671774, 0x1d60d1b1, 0x16d8a20d},
+  /* 150 */ {4, 0.1383351890281539, 0x1e2cc310, 0xf7bfe87},
+  /* 151 */ {4, 0.1381519873525671, 0x1efcd321, 0x85d2492},
+  /* 152 */ {4, 0.1379704747522905, 0x1fd11000, 0x179a9f4},
+  /* 153 */ {4, 0.1377906240751463, 0x20a987e1, 0xf59e80eb},
+  /* 154 */ {4, 0.1376124087861776, 0x21864910, 0xe8b768db},
+  /* 155 */ {4, 0.1374358029495937, 0x226761f1, 0xdc39d6d5},
+  /* 156 */ {4, 0.1372607812113589, 0x234ce100, 0xd021c5d1},
+  /* 157 */ {4, 0.1370873187823978, 0x2436d4d1, 0xc46b5e37},
+  /* 158 */ {4, 0.1369153914223921, 0x25254c10, 0xb912f39c},
+  /* 159 */ {4, 0.1367449754241439, 0x26185581, 0xae150294},
+  /* 160 */ {4, 0.1365760475984821, 0x27100000, 0xa36e2eb1},
+  /* 161 */ {4, 0.1364085852596902, 0x280c5a81, 0x991b4094},
+  /* 162 */ {4, 0.1362425662114337, 0x290d7410, 0x8f19241e},
+  /* 163 */ {4, 0.1360779687331669, 0x2a135bd1, 0x8564e6b7},
+  /* 164 */ {4, 0.1359147715670014, 0x2b1e2100, 0x7bfbb5b4},
+  /* 165 */ {4, 0.1357529539050150, 0x2c2dd2f1, 0x72dadcc8},
+  /* 166 */ {4, 0.1355924953769863, 0x2d428110, 0x69ffc498},
+  /* 167 */ {4, 0.1354333760385373, 0x2e5c3ae1, 0x6167f154},
+  /* 168 */ {4, 0.1352755763596663, 0x2f7b1000, 0x5911016e},
+  /* 169 */ {4, 0.1351190772136599, 0x309f1021, 0x50f8ac5f},
+  /* 170 */ {4, 0.1349638598663645, 0x31c84b10, 0x491cc17c},
+  /* 171 */ {4, 0.1348099059658079, 0x32f6d0b1, 0x417b26d8},
+  /* 172 */ {4, 0.1346571975321549, 0x342ab100, 0x3a11d83b},
+  /* 173 */ {4, 0.1345057169479844, 0x3563fc11, 0x32dee622},
+  /* 174 */ {4, 0.1343554469488779, 0x36a2c210, 0x2be074cd},
+  /* 175 */ {4, 0.1342063706143054, 0x37e71341, 0x2514bb58},
+  /* 176 */ {4, 0.1340584713587980, 0x39310000, 0x1e7a02e7},
+  /* 177 */ {4, 0.1339117329233981, 0x3a8098c1, 0x180ea5d0},
+  /* 178 */ {4, 0.1337661393673756, 0x3bd5ee10, 0x11d10edd},
+  /* 179 */ {4, 0.1336216750601996, 0x3d311091, 0xbbfb88e},
+  /* 180 */ {4, 0.1334783246737591, 0x3e921100, 0x5d92c68},
+  /* 181 */ {4, 0.1333360731748201, 0x3ff90031, 0x1c024c},
+  /* 182 */ {4, 0.1331949058177136, 0x4165ef10, 0xf50dbfb2},
+  /* 183 */ {4, 0.1330548081372441, 0x42d8eea1, 0xea30efa3},
+  /* 184 */ {4, 0.1329157659418126, 0x44521000, 0xdf9f1316},
+  /* 185 */ {4, 0.1327777653067443, 0x45d16461, 0xd555c0c9},
+  /* 186 */ {4, 0.1326407925678156, 0x4756fd10, 0xcb52a684},
+  /* 187 */ {4, 0.1325048343149731, 0x48e2eb71, 0xc193881f},
+  /* 188 */ {4, 0.1323698773862368, 0x4a754100, 0xb8163e97},
+  /* 189 */ {4, 0.1322359088617821, 0x4c0e0f51, 0xaed8b724},
+  /* 190 */ {4, 0.1321029160581950, 0x4dad6810, 0xa5d8f269},
+  /* 191 */ {4, 0.1319708865228925, 0x4f535d01, 0x9d15039d},
+  /* 192 */ {4, 0.1318398080287045, 0x51000000, 0x948b0fcd},
+  /* 193 */ {4, 0.1317096685686114, 0x52b36301, 0x8c394d1d},
+  /* 194 */ {4, 0.1315804563506306, 0x546d9810, 0x841e0215},
+  /* 195 */ {4, 0.1314521597928493, 0x562eb151, 0x7c3784f8},
+  /* 196 */ {4, 0.1313247675185968, 0x57f6c100, 0x74843b1e},
+  /* 197 */ {4, 0.1311982683517524, 0x59c5d971, 0x6d02985d},
+  /* 198 */ {4, 0.1310726513121843, 0x5b9c0d10, 0x65b11e6e},
+  /* 199 */ {4, 0.1309479056113158, 0x5d796e61, 0x5e8e5c64},
+  /* 200 */ {4, 0.1308240206478128, 0x5f5e1000, 0x5798ee23},
+  /* 201 */ {4, 0.1307009860033912, 0x614a04a1, 0x50cf7bde},
+  /* 202 */ {4, 0.1305787914387386, 0x633d5f10, 0x4a30b99b},
+  /* 203 */ {4, 0.1304574268895465, 0x65383231, 0x43bb66bd},
+  /* 204 */ {4, 0.1303368824626505, 0x673a9100, 0x3d6e4d94},
+  /* 205 */ {4, 0.1302171484322746, 0x69448e91, 0x374842ee},
+  /* 206 */ {4, 0.1300982152363760, 0x6b563e10, 0x314825b0},
+  /* 207 */ {4, 0.1299800734730872, 0x6d6fb2c1, 0x2b6cde75},
+  /* 208 */ {4, 0.1298627138972530, 0x6f910000, 0x25b55f2e},
+  /* 209 */ {4, 0.1297461274170591, 0x71ba3941, 0x2020a2c5},
+  /* 210 */ {4, 0.1296303050907487, 0x73eb7210, 0x1aadaccb},
+  /* 211 */ {4, 0.1295152381234257, 0x7624be11, 0x155b891f},
+  /* 212 */ {4, 0.1294009178639407, 0x78663100, 0x10294ba2},
+  /* 213 */ {4, 0.1292873358018581, 0x7aafdeb1, 0xb160fe9},
+  /* 214 */ {4, 0.1291744835645007, 0x7d01db10, 0x620f8f6},
+  /* 215 */ {4, 0.1290623529140715, 0x7f5c3a21, 0x14930ef},
+  /* 216 */ {4, 0.1289509357448472, 0x81bf1000, 0xf91bd1b6},
+  /* 217 */ {4, 0.1288402240804449, 0x842a70e1, 0xefdcb0c7},
+  /* 218 */ {4, 0.1287302100711567, 0x869e7110, 0xe6d37b2a},
+  /* 219 */ {4, 0.1286208859913518, 0x891b24f1, 0xddfeb94a},
+  /* 220 */ {4, 0.1285122442369443, 0x8ba0a100, 0xd55cff6e},
+  /* 221 */ {4, 0.1284042773229231, 0x8e2ef9d1, 0xcceced50},
+  /* 222 */ {4, 0.1282969778809442, 0x90c64410, 0xc4ad2db2},
+  /* 223 */ {4, 0.1281903386569819, 0x93669481, 0xbc9c75f9},
+  /* 224 */ {4, 0.1280843525090381, 0x96100000, 0xb4b985cf},
+  /* 225 */ {4, 0.1279790124049077, 0x98c29b81, 0xad0326c2},
+  /* 226 */ {4, 0.1278743114199984, 0x9b7e7c10, 0xa5782bef},
+  /* 227 */ {4, 0.1277702427352035, 0x9e43b6d1, 0x9e1771a9},
+  /* 228 */ {4, 0.1276667996348261, 0xa1126100, 0x96dfdd2a},
+  /* 229 */ {4, 0.1275639755045533, 0xa3ea8ff1, 0x8fd05c41},
+  /* 230 */ {4, 0.1274617638294791, 0xa6cc5910, 0x88e7e509},
+  /* 231 */ {4, 0.1273601581921741, 0xa9b7d1e1, 0x8225759d},
+  /* 232 */ {4, 0.1272591522708010, 0xacad1000, 0x7b8813d3},
+  /* 233 */ {4, 0.1271587398372755, 0xafac2921, 0x750eccf9},
+  /* 234 */ {4, 0.1270589147554692, 0xb2b53310, 0x6eb8b595},
+  /* 235 */ {4, 0.1269596709794558, 0xb5c843b1, 0x6884e923},
+  /* 236 */ {4, 0.1268610025517973, 0xb8e57100, 0x627289db},
+  /* 237 */ {4, 0.1267629036018709, 0xbc0cd111, 0x5c80c07b},
+  /* 238 */ {4, 0.1266653683442337, 0xbf3e7a10, 0x56aebc07},
+  /* 239 */ {4, 0.1265683910770258, 0xc27a8241, 0x50fbb19b},
+  /* 240 */ {4, 0.1264719661804097, 0xc5c10000, 0x4b66dc33},
+  /* 241 */ {4, 0.1263760881150453, 0xc91209c1, 0x45ef7c7c},
+  /* 242 */ {4, 0.1262807514205999, 0xcc6db610, 0x4094d8a3},
+  /* 243 */ {4, 0.1261859507142915, 0xcfd41b91, 0x3b563c24},
+  /* 244 */ {4, 0.1260916806894653, 0xd3455100, 0x3632f7a5},
+  /* 245 */ {4, 0.1259979361142023, 0xd6c16d31, 0x312a60c3},
+  /* 246 */ {4, 0.1259047118299582, 0xda488710, 0x2c3bd1f0},
+  /* 247 */ {4, 0.1258120027502338, 0xdddab5a1, 0x2766aa45},
+  /* 248 */ {4, 0.1257198038592741, 0xe1781000, 0x22aa4d5f},
+  /* 249 */ {4, 0.1256281102107963, 0xe520ad61, 0x1e06233c},
+  /* 250 */ {4, 0.1255369169267456, 0xe8d4a510, 0x19799812},
+  /* 251 */ {4, 0.1254462191960791, 0xec940e71, 0x15041c33},
+  /* 252 */ {4, 0.1253560122735751, 0xf05f0100, 0x10a523e5},
+  /* 253 */ {4, 0.1252662914786691, 0xf4359451, 0xc5c2749},
+  /* 254 */ {4, 0.1251770521943144, 0xf817e010, 0x828a237},
+  /* 255 */ {4, 0.1250882898658681, 0xfc05fc01, 0x40a1423},
+};
+#endif
+#if BITS_PER_MP_LIMB == 64
+const struct bases __mp_bases[256] =
+{
+  /*  0 */ {0, 0.0, 0, 0},
+  /*  1 */ {0, 1e38, 0, 0},
+  /*  2 */ {64, 1.0000000000000000, CNST_LIMB(0x1), CNST_LIMB(0x0)},
+  /*  3 */ {40, 0.6309297535714574, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)},
+  /*  4 */ {32, 0.5000000000000000, CNST_LIMB(0x2), CNST_LIMB(0x0)},
+  /*  5 */ {27, 0.4306765580733931, CNST_LIMB(0x6765c793fa10079d), CNST_LIMB(0x3ce9a36f23c0fc90)},
+  /*  6 */ {24, 0.3868528072345416, CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295)},
+  /*  7 */ {22, 0.3562071871080222, CNST_LIMB(0x3642798750226111), CNST_LIMB(0x2df495ccaa57147b)},
+  /*  8 */ {21, 0.3333333333333334, CNST_LIMB(0x3), CNST_LIMB(0x0)},
+  /*  9 */ {20, 0.3154648767857287, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)},
+  /* 10 */ {19, 0.3010299956639811, CNST_LIMB(0x8ac7230489e80000), CNST_LIMB(0xd83c94fb6d2ac34a)},
+  /* 11 */ {18, 0.2890648263178878, CNST_LIMB(0x4d28cb56c33fa539), CNST_LIMB(0xa8adf7ae45e7577b)},
+  /* 12 */ {17, 0.2789429456511298, CNST_LIMB(0x1eca170c00000000), CNST_LIMB(0xa10c2bec5da8f8f)},
+  /* 13 */ {17, 0.2702381544273197, CNST_LIMB(0x780c7372621bd74d), CNST_LIMB(0x10f4becafe412ec3)},
+  /* 14 */ {16, 0.2626495350371936, CNST_LIMB(0x1e39a5057d810000), CNST_LIMB(0xf08480f672b4e86)},
+  /* 15 */ {16, 0.2559580248098155, CNST_LIMB(0x5b27ac993df97701), CNST_LIMB(0x6779c7f90dc42f48)},
+  /* 16 */ {16, 0.2500000000000000, CNST_LIMB(0x4), CNST_LIMB(0x0)},
+  /* 17 */ {15, 0.2446505421182260, CNST_LIMB(0x27b95e997e21d9f1), CNST_LIMB(0x9c71e11bab279323)},
+  /* 18 */ {15, 0.2398124665681315, CNST_LIMB(0x5da0e1e53c5c8000), CNST_LIMB(0x5dfaa697ec6f6a1c)},
+  /* 19 */ {15, 0.2354089133666382, CNST_LIMB(0xd2ae3299c1c4aedb), CNST_LIMB(0x3711783f6be7e9ec)},
+  /* 20 */ {14, 0.2313782131597592, CNST_LIMB(0x16bcc41e90000000), CNST_LIMB(0x6849b86a12b9b01e)},
+  /* 21 */ {14, 0.2276702486969530, CNST_LIMB(0x2d04b7fdd9c0ef49), CNST_LIMB(0x6bf097ba5ca5e239)},
+  /* 22 */ {14, 0.2242438242175754, CNST_LIMB(0x5658597bcaa24000), CNST_LIMB(0x7b8015c8d7af8f08)},
+  /* 23 */ {14, 0.2210647294575037, CNST_LIMB(0xa0e2073737609371), CNST_LIMB(0x975a24b3a3151b38)},
+  /* 24 */ {13, 0.2181042919855316, CNST_LIMB(0xc29e98000000000), CNST_LIMB(0x50bd367972689db1)},
+  /* 25 */ {13, 0.2153382790366965, CNST_LIMB(0x14adf4b7320334b9), CNST_LIMB(0x8c240c4aecb13bb5)},
+  /* 26 */ {13, 0.2127460535533632, CNST_LIMB(0x226ed36478bfa000), CNST_LIMB(0xdbd2e56854e118c9)},
+  /* 27 */ {13, 0.2103099178571525, CNST_LIMB(0x383d9170b85ff80b), CNST_LIMB(0x2351ffcaa9c7c4ae)},
+  /* 28 */ {13, 0.2080145976765095, CNST_LIMB(0x5a3c23e39c000000), CNST_LIMB(0x6b24188ca33b0636)},
+  /* 29 */ {13, 0.2058468324604344, CNST_LIMB(0x8e65137388122bcd), CNST_LIMB(0xcc3dceaf2b8ba99d)},
+  /* 30 */ {13, 0.2037950470905062, CNST_LIMB(0xdd41bb36d259e000), CNST_LIMB(0x2832e835c6c7d6b6)},
+  /* 31 */ {12, 0.2018490865820999, CNST_LIMB(0xaee5720ee830681), CNST_LIMB(0x76b6aa272e1873c5)},
+  /* 32 */ {12, 0.2000000000000000, CNST_LIMB(0x5), CNST_LIMB(0x0)},
+  /* 33 */ {12, 0.1982398631705605, CNST_LIMB(0x172588ad4f5f0981), CNST_LIMB(0x61eaf5d402c7bf4f)},
+  /* 34 */ {12, 0.1965616322328226, CNST_LIMB(0x211e44f7d02c1000), CNST_LIMB(0xeeb658123ffb27ec)},
+  /* 35 */ {12, 0.1949590218937863, CNST_LIMB(0x2ee56725f06e5c71), CNST_LIMB(0x5d5e3762e6fdf509)},
+  /* 36 */ {12, 0.1934264036172708, CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295)},
+  /* 37 */ {12, 0.1919587200065601, CNST_LIMB(0x5b5b57f8a98a5dd1), CNST_LIMB(0x66ae7831762efb6f)},
+  /* 38 */ {12, 0.1905514124267734, CNST_LIMB(0x7dcff8986ea31000), CNST_LIMB(0x47388865a00f544)},
+  /* 39 */ {12, 0.1892003595168700, CNST_LIMB(0xabd4211662a6b2a1), CNST_LIMB(0x7d673c33a123b54c)},
+  /* 40 */ {12, 0.1879018247091076, CNST_LIMB(0xe8d4a51000000000), CNST_LIMB(0x19799812dea11197)},
+  /* 41 */ {11, 0.1866524112389434, CNST_LIMB(0x7a32956ad081b79), CNST_LIMB(0xc27e62e0686feae)},
+  /* 42 */ {11, 0.1854490234153689, CNST_LIMB(0x9f49aaff0e86800), CNST_LIMB(0x9b6e7507064ce7c7)},
+  /* 43 */ {11, 0.1842888331487062, CNST_LIMB(0xce583bb812d37b3), CNST_LIMB(0x3d9ac2bf66cfed94)},
+  /* 44 */ {11, 0.1831692509136336, CNST_LIMB(0x109b79a654c00000), CNST_LIMB(0xed46bc50ce59712a)},
+  /* 45 */ {11, 0.1820879004699383, CNST_LIMB(0x1543beff214c8b95), CNST_LIMB(0x813d97e2c89b8d46)},
+  /* 46 */ {11, 0.1810425967800402, CNST_LIMB(0x1b149a79459a3800), CNST_LIMB(0x2e81751956af8083)},
+  /* 47 */ {11, 0.1800313266566926, CNST_LIMB(0x224edfb5434a830f), CNST_LIMB(0xdd8e0a95e30c0988)},
+  /* 48 */ {11, 0.1790522317510413, CNST_LIMB(0x2b3fb00000000000), CNST_LIMB(0x7ad4dd48a0b5b167)},
+  /* 49 */ {11, 0.1781035935540111, CNST_LIMB(0x3642798750226111), CNST_LIMB(0x2df495ccaa57147b)},
+  /* 50 */ {11, 0.1771838201355579, CNST_LIMB(0x43c33c1937564800), CNST_LIMB(0xe392010175ee5962)},
+  /* 51 */ {11, 0.1762914343888821, CNST_LIMB(0x54411b2441c3cd8b), CNST_LIMB(0x84eaf11b2fe7738e)},
+  /* 52 */ {11, 0.1754250635819545, CNST_LIMB(0x6851455acd400000), CNST_LIMB(0x3a1e3971e008995d)},
+  /* 53 */ {11, 0.1745834300480449, CNST_LIMB(0x80a23b117c8feb6d), CNST_LIMB(0xfd7a462344ffce25)},
+  /* 54 */ {11, 0.1737653428714400, CNST_LIMB(0x9dff7d32d5dc1800), CNST_LIMB(0x9eca40b40ebcef8a)},
+  /* 55 */ {11, 0.1729696904450771, CNST_LIMB(0xc155af6faeffe6a7), CNST_LIMB(0x52fa161a4a48e43d)},
+  /* 56 */ {11, 0.1721954337940981, CNST_LIMB(0xebb7392e00000000), CNST_LIMB(0x1607a2cbacf930c1)},
+  /* 57 */ {10, 0.1714416005739134, CNST_LIMB(0x50633659656d971), CNST_LIMB(0x97a014f8e3be55f1)},
+  /* 58 */ {10, 0.1707072796637201, CNST_LIMB(0x5fa8624c7fba400), CNST_LIMB(0x568df8b76cbf212c)},
+  /* 59 */ {10, 0.1699916162869140, CNST_LIMB(0x717d9faa73c5679), CNST_LIMB(0x20ba7c4b4e6ef492)},
+  /* 60 */ {10, 0.1692938075987814, CNST_LIMB(0x86430aac6100000), CNST_LIMB(0xe81ee46b9ef492f5)},
+  /* 61 */ {10, 0.1686130986895011, CNST_LIMB(0x9e64d9944b57f29), CNST_LIMB(0x9dc0d10d51940416)},
+  /* 62 */ {10, 0.1679487789570419, CNST_LIMB(0xba5ca5392cb0400), CNST_LIMB(0x5fa8ed2f450272a5)},
+  /* 63 */ {10, 0.1673001788101741, CNST_LIMB(0xdab2ce1d022cd81), CNST_LIMB(0x2ba9eb8c5e04e641)},
+  /* 64 */ {10, 0.1666666666666667, CNST_LIMB(0x6), CNST_LIMB(0x0)},
+  /* 65 */ {10, 0.1660476462159378, CNST_LIMB(0x12aeed5fd3e2d281), CNST_LIMB(0xb67759cc00287bf1)},
+  /* 66 */ {10, 0.1654425539190583, CNST_LIMB(0x15c3da1572d50400), CNST_LIMB(0x78621feeb7f4ed33)},
+  /* 67 */ {10, 0.1648508567221604, CNST_LIMB(0x194c05534f75ee29), CNST_LIMB(0x43d55b5f72943bc0)},
+  /* 68 */ {10, 0.1642720499620502, CNST_LIMB(0x1d56299ada100000), CNST_LIMB(0x173decb64d1d4409)},
+  /* 69 */ {10, 0.1637056554452156, CNST_LIMB(0x21f2a089a4ff4f79), CNST_LIMB(0xe29fb54fd6b6074f)},
+  /* 70 */ {10, 0.1631512196835108, CNST_LIMB(0x2733896c68d9a400), CNST_LIMB(0xa1f1f5c210d54e62)},
+  /* 71 */ {10, 0.1626083122716341, CNST_LIMB(0x2d2cf2c33b533c71), CNST_LIMB(0x6aac7f9bfafd57b2)},
+  /* 72 */ {10, 0.1620765243931223, CNST_LIMB(0x33f506e440000000), CNST_LIMB(0x3b563c2478b72ee2)},
+  /* 73 */ {10, 0.1615554674429964, CNST_LIMB(0x3ba43bec1d062211), CNST_LIMB(0x12b536b574e92d1b)},
+  /* 74 */ {10, 0.1610447717564444, CNST_LIMB(0x4455872d8fd4e400), CNST_LIMB(0xdf86c03020404fa5)},
+  /* 75 */ {10, 0.1605440854340214, CNST_LIMB(0x4e2694539f2f6c59), CNST_LIMB(0xa34adf02234eea8e)},
+  /* 76 */ {10, 0.1600530732548213, CNST_LIMB(0x5938006c18900000), CNST_LIMB(0x6f46eb8574eb59dd)},
+  /* 77 */ {10, 0.1595714156699382, CNST_LIMB(0x65ad9912474aa649), CNST_LIMB(0x42459b481df47cec)},
+  /* 78 */ {10, 0.1590988078692941, CNST_LIMB(0x73ae9ff4241ec400), CNST_LIMB(0x1b424b95d80ca505)},
+  /* 79 */ {10, 0.1586349589155960, CNST_LIMB(0x836612ee9c4ce1e1), CNST_LIMB(0xf2c1b982203a0dac)},
+  /* 80 */ {10, 0.1581795909397823, CNST_LIMB(0x9502f90000000000), CNST_LIMB(0xb7cdfd9d7bdbab7d)},
+  /* 81 */ {10, 0.1577324383928644, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)},
+  /* 82 */ {10, 0.1572932473495469, CNST_LIMB(0xbebf59a07dab4400), CNST_LIMB(0x57931eeaf85cf64f)},
+  /* 83 */ {10, 0.1568617748594410, CNST_LIMB(0xd7540d4093bc3109), CNST_LIMB(0x305a944507c82f47)},
+  /* 84 */ {10, 0.1564377883420716, CNST_LIMB(0xf2b96616f1900000), CNST_LIMB(0xe007ccc9c22781a)},
+  /* 85 */ {9, 0.1560210650222250, CNST_LIMB(0x336de62af2bca35), CNST_LIMB(0x3e92c42e000eeed4)},
+  /* 86 */ {9, 0.1556113914024940, CNST_LIMB(0x39235ec33d49600), CNST_LIMB(0x1ebe59130db2795e)},
+  /* 87 */ {9, 0.1552085627701551, CNST_LIMB(0x3f674e539585a17), CNST_LIMB(0x268859e90f51b89)},
+  /* 88 */ {9, 0.1548123827357682, CNST_LIMB(0x4645b6958000000), CNST_LIMB(0xd24cde0463108cfa)},
+  /* 89 */ {9, 0.1544226628011101, CNST_LIMB(0x4dcb74afbc49c19), CNST_LIMB(0xa536009f37adc383)},
+  /* 90 */ {9, 0.1540392219542636, CNST_LIMB(0x56064e1d18d9a00), CNST_LIMB(0x7cea06ce1c9ace10)},
+  /* 91 */ {9, 0.1536618862898642, CNST_LIMB(0x5f04fe2cd8a39fb), CNST_LIMB(0x58db032e72e8ba43)},
+  /* 92 */ {9, 0.1532904886526781, CNST_LIMB(0x68d74421f5c0000), CNST_LIMB(0x388cc17cae105447)},
+  /* 93 */ {9, 0.1529248683028321, CNST_LIMB(0x738df1f6ab4827d), CNST_LIMB(0x1b92672857620ce0)},
+  /* 94 */ {9, 0.1525648706011593, CNST_LIMB(0x7f3afbc9cfb5e00), CNST_LIMB(0x18c6a9575c2ade4)},
+  /* 95 */ {9, 0.1522103467132434, CNST_LIMB(0x8bf187fba88f35f), CNST_LIMB(0xd44da7da8e44b24f)},
+  /* 96 */ {9, 0.1518611533308632, CNST_LIMB(0x99c600000000000), CNST_LIMB(0xaa2f78f1b4cc6794)},
+  /* 97 */ {9, 0.1515171524096389, CNST_LIMB(0xa8ce21eb6531361), CNST_LIMB(0x843c067d091ee4cc)},
+  /* 98 */ {9, 0.1511782109217764, CNST_LIMB(0xb92112c1a0b6200), CNST_LIMB(0x62005e1e913356e3)},
+  /* 99 */ {9, 0.1508442006228941, CNST_LIMB(0xcad7718b8747c43), CNST_LIMB(0x4316eed01dedd518)},
+  /* 100 */ {9, 0.1505149978319906, CNST_LIMB(0xde0b6b3a7640000), CNST_LIMB(0x2725dd1d243aba0e)},
+  /* 101 */ {9, 0.1501904832236879, CNST_LIMB(0xf2d8cf5fe6d74c5), CNST_LIMB(0xddd9057c24cb54f)},
+  /* 102 */ {9, 0.1498705416319474, CNST_LIMB(0x1095d25bfa712600), CNST_LIMB(0xedeee175a736d2a1)},
+  /* 103 */ {9, 0.1495550618645152, CNST_LIMB(0x121b7c4c3698faa7), CNST_LIMB(0xc4699f3df8b6b328)},
+  /* 104 */ {9, 0.1492439365274121, CNST_LIMB(0x13c09e8d68000000), CNST_LIMB(0x9ebbe7d859cb5a7c)},
+  /* 105 */ {9, 0.1489370618588283, CNST_LIMB(0x15876ccb0b709ca9), CNST_LIMB(0x7c828b9887eb2179)},
+  /* 106 */ {9, 0.1486343375718350, CNST_LIMB(0x17723c2976da2a00), CNST_LIMB(0x5d652ab99001adcf)},
+  /* 107 */ {9, 0.1483356667053617, CNST_LIMB(0x198384e9c259048b), CNST_LIMB(0x4114f1754e5d7b32)},
+  /* 108 */ {9, 0.1480409554829326, CNST_LIMB(0x1bbde41dfeec0000), CNST_LIMB(0x274b7c902f7e0188)},
+  /* 109 */ {9, 0.1477501131786861, CNST_LIMB(0x1e241d6e3337910d), CNST_LIMB(0xfc9e0fbb32e210c)},
+  /* 110 */ {9, 0.1474630519902391, CNST_LIMB(0x20b91cee9901ee00), CNST_LIMB(0xf4afa3e594f8ea1f)},
+  /* 111 */ {9, 0.1471796869179852, CNST_LIMB(0x237ff9079863dfef), CNST_LIMB(0xcd85c32e9e4437b0)},
+  /* 112 */ {9, 0.1468999356504447, CNST_LIMB(0x267bf47000000000), CNST_LIMB(0xa9bbb147e0dd92a8)},
+  /* 113 */ {9, 0.1466237184553111, CNST_LIMB(0x29b08039fbeda7f1), CNST_LIMB(0x8900447b70e8eb82)},
+  /* 114 */ {9, 0.1463509580758620, CNST_LIMB(0x2d213df34f65f200), CNST_LIMB(0x6b0a92adaad5848a)},
+  /* 115 */ {9, 0.1460815796324244, CNST_LIMB(0x30d201d957a7c2d3), CNST_LIMB(0x4f990ad8740f0ee5)},
+  /* 116 */ {9, 0.1458155105286054, CNST_LIMB(0x34c6d52160f40000), CNST_LIMB(0x3670a9663a8d3610)},
+  /* 117 */ {9, 0.1455526803620167, CNST_LIMB(0x3903f855d8f4c755), CNST_LIMB(0x1f5c44188057be3c)},
+  /* 118 */ {9, 0.1452930208392428, CNST_LIMB(0x3d8de5c8ec59b600), CNST_LIMB(0xa2bea956c4e4977)},
+  /* 119 */ {9, 0.1450364656948130, CNST_LIMB(0x4269541d1ff01337), CNST_LIMB(0xed68b23033c3637e)},
+  /* 120 */ {9, 0.1447829506139581, CNST_LIMB(0x479b38e478000000), CNST_LIMB(0xc99cf624e50549c5)},
+  /* 121 */ {9, 0.1445324131589439, CNST_LIMB(0x4d28cb56c33fa539), CNST_LIMB(0xa8adf7ae45e7577b)},
+  /* 122 */ {9, 0.1442847926987864, CNST_LIMB(0x5317871fa13aba00), CNST_LIMB(0x8a5bc740b1c113e5)},
+  /* 123 */ {9, 0.1440400303421672, CNST_LIMB(0x596d2f44de9fa71b), CNST_LIMB(0x6e6c7efb81cfbb9b)},
+  /* 124 */ {9, 0.1437980688733775, CNST_LIMB(0x602fd125c47c0000), CNST_LIMB(0x54aba5c5cada5f10)},
+  /* 125 */ {9, 0.1435588526911310, CNST_LIMB(0x6765c793fa10079d), CNST_LIMB(0x3ce9a36f23c0fc90)},
+  /* 126 */ {9, 0.1433223277500932, CNST_LIMB(0x6f15be069b847e00), CNST_LIMB(0x26fb43de2c8cd2a8)},
+  /* 127 */ {9, 0.1430884415049874, CNST_LIMB(0x7746b3e82a77047f), CNST_LIMB(0x12b94793db8486a1)},
+  /* 128 */ {9, 0.1428571428571428, CNST_LIMB(0x7), CNST_LIMB(0x0)},
+  /* 129 */ {9, 0.1426283821033600, CNST_LIMB(0x894953f7ea890481), CNST_LIMB(0xdd5deca404c0156d)},
+  /* 130 */ {9, 0.1424021108869747, CNST_LIMB(0x932abffea4848200), CNST_LIMB(0xbd51373330291de0)},
+  /* 131 */ {9, 0.1421782821510107, CNST_LIMB(0x9dacb687d3d6a163), CNST_LIMB(0x9fa4025d66f23085)},
+  /* 132 */ {9, 0.1419568500933153, CNST_LIMB(0xa8d8102a44840000), CNST_LIMB(0x842530ee2db4949d)},
+  /* 133 */ {9, 0.1417377701235801, CNST_LIMB(0xb4b60f9d140541e5), CNST_LIMB(0x6aa7f2766b03dc25)},
+  /* 134 */ {9, 0.1415209988221527, CNST_LIMB(0xc15065d4856e4600), CNST_LIMB(0x53035ba7ebf32e8d)},
+  /* 135 */ {9, 0.1413064939005528, CNST_LIMB(0xceb1363f396d23c7), CNST_LIMB(0x3d12091fc9fb4914)},
+  /* 136 */ {9, 0.1410942141636095, CNST_LIMB(0xdce31b2488000000), CNST_LIMB(0x28b1cb81b1ef1849)},
+  /* 137 */ {9, 0.1408841194731412, CNST_LIMB(0xebf12a24bca135c9), CNST_LIMB(0x15c35be67ae3e2c9)},
+  /* 138 */ {9, 0.1406761707131039, CNST_LIMB(0xfbe6f8dbf88f4a00), CNST_LIMB(0x42a17bd09be1ff0)},
+  /* 139 */ {8, 0.1404703297561400, CNST_LIMB(0x1ef156c084ce761), CNST_LIMB(0x8bf461f03cf0bbf)},
+  /* 140 */ {8, 0.1402665594314587, CNST_LIMB(0x20c4e3b94a10000), CNST_LIMB(0xf3fbb43f68a32d05)},
+  /* 141 */ {8, 0.1400648234939879, CNST_LIMB(0x22b0695a08ba421), CNST_LIMB(0xd84f44c48564dc19)},
+  /* 142 */ {8, 0.1398650865947379, CNST_LIMB(0x24b4f35d7a4c100), CNST_LIMB(0xbe58ebcce7956abe)},
+  /* 143 */ {8, 0.1396673142523192, CNST_LIMB(0x26d397284975781), CNST_LIMB(0xa5fac463c7c134b7)},
+  /* 144 */ {8, 0.1394714728255649, CNST_LIMB(0x290d74100000000), CNST_LIMB(0x8f19241e28c7d757)},
+  /* 145 */ {8, 0.1392775294872041, CNST_LIMB(0x2b63b3a37866081), CNST_LIMB(0x799a6d046c0ae1ae)},
+  /* 146 */ {8, 0.1390854521985406, CNST_LIMB(0x2dd789f4d894100), CNST_LIMB(0x6566e37d746a9e40)},
+  /* 147 */ {8, 0.1388952096850913, CNST_LIMB(0x306a35e51b58721), CNST_LIMB(0x526887dbfb5f788f)},
+  /* 148 */ {8, 0.1387067714131417, CNST_LIMB(0x331d01712e10000), CNST_LIMB(0x408af3382b8efd3d)},
+  /* 149 */ {8, 0.1385201075671774, CNST_LIMB(0x35f14200a827c61), CNST_LIMB(0x2fbb374806ec05f1)},
+  /* 150 */ {8, 0.1383351890281539, CNST_LIMB(0x38e858b62216100), CNST_LIMB(0x1fe7c0f0afce87fe)},
+  /* 151 */ {8, 0.1381519873525671, CNST_LIMB(0x3c03b2c13176a41), CNST_LIMB(0x11003d517540d32e)},
+  /* 152 */ {8, 0.1379704747522905, CNST_LIMB(0x3f44c9b21000000), CNST_LIMB(0x2f5810f98eff0dc)},
+  /* 153 */ {8, 0.1377906240751463, CNST_LIMB(0x42ad23cef3113c1), CNST_LIMB(0xeb72e35e7840d910)},
+  /* 154 */ {8, 0.1376124087861776, CNST_LIMB(0x463e546b19a2100), CNST_LIMB(0xd27de19593dc3614)},
+  /* 155 */ {8, 0.1374358029495937, CNST_LIMB(0x49f9fc3f96684e1), CNST_LIMB(0xbaf391fd3e5e6fc2)},
+  /* 156 */ {8, 0.1372607812113589, CNST_LIMB(0x4de1c9c5dc10000), CNST_LIMB(0xa4bd38c55228c81d)},
+  /* 157 */ {8, 0.1370873187823978, CNST_LIMB(0x51f77994116d2a1), CNST_LIMB(0x8fc5a8de8e1de782)},
+  /* 158 */ {8, 0.1369153914223921, CNST_LIMB(0x563cd6bb3398100), CNST_LIMB(0x7bf9265bea9d3a3b)},
+  /* 159 */ {8, 0.1367449754241439, CNST_LIMB(0x5ab3bb270beeb01), CNST_LIMB(0x69454b325983dccd)},
+  /* 160 */ {8, 0.1365760475984821, CNST_LIMB(0x5f5e10000000000), CNST_LIMB(0x5798ee2308c39df9)},
+  /* 161 */ {8, 0.1364085852596902, CNST_LIMB(0x643dce0ec16f501), CNST_LIMB(0x46e40ba0fa66a753)},
+  /* 162 */ {8, 0.1362425662114337, CNST_LIMB(0x6954fe21e3e8100), CNST_LIMB(0x3717b0870b0db3a7)},
+  /* 163 */ {8, 0.1360779687331669, CNST_LIMB(0x6ea5b9755f440a1), CNST_LIMB(0x2825e6775d11cdeb)},
+  /* 164 */ {8, 0.1359147715670014, CNST_LIMB(0x74322a1c0410000), CNST_LIMB(0x1a01a1c09d1b4dac)},
+  /* 165 */ {8, 0.1357529539050150, CNST_LIMB(0x79fc8b6ae8a46e1), CNST_LIMB(0xc9eb0a8bebc8f3e)},
+  /* 166 */ {8, 0.1355924953769863, CNST_LIMB(0x80072a66d512100), CNST_LIMB(0xffe357ff59e6a004)},
+  /* 167 */ {8, 0.1354333760385373, CNST_LIMB(0x86546633b42b9c1), CNST_LIMB(0xe7dfd1be05fa61a8)},
+  /* 168 */ {8, 0.1352755763596663, CNST_LIMB(0x8ce6b0861000000), CNST_LIMB(0xd11ed6fc78f760e5)},
+  /* 169 */ {8, 0.1351190772136599, CNST_LIMB(0x93c08e16a022441), CNST_LIMB(0xbb8db609dd29ebfe)},
+  /* 170 */ {8, 0.1349638598663645, CNST_LIMB(0x9ae49717f026100), CNST_LIMB(0xa71aec8d1813d532)},
+  /* 171 */ {8, 0.1348099059658079, CNST_LIMB(0xa25577ae24c1a61), CNST_LIMB(0x93b612a9f20fbc02)},
+  /* 172 */ {8, 0.1346571975321549, CNST_LIMB(0xaa15f068e610000), CNST_LIMB(0x814fc7b19a67d317)},
+  /* 173 */ {8, 0.1345057169479844, CNST_LIMB(0xb228d6bf7577921), CNST_LIMB(0x6fd9a03f2e0a4b7c)},
+  /* 174 */ {8, 0.1343554469488779, CNST_LIMB(0xba91158ef5c4100), CNST_LIMB(0x5f4615a38d0d316e)},
+  /* 175 */ {8, 0.1342063706143054, CNST_LIMB(0xc351ad9aec0b681), CNST_LIMB(0x4f8876863479a286)},
+  /* 176 */ {8, 0.1340584713587980, CNST_LIMB(0xcc6db6100000000), CNST_LIMB(0x4094d8a3041b60eb)},
+  /* 177 */ {8, 0.1339117329233981, CNST_LIMB(0xd5e85d09025c181), CNST_LIMB(0x32600b8ed883a09b)},
+  /* 178 */ {8, 0.1337661393673756, CNST_LIMB(0xdfc4e816401c100), CNST_LIMB(0x24df8c6eb4b6d1f1)},
+  /* 179 */ {8, 0.1336216750601996, CNST_LIMB(0xea06b4c72947221), CNST_LIMB(0x18097a8ee151acef)},
+  /* 180 */ {8, 0.1334783246737591, CNST_LIMB(0xf4b139365210000), CNST_LIMB(0xbd48cc8ec1cd8e3)},
+  /* 181 */ {8, 0.1333360731748201, CNST_LIMB(0xffc80497d520961), CNST_LIMB(0x3807a8d67485fb)},
+  /* 182 */ {8, 0.1331949058177136, CNST_LIMB(0x10b4ebfca1dee100), CNST_LIMB(0xea5768860b62e8d8)},
+  /* 183 */ {8, 0.1330548081372441, CNST_LIMB(0x117492de921fc141), CNST_LIMB(0xd54faf5b635c5005)},
+  /* 184 */ {8, 0.1329157659418126, CNST_LIMB(0x123bb2ce41000000), CNST_LIMB(0xc14a56233a377926)},
+  /* 185 */ {8, 0.1327777653067443, CNST_LIMB(0x130a8b6157bdecc1), CNST_LIMB(0xae39a88db7cd329f)},
+  /* 186 */ {8, 0.1326407925678156, CNST_LIMB(0x13e15dede0e8a100), CNST_LIMB(0x9c10bde69efa7ab6)},
+  /* 187 */ {8, 0.1325048343149731, CNST_LIMB(0x14c06d941c0ca7e1), CNST_LIMB(0x8ac36c42a2836497)},
+  /* 188 */ {8, 0.1323698773862368, CNST_LIMB(0x15a7ff487a810000), CNST_LIMB(0x7a463c8b84f5ef67)},
+  /* 189 */ {8, 0.1322359088617821, CNST_LIMB(0x169859ddc5c697a1), CNST_LIMB(0x6a8e5f5ad090fd4b)},
+  /* 190 */ {8, 0.1321029160581950, CNST_LIMB(0x1791c60f6fed0100), CNST_LIMB(0x5b91a2943596fc56)},
+  /* 191 */ {8, 0.1319708865228925, CNST_LIMB(0x18948e8c0e6fba01), CNST_LIMB(0x4d4667b1c468e8f0)},
+  /* 192 */ {8, 0.1318398080287045, CNST_LIMB(0x19a1000000000000), CNST_LIMB(0x3fa39ab547994daf)},
+  /* 193 */ {8, 0.1317096685686114, CNST_LIMB(0x1ab769203dafc601), CNST_LIMB(0x32a0a9b2faee1e2a)},
+  /* 194 */ {8, 0.1315804563506306, CNST_LIMB(0x1bd81ab557f30100), CNST_LIMB(0x26357ceac0e96962)},
+  /* 195 */ {8, 0.1314521597928493, CNST_LIMB(0x1d0367a69fed1ba1), CNST_LIMB(0x1a5a6f65caa5859e)},
+  /* 196 */ {8, 0.1313247675185968, CNST_LIMB(0x1e39a5057d810000), CNST_LIMB(0xf08480f672b4e86)},
+  /* 197 */ {8, 0.1311982683517524, CNST_LIMB(0x1f7b2a18f29ac3e1), CNST_LIMB(0x4383340615612ca)},
+  /* 198 */ {8, 0.1310726513121843, CNST_LIMB(0x20c850694c2aa100), CNST_LIMB(0xf3c77969ee4be5a2)},
+  /* 199 */ {8, 0.1309479056113158, CNST_LIMB(0x222173cc014980c1), CNST_LIMB(0xe00993cc187c5ec9)},
+  /* 200 */ {8, 0.1308240206478128, CNST_LIMB(0x2386f26fc1000000), CNST_LIMB(0xcd2b297d889bc2b6)},
+  /* 201 */ {8, 0.1307009860033912, CNST_LIMB(0x24f92ce8af296d41), CNST_LIMB(0xbb214d5064862b22)},
+  /* 202 */ {8, 0.1305787914387386, CNST_LIMB(0x2678863cd0ece100), CNST_LIMB(0xa9e1a7ca7ea10e20)},
+  /* 203 */ {8, 0.1304574268895465, CNST_LIMB(0x280563f0a9472d61), CNST_LIMB(0x99626e72b39ea0cf)},
+  /* 204 */ {8, 0.1303368824626505, CNST_LIMB(0x29a02e1406210000), CNST_LIMB(0x899a5ba9c13fafd9)},
+  /* 205 */ {8, 0.1302171484322746, CNST_LIMB(0x2b494f4efe6d2e21), CNST_LIMB(0x7a80a705391e96ff)},
+  /* 206 */ {8, 0.1300982152363760, CNST_LIMB(0x2d0134ef21cbc100), CNST_LIMB(0x6c0cfe23de23042a)},
+  /* 207 */ {8, 0.1299800734730872, CNST_LIMB(0x2ec84ef4da2ef581), CNST_LIMB(0x5e377df359c944dd)},
+  /* 208 */ {8, 0.1298627138972530, CNST_LIMB(0x309f102100000000), CNST_LIMB(0x50f8ac5fc8f53985)},
+  /* 209 */ {8, 0.1297461274170591, CNST_LIMB(0x3285ee02a1420281), CNST_LIMB(0x44497266278e35b7)},
+  /* 210 */ {8, 0.1296303050907487, CNST_LIMB(0x347d6104fc324100), CNST_LIMB(0x382316831f7ee175)},
+  /* 211 */ {8, 0.1295152381234257, CNST_LIMB(0x3685e47dade53d21), CNST_LIMB(0x2c7f377833b8946e)},
+  /* 212 */ {8, 0.1294009178639407, CNST_LIMB(0x389ff6bb15610000), CNST_LIMB(0x2157c761ab4163ef)},
+  /* 213 */ {8, 0.1292873358018581, CNST_LIMB(0x3acc1912ebb57661), CNST_LIMB(0x16a7071803cc49a9)},
+  /* 214 */ {8, 0.1291744835645007, CNST_LIMB(0x3d0acff111946100), CNST_LIMB(0xc6781d80f8224fc)},
+  /* 215 */ {8, 0.1290623529140715, CNST_LIMB(0x3f5ca2e692eaf841), CNST_LIMB(0x294092d370a900b)},
+  /* 216 */ {8, 0.1289509357448472, CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295)},
+  /* 217 */ {8, 0.1288402240804449, CNST_LIMB(0x443bcb714399a5c1), CNST_LIMB(0xe03b98f103fad6d2)},
+  /* 218 */ {8, 0.1287302100711567, CNST_LIMB(0x46ca406c81af2100), CNST_LIMB(0xcee3d32cad2a9049)},
+  /* 219 */ {8, 0.1286208859913518, CNST_LIMB(0x496e106ac22aaae1), CNST_LIMB(0xbe3f9df9277fdada)},
+  /* 220 */ {8, 0.1285122442369443, CNST_LIMB(0x4c27d39fa5410000), CNST_LIMB(0xae46f0d94c05e933)},
+  /* 221 */ {8, 0.1284042773229231, CNST_LIMB(0x4ef825c296e43ca1), CNST_LIMB(0x9ef2280fb437a33d)},
+  /* 222 */ {8, 0.1282969778809442, CNST_LIMB(0x51dfa61f5ad88100), CNST_LIMB(0x9039ff426d3f284b)},
+  /* 223 */ {8, 0.1281903386569819, CNST_LIMB(0x54def7a6d2f16901), CNST_LIMB(0x82178c6d6b51f8f4)},
+  /* 224 */ {8, 0.1280843525090381, CNST_LIMB(0x57f6c10000000000), CNST_LIMB(0x74843b1ee4c1e053)},
+  /* 225 */ {8, 0.1279790124049077, CNST_LIMB(0x5b27ac993df97701), CNST_LIMB(0x6779c7f90dc42f48)},
+  /* 226 */ {8, 0.1278743114199984, CNST_LIMB(0x5e7268b9bbdf8100), CNST_LIMB(0x5af23c74f9ad9fe9)},
+  /* 227 */ {8, 0.1277702427352035, CNST_LIMB(0x61d7a7932ff3d6a1), CNST_LIMB(0x4ee7eae2acdc617e)},
+  /* 228 */ {8, 0.1276667996348261, CNST_LIMB(0x65581f53c8c10000), CNST_LIMB(0x43556aa2ac262a0b)},
+  /* 229 */ {8, 0.1275639755045533, CNST_LIMB(0x68f48a385b8320e1), CNST_LIMB(0x3835949593b8ddd1)},
+  /* 230 */ {8, 0.1274617638294791, CNST_LIMB(0x6cada69ed07c2100), CNST_LIMB(0x2d837fbe78458762)},
+  /* 231 */ {8, 0.1273601581921741, CNST_LIMB(0x70843718cdbf27c1), CNST_LIMB(0x233a7e150a54a555)},
+  /* 232 */ {8, 0.1272591522708010, CNST_LIMB(0x7479027ea1000000), CNST_LIMB(0x19561984a50ff8fe)},
+  /* 233 */ {8, 0.1271587398372755, CNST_LIMB(0x788cd40268f39641), CNST_LIMB(0xfd211159fe3490f)},
+  /* 234 */ {8, 0.1270589147554692, CNST_LIMB(0x7cc07b437ecf6100), CNST_LIMB(0x6aa563e655033e3)},
+  /* 235 */ {8, 0.1269596709794558, CNST_LIMB(0x8114cc6220762061), CNST_LIMB(0xfbb614b3f2d3b14c)},
+  /* 236 */ {8, 0.1268610025517973, CNST_LIMB(0x858aa0135be10000), CNST_LIMB(0xeac0f8837fb05773)},
+  /* 237 */ {8, 0.1267629036018709, CNST_LIMB(0x8a22d3b53c54c321), CNST_LIMB(0xda6e4c10e8615ca5)},
+  /* 238 */ {8, 0.1266653683442337, CNST_LIMB(0x8ede496339f34100), CNST_LIMB(0xcab755a8d01fa67f)},
+  /* 239 */ {8, 0.1265683910770258, CNST_LIMB(0x93bde80aec3a1481), CNST_LIMB(0xbb95a9ae71aa3e0c)},
+  /* 240 */ {8, 0.1264719661804097, CNST_LIMB(0x98c29b8100000000), CNST_LIMB(0xad0326c296b4f529)},
+  /* 241 */ {8, 0.1263760881150453, CNST_LIMB(0x9ded549671832381), CNST_LIMB(0x9ef9f21eed31b7c1)},
+  /* 242 */ {8, 0.1262807514205999, CNST_LIMB(0xa33f092e0b1ac100), CNST_LIMB(0x91747422be14b0b2)},
+  /* 243 */ {8, 0.1261859507142915, CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d)},
+  /* 244 */ {8, 0.1260916806894653, CNST_LIMB(0xae5b564ac3a10000), CNST_LIMB(0x77df79e9a96c06f6)},
+  /* 245 */ {8, 0.1259979361142023, CNST_LIMB(0xb427f4b3be74c361), CNST_LIMB(0x6bc6019636c7d0c2)},
+  /* 246 */ {8, 0.1259047118299582, CNST_LIMB(0xba1f9a938041e100), CNST_LIMB(0x601c4205aebd9e47)},
+  /* 247 */ {8, 0.1258120027502338, CNST_LIMB(0xc0435871d1110f41), CNST_LIMB(0x54ddc59756f05016)},
+  /* 248 */ {8, 0.1257198038592741, CNST_LIMB(0xc694446f01000000), CNST_LIMB(0x4a0648979c838c18)},
+  /* 249 */ {8, 0.1256281102107963, CNST_LIMB(0xcd137a5b57ac3ec1), CNST_LIMB(0x3f91b6e0bb3a053d)},
+  /* 250 */ {8, 0.1255369169267456, CNST_LIMB(0xd3c21bcecceda100), CNST_LIMB(0x357c299a88ea76a5)},
+  /* 251 */ {8, 0.1254462191960791, CNST_LIMB(0xdaa150410b788de1), CNST_LIMB(0x2bc1e517aecc56e3)},
+  /* 252 */ {8, 0.1253560122735751, CNST_LIMB(0xe1b24521be010000), CNST_LIMB(0x225f56ceb3da9f5d)},
+  /* 253 */ {8, 0.1252662914786691, CNST_LIMB(0xe8f62df12777c1a1), CNST_LIMB(0x1951136d53ad63ac)},
+  /* 254 */ {8, 0.1251770521943144, CNST_LIMB(0xf06e445906fc0100), CNST_LIMB(0x1093d504b3cd7d93)},
+  /* 255 */ {8, 0.1250882898658681, CNST_LIMB(0xf81bc845c81bf801), CNST_LIMB(0x824794d1ec1814f)},
+};
+#endif
diff --git a/rts/gmp/mpn/ns32k/add_n.s b/rts/gmp/mpn/ns32k/add_n.s
new file mode 100644
index 0000000000..bd063d07d9
--- /dev/null
+++ b/rts/gmp/mpn/ns32k/add_n.s
@@ -0,0 +1,46 @@
+# ns32000 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+# sum in a third limb vector.
+
+# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+	.align 1
+.globl ___gmpn_add_n
+___gmpn_add_n:
+	save	[r3,r4,r5]
+	negd	28(sp),r3
+	movd	r3,r0
+	lshd	2,r0
+	movd	24(sp),r4
+	subd	r0,r4			# r4 -> to end of S2
+	movd	20(sp),r5
+	subd	r0,r5			# r5 -> to end of S1
+	movd	16(sp),r2
+	subd	r0,r2			# r2 -> to end of RES
+	subd	r0,r0			# cy = 0
+
+Loop:	movd	r5[r3:d],r0
+	addcd	r4[r3:d],r0
+	movd	r0,r2[r3:d]
+	acbd	1,r3,Loop
+
+	scsd	r0			# r0 = cy.
+	restore	[r5,r4,r3]
+	ret	0
diff --git a/rts/gmp/mpn/ns32k/addmul_1.s b/rts/gmp/mpn/ns32k/addmul_1.s
new file mode 100644
index 0000000000..df0dcdd4af
--- /dev/null
+++ b/rts/gmp/mpn/ns32k/addmul_1.s
@@ -0,0 +1,48 @@
+# ns32000 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+	.align 1
+.globl ___gmpn_addmul_1
+___gmpn_addmul_1:
+	save	[r3,r4,r5,r6,r7]
+	negd	24(sp),r4
+	movd	r4,r0
+	lshd	2,r0
+	movd	20(sp),r5
+	subd	r0,r5			# r5 -> to end of S1
+	movd	16(sp),r6
+	subd	r0,r6			# r6 -> to end of RES
+	subd	r0,r0			# r0 = 0, cy = 0
+	movd	28(sp),r7		# r7 = s2_limb
+
+Loop:	movd	r5[r4:d],r2
+	meid	r7,r2			# r2 = low_prod, r3 = high_prod
+	addcd	r0,r2			# r2 = low_prod + cy_limb
+	movd	r3,r0			# r0 = new cy_limb
+	addcd	0,r0
+	addd	r2,r6[r4:d]
+	acbd	1,r4,Loop
+
+	addcd	0,r0
+	restore	[r7,r6,r5,r4,r3]
+	ret	0
diff --git a/rts/gmp/mpn/ns32k/mul_1.s b/rts/gmp/mpn/ns32k/mul_1.s
new file mode 100644
index 0000000000..0a77efba29
--- /dev/null
+++ b/rts/gmp/mpn/ns32k/mul_1.s
@@ -0,0 +1,47 @@
+# ns32000 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+	.align 1
+.globl ___gmpn_mul_1
+___gmpn_mul_1:
+	save	[r3,r4,r5,r6,r7]
+	negd	24(sp),r4
+	movd	r4,r0
+	lshd	2,r0
+	movd	20(sp),r5
+	subd	r0,r5			# r5 -> to end of S1
+	movd	16(sp),r6
+	subd	r0,r6			# r6 -> to end of RES
+	subd	r0,r0			# r0 = 0, cy = 0
+	movd	28(sp),r7		# r7 = s2_limb
+
+Loop:	movd	r5[r4:d],r2
+	meid	r7,r2			# r2 = low_prod, r3 = high_prod
+	addcd	r0,r2			# r2 = low_prod + cy_limb
+	movd	r3,r0			# r0 = new cy_limb
+	movd	r2,r6[r4:d]
+	acbd	1,r4,Loop
+
+	addcd	0,r0
+	restore	[r7,r6,r5,r4,r3]
+	ret	0
diff --git a/rts/gmp/mpn/ns32k/sub_n.s b/rts/gmp/mpn/ns32k/sub_n.s
new file mode 100644
index 0000000000..cd89f4fd3f
--- /dev/null
+++ b/rts/gmp/mpn/ns32k/sub_n.s
@@ -0,0 +1,46 @@
+# ns32000 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+# store difference in a third limb vector.
+
+# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+	.align 1
+.globl ___gmpn_sub_n
+___gmpn_sub_n:
+	save	[r3,r4,r5]
+	negd	28(sp),r3
+	movd	r3,r0
+	lshd	2,r0
+	movd	24(sp),r4
+	subd	r0,r4			# r4 -> to end of S2
+	movd	20(sp),r5
+	subd	r0,r5			# r5 -> to end of S1
+	movd	16(sp),r2
+	subd	r0,r2			# r2 -> to end of RES
+	subd	r0,r0			# cy = 0
+
+Loop:	movd	r5[r3:d],r0
+	subcd	r4[r3:d],r0
+	movd	r0,r2[r3:d]
+	acbd	1,r3,Loop
+
+	scsd	r0			# r0 = cy.
+	restore	[r5,r4,r3]
+	ret	0
diff --git a/rts/gmp/mpn/ns32k/submul_1.s b/rts/gmp/mpn/ns32k/submul_1.s
new file mode 100644
index 0000000000..f811aedcf1
--- /dev/null
+++ b/rts/gmp/mpn/ns32k/submul_1.s
@@ -0,0 +1,48 @@
+# ns32000 __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright (C) 1992, 1994, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+	.align 1
+.globl ___gmpn_submul_1
+___gmpn_submul_1:
+	save	[r3,r4,r5,r6,r7]
+	negd	24(sp),r4
+	movd	r4,r0
+	lshd	2,r0
+	movd	20(sp),r5
+	subd	r0,r5			# r5 -> to end of S1
+	movd	16(sp),r6
+	subd	r0,r6			# r6 -> to end of RES
+	subd	r0,r0			# r0 = 0, cy = 0
+	movd	28(sp),r7		# r7 = s2_limb
+
+Loop:	movd	r5[r4:d],r2
+	meid	r7,r2			# r2 = low_prod, r3 = high_prod
+	addcd	r0,r2			# r2 = low_prod + cy_limb
+	movd	r3,r0			# r0 = new cy_limb
+	addcd	0,r0
+	subd	r2,r6[r4:d]
+	acbd	1,r4,Loop
+
+	addcd	0,r0
+	restore	[r7,r6,r5,r4,r3]
+	ret	0
diff --git a/rts/gmp/mpn/pa64/README b/rts/gmp/mpn/pa64/README
new file mode 100644
index 0000000000..8d2976dabc
--- /dev/null
+++ b/rts/gmp/mpn/pa64/README
@@ -0,0 +1,38 @@
+This directory contains mpn functions for 64-bit PA-RISC 2.0.
+
+RELEVANT OPTIMIZATION ISSUES
+
+The PA8000 has a multi-issue pipeline with large buffers for instructions
+awaiting pending results.  Therefore, no latency scheduling is necessary
+(and might actually be harmful).
+
+Two 64-bit loads can be completed per cycle.  One 64-bit store can be
+completed per cycle.  A store cannot complete in the same cycle as a load.
+
+STATUS
+
+* mpn_lshift, mpn_rshift, mpn_add_n, mpn_sub_n are all well-tuned and run at
+  the peak cache bandwidth; 1.5 cycles/limb for shifting and 2.0 cycles/limb
+  for add/subtract.
+
+* The multiplication functions run at 11 cycles/limb.  The cache bandwidth
+  allows 7.5 cycles/limb.  Perhaps it would be possible, using unrolling or
+  better scheduling, to get closer to the cache bandwidth limit.
+
+* xaddmul_1.S contains a quicker method for forming the 128 bit product.  It
+  uses some fewer operations, and keep the carry flag live across the loop
+  boundary.  But it seems hard to make it run more than 1/4 cycle faster
+  than the old code.  Perhaps we really ought to unroll this loop be 2x?
+  2x should suffice since register latency schedling is never needed,
+  but the unrolling would hide the store-load latency.  Here is a sketch:
+
+	1. A multiply and store 64-bit products
+	2. B sum 64-bit products 128-bit product
+	3. B load  64-bit products to integer registers
+	4. B multiply and store 64-bit products
+	5. A sum 64-bit products 128-bit product
+	6. A load  64-bit products to integer registers
+	7. goto 1
+
+  In practice, adjacent groups (1 and 2, 2 and 3, etc) will be interleaved
+  for better instruction mix.
diff --git a/rts/gmp/mpn/pa64/add_n.s b/rts/gmp/mpn/pa64/add_n.s
new file mode 100644
index 0000000000..22ff19c184
--- /dev/null
+++ b/rts/gmp/mpn/pa64/add_n.s
@@ -0,0 +1,90 @@
+; HP-PA 2.0 __gmpn_add_n -- Add two limb vectors of the same length > 0 and
+; store sum in a third limb vector.
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+	.level	2.0n
+	.code
+	.export	__gmpn_add_n,entry
+__gmpn_add_n
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	sub		%r0,%r23,%r22
+	depw,z		%r22,30,3,%r28		; r28 = 2 * (-n & 7)
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-n & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	sub		%r24,%r22,%r24		; offset s2_ptr
+	sub		%r26,%r22,%r26		; offset res_ptr
+	blr		%r28,%r0		; branch into loop
+	add		%r0,%r0,%r0		; reset carry
+
+L$loop	ldd		0(%r25),%r20
+	ldd		0(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,0(%r26)
+L$7	ldd		8(%r25),%r21
+	ldd		8(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,8(%r26)
+L$6	ldd		16(%r25),%r20
+	ldd		16(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,16(%r26)
+L$5	ldd		24(%r25),%r21
+	ldd		24(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,24(%r26)
+L$4	ldd		32(%r25),%r20
+	ldd		32(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,32(%r26)
+L$3	ldd		40(%r25),%r21
+	ldd		40(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,40(%r26)
+L$2	ldd		48(%r25),%r20
+	ldd		48(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,48(%r26)
+L$1	ldd		56(%r25),%r21
+	ldo		64(%r25),%r25
+	ldd		56(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,56(%r26)
+	ldo		64(%r24),%r24
+	addib,>		-8,%r23,L$loop
+	ldo		64(%r26),%r26
+
+	add,dc		%r0,%r0,%r29
+	bve		(%r2)
+	.exit
+	ldi		0,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64/addmul_1.S b/rts/gmp/mpn/pa64/addmul_1.S
new file mode 100644
index 0000000000..b1885b432c
--- /dev/null
+++ b/rts/gmp/mpn/pa64/addmul_1.S
@@ -0,0 +1,167 @@
+; HP-PA 2.0 64-bit __gmpn_addmul_1 -- Multiply a limb vector with a limb and
+; add the result to a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr		%r26
+#define sptr		%r25
+#define size		%r24
+#define s2limb		-56(%r30)
+
+; This runs at 11 cycles/limb on a PA8000.  It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+	.level  2.0n
+	.code
+	.export __gmpn_addmul_1,entry
+__gmpn_addmul_1
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+        fldd		-56(%r30),%fr5		; s2limb passed on stack
+	ldo		128(%r30),%r30
+	add		%r0,%r0,cylimb		; clear cy and cylimb
+
+	std		%r3,-96(%r30)
+	std		%r4,-88(%r30)
+	std		%r5,-80(%r30)
+	std		%r6,-72(%r30)
+	depdi,z		1,31,1,%r5
+
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	addib,=		-1,%r24,L$end1
+	nop
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	addib,=		-1,%r24,L$end2
+	nop
+L$loop
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m1
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,rlimb,rlimb
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	add		t4,rlimb,t3
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	std		t3,0(rptr)
+	addib,<>	-1,%r24,L$loop
+	ldo		8(rptr),rptr
+L$end2
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,rlimb,rlimb
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	add		t4,rlimb,t3
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+L$end1
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	add		cylimb,rlimb,rlimb
+	add,dc		t2,hi,cylimb
+	add		t4,rlimb,t3
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+
+	ldd		-96(%r30),%r3
+	ldd		-88(%r30),%r4
+	ldd		-80(%r30),%r5
+	ldd		-72(%r30),%r6
+
+	extrd,u		cylimb,31,32,%r28
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/pa64/gmp-mparam.h b/rts/gmp/mpn/pa64/gmp-mparam.h
new file mode 100644
index 0000000000..847735b987
--- /dev/null
+++ b/rts/gmp/mpn/pa64/gmp-mparam.h
@@ -0,0 +1,65 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values were measured in a PA8000 using the system compiler version
+   A.10.32.30.  Presumably the PA8200 and PA8500 have the same timing
+   characteristic, but GCC might give somewhat different results.  */
+/* Generated by tuneup.c, 2000-07-25. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   16
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      105
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   40
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      116
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              72
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD             94
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            50
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD       46
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD           1
+#endif
diff --git a/rts/gmp/mpn/pa64/lshift.s b/rts/gmp/mpn/pa64/lshift.s
new file mode 100644
index 0000000000..994bc1c4d6
--- /dev/null
+++ b/rts/gmp/mpn/pa64/lshift.s
@@ -0,0 +1,103 @@
+; HP-PA 2.0 __gmpn_lshift --
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; size		gr24
+; cnt		gr23
+
+; This runs at 1.5 cycles/limb on PA8000.
+
+	.level	2.0n
+	.code
+	.export	__gmpn_lshift,entry
+__gmpn_lshift
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	shladd		%r24,3,%r25,%r25
+	shladd		%r24,3,%r26,%r26
+	subi		64,%r23,%r23
+	mtsar		%r23
+	ldd		-8(%r25),%r21
+	addib,=		-1,%r24,L$end
+	shrpd		%r0,%r21,%sar,%r29	; compute carry out limb
+	depw,z		%r24,31,3,%r28		; r28 = (size & 7)
+	sub		%r0,%r24,%r22
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-size & 7)
+	add		%r25,%r22,%r25		; offset s1_ptr
+	blr		%r28,%r0		; branch into jump table
+	add		%r26,%r22,%r26		; offset res_ptr
+	b		L$0
+	nop
+	b		L$1
+	copy		%r21,%r20
+	b		L$2
+	nop
+	b		L$3
+	copy		%r21,%r20
+	b		L$4
+	nop
+	b		L$5
+	copy		%r21,%r20
+	b		L$6
+	nop
+	b		L$7
+	copy		%r21,%r20
+
+L$loop
+L$0	ldd		-16(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-8(%r26)
+L$7	ldd		-24(%r25),%r21
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-16(%r26)
+L$6	ldd		-32(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-24(%r26)
+L$5	ldd		-40(%r25),%r21
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-32(%r26)
+L$4	ldd		-48(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-40(%r26)
+L$3	ldd		-56(%r25),%r21
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-48(%r26)
+L$2	ldd		-64(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-56(%r26)
+L$1	ldd		-72(%r25),%r21
+	ldo		-64(%r25),%r25
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-64(%r26)
+	addib,>		-8,%r24,L$loop
+	ldo		-64(%r26),%r26
+
+L$end	shrpd		%r21,%r0,%sar,%r21
+	std		%r21,-8(%r26)
+	bve		(%r2)
+	.exit
+	extrd,u		%r29,31,32,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64/mul_1.S b/rts/gmp/mpn/pa64/mul_1.S
new file mode 100644
index 0000000000..ab310c1264
--- /dev/null
+++ b/rts/gmp/mpn/pa64/mul_1.S
@@ -0,0 +1,158 @@
+; HP-PA 2.0 64-bit __gmpn_mul_1 -- Multiply a limb vector with a limb and
+; store the result in a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr		%r26
+#define sptr		%r25
+#define size		%r24
+#define s2limb		-56(%r30)
+
+; This runs at 11 cycles/limb on a PA8000.  It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+	.level  2.0n
+	.code
+	.export __gmpn_mul_1,entry
+__gmpn_mul_1
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+        fldd		-56(%r30),%fr5		; s2limb passed on stack
+	ldo		128(%r30),%r30
+	add		%r0,%r0,cylimb		; clear cy and cylimb
+
+	std		%r3,-96(%r30)
+	std		%r4,-88(%r30)
+	std		%r5,-80(%r30)
+	std		%r6,-72(%r30)
+	depdi,z		1,31,1,%r5
+
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	addib,=		-1,%r24,L$end1
+	nop
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	addib,=		-1,%r24,L$end2
+	nop
+L$loop
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m1
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t3
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	std		t3,0(rptr)
+	addib,<>	-1,%r24,L$loop
+	ldo		8(rptr),rptr
+L$end2
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t3
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+L$end1
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t2 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	add		cylimb,t4,t3
+	add,dc		t2,hi,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+
+	ldd		-96(%r30),%r3
+	ldd		-88(%r30),%r4
+	ldd		-80(%r30),%r5
+	ldd		-72(%r30),%r6
+
+	extrd,u		cylimb,31,32,%r28
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/pa64/rshift.s b/rts/gmp/mpn/pa64/rshift.s
new file mode 100644
index 0000000000..f0730e2a91
--- /dev/null
+++ b/rts/gmp/mpn/pa64/rshift.s
@@ -0,0 +1,100 @@
+; HP-PA 2.0 __gmpn_rshift --
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; size		gr24
+; cnt		gr23
+
+; This runs at 1.5 cycles/limb on PA8000.
+
+	.level	2.0n
+	.code
+	.export	__gmpn_rshift,entry
+__gmpn_rshift
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	mtsar		%r23
+	ldd		0(%r25),%r21
+	addib,=		-1,%r24,L$end
+	shrpd		%r21,%r0,%sar,%r29	; compute carry out limb
+	depw,z		%r24,31,3,%r28		; r28 = (size & 7)
+	sub		%r0,%r24,%r22
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-size & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	blr		%r28,%r0		; branch into jump table
+	sub		%r26,%r22,%r26		; offset res_ptr
+	b		L$0
+	nop
+	b		L$1
+	copy		%r21,%r20
+	b		L$2
+	nop
+	b		L$3
+	copy		%r21,%r20
+	b		L$4
+	nop
+	b		L$5
+	copy		%r21,%r20
+	b		L$6
+	nop
+	b		L$7
+	copy		%r21,%r20
+
+L$loop
+L$0	ldd		8(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,0(%r26)
+L$7	ldd		16(%r25),%r21
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,8(%r26)
+L$6	ldd		24(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,16(%r26)
+L$5	ldd		32(%r25),%r21
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,24(%r26)
+L$4	ldd		40(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,32(%r26)
+L$3	ldd		48(%r25),%r21
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,40(%r26)
+L$2	ldd		56(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,48(%r26)
+L$1	ldd		64(%r25),%r21
+	ldo		64(%r25),%r25
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,56(%r26)
+	addib,>		-8,%r24,L$loop
+	ldo		64(%r26),%r26
+
+L$end	shrpd		%r0,%r21,%sar,%r21
+	std		%r21,0(%r26)
+	bve		(%r2)
+	.exit
+	extrd,u		%r29,31,32,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64/sub_n.s b/rts/gmp/mpn/pa64/sub_n.s
new file mode 100644
index 0000000000..dda1f54b34
--- /dev/null
+++ b/rts/gmp/mpn/pa64/sub_n.s
@@ -0,0 +1,90 @@
+; HP-PA 2.0 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0
+; and store difference in a third limb vector.
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+	.level	2.0n
+	.code
+	.export	__gmpn_sub_n,entry
+__gmpn_sub_n
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	sub		%r0,%r23,%r22
+	depw,z		%r22,30,3,%r28		; r28 = 2 * (-n & 7)
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-n & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	sub		%r24,%r22,%r24		; offset s2_ptr
+	blr		%r28,%r0		; branch into loop
+	sub		%r26,%r22,%r26		; offset res_ptr and set carry
+
+L$loop	ldd		0(%r25),%r20
+	ldd		0(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,0(%r26)
+L$7	ldd		8(%r25),%r21
+	ldd		8(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,8(%r26)
+L$6	ldd		16(%r25),%r20
+	ldd		16(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,16(%r26)
+L$5	ldd		24(%r25),%r21
+	ldd		24(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,24(%r26)
+L$4	ldd		32(%r25),%r20
+	ldd		32(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,32(%r26)
+L$3	ldd		40(%r25),%r21
+	ldd		40(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,40(%r26)
+L$2	ldd		48(%r25),%r20
+	ldd		48(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,48(%r26)
+L$1	ldd		56(%r25),%r21
+	ldo		64(%r25),%r25
+	ldd		56(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,56(%r26)
+	ldo		64(%r24),%r24
+	addib,>		-8,%r23,L$loop
+	ldo		64(%r26),%r26
+
+	add,dc		%r0,%r0,%r29
+	subi		1,%r29,%r29
+	bve		(%r2)
+	.exit
+	ldi		0,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64/submul_1.S b/rts/gmp/mpn/pa64/submul_1.S
new file mode 100644
index 0000000000..27666b99df
--- /dev/null
+++ b/rts/gmp/mpn/pa64/submul_1.S
@@ -0,0 +1,170 @@
+; HP-PA 2.0 64-bit __gmpn_submul_1 -- Multiply a limb vector with a limb and
+; subtract the result from a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr		%r26
+#define sptr		%r25
+#define size		%r24
+#define s2limb		-56(%r30)
+
+; This runs at 11 cycles/limb on a PA8000.  It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+	.level  2.0n
+	.code
+	.export __gmpn_submul_1,entry
+__gmpn_submul_1
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+        fldd		-56(%r30),%fr5		; s2limb passed on stack
+	ldo		128(%r30),%r30
+	add		%r0,%r0,cylimb		; clear cy and cylimb
+
+	std		%r3,-96(%r30)
+	std		%r4,-88(%r30)
+	std		%r5,-80(%r30)
+	std		%r6,-72(%r30)
+	depdi,z		1,31,1,%r5
+
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	addib,=		-1,%r24,L$end1
+	nop
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	addib,=		-1,%r24,L$end2
+	nop
+L$loop
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m1
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t4
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	sub		rlimb,t4,t3
+	add		t4,t3,%r0
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	std		t3,0(rptr)
+	addib,<>	-1,%r24,L$loop
+	ldo		8(rptr),rptr
+L$end2
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t4
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	sub		rlimb,t4,t3
+	add		t4,t3,%r0
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+L$end1
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	add		cylimb,t4,t4
+	add,dc		t2,hi,cylimb
+	sub		rlimb,t4,t3
+	add		t4,t3,%r0
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+
+	ldd		-96(%r30),%r3
+	ldd		-88(%r30),%r4
+	ldd		-80(%r30),%r5
+	ldd		-72(%r30),%r6
+
+	extrd,u		cylimb,31,32,%r28
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/pa64/udiv_qrnnd.c b/rts/gmp/mpn/pa64/udiv_qrnnd.c
new file mode 100644
index 0000000000..1c9fe084db
--- /dev/null
+++ b/rts/gmp/mpn/pa64/udiv_qrnnd.c
@@ -0,0 +1,111 @@
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#define TWO64 18446744073709551616.0
+
+mp_limb_t
+#if __STDC__
+__MPN(udiv_qrnnd) (mp_limb_t n1, mp_limb_t n0, mp_limb_t d, mp_limb_t *r)
+#else
+__MPN(udiv_qrnnd) (n1, n0, d, r)
+     mp_limb_t n1;
+     mp_limb_t n0;
+     mp_limb_t d;
+     mp_limb_t *r;
+#endif
+{
+  mp_limb_t q1, q2, q;
+  mp_limb_t p1, p0;
+  double di, dq;
+
+  di = 1.0 / d;
+
+  /* Generate upper 53 bits of quotient.  Be careful here; the `double'
+     quotient may be rounded to 2^64 which we cannot safely convert back
+     to a 64-bit integer.  */
+  dq = (TWO64 * (double) n1 + (double) n0) * di;
+  if (dq >= TWO64)
+    q1 = 0xfffffffffffff800LL;
+  else
+    q1 = (mp_limb_t) dq;
+
+  /* Multiply back in order to compare the product to the dividend.  */
+  umul_ppmm (p1, p0, q1, d);
+
+  /* Was the 53-bit quotient greater that our sought quotient?  Test the
+     sign of the partial remainder to find out.  */
+  if (n1 < p1 || (n1 == p1 && n0 < p0))
+    {
+      /* 53-bit quotient too large.  Partial remainder is negative.
+	 Compute the absolute value of the remainder in n1,,n0.  */
+      n1 = p1 - (n1 + (p0 < n0));
+      n0 = p0 - n0;
+
+      /* Now use the partial remainder as new dividend to compute more bits of
+	 quotient.  This is an adjustment for the one we got previously.  */
+      q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di);
+      umul_ppmm (p1, p0, q2, d);
+
+      q = q1 - q2;
+      if (n1 < p1 || (n1 == p1 && n0 <= p0))
+	{
+	  n0 = p0 - n0;
+	}
+      else
+	{
+	  n0 = p0 - n0;
+	  n0 += d;
+	  q--;
+	}
+    }
+  else
+    {
+      n1 = n1 - (p1 + (n0 < p0));
+      n0 = n0 - p0;
+
+      q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di);
+      umul_ppmm (p1, p0, q2, d);
+
+      q = q1 + q2;
+      if (n1 < p1 || (n1 == p1 && n0 < p0))
+	{
+	  n0 = n0 - p0;
+	  n0 += d;
+	  q--;
+	}
+      else
+	{
+	  n0 = n0 - p0;
+	  if (n0 >= d)
+	    {
+	      n0 -= d;
+	      q++;
+	    }
+	}
+    }
+
+  *r = n0;
+  return q;
+}
diff --git a/rts/gmp/mpn/pa64/umul_ppmm.S b/rts/gmp/mpn/pa64/umul_ppmm.S
new file mode 100644
index 0000000000..ceff2d752f
--- /dev/null
+++ b/rts/gmp/mpn/pa64/umul_ppmm.S
@@ -0,0 +1,74 @@
+; Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+#define p0	%r28
+#define p1	%r29
+#define t32	%r19
+#define t0	%r20
+#define t1	%r21
+#define x	%r22
+#define m0	%r23
+#define m1	%r24
+	.level  2.0n
+	.code
+	.export __gmpn_umul_ppmm,entry
+__gmpn_umul_ppmm
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+	ldo		128(%r30),%r30
+	depd		%r25,31,32,%r26
+	std		%r26,-64(%r30)
+	depd		%r23,31,32,%r24
+	std		%r24,-56(%r30)
+
+	ldw		-180(%r30),%r31
+
+        fldd		-64(%r30),%fr4
+        fldd		-56(%r30),%fr5
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+
+	depdi,z		1,31,1,t32		; t32 = 2^32
+
+	ldd		-128(%r30),p0		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),p1		; hi = high 64 bit of product
+
+	add,l,*nuv	m0,m1,x			; x = m1+m0
+	 add,l		t32,p1,p1		; propagate carry to mid of p1
+	depd,z		x,31,32,t0		; lo32(m1+m0)
+	add		t0,p0,p0
+	extrd,u		x,31,32,t1		; hi32(m1+m0)
+	add,dc		t1,p1,p1
+
+	std		p0,0(%r31)		; store low half of product
+	extrd,u		p1,31,32,%r28		; return high half of product
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/pa64w/README b/rts/gmp/mpn/pa64w/README
new file mode 100644
index 0000000000..cf590a7b98
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/README
@@ -0,0 +1,2 @@
+This directory contains mpn functions for 64-bit PA-RISC 2.0
+using 64-bit pointers (2.0W).
diff --git a/rts/gmp/mpn/pa64w/add_n.s b/rts/gmp/mpn/pa64w/add_n.s
new file mode 100644
index 0000000000..1bb9e8fbc7
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/add_n.s
@@ -0,0 +1,90 @@
+; HP-PA 2.0 __gmpn_add_n -- Add two limb vectors of the same length > 0 and
+; store sum in a third limb vector.
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+	.level	2.0w
+	.code
+	.export	__gmpn_add_n,entry
+__gmpn_add_n
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	sub		%r0,%r23,%r22
+	depw,z		%r22,30,3,%r28		; r28 = 2 * (-n & 7)
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-n & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	sub		%r24,%r22,%r24		; offset s2_ptr
+	sub		%r26,%r22,%r26		; offset res_ptr
+	blr		%r28,%r0		; branch into loop
+	add		%r0,%r0,%r0		; reset carry
+
+L$loop	ldd		0(%r25),%r20
+	ldd		0(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,0(%r26)
+L$7	ldd		8(%r25),%r21
+	ldd		8(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,8(%r26)
+L$6	ldd		16(%r25),%r20
+	ldd		16(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,16(%r26)
+L$5	ldd		24(%r25),%r21
+	ldd		24(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,24(%r26)
+L$4	ldd		32(%r25),%r20
+	ldd		32(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,32(%r26)
+L$3	ldd		40(%r25),%r21
+	ldd		40(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,40(%r26)
+L$2	ldd		48(%r25),%r20
+	ldd		48(%r24),%r31
+	add,dc		%r20,%r31,%r20
+	std		%r20,48(%r26)
+L$1	ldd		56(%r25),%r21
+	ldo		64(%r25),%r25
+	ldd		56(%r24),%r19
+	add,dc		%r21,%r19,%r21
+	std		%r21,56(%r26)
+	ldo		64(%r24),%r24
+	addib,>		-8,%r23,L$loop
+	ldo		64(%r26),%r26
+
+	add,dc		%r0,%r0,%r29
+	bve		(%r2)
+	.exit
+	copy		%r29,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64w/addmul_1.S b/rts/gmp/mpn/pa64w/addmul_1.S
new file mode 100644
index 0000000000..4799f90fc5
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/addmul_1.S
@@ -0,0 +1,168 @@
+; HP-PA 2.0 64-bit __gmpn_addmul_1 -- Multiply a limb vector with a limb and
+; add the result to a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr		%r26
+#define sptr		%r25
+#define size		%r24
+#define s2limb		%r23
+
+; This runs at 11 cycles/limb on a PA8000.  It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+	.level  2.0w
+	.code
+	.export __gmpn_addmul_1,entry
+__gmpn_addmul_1
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+	std		s2limb,-56(%r30)
+        fldd		-56(%r30),%fr5
+	ldo		128(%r30),%r30
+	add		%r0,%r0,cylimb		; clear cy and cylimb
+
+	std		%r3,-96(%r30)
+	std		%r4,-88(%r30)
+	std		%r5,-80(%r30)
+	std		%r6,-72(%r30)
+	depdi,z		1,31,1,%r5
+
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	addib,=		-1,%r24,L$end1
+	nop
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	addib,=		-1,%r24,L$end2
+	nop
+L$loop
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m1
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,rlimb,rlimb
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	add		t4,rlimb,t3
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	std		t3,0(rptr)
+	addib,<>	-1,%r24,L$loop
+	ldo		8(rptr),rptr
+L$end2
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,rlimb,rlimb
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	add		t4,rlimb,t3
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+L$end1
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	add		cylimb,rlimb,rlimb
+	add,dc		t2,hi,cylimb
+	add		t4,rlimb,t3
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+
+	ldd		-96(%r30),%r3
+	ldd		-88(%r30),%r4
+	ldd		-80(%r30),%r5
+	ldd		-72(%r30),%r6
+
+	copy		cylimb,%r28
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/pa64w/gmp-mparam.h b/rts/gmp/mpn/pa64w/gmp-mparam.h
new file mode 100644
index 0000000000..ee5a0a3ab7
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/gmp-mparam.h
@@ -0,0 +1,65 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values were measured on a PA8500 using the system compiler version
+   A.11.01.02.  Presumably the PA8000 and PA8200 have the same timing
+   characteristic, but GCC might give somewhat different results..  */
+/* Generated by tuneup.c, 2000-07-25. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   18
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      105
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   46
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD       83
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              58
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD            134
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            56
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD       26
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD           1
+#endif
diff --git a/rts/gmp/mpn/pa64w/lshift.s b/rts/gmp/mpn/pa64w/lshift.s
new file mode 100644
index 0000000000..84f925a105
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/lshift.s
@@ -0,0 +1,103 @@
+; HP-PA 2.0 __gmpn_lshift --
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; size		gr24
+; cnt		gr23
+
+; This runs at 1.5 cycles/limb on PA8000.
+
+	.level	2.0w
+	.code
+	.export	__gmpn_lshift,entry
+__gmpn_lshift
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	shladd		%r24,3,%r25,%r25
+	shladd		%r24,3,%r26,%r26
+	subi		64,%r23,%r23
+	mtsar		%r23
+	ldd		-8(%r25),%r21
+	addib,=		-1,%r24,L$end
+	shrpd		%r0,%r21,%sar,%r29	; compute carry out limb
+	depw,z		%r24,31,3,%r28		; r28 = (size & 7)
+	sub		%r0,%r24,%r22
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-size & 7)
+	add		%r25,%r22,%r25		; offset s1_ptr
+	blr		%r28,%r0		; branch into jump table
+	add		%r26,%r22,%r26		; offset res_ptr
+	b		L$0
+	nop
+	b		L$1
+	copy		%r21,%r20
+	b		L$2
+	nop
+	b		L$3
+	copy		%r21,%r20
+	b		L$4
+	nop
+	b		L$5
+	copy		%r21,%r20
+	b		L$6
+	nop
+	b		L$7
+	copy		%r21,%r20
+
+L$loop
+L$0	ldd		-16(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-8(%r26)
+L$7	ldd		-24(%r25),%r21
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-16(%r26)
+L$6	ldd		-32(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-24(%r26)
+L$5	ldd		-40(%r25),%r21
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-32(%r26)
+L$4	ldd		-48(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-40(%r26)
+L$3	ldd		-56(%r25),%r21
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-48(%r26)
+L$2	ldd		-64(%r25),%r20
+	shrpd		%r21,%r20,%sar,%r21
+	std		%r21,-56(%r26)
+L$1	ldd		-72(%r25),%r21
+	ldo		-64(%r25),%r25
+	shrpd		%r20,%r21,%sar,%r20
+	std		%r20,-64(%r26)
+	addib,>		-8,%r24,L$loop
+	ldo		-64(%r26),%r26
+
+L$end	shrpd		%r21,%r0,%sar,%r21
+	std		%r21,-8(%r26)
+	bve		(%r2)
+	.exit
+	copy		%r29,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64w/mul_1.S b/rts/gmp/mpn/pa64w/mul_1.S
new file mode 100644
index 0000000000..48f13fbd1b
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/mul_1.S
@@ -0,0 +1,159 @@
+; HP-PA 2.0 64-bit __gmpn_mul_1 -- Multiply a limb vector with a limb and
+; store the result in a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr		%r26
+#define sptr		%r25
+#define size		%r24
+#define s2limb		%r23
+
+; This runs at 11 cycles/limb on a PA8000.  It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+	.level  2.0w
+	.code
+	.export __gmpn_mul_1,entry
+__gmpn_mul_1
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+	std		s2limb,-56(%r30)
+        fldd		-56(%r30),%fr5
+	ldo		128(%r30),%r30
+	add		%r0,%r0,cylimb		; clear cy and cylimb
+
+	std		%r3,-96(%r30)
+	std		%r4,-88(%r30)
+	std		%r5,-80(%r30)
+	std		%r6,-72(%r30)
+	depdi,z		1,31,1,%r5
+
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	addib,=		-1,%r24,L$end1
+	nop
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	addib,=		-1,%r24,L$end2
+	nop
+L$loop
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m1
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t3
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	std		t3,0(rptr)
+	addib,<>	-1,%r24,L$loop
+	ldo		8(rptr),rptr
+L$end2
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t3
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+L$end1
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t2 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	add		cylimb,t4,t3
+	add,dc		t2,hi,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+
+	ldd		-96(%r30),%r3
+	ldd		-88(%r30),%r4
+	ldd		-80(%r30),%r5
+	ldd		-72(%r30),%r6
+
+	copy		cylimb,%r28
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/pa64w/rshift.s b/rts/gmp/mpn/pa64w/rshift.s
new file mode 100644
index 0000000000..2517cb1f87
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/rshift.s
@@ -0,0 +1,100 @@
+; HP-PA 2.0 __gmpn_rshift --
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; size		gr24
+; cnt		gr23
+
+; This runs at 1.5 cycles/limb on PA8000.
+
+	.level	2.0w
+	.code
+	.export	__gmpn_rshift,entry
+__gmpn_rshift
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	mtsar		%r23
+	ldd		0(%r25),%r21
+	addib,=		-1,%r24,L$end
+	shrpd		%r21,%r0,%sar,%r29	; compute carry out limb
+	depw,z		%r24,31,3,%r28		; r28 = (size & 7)
+	sub		%r0,%r24,%r22
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-size & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	blr		%r28,%r0		; branch into jump table
+	sub		%r26,%r22,%r26		; offset res_ptr
+	b		L$0
+	nop
+	b		L$1
+	copy		%r21,%r20
+	b		L$2
+	nop
+	b		L$3
+	copy		%r21,%r20
+	b		L$4
+	nop
+	b		L$5
+	copy		%r21,%r20
+	b		L$6
+	nop
+	b		L$7
+	copy		%r21,%r20
+
+L$loop
+L$0	ldd		8(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,0(%r26)
+L$7	ldd		16(%r25),%r21
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,8(%r26)
+L$6	ldd		24(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,16(%r26)
+L$5	ldd		32(%r25),%r21
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,24(%r26)
+L$4	ldd		40(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,32(%r26)
+L$3	ldd		48(%r25),%r21
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,40(%r26)
+L$2	ldd		56(%r25),%r20
+	shrpd		%r20,%r21,%sar,%r21
+	std		%r21,48(%r26)
+L$1	ldd		64(%r25),%r21
+	ldo		64(%r25),%r25
+	shrpd		%r21,%r20,%sar,%r20
+	std		%r20,56(%r26)
+	addib,>		-8,%r24,L$loop
+	ldo		64(%r26),%r26
+
+L$end	shrpd		%r0,%r21,%sar,%r21
+	std		%r21,0(%r26)
+	bve		(%r2)
+	.exit
+	copy		%r29,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64w/sub_n.s b/rts/gmp/mpn/pa64w/sub_n.s
new file mode 100644
index 0000000000..ad01e24aa7
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/sub_n.s
@@ -0,0 +1,90 @@
+; HP-PA 2.0 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0
+; and store difference in a third limb vector.
+
+; Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+
+; INPUT PARAMETERS
+; res_ptr	gr26
+; s1_ptr	gr25
+; s2_ptr	gr24
+; size		gr23
+
+; This runs at 2 cycles/limb on PA8000.
+
+	.level	2.0w
+	.code
+	.export	__gmpn_sub_n,entry
+__gmpn_sub_n
+	.proc
+	.callinfo frame=0,args_saved
+	.entry
+
+	sub		%r0,%r23,%r22
+	depw,z		%r22,30,3,%r28		; r28 = 2 * (-n & 7)
+	depw,z		%r22,28,3,%r22		; r22 = 8 * (-n & 7)
+	sub		%r25,%r22,%r25		; offset s1_ptr
+	sub		%r24,%r22,%r24		; offset s2_ptr
+	blr		%r28,%r0		; branch into loop
+	sub		%r26,%r22,%r26		; offset res_ptr and set carry
+
+L$loop	ldd		0(%r25),%r20
+	ldd		0(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,0(%r26)
+L$7	ldd		8(%r25),%r21
+	ldd		8(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,8(%r26)
+L$6	ldd		16(%r25),%r20
+	ldd		16(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,16(%r26)
+L$5	ldd		24(%r25),%r21
+	ldd		24(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,24(%r26)
+L$4	ldd		32(%r25),%r20
+	ldd		32(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,32(%r26)
+L$3	ldd		40(%r25),%r21
+	ldd		40(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,40(%r26)
+L$2	ldd		48(%r25),%r20
+	ldd		48(%r24),%r31
+	sub,db		%r20,%r31,%r20
+	std		%r20,48(%r26)
+L$1	ldd		56(%r25),%r21
+	ldo		64(%r25),%r25
+	ldd		56(%r24),%r19
+	sub,db		%r21,%r19,%r21
+	std		%r21,56(%r26)
+	ldo		64(%r24),%r24
+	addib,>		-8,%r23,L$loop
+	ldo		64(%r26),%r26
+
+	add,dc		%r0,%r0,%r29
+	subi		1,%r29,%r29
+	bve		(%r2)
+	.exit
+	copy		%r29,%r28
+	.procend
diff --git a/rts/gmp/mpn/pa64w/submul_1.S b/rts/gmp/mpn/pa64w/submul_1.S
new file mode 100644
index 0000000000..294f6239b2
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/submul_1.S
@@ -0,0 +1,171 @@
+; HP-PA 2.0 64-bit __gmpn_submul_1 -- Multiply a limb vector with a limb and
+; subtract the result from a second limb vector.
+
+; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+; INPUT PARAMETERS
+#define rptr		%r26
+#define sptr		%r25
+#define size		%r24
+#define s2limb		%r23
+
+; This runs at 11 cycles/limb on a PA8000.  It might be possible to make
+; it faster, but the PA8000 pipeline is not publically documented and it
+; is very complex to reverse engineer
+
+#define t1 %r19
+#define rlimb %r20
+#define hi %r21
+#define lo %r22
+#define m0 %r28
+#define m1 %r3
+#define cylimb %r29
+#define t3 %r4
+#define t2 %r6
+#define t5 %r23
+#define t4 %r31
+	.level  2.0w
+	.code
+	.export __gmpn_submul_1,entry
+__gmpn_submul_1
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+	std		s2limb,-56(%r30)
+        fldd		-56(%r30),%fr5
+	ldo		128(%r30),%r30
+	add		%r0,%r0,cylimb		; clear cy and cylimb
+
+	std		%r3,-96(%r30)
+	std		%r4,-88(%r30)
+	std		%r5,-80(%r30)
+	std		%r6,-72(%r30)
+	depdi,z		1,31,1,%r5
+
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	addib,=		-1,%r24,L$end1
+	nop
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	addib,=		-1,%r24,L$end2
+	nop
+L$loop
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m1
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t4
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	sub		rlimb,t4,t3
+	add		t4,t3,%r0
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	fldd		0(sptr),%fr4
+	ldo		8(sptr),sptr
+	std		t3,0(rptr)
+	addib,<>	-1,%r24,L$loop
+	ldo		8(rptr),rptr
+L$end2
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	ldd		-128(%r30),lo		; lo = low 64 bit of product
+	add		cylimb,t4,t4
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	add,dc		t2,hi,cylimb
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	sub		rlimb,t4,t3
+	add		t4,t3,%r0
+	ldd		-104(%r30),hi		; hi = high 64 bit of product
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+L$end1
+	ldd		0(rptr),rlimb
+	extrd,u		lo,31,32,t1		; t1 = hi32(lo)
+	extrd,u		lo,63,32,t4		; t4 = lo32(lo)
+	add,l		m0,t1,t1		; t1 += m0
+	add,l,*nuv	m1,t1,t1		; t1 += m0
+	 add,l		%r5,hi,hi		; propagate carry
+	extrd,u		t1,31,32,t2		; t2 = hi32(t1)
+	depd,z		t1,31,32,t5		; t5 = lo32(t1)
+	add,l		t5,t4,t4		; t4 += lo32(t1)
+	add		cylimb,t4,t4
+	add,dc		t2,hi,cylimb
+	sub		rlimb,t4,t3
+	add		t4,t3,%r0
+	add,dc		%r0,cylimb,cylimb
+	std		t3,0(rptr)
+	ldo		8(rptr),rptr
+
+	ldd		-96(%r30),%r3
+	ldd		-88(%r30),%r4
+	ldd		-80(%r30),%r5
+	ldd		-72(%r30),%r6
+
+	copy		cylimb,%r28
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/pa64w/udiv_qrnnd.c b/rts/gmp/mpn/pa64w/udiv_qrnnd.c
new file mode 100644
index 0000000000..1852913000
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/udiv_qrnnd.c
@@ -0,0 +1,117 @@
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#define TWO64 18446744073709551616.0
+#define TWO63 9223372036854775808.0
+
+mp_limb_t
+#if __STDC__
+__MPN(udiv_qrnnd) (mp_limb_t n1, mp_limb_t n0, mp_limb_t d, mp_limb_t *r)
+#else
+__MPN(udiv_qrnnd) (n1, n0, d, r)
+     mp_limb_t n1;
+     mp_limb_t n0;
+     mp_limb_t d;
+     mp_limb_t *r;
+#endif
+{
+  mp_limb_t q1, q2, q;
+  mp_limb_t p1, p0;
+  double di, dq;
+
+  di = 1.0 / d;
+
+  /* Generate upper 53 bits of quotient.  Be careful here; the `double'
+     quotient may be rounded to 2^64 which we cannot safely convert back
+     to a 64-bit integer.  */
+  dq = (TWO64 * (double) n1 + (double) n0) * di;
+  if (dq >= TWO64)
+    q1 = 0xfffffffffffff800L;
+#ifndef __GNUC__
+  /* Work around HP compiler bug.  */
+  else if (dq > TWO63)
+    q1 = (mp_limb_t) (dq - TWO63) + 0x8000000000000000L;
+#endif
+  else
+    q1 = (mp_limb_t) dq;
+
+  /* Multiply back in order to compare the product to the dividend.  */
+  umul_ppmm (p1, p0, q1, d);
+
+  /* Was the 53-bit quotient greater that our sought quotient?  Test the
+     sign of the partial remainder to find out.  */
+  if (n1 < p1 || (n1 == p1 && n0 < p0))
+    {
+      /* 53-bit quotient too large.  Partial remainder is negative.
+	 Compute the absolute value of the remainder in n1,,n0.  */
+      n1 = p1 - (n1 + (p0 < n0));
+      n0 = p0 - n0;
+
+      /* Now use the partial remainder as new dividend to compute more bits of
+	 quotient.  This is an adjustment for the one we got previously.  */
+      q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di);
+      umul_ppmm (p1, p0, q2, d);
+
+      q = q1 - q2;
+      if (n1 < p1 || (n1 == p1 && n0 <= p0))
+	{
+	  n0 = p0 - n0;
+	}
+      else
+	{
+	  n0 = p0 - n0;
+	  n0 += d;
+	  q--;
+	}
+    }
+  else
+    {
+      n1 = n1 - (p1 + (n0 < p0));
+      n0 = n0 - p0;
+
+      q2 = (mp_limb_t) ((TWO64 * (double) n1 + (double) n0) * di);
+      umul_ppmm (p1, p0, q2, d);
+
+      q = q1 + q2;
+      if (n1 < p1 || (n1 == p1 && n0 < p0))
+	{
+	  n0 = n0 - p0;
+	  n0 += d;
+	  q--;
+	}
+      else
+	{
+	  n0 = n0 - p0;
+	  if (n0 >= d)
+	    {
+	      n0 -= d;
+	      q++;
+	    }
+	}
+    }
+
+  *r = n0;
+  return q;
+}
diff --git a/rts/gmp/mpn/pa64w/umul_ppmm.S b/rts/gmp/mpn/pa64w/umul_ppmm.S
new file mode 100644
index 0000000000..d9fb92be8c
--- /dev/null
+++ b/rts/gmp/mpn/pa64w/umul_ppmm.S
@@ -0,0 +1,72 @@
+; Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+; This file is part of the GNU MP Library.
+
+; The GNU MP Library is free software; you can redistribute it and/or modify
+; it under the terms of the GNU Lesser General Public License as published by
+; the Free Software Foundation; either version 2.1 of the License, or (at your
+; option) any later version.
+
+; The GNU MP Library is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+; License for more details.
+
+; You should have received a copy of the GNU Lesser General Public License
+; along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+; MA 02111-1307, USA.
+
+#define p0	%r28
+#define p1	%r29
+#define t32	%r19
+#define t0	%r20
+#define t1	%r21
+#define x	%r22
+#define m0	%r23
+#define m1	%r24
+	.level  2.0w
+	.code
+	.export __gmpn_umul_ppmm,entry
+__gmpn_umul_ppmm
+	.proc
+	.callinfo frame=128,no_calls
+	.entry
+	ldo		128(%r30),%r30
+	std		%r26,-64(%r30)
+	std		%r25,-56(%r30)
+
+	copy		%r24,%r31
+
+        fldd		-64(%r30),%fr4
+        fldd		-56(%r30),%fr5
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+
+	depdi,z		1,31,1,t32		; t32 = 2^32
+
+	ldd		-128(%r30),p0		; lo = low 64 bit of product
+	ldd		-120(%r30),m0		; m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		; m1 = mid1 64 bit of product
+	ldd		-104(%r30),p1		; hi = high 64 bit of product
+
+	add,l,*nuv	m0,m1,x			; x = m1+m0
+	 add,l		t32,p1,p1		; propagate carry to mid of p1
+	depd,z		x,31,32,t0		; lo32(m1+m0)
+	add		t0,p0,p0
+	extrd,u		x,31,32,t1		; hi32(m1+m0)
+	add,dc		t1,p1,p1
+
+	std		p0,0(%r31)		; store low half of product
+	copy		p1,%r28			; return high half of product
+	bve		(%r2)
+	.exit
+	ldo		-128(%r30),%r30
+	.procend
diff --git a/rts/gmp/mpn/power/add_n.s b/rts/gmp/mpn/power/add_n.s
new file mode 100644
index 0000000000..0f9f48f1cc
--- /dev/null
+++ b/rts/gmp/mpn/power/add_n.s
@@ -0,0 +1,79 @@
+# IBM POWER __gmpn_add_n -- Add two limb vectors of equal, non-zero length.
+
+# Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software Foundation,
+# Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# s2_ptr	r5
+# size		r6
+
+	.toc
+	.globl	__gmpn_add_n
+	.globl	.__gmpn_add_n
+	.csect	__gmpn_add_n[DS]
+__gmpn_add_n:
+	.long	.__gmpn_add_n, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__gmpn_add_n:
+	andil.	10,6,1		# odd or even number of limbs?
+	l	8,0(4)		# load least significant s1 limb
+	l	0,0(5)		# load least significant s2 limb
+	cal	3,-4(3)		# offset res_ptr, it's updated before it's used
+	sri	10,6,1		# count for unrolled loop
+	a	7,0,8		# add least significant limbs, set cy
+	mtctr	10		# copy count into CTR
+	beq	0,Leven		# branch if even # of limbs (# of limbs >= 2)
+
+# We have an odd # of limbs.  Add the first limbs separately.
+	cmpi	1,10,0		# is count for unrolled loop zero?
+	bc	4,6,L1		# bne cr1,L1 (misassembled by gas)
+	st	7,4(3)
+	aze	3,10		# use the fact that r10 is zero...
+	br			# return
+
+# We added least significant limbs.  Now reload the next limbs to enter loop.
+L1:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	stu	7,4(3)
+	ae	7,0,8		# add limbs, set cy
+Leven:	lu	9,4(4)		# load s1 limb and update s1_ptr
+	lu	10,4(5)		# load s2 limb and update s2_ptr
+	bdz	Lend		# If done, skip loop
+
+Loop:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	ae	11,9,10		# add previous limbs with cy, set cy
+	stu	7,4(3)		# 
+	lu	9,4(4)		# load s1 limb and update s1_ptr
+	lu	10,4(5)		# load s2 limb and update s2_ptr
+	ae	7,0,8		# add previous limbs with cy, set cy
+	stu	11,4(3)		# 
+	bdn	Loop		# decrement CTR and loop back
+
+Lend:	ae	11,9,10		# add limbs with cy, set cy
+	st	7,4(3)		# 
+	st	11,8(3)		# 
+	lil	3,0		# load cy into ...
+	aze	3,3		# ... return value register
+	br
diff --git a/rts/gmp/mpn/power/addmul_1.s b/rts/gmp/mpn/power/addmul_1.s
new file mode 100644
index 0000000000..8ecc651579
--- /dev/null
+++ b/rts/gmp/mpn/power/addmul_1.s
@@ -0,0 +1,122 @@
+# IBM POWER __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+# The POWER architecture has no unsigned 32x32->64 bit multiplication
+# instruction.  To obtain that operation, we have to use the 32x32->64 signed
+# multiplication instruction, and add the appropriate compensation to the high
+# limb of the result.  We add the multiplicand if the multiplier has its most
+# significant bit set, and we add the multiplier if the multiplicand has its
+# most significant bit set.  We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit we
+# can branch in zero cycles, so that's how we perform the additions.
+
+	.toc
+	.globl	__gmpn_addmul_1
+	.globl	.__gmpn_addmul_1
+	.csect	__gmpn_addmul_1[DS]
+__gmpn_addmul_1:
+	.long	.__gmpn_addmul_1, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__gmpn_addmul_1:
+
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	8
+	cax	9,9,7
+	l	7,4(3)
+	a	8,8,7		# add res_limb
+	blt	Lneg
+Lpos:	bdz	Lend
+
+Lploop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	8,0,9		# low limb + old_cy_limb + old cy
+	l	7,4(3)
+	aze	10,10		# propagate cy to new cy_limb
+	a	8,8,7		# add res_limb
+	bge	Lp0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	8,0,10
+	l	7,4(3)
+	aze	9,9
+	a	8,8,7
+	bge	Lp1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	7
+	ae	8,7,9
+	l	7,4(3)
+	ae	10,10,0		# propagate cy to new cy_limb
+	a	8,8,7		# add res_limb
+	bge	Ln0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	7
+	ae	8,7,10
+	l	7,4(3)
+	ae	9,9,0		# propagate cy to new cy_limb
+	a	8,8,7		# add res_limb
+	bge	Ln1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
diff --git a/rts/gmp/mpn/power/lshift.s b/rts/gmp/mpn/power/lshift.s
new file mode 100644
index 0000000000..ab71fb7727
--- /dev/null
+++ b/rts/gmp/mpn/power/lshift.s
@@ -0,0 +1,56 @@
+# IBM POWER __gmpn_lshift -- 
+
+# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s_ptr		r4
+# size		r5
+# cnt		r6
+
+	.toc
+	.globl	__gmpn_lshift
+	.globl	.__gmpn_lshift
+	.csect	__gmpn_lshift[DS]
+__gmpn_lshift:
+	.long	.__gmpn_lshift, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__gmpn_lshift:
+	sli	0,5,2
+	cax	9,3,0
+	cax	4,4,0
+	sfi	8,6,32
+	mtctr	5		# put limb count in CTR loop register
+	lu	0,-4(4)		# read most significant limb
+	sre	3,0,8		# compute carry out limb, and init MQ register
+	bdz	Lend2		# if just one limb, skip loop
+	lu	0,-4(4)		# read 2:nd most significant limb
+	sreq	7,0,8		# compute most significant limb of result
+	bdz	Lend		# if just two limb, skip loop
+Loop:	lu	0,-4(4)		# load next lower limb
+	stu	7,-4(9)		# store previous result during read latency
+	sreq	7,0,8		# compute result limb
+	bdn	Loop		# loop back until CTR is zero
+Lend:	stu	7,-4(9)		# store 2:nd least significant limb
+Lend2:	sle	7,0,6		# compute least significant limb
+	st      7,-4(9)		# store it"				\
+	br
diff --git a/rts/gmp/mpn/power/mul_1.s b/rts/gmp/mpn/power/mul_1.s
new file mode 100644
index 0000000000..4e08ade583
--- /dev/null
+++ b/rts/gmp/mpn/power/mul_1.s
@@ -0,0 +1,109 @@
+# IBM POWER __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+# The POWER architecture has no unsigned 32x32->64 bit multiplication
+# instruction.  To obtain that operation, we have to use the 32x32->64 signed
+# multiplication instruction, and add the appropriate compensation to the high
+# limb of the result.  We add the multiplicand if the multiplier has its most
+# significant bit set, and we add the multiplier if the multiplicand has its
+# most significant bit set.  We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit we
+# can branch in zero cycles, so that's how we perform the additions.
+
+	.toc
+	.globl	__gmpn_mul_1
+	.globl	.__gmpn_mul_1
+	.csect	__gmpn_mul_1[DS]
+__gmpn_mul_1:
+	.long	.__gmpn_mul_1, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__gmpn_mul_1:
+
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	8
+	ai	0,0,0		# reset carry
+	cax	9,9,7
+	blt	Lneg
+Lpos:	bdz	Lend
+Lploop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	8,0,9
+	bge	Lp0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	8,0,10
+	bge	Lp1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	cax	10,10,0		# adjust high limb for negative s2_limb
+	mfmq	0
+	ae	8,0,9
+	bge	Ln0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	cax	9,9,0		# adjust high limb for negative s2_limb
+	mfmq	0
+	ae	8,0,10
+	bge	Ln1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
diff --git a/rts/gmp/mpn/power/rshift.s b/rts/gmp/mpn/power/rshift.s
new file mode 100644
index 0000000000..65b3945f8a
--- /dev/null
+++ b/rts/gmp/mpn/power/rshift.s
@@ -0,0 +1,54 @@
+# IBM POWER __gmpn_rshift -- 
+
+# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s_ptr		r4
+# size		r5
+# cnt		r6
+
+	.toc
+	.globl	__gmpn_rshift
+	.globl	.__gmpn_rshift
+	.csect	__gmpn_rshift[DS]
+__gmpn_rshift:
+	.long	.__gmpn_rshift, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__gmpn_rshift:
+	sfi	8,6,32
+	mtctr	5		# put limb count in CTR loop register
+	l	0,0(4)		# read least significant limb
+	ai	9,3,-4		# adjust res_ptr since it's offset in the stu:s
+	sle	3,0,8		# compute carry limb, and init MQ register
+	bdz	Lend2		# if just one limb, skip loop
+	lu	0,4(4)		# read 2:nd least significant limb
+	sleq	7,0,8		# compute least significant limb of result
+	bdz	Lend		# if just two limb, skip loop
+Loop:	lu	0,4(4)		# load next higher limb
+	stu	7,4(9)		# store previous result during read latency
+	sleq	7,0,8		# compute result limb
+	bdn	Loop		# loop back until CTR is zero
+Lend:	stu	7,4(9)		# store 2:nd most significant limb
+Lend2:	sre	7,0,6		# compute most significant limb
+	st      7,4(9)		# store it"				\
+	br
diff --git a/rts/gmp/mpn/power/sdiv.s b/rts/gmp/mpn/power/sdiv.s
new file mode 100644
index 0000000000..81da622fbc
--- /dev/null
+++ b/rts/gmp/mpn/power/sdiv.s
@@ -0,0 +1,34 @@
+# Copyright (C) 1999 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+	.toc
+	.globl	__sdiv_qrnnd
+	.globl	.__sdiv_qrnnd
+	.csect	__sdiv_qrnnd[DS]
+__sdiv_qrnnd:
+	.long	.__sdiv_qrnnd, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__sdiv_qrnnd:
+	mtmq	5
+	div	0,4,6
+	mfmq	9
+	st	9,0(3)
+	mr	3,0
+	br
diff --git a/rts/gmp/mpn/power/sub_n.s b/rts/gmp/mpn/power/sub_n.s
new file mode 100644
index 0000000000..aa09cf5bc1
--- /dev/null
+++ b/rts/gmp/mpn/power/sub_n.s
@@ -0,0 +1,80 @@
+# IBM POWER __gmpn_sub_n -- Subtract two limb vectors of equal, non-zero length.
+
+# Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software Foundation,
+# Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# s2_ptr	r5
+# size		r6
+
+	.toc
+	.globl	__gmpn_sub_n
+	.globl	.__gmpn_sub_n
+	.csect	__gmpn_sub_n[DS]
+__gmpn_sub_n:
+	.long	.__gmpn_sub_n, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__gmpn_sub_n:
+	andil.	10,6,1		# odd or even number of limbs?
+	l	8,0(4)		# load least significant s1 limb
+	l	0,0(5)		# load least significant s2 limb
+	cal	3,-4(3)		# offset res_ptr, it's updated before it's used
+	sri	10,6,1		# count for unrolled loop
+	sf	7,0,8		# subtract least significant limbs, set cy
+	mtctr	10		# copy count into CTR
+	beq	0,Leven		# branch if even # of limbs (# of limbs >= 2)
+
+# We have an odd # of limbs.  Add the first limbs separately.
+	cmpi	1,10,0		# is count for unrolled loop zero?
+	bc	4,6,L1		# bne cr1,L1 (misassembled by gas)
+	st	7,4(3)
+	sfe	3,0,0		# load !cy into ...
+	sfi	3,3,0		# ... return value register
+	br			# return
+
+# We added least significant limbs.  Now reload the next limbs to enter loop.
+L1:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	stu	7,4(3)
+	sfe	7,0,8		# subtract limbs, set cy
+Leven:	lu	9,4(4)		# load s1 limb and update s1_ptr
+	lu	10,4(5)		# load s2 limb and update s2_ptr
+	bdz	Lend		# If done, skip loop
+
+Loop:	lu	8,4(4)		# load s1 limb and update s1_ptr
+	lu	0,4(5)		# load s2 limb and update s2_ptr
+	sfe	11,10,9		# subtract previous limbs with cy, set cy
+	stu	7,4(3)		# 
+	lu	9,4(4)		# load s1 limb and update s1_ptr
+	lu	10,4(5)		# load s2 limb and update s2_ptr
+	sfe	7,0,8		# subtract previous limbs with cy, set cy
+	stu	11,4(3)		# 
+	bdn	Loop		# decrement CTR and loop back
+
+Lend:	sfe	11,10,9		# subtract limbs with cy, set cy
+	st	7,4(3)		# 
+	st	11,8(3)		# 
+	sfe	3,0,0		# load !cy into ...
+	sfi	3,3,0		# ... return value register
+	br
diff --git a/rts/gmp/mpn/power/submul_1.s b/rts/gmp/mpn/power/submul_1.s
new file mode 100644
index 0000000000..bc01b7c95d
--- /dev/null
+++ b/rts/gmp/mpn/power/submul_1.s
@@ -0,0 +1,127 @@
+# IBM POWER __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright (C) 1992, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+# The POWER architecture has no unsigned 32x32->64 bit multiplication
+# instruction.  To obtain that operation, we have to use the 32x32->64 signed
+# multiplication instruction, and add the appropriate compensation to the high
+# limb of the result.  We add the multiplicand if the multiplier has its most
+# significant bit set, and we add the multiplier if the multiplicand has its
+# most significant bit set.  We need to preserve the carry flag between each
+# iteration, so we have to compute the compensation carefully (the natural,
+# srai+and doesn't work).  Since the POWER architecture has a branch unit we
+# can branch in zero cycles, so that's how we perform the additions.
+
+	.toc
+	.globl	__gmpn_submul_1
+	.globl	.__gmpn_submul_1
+	.csect	__gmpn_submul_1[DS]
+__gmpn_submul_1:
+	.long	.__gmpn_submul_1, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__gmpn_submul_1:
+
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	11
+	cax	9,9,7
+	l	7,4(3)
+	sf	8,11,7		# add res_limb
+	a	11,8,11		# invert cy (r11 is junk)
+	blt	Lneg
+Lpos:	bdz	Lend
+
+Lploop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	11,0,9		# low limb + old_cy_limb + old cy
+	l	7,4(3)
+	aze	10,10		# propagate cy to new cy_limb
+	sf	8,11,7		# add res_limb
+	a	11,8,11		# invert cy (r11 is junk)
+	bge	Lp0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	11,0,10
+	l	7,4(3)
+	aze	9,9
+	sf	8,11,7
+	a	11,8,11		# invert cy (r11 is junk)
+	bge	Lp1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	7
+	ae	11,7,9
+	l	7,4(3)
+	ae	10,10,0		# propagate cy to new cy_limb
+	sf	8,11,7		# add res_limb
+	a	11,8,11		# invert cy (r11 is junk)
+	bge	Ln0
+	cax	10,10,6		# adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	7
+	ae	11,7,10
+	l	7,4(3)
+	ae	9,9,0		# propagate cy to new cy_limb
+	sf	8,11,7		# add res_limb
+	a	11,8,11		# invert cy (r11 is junk)
+	bge	Ln1
+	cax	9,9,6		# adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
diff --git a/rts/gmp/mpn/power/umul.s b/rts/gmp/mpn/power/umul.s
new file mode 100644
index 0000000000..8c77496380
--- /dev/null
+++ b/rts/gmp/mpn/power/umul.s
@@ -0,0 +1,38 @@
+# Copyright (C) 1999 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+	.toc
+	.globl	__umul_ppmm
+	.globl	.__umul_ppmm
+	.csect	__umul_ppmm[DS]
+__umul_ppmm:
+	.long	.__umul_ppmm, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.__umul_ppmm:
+	mul	9,4,5
+	srai	0,4,31
+	and	0,0,5
+	srai	5,5,31
+	and	5,5,4
+	cax	0,0,5
+	mfmq	11
+	st	11,0(3)
+	cax	3,9,0
+	br
diff --git a/rts/gmp/mpn/powerpc32/add_n.asm b/rts/gmp/mpn/powerpc32/add_n.asm
new file mode 100644
index 0000000000..81ed04b162
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/add_n.asm
@@ -0,0 +1,61 @@
+dnl PowerPC-32 mpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl store sum in a third limb vector.
+
+dnl Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr	r3
+dnl s1_ptr	r4
+dnl s2_ptr	r5
+dnl size	r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+	mtctr	r6		C copy size into CTR
+	addic	r0,r0,0		C clear cy
+	lwz	r8,0(r4)	C load least significant s1 limb
+	lwz	r0,0(r5)	C load least significant s2 limb
+	addi	r3,r3,-4	C offset res_ptr, it's updated before it's used
+	bdz	.Lend		C If done, skip loop
+.Loop:	lwz	r9,4(r4)	C load s1 limb
+	lwz	r10,4(r5)	C load s2 limb
+	adde	r7,r0,r8	C add limbs with cy, set cy
+	stw	r7,4(r3)	C store result limb
+	bdz	.Lexit		C decrement CTR and exit if done
+	lwzu	r8,8(r4)	C load s1 limb and update s1_ptr
+	lwzu	r0,8(r5)	C load s2 limb and update s2_ptr
+	adde	r7,r10,r9	C add limbs with cy, set cy
+	stwu	r7,8(r3)	C store result limb and update res_ptr
+	bdnz	.Loop		C decrement CTR and loop back
+
+.Lend:	adde	r7,r0,r8
+	stw	r7,4(r3)	C store ultimate result limb
+	li	r3,0		C load cy into ...
+	addze	r3,r3		C ... return value register
+	blr
+.Lexit:	adde	r7,r10,r9
+	stw	r7,8(r3)
+	li	r3,0		C load cy into ...
+	addze	r3,r3		C ... return value register
+	blr
+EPILOGUE(mpn_add_n)
diff --git a/rts/gmp/mpn/powerpc32/addmul_1.asm b/rts/gmp/mpn/powerpc32/addmul_1.asm
new file mode 100644
index 0000000000..3ef75b1532
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/addmul_1.asm
@@ -0,0 +1,124 @@
+dnl PowerPC-32 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+dnl the result to a second limb vector.
+
+dnl Copyright (C) 1995, 1997, 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr	r3
+dnl s1_ptr	r4
+dnl size	r5
+dnl s2_limb	r6
+
+dnl This is optimized for the PPC604.  It has not been tested on PPC601, PPC603
+dnl or PPC750 since I don't have access to any such machines.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	cmpi	cr0,r5,9	C more than 9 limbs?
+	bgt	cr0,.Lbig	C branch if more than 9 limbs
+
+	mtctr	r5
+	lwz	r0,0(r4)
+	mullw	r7,r0,r6
+	mulhwu	r10,r0,r6
+	lwz	r9,0(r3)
+	addc	r8,r7,r9
+	addi	r3,r3,-4
+	bdz	.Lend
+.Lloop:
+	lwzu	r0,4(r4)
+	stwu	r8,4(r3)
+	mullw	r8,r0,r6
+	adde	r7,r8,r10
+	mulhwu	r10,r0,r6
+	lwz	r9,4(r3)
+	addze	r10,r10
+	addc	r8,r7,r9
+	bdnz	.Lloop
+.Lend:	stw	r8,4(r3)
+	addze	r3,r10
+	blr
+
+.Lbig:	stmw	r30,-32(r1)
+	addi	r5,r5,-1
+	srwi	r0,r5,2
+	mtctr	r0
+
+	lwz	r7,0(r4)
+	mullw	r8,r7,r6
+	mulhwu	r0,r7,r6
+	lwz	r7,0(r3)
+	addc	r8,r8,r7
+	stw	r8,0(r3)
+
+.LloopU:
+	lwz	r7,4(r4)
+	lwz	r12,8(r4)
+	lwz	r30,12(r4)
+	lwzu	r31,16(r4)
+	mullw	r8,r7,r6
+	mullw	r9,r12,r6
+	mullw	r10,r30,r6
+	mullw	r11,r31,r6
+	adde	r8,r8,r0	C add cy_limb
+	mulhwu	r0,r7,r6
+	lwz	r7,4(r3)
+	adde	r9,r9,r0
+	mulhwu	r0,r12,r6
+	lwz	r12,8(r3)
+	adde	r10,r10,r0
+	mulhwu	r0,r30,r6
+	lwz	r30,12(r3)
+	adde	r11,r11,r0
+	mulhwu	r0,r31,r6
+	lwz	r31,16(r3)
+	addze	r0,r0		C new cy_limb
+	addc	r8,r8,r7
+	stw	r8,4(r3)
+	adde	r9,r9,r12
+	stw	r9,8(r3)
+	adde	r10,r10,r30
+	stw	r10,12(r3)
+	adde	r11,r11,r31
+	stwu	r11,16(r3)
+	bdnz	.LloopU
+
+	andi.	r31,r5,3
+	mtctr	r31
+	beq	cr0,.Lendx
+
+.LloopE:
+	lwzu	r7,4(r4)
+	mullw	r8,r7,r6
+	adde	r8,r8,r0	C add cy_limb
+	mulhwu	r0,r7,r6
+	lwz	r7,4(r3)
+	addze	r0,r0		C new cy_limb
+	addc	r8,r8,r7
+	stwu	r8,4(r3)
+	bdnz	.LloopE
+.Lendx:
+	addze	r3,r0
+	lmw	r30,-32(r1)
+	blr
+EPILOGUE(mpn_addmul_1)
diff --git a/rts/gmp/mpn/powerpc32/aix.m4 b/rts/gmp/mpn/powerpc32/aix.m4
new file mode 100644
index 0000000000..2bd8425817
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/aix.m4
@@ -0,0 +1,39 @@
+divert(-1)
+dnl  m4 macros for AIX 32-bit assembly.
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+define(`ASM_START',
+	`.toc')
+	
+define(`PROLOGUE',
+	`
+	.globl	$1
+	.globl	.$1
+	.csect	$1[DS],2
+$1:
+	.long	.$1, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.$1:')
+
+define(`EPILOGUE', `')
+
+divert
diff --git a/rts/gmp/mpn/powerpc32/gmp-mparam.h b/rts/gmp/mpn/powerpc32/gmp-mparam.h
new file mode 100644
index 0000000000..b283185789
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/gmp-mparam.h
@@ -0,0 +1,66 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* These values are for the 604.  Presumably, these should be considerably
+   different for the 603 and 750 that have much slower multiply
+   instructions.  */
+
+/* Generated by tuneup.c, 2000-05-26. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   26	/* tuneup says 20 */
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      228
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   46	/* tuneup says 44 */
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      262
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              52
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD             86
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            23
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        7
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD          53
+#endif
diff --git a/rts/gmp/mpn/powerpc32/lshift.asm b/rts/gmp/mpn/powerpc32/lshift.asm
new file mode 100644
index 0000000000..73a85430ab
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/lshift.asm
@@ -0,0 +1,145 @@
+dnl  PowerPC-32 mpn_lshift -- Shift a number left.
+
+dnl Copyright (C) 1995, 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr	r3
+dnl s1_ptr	r4
+dnl size	r5
+dnl cnt		r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	cmpi	cr0,r5,12	C more than 12 limbs?
+	slwi	r0,r5,2
+	add	r4,r4,r0	C make r4 point at end of s1
+	add	r7,r3,r0	C make r7 point at end of res
+	bgt	.LBIG		C branch if more than 12 limbs
+
+	mtctr	r5		C copy size into CTR
+	subfic	r8,r6,32
+	lwzu	r11,-4(r4)	C load first s1 limb
+	srw	r3,r11,r8	C compute function return value
+	bdz	.Lend1
+
+.Loop:	lwzu	r10,-4(r4)
+	slw	r9,r11,r6
+	srw	r12,r10,r8
+	or	r9,r9,r12
+	stwu	r9,-4(r7)
+	bdz	.Lend2
+	lwzu	r11,-4(r4)
+	slw	r9,r10,r6
+	srw	r12,r11,r8
+	or	r9,r9,r12
+	stwu	r9,-4(r7)
+	bdnz	.Loop
+
+.Lend1:	slw	r0,r11,r6
+	stw	r0,-4(r7)
+	blr
+.Lend2:	slw	r0,r10,r6
+	stw	r0,-4(r7)
+	blr
+
+.LBIG:
+	stmw	r24,-32(r1)	C save registers we are supposed to preserve
+	lwzu	r9,-4(r4)
+	subfic	r8,r6,32
+	srw	r3,r9,r8	C compute function return value
+	slw	r0,r9,r6
+	addi	r5,r5,-1
+
+	andi.	r10,r5,3	C count for spill loop
+	beq	.Le
+	mtctr	r10
+	lwzu	r28,-4(r4)
+	bdz	.Lxe0
+
+.Loop0:	slw	r12,r28,r6
+	srw	r24,r28,r8
+	lwzu	r28,-4(r4)
+	or	r24,r0,r24
+	stwu	r24,-4(r7)
+	mr	r0,r12
+	bdnz	.Loop0		C taken at most once!
+
+.Lxe0:	slw	r12,r28,r6
+	srw	r24,r28,r8
+	or	r24,r0,r24
+	stwu	r24,-4(r7)
+	mr	r0,r12
+
+.Le:	srwi	r5,r5,2		C count for unrolled loop
+	addi	r5,r5,-1
+	mtctr	r5
+	lwz	r28,-4(r4)
+	lwz	r29,-8(r4)
+	lwz	r30,-12(r4)
+	lwzu	r31,-16(r4)
+
+.LoopU:	slw	r9,r28,r6
+	srw	r24,r28,r8
+	lwz	r28,-4(r4)
+	slw	r10,r29,r6
+	srw	r25,r29,r8
+	lwz	r29,-8(r4)
+	slw	r11,r30,r6
+	srw	r26,r30,r8
+	lwz	r30,-12(r4)
+	slw	r12,r31,r6
+	srw	r27,r31,r8
+	lwzu	r31,-16(r4)
+	or	r24,r0,r24
+	stw	r24,-4(r7)
+	or	r25,r9,r25
+	stw	r25,-8(r7)
+	or	r26,r10,r26
+	stw	r26,-12(r7)
+	or	r27,r11,r27
+	stwu	r27,-16(r7)
+	mr	r0,r12
+	bdnz	.LoopU
+
+	slw	r9,r28,r6
+	srw	r24,r28,r8
+	slw	r10,r29,r6
+	srw	r25,r29,r8
+	slw	r11,r30,r6
+	srw	r26,r30,r8
+	slw	r12,r31,r6
+	srw	r27,r31,r8
+	or	r24,r0,r24
+	stw	r24,-4(r7)
+	or	r25,r9,r25
+	stw	r25,-8(r7)
+	or	r26,r10,r26
+	stw	r26,-12(r7)
+	or	r27,r11,r27
+	stwu	r27,-16(r7)
+	mr	r0,r12
+
+	stw	r0,-4(r7)
+	lmw	r24,-32(r1)	C restore registers
+	blr
+EPILOGUE(mpn_lshift)
diff --git a/rts/gmp/mpn/powerpc32/mul_1.asm b/rts/gmp/mpn/powerpc32/mul_1.asm
new file mode 100644
index 0000000000..ec878b54d5
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/mul_1.asm
@@ -0,0 +1,86 @@
+dnl PowerPC-32 mpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl the result in a second limb vector.
+
+dnl Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr	r3
+dnl s1_ptr	r4
+dnl size	r5
+dnl s2_limb	r6
+
+dnl This is optimized for the PPC604 but it runs decently even on PPC601.  It
+dnl has not been tested on a PPC603 since I don't have access to any such
+dnl machines.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	mtctr	r5
+	addi	r3,r3,-4	C adjust res_ptr, it's offset before it's used
+	li	r12,0		C clear upper product reg
+	addic	r0,r0,0		C clear cy
+C Start software pipeline
+	lwz	r8,0(r4)
+	bdz	.Lend3
+	stmw	r30,-8(r1)	C save registers we are supposed to preserve
+	lwzu	r9,4(r4)
+	mullw	r11,r8,r6
+	mulhwu	r0,r8,r6
+	bdz	.Lend1
+C Software pipelined main loop
+.Loop:	lwz	r8,4(r4)
+	mullw	r10,r9,r6
+	adde	r30,r11,r12
+	mulhwu	r12,r9,r6
+	stw	r30,4(r3)
+	bdz	.Lend2
+	lwzu	r9,8(r4)
+	mullw	r11,r8,r6
+	adde	r31,r10,r0
+	mulhwu	r0,r8,r6
+	stwu	r31,8(r3)
+	bdnz	.Loop
+C Finish software pipeline
+.Lend1:	mullw	r10,r9,r6
+	adde	r30,r11,r12
+	mulhwu	r12,r9,r6
+	stw	r30,4(r3)
+	adde	r31,r10,r0
+	stwu	r31,8(r3)
+	addze	r3,r12
+	lmw	r30,-8(r1)	C restore registers from stack
+	blr
+.Lend2:	mullw	r11,r8,r6
+	adde	r31,r10,r0
+	mulhwu	r0,r8,r6
+	stwu	r31,8(r3)
+	adde	r30,r11,r12
+	stw	r30,4(r3)
+	addze	r3,r0
+	lmw	r30,-8(r1)	C restore registers from stack
+	blr
+.Lend3:	mullw	r11,r8,r6
+	stw	r11,4(r3)
+	mulhwu	r3,r8,r6
+	blr
+EPILOGUE(mpn_mul_1)
diff --git a/rts/gmp/mpn/powerpc32/regmap.m4 b/rts/gmp/mpn/powerpc32/regmap.m4
new file mode 100644
index 0000000000..978f18902a
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/regmap.m4
@@ -0,0 +1,34 @@
+divert(-1)
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl Map register names r0, r1, etc, to just `0', `1', etc.
+dnl This is needed on all systems but NeXT, Rhapsody, and MacOS-X
+forloop(i,0,31,
+`define(`r'i,i)'
+)
+
+dnl Likewise for cr0, cr1, etc.
+forloop(i,0,7,
+`define(`cr'i,i)'
+)
+
+divert
diff --git a/rts/gmp/mpn/powerpc32/rshift.asm b/rts/gmp/mpn/powerpc32/rshift.asm
new file mode 100644
index 0000000000..a09ba04938
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/rshift.asm
@@ -0,0 +1,60 @@
+dnl PowerPC-32 mpn_rshift -- Shift a number right.
+
+dnl Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr	r3
+dnl s1_ptr	r4
+dnl size	r5
+dnl cnt		r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	mtctr	r5		C copy size into CTR
+	addi	r7,r3,-4	C move adjusted res_ptr to free return reg
+	subfic	r8,r6,32
+	lwz	r11,0(r4)	C load first s1 limb
+	slw	r3,r11,r8	C compute function return value
+	bdz	.Lend1
+
+.Loop:	lwzu	r10,4(r4)
+	srw	r9,r11,r6
+	slw	r12,r10,r8
+	or	r9,r9,r12
+	stwu	r9,4(r7)
+	bdz	.Lend2
+	lwzu	r11,4(r4)
+	srw	r9,r10,r6
+	slw	r12,r11,r8
+	or	r9,r9,r12
+	stwu	r9,4(r7)
+	bdnz	.Loop
+
+.Lend1:	srw	r0,r11,r6
+	stw	r0,4(r7)
+	blr
+
+.Lend2:	srw	r0,r10,r6
+	stw	r0,4(r7)
+	blr
+EPILOGUE(mpn_rshift)
diff --git a/rts/gmp/mpn/powerpc32/sub_n.asm b/rts/gmp/mpn/powerpc32/sub_n.asm
new file mode 100644
index 0000000000..b04b4192ef
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/sub_n.asm
@@ -0,0 +1,61 @@
+dnl PowerPC-32 mpn_sub_n -- Subtract two limb vectors of the same length > 0
+dnl and store difference in a third limb vector.
+
+dnl Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr	r3
+dnl s1_ptr	r4
+dnl s2_ptr	r5
+dnl size	r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	mtctr	r6		C copy size into CTR
+	addic	r0,r6,-1	C set cy
+	lwz	r8,0(r4)	C load least significant s1 limb
+	lwz	r0,0(r5)	C load least significant s2 limb
+	addi	r3,r3,-4	C offset res_ptr, it's updated before it's used
+	bdz	.Lend		C If done, skip loop
+.Loop:	lwz	r9,4(r4)	C load s1 limb
+	lwz	r10,4(r5)	C load s2 limb
+	subfe	r7,r0,r8	C subtract limbs with cy, set cy
+	stw	r7,4(r3)	C store result limb
+	bdz	.Lexit		C decrement CTR and exit if done
+	lwzu	r8,8(r4)	C load s1 limb and update s1_ptr
+	lwzu	r0,8(r5)	C load s2 limb and update s2_ptr
+	subfe	r7,r10,r9	C subtract limbs with cy, set cy
+	stwu	r7,8(r3)	C store result limb and update res_ptr
+	bdnz	.Loop		C decrement CTR and loop back
+
+.Lend:	subfe	r7,r0,r8
+	stw	r7,4(r3)	C store ultimate result limb
+	subfe	r3,r0,r0	C load !cy into ...
+	subfic	r3,r3,0		C ... return value register
+	blr
+.Lexit:	subfe	r7,r10,r9
+	stw	r7,8(r3)
+	subfe	r3,r0,r0	C load !cy into ...
+	subfic	r3,r3,0		C ... return value register
+	blr
+EPILOGUE(mpn_sub_n)
diff --git a/rts/gmp/mpn/powerpc32/submul_1.asm b/rts/gmp/mpn/powerpc32/submul_1.asm
new file mode 100644
index 0000000000..a129e9f9ea
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/submul_1.asm
@@ -0,0 +1,130 @@
+dnl PowerPC-32 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+dnl the result from a second limb vector.
+
+dnl Copyright (C) 1995, 1997, 1998, 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+dnl INPUT PARAMETERS
+dnl res_ptr	r3
+dnl s1_ptr	r4
+dnl size	r5
+dnl s2_limb	r6
+
+dnl This is optimized for the PPC604.  It has not been tested on PPC601, PPC603
+dnl or PPC750 since I don't have access to any such machines.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	cmpi	cr0,r5,9	C more than 9 limbs?
+	bgt	cr0,.Lbig	C branch if more than 9 limbs
+
+	mtctr	r5
+	lwz	r0,0(r4)
+	mullw	r7,r0,r6
+	mulhwu	r10,r0,r6
+	lwz	r9,0(r3)
+	subfc	r8,r7,r9
+	addc	r7,r7,r8	C invert cy (r7 is junk)
+	addi	r3,r3,-4
+	bdz	.Lend
+.Lloop:
+	lwzu	r0,4(r4)
+	stwu	r8,4(r3)
+	mullw	r8,r0,r6
+	adde	r7,r8,r10
+	mulhwu	r10,r0,r6
+	lwz	r9,4(r3)
+	addze	r10,r10
+	subfc	r8,r7,r9
+	addc	r7,r7,r8	C invert cy (r7 is junk)
+	bdnz	.Lloop
+.Lend:	stw	r8,4(r3)
+	addze	r3,r10
+	blr
+
+.Lbig:	stmw	r30,-32(r1)
+	addi	r5,r5,-1
+	srwi	r0,r5,2
+	mtctr	r0
+
+	lwz	r7,0(r4)
+	mullw	r8,r7,r6
+	mulhwu	r0,r7,r6
+	lwz	r7,0(r3)
+	subfc	r7,r8,r7
+	addc	r8,r8,r7
+	stw	r7,0(r3)
+
+.LloopU:
+	lwz	r7,4(r4)
+	lwz	r12,8(r4)
+	lwz	r30,12(r4)
+	lwzu	r31,16(r4)
+	mullw	r8,r7,r6
+	mullw	r9,r12,r6
+	mullw	r10,r30,r6
+	mullw	r11,r31,r6
+	adde	r8,r8,r0	C add cy_limb
+	mulhwu	r0,r7,r6
+	lwz	r7,4(r3)
+	adde	r9,r9,r0
+	mulhwu	r0,r12,r6
+	lwz	r12,8(r3)
+	adde	r10,r10,r0
+	mulhwu	r0,r30,r6
+	lwz	r30,12(r3)
+	adde	r11,r11,r0
+	mulhwu	r0,r31,r6
+	lwz	r31,16(r3)
+	addze	r0,r0		C new cy_limb
+	subfc	r7,r8,r7
+	stw	r7,4(r3)
+	subfe	r12,r9,r12
+	stw	r12,8(r3)
+	subfe	r30,r10,r30
+	stw	r30,12(r3)
+	subfe	r31,r11,r31
+	stwu	r31,16(r3)
+	subfe	r11,r11,r11	C invert ...
+	addic	r11,r11,1	C ... carry
+	bdnz	.LloopU
+
+	andi.	r31,r5,3
+	mtctr	r31
+	beq	cr0,.Lendx
+
+.LloopE:
+	lwzu	r7,4(r4)
+	mullw	r8,r7,r6
+	adde	r8,r8,r0	C add cy_limb
+	mulhwu	r0,r7,r6
+	lwz	r7,4(r3)
+	addze	r0,r0		C new cy_limb
+	subfc	r7,r8,r7
+	addc	r8,r8,r7
+	stwu	r7,4(r3)
+	bdnz	.LloopE
+.Lendx:
+	addze	r3,r0
+	lmw	r30,-32(r1)
+	blr
+EPILOGUE(mpn_submul_1)
diff --git a/rts/gmp/mpn/powerpc32/umul.asm b/rts/gmp/mpn/powerpc32/umul.asm
new file mode 100644
index 0000000000..eeaa0a4dc8
--- /dev/null
+++ b/rts/gmp/mpn/powerpc32/umul.asm
@@ -0,0 +1,32 @@
+dnl PowerPC-32 umul_ppmm -- support for longlong.h
+
+dnl Copyright (C) 2000 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published by
+dnl the Free Software Foundation; either version 2.1 of the License, or (at your
+dnl option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_umul_ppmm)
+	mullw	0,4,5
+	mulhwu	9,4,5
+	stw	0,0(3)
+	mr	3,9
+	blr
+EPILOGUE(mpn_umul_ppmm)
diff --git a/rts/gmp/mpn/powerpc64/README b/rts/gmp/mpn/powerpc64/README
new file mode 100644
index 0000000000..c779276917
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/README
@@ -0,0 +1,36 @@
+PPC630 (aka Power3) pipeline information:
+
+Decoding is 4-way and issue is 8-way with some out-of-order capability.
+LS1  - ld/st unit 1
+LS2  - ld/st unit 2
+FXU1 - integer unit 1, handles any simple integer instructions
+FXU2 - integer unit 2, handles any simple integer instructions
+FXU3 - integer unit 3, handles integer multiply and divide
+FPU1 - floating-point unit 1
+FPU2 - floating-point unit 2
+
+Memory:		  Any two memory operations can issue, but memory subsystem
+		  can sustain just one store per cycle.
+Simple integer:	  2 operations (such as add, rl*)
+Integer multiply: 1 operation every 9th cycle worst case; exact timing depends
+		  on 2nd operand most significant bit position (10 bits per
+		  cycle).  Multiply unit is not pipelined, only one multiply
+		  operation in progress is allowed.
+Integer divide:	  ?
+Floating-point:	  Any plain 2 arithmetic instructions (such as fmul, fadd, fmadd)
+		  Latency = 4.
+Floating-point divide:
+		  ?
+Floating-point square root:
+		  ?
+
+Best possible times for the main loops:
+shift:	      1.5 cycles limited by integer unit contention.
+	      With 63 special loops, one for each shift count, we could
+	      reduce the needed integer instructions to 2, which would
+	      reduce the best possible time to 1 cycle.
+add/sub:      1.5 cycles, limited by ld/st unit contention.
+mul:	      18 cycles (average) unless floating-point operations are used,
+	      but that would only help for multiplies of perhaps 10 and more
+	      limbs.
+addmul/submul:Same situation as for mul.
diff --git a/rts/gmp/mpn/powerpc64/add_n.asm b/rts/gmp/mpn/powerpc64/add_n.asm
new file mode 100644
index 0000000000..c3325376dc
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/add_n.asm
@@ -0,0 +1,61 @@
+# PowerPC-64 mpn_add_n -- Add two limb vectors of the same length > 0 and
+# store sum in a third limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# s2_ptr	r5
+# size		r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+	mtctr	r6		# copy size into CTR
+	addic	r0,r0,0		# clear cy
+	ld	r8,0(r4)	# load least significant s1 limb
+	ld	r0,0(r5)	# load least significant s2 limb
+	addi	r3,r3,-8	# offset res_ptr, it's updated before it's used
+	bdz	.Lend		# If done, skip loop
+.Loop:	ld	r9,8(r4)	# load s1 limb
+	ld	r10,8(r5)	# load s2 limb
+	adde	r7,r0,r8	# add limbs with cy, set cy
+	std	r7,8(r3)	# store result limb
+	bdz	.Lexit		# decrement CTR and exit if done
+	ldu	r8,16(r4)	# load s1 limb and update s1_ptr
+	ldu	r0,16(r5)	# load s2 limb and update s2_ptr
+	adde	r7,r10,r9	# add limbs with cy, set cy
+	stdu	r7,16(r3)	# store result limb and update res_ptr
+	bdnz	.Loop		# decrement CTR and loop back
+
+.Lend:	adde	r7,r0,r8
+	std	r7,8(r3)	# store ultimate result limb
+	li	r3,0		# load cy into ...
+	addze	r3,r3		# ... return value register
+	blr
+.Lexit:	adde	r7,r10,r9
+	std	r7,16(r3)
+	li	r3,0		# load cy into ...
+	addze	r3,r3		# ... return value register
+	blr
+EPILOGUE(mpn_add_n)
diff --git a/rts/gmp/mpn/powerpc64/addmul_1.asm b/rts/gmp/mpn/powerpc64/addmul_1.asm
new file mode 100644
index 0000000000..81774482fe
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/addmul_1.asm
@@ -0,0 +1,52 @@
+# PowerPC-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	mtctr	5
+	li	9,0		# cy_limb = 0
+	addic	0,0,0
+	cal	3,-8(3)
+	cal	4,-8(4)
+.Loop:
+	ldu	0,8(4)
+	ld	10,8(3)
+	mulld	7,0,6
+	adde	7,7,9
+	mulhdu	9,0,6
+	addze	9,9
+	addc	7,7,10
+	stdu	7,8(3)
+	bdnz	.Loop
+
+	addze	3,9
+	blr
+EPILOGUE(mpn_addmul_1)
diff --git a/rts/gmp/mpn/powerpc64/addsub_n.asm b/rts/gmp/mpn/powerpc64/addsub_n.asm
new file mode 100644
index 0000000000..4ed40d71ae
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/addsub_n.asm
@@ -0,0 +1,107 @@
+# PowerPC-64 mpn_addsub_n -- Simultaneous add and sub.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# s2_ptr	r5
+# size		r6
+
+include(`asm-syntax.m4')
+
+define(SAVE_BORROW_RESTORE_CARRY,
+	`sldi $1,$1,63
+	adde $1,$1,$1')
+define(SAVE_CARRY_RESTORE_BORROW,
+	`sldi $1,$1,63
+	adde $1,$1,$1')
+
+# 19991117
+
+# This is just crafted for testing some ideas, and verifying that we can make
+# it run fast.  It runs at 2.55 cycles/limb on the 630, which is very good.
+# We should play a little with the schedule.  No time has been spent on that.
+
+# To finish this, the loop warm up and cool down code needs to be written,
+# and the result need to be tested.  Also, the proper calling sequence should
+# be used.
+
+#             r1p r2p s1p s2p n
+# Use reg r0, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12
+
+ASM_START()
+PROLOGUE(mpn_addsub_n)
+	std	r14,-64(1)
+	std	r15,-56(1)
+	std	r16,-48(1)
+	std	r17,-40(1)
+	std	r18,-32(1)
+	std	r19,-24(1)
+
+	srdi	r7,r7,2
+	mtctr	r7		# copy size into CTR
+	addic	r0,r0,0		# clear cy
+	addi	r3,r3,-8	# offset res_ptr, it's updated before it's used
+	addi	r4,r4,-8	# offset res_ptr, it's updated before it's used
+
+.Loop:
+	adde	r12,r8,r9
+	std	r12,8(r3)
+	adde	r12,r10,r11
+	std	r12,16(r3)
+
+	SAVE_CARRY_RESTORE_BORROW(r0)
+
+	subfe	r12,r8,r9
+	std	r12,8(r4)
+	ld	r8,8(r5)	# s1 L 1
+	ld	r9,8(r6)	# s2 L 1
+	subfe	r12,r10,r11
+	std	r12,16(r4)
+	ld	r10,16(r5)	# s1 L 2
+	ld	r11,16(r6)	# s2 L 2
+# pair -------------------------
+	subfe	r12,r14,r15
+	std	r12,24(r4)
+	subfe	r12,r16,r17
+	stdu	r12,32(r4)
+
+	SAVE_BORROW_RESTORE_CARRY(r0)
+
+	adde	r12,r14,r15
+	std	r12,24(r3)
+	ld	r14,24(r5)	# s1 L 3
+	ld	r15,24(r6)	# s2 L 3
+	adde	r12,r16,r17
+	stdu	r12,32(r3)
+	ldu	r16,32(r5)	# s1 L 4
+	ldu	r17,32(r6)	# s2 L 4
+	bdnz	.Loop
+
+	ld	r14,-64(1)
+	ld	r15,-56(1)
+	ld	r16,-48(1)
+	ld	r17,-40(1)
+	ld	r18,-32(1)
+	ld	r19,-24(1)
+	blr
+EPILOGUE(mpn_addsub_n)
diff --git a/rts/gmp/mpn/powerpc64/aix.m4 b/rts/gmp/mpn/powerpc64/aix.m4
new file mode 100644
index 0000000000..aee9f1f97a
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/aix.m4
@@ -0,0 +1,40 @@
+divert(-1)
+dnl  m4 macros for AIX 64-bit assembly.
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+define(`ASM_START',
+	`.machine	"ppc64"
+	.toc')
+	
+define(`PROLOGUE',
+	`
+	.globl	$1
+	.globl	.$1
+	.csect	$1[DS],3
+$1:
+	.llong	.$1, TOC[tc0], 0
+	.csect	.text[PR]
+	.align	2
+.$1:')
+
+define(`EPILOGUE', `')
+
+divert
diff --git a/rts/gmp/mpn/powerpc64/copyd.asm b/rts/gmp/mpn/powerpc64/copyd.asm
new file mode 100644
index 0000000000..d06e8c25fd
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/copyd.asm
@@ -0,0 +1,45 @@
+# PowerPC-64 mpn_copyd -- Copy a limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# rptr	r3
+# sptr	r4
+# n	r5
+
+include(`../config.m4')
+
+# Unrolling this analogous to sparc64/copyi.s doesn't help for any
+# operand sizes.
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+	cmpdi	cr0,r5,0
+	mtctr	r5
+	sldi	r5,r5,3
+	add	r4,r4,r5
+	add	r3,r3,r5
+	beq	cr0,.Lend
+.Loop:	ldu	r0,-8(r4)
+	stdu	r0,-8(r3)
+	bdnz	.Loop
+.Lend:	blr
+EPILOGUE(mpn_copyd)
diff --git a/rts/gmp/mpn/powerpc64/copyi.asm b/rts/gmp/mpn/powerpc64/copyi.asm
new file mode 100644
index 0000000000..a1bedc4c5b
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/copyi.asm
@@ -0,0 +1,44 @@
+# PowerPC-64 mpn_copyi -- Copy a limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# rptr	r3
+# sptr	r4
+# n	r5
+
+include(`../config.m4')
+
+# Unrolling this analogous to sparc64/copyi.s doesn't help for any
+# operand sizes.
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+	cmpdi	cr0,r5,0
+	mtctr	r5
+	addi	r4,r4,-8
+	addi	r3,r3,-8
+	beq	cr0,.Lend
+.Loop:	ldu	r0,8(r4)
+	stdu	r0,8(r3)
+	bdnz	.Loop
+.Lend:	blr
+EPILOGUE(mpn_copyi)
diff --git a/rts/gmp/mpn/powerpc64/gmp-mparam.h b/rts/gmp/mpn/powerpc64/gmp-mparam.h
new file mode 100644
index 0000000000..6fefb960cd
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/gmp-mparam.h
@@ -0,0 +1,62 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* Generated by tuneup.c, 2000-07-16. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   10
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD       57
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   16
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD       89
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              28
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD            216
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            14
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        6
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD         163
+#endif
diff --git a/rts/gmp/mpn/powerpc64/lshift.asm b/rts/gmp/mpn/powerpc64/lshift.asm
new file mode 100644
index 0000000000..cef3a81fdd
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/lshift.asm
@@ -0,0 +1,159 @@
+#  PowerPC-64 mpn_lshift -- Shift a number left.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# cnt		r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	cmpdi	cr0,r5,20	# more than 20 limbs?
+	sldi	r0,r5,3
+	add	r4,r4,r0	# make r4 point at end of s1
+	add	r7,r3,r0	# make r7 point at end of res
+	bgt	.LBIG		# branch if more than 12 limbs
+
+	mtctr	r5		# copy size into CTR
+	subfic	r8,r6,64
+	ldu	r11,-8(r4)	# load first s1 limb
+	srd	r3,r11,r8	# compute function return value
+	bdz	.Lend1
+
+.Loop:	ldu	r10,-8(r4)
+	sld	r9,r11,r6
+	srd	r12,r10,r8
+	or	r9,r9,r12
+	stdu	r9,-8(r7)
+	bdz	.Lend2
+	ldu	r11,-8(r4)
+	sld	r9,r10,r6
+	srd	r12,r11,r8
+	or	r9,r9,r12
+	stdu	r9,-8(r7)
+	bdnz	.Loop
+
+.Lend1:	sld	r0,r11,r6
+	std	r0,-8(r7)
+	blr
+.Lend2:	sld	r0,r10,r6
+	std	r0,-8(r7)
+	blr
+
+.LBIG:
+	std	r24,-64(1)
+	std	r25,-56(1)
+	std	r26,-48(1)
+	std	r27,-40(1)
+	std	r28,-32(1)
+	std	r29,-24(1)
+	std	r30,-16(1)
+	std	r31,-8(1)
+	ldu	r9,-8(r4)
+	subfic	r8,r6,64
+	srd	r3,r9,r8	# compute function return value
+	sld	r0,r9,r6
+	addi	r5,r5,-1
+
+	andi.	r10,r5,3	# count for spill loop
+	beq	.Le
+	mtctr	r10
+	ldu	r28,-8(r4)
+	bdz	.Lxe0
+
+.Loop0:	sld	r12,r28,r6
+	srd	r24,r28,r8
+	ldu	r28,-8(r4)
+	or	r24,r0,r24
+	stdu	r24,-8(r7)
+	mr	r0,r12
+	bdnz	.Loop0		# taken at most once!
+
+.Lxe0:	sld	r12,r28,r6
+	srd	r24,r28,r8
+	or	r24,r0,r24
+	stdu	r24,-8(r7)
+	mr	r0,r12
+
+.Le:	srdi	r5,r5,2		# count for unrolled loop
+	addi	r5,r5,-1
+	mtctr	r5
+	ld	r28,-8(r4)
+	ld	r29,-16(r4)
+	ld	r30,-24(r4)
+	ldu	r31,-32(r4)
+
+.LoopU:	sld	r9,r28,r6
+	srd	r24,r28,r8
+	ld	r28,-8(r4)
+	sld	r10,r29,r6
+	srd	r25,r29,r8
+	ld	r29,-16(r4)
+	sld	r11,r30,r6
+	srd	r26,r30,r8
+	ld	r30,-24(r4)
+	sld	r12,r31,r6
+	srd	r27,r31,r8
+	ldu	r31,-32(r4)
+	or	r24,r0,r24
+	std	r24,-8(r7)
+	or	r25,r9,r25
+	std	r25,-16(r7)
+	or	r26,r10,r26
+	std	r26,-24(r7)
+	or	r27,r11,r27
+	stdu	r27,-32(r7)
+	mr	r0,r12
+	bdnz	.LoopU
+
+	sld	r9,r28,r6
+	srd	r24,r28,r8
+	sld	r10,r29,r6
+	srd	r25,r29,r8
+	sld	r11,r30,r6
+	srd	r26,r30,r8
+	sld	r12,r31,r6
+	srd	r27,r31,r8
+	or	r24,r0,r24
+	std	r24,-8(r7)
+	or	r25,r9,r25
+	std	r25,-16(r7)
+	or	r26,r10,r26
+	std	r26,-24(r7)
+	or	r27,r11,r27
+	stdu	r27,-32(r7)
+	mr	r0,r12
+
+	std	r0,-8(r7)
+	ld	r24,-64(1)
+	ld	r25,-56(1)
+	ld	r26,-48(1)
+	ld	r27,-40(1)
+	ld	r28,-32(1)
+	ld	r29,-24(1)
+	ld	r30,-16(1)
+	ld	r31,-8(1)
+	blr
+EPILOGUE(mpn_lshift)
diff --git a/rts/gmp/mpn/powerpc64/mul_1.asm b/rts/gmp/mpn/powerpc64/mul_1.asm
new file mode 100644
index 0000000000..47597283ff
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/mul_1.asm
@@ -0,0 +1,49 @@
+# PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	mtctr	5
+	li	9,0		# cy_limb = 0
+	addic	0,0,0
+	cal	3,-8(3)
+	cal	4,-8(4)
+.Loop:
+	ldu	0,8(4)
+	mulld	7,0,6
+	adde	7,7,9
+	mulhdu	9,0,6
+	stdu	7,8(3)
+	bdnz	.Loop
+
+	addze	3,9
+	blr
+EPILOGUE(mpn_mul_1)
diff --git a/rts/gmp/mpn/powerpc64/rshift.asm b/rts/gmp/mpn/powerpc64/rshift.asm
new file mode 100644
index 0000000000..88272c7fa9
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/rshift.asm
@@ -0,0 +1,60 @@
+# PowerPC-64 mpn_rshift -- Shift a number right.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# cnt		r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	mtctr	r5		# copy size into CTR
+	addi	r7,r3,-8	# move adjusted res_ptr to free return reg
+	subfic	r8,r6,64
+	ld	r11,0(r4)	# load first s1 limb
+	sld	r3,r11,r8	# compute function return value
+	bdz	.Lend1
+
+.Loop:	ldu	r10,8(r4)
+	srd	r9,r11,r6
+	sld	r12,r10,r8
+	or	r9,r9,r12
+	stdu	r9,8(r7)
+	bdz	.Lend2
+	ldu	r11,8(r4)
+	srd	r9,r10,r6
+	sld	r12,r11,r8
+	or	r9,r9,r12
+	stdu	r9,8(r7)
+	bdnz	.Loop
+
+.Lend1:	srd	r0,r11,r6
+	std	r0,8(r7)
+	blr
+
+.Lend2:	srd	r0,r10,r6
+	std	r0,8(r7)
+	blr
+EPILOGUE(mpn_rshift)
diff --git a/rts/gmp/mpn/powerpc64/sub_n.asm b/rts/gmp/mpn/powerpc64/sub_n.asm
new file mode 100644
index 0000000000..4de3de69c7
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/sub_n.asm
@@ -0,0 +1,61 @@
+# PowerPC-64 mpn_sub_n -- Subtract two limb vectors of the same length > 0
+# and store difference in a third limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.b
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# s2_ptr	r5
+# size		r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	mtctr	r6		# copy size into CTR
+	addic	r0,r6,-1	# set cy
+	ld	r8,0(r4)	# load least significant s1 limb
+	ld	r0,0(r5)	# load least significant s2 limb
+	addi	r3,r3,-8	# offset res_ptr, it's updated before it's used
+	bdz	.Lend		# If done, skip loop
+.Loop:	ld	r9,8(r4)	# load s1 limb
+	ld	r10,8(r5)	# load s2 limb
+	subfe	r7,r0,r8	# subtract limbs with cy, set cy
+	std	r7,8(r3)	# store result limb
+	bdz	.Lexit		# decrement CTR and exit if done
+	ldu	r8,16(r4)	# load s1 limb and update s1_ptr
+	ldu	r0,16(r5)	# load s2 limb and update s2_ptr
+	subfe	r7,r10,r9	# subtract limbs with cy, set cy
+	stdu	r7,16(r3)	# store result limb and update res_ptr
+	bdnz	.Loop		# decrement CTR and loop back
+
+.Lend:	subfe	r7,r0,r8
+	std	r7,8(r3)	# store ultimate result limb
+	subfe	r3,r0,r0	# load !cy into ...
+	subfic	r3,r3,0		# ... return value register
+	blr
+.Lexit:	subfe	r7,r10,r9
+	std	r7,16(r3)
+	subfe	r3,r0,r0	# load !cy into ...
+	subfic	r3,r3,0		# ... return value register
+	blr
+EPILOGUE(mpn_sub_n)
diff --git a/rts/gmp/mpn/powerpc64/submul_1.asm b/rts/gmp/mpn/powerpc64/submul_1.asm
new file mode 100644
index 0000000000..17f6369a38
--- /dev/null
+++ b/rts/gmp/mpn/powerpc64/submul_1.asm
@@ -0,0 +1,54 @@
+# PowerPC-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	r3
+# s1_ptr	r4
+# size		r5
+# s2_limb	r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	mtctr	5
+	li	9,0		# cy_limb = 0
+	addic	0,0,0
+	cal	3,-8(3)
+	cal	4,-8(4)
+.Loop:
+	ldu	0,8(4)
+	ld	10,8(3)
+	mulld	7,0,6
+	adde	7,7,9
+	mulhdu	9,0,6
+	addze	9,9
+	subfc	7,7,10
+	stdu	7,8(3)
+	subfe	11,11,11	# invert ...
+	addic	11,11,1		# ... carry
+	bdnz	.Loop
+
+	addze	3,9
+	blr
+EPILOGUE(mpn_submul_1)
diff --git a/rts/gmp/mpn/pyr/add_n.s b/rts/gmp/mpn/pyr/add_n.s
new file mode 100644
index 0000000000..e1fc535846
--- /dev/null
+++ b/rts/gmp/mpn/pyr/add_n.s
@@ -0,0 +1,76 @@
+# Pyramid __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+# sum in a third limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+	.align	2
+.globl	___gmpn_add_n
+___gmpn_add_n:
+	movw	$-1,tr0		# representation for carry clear
+
+	movw	pr3,tr2
+	andw	$3,tr2
+	beq	Lend0
+	subw	tr2,pr3
+
+Loop0:	rsubw	$0,tr0		# restore carry bit from carry-save register
+
+	movw	(pr1),tr1
+	addwc	(pr2),tr1
+	movw	tr1,(pr0)
+
+	subwb	tr0,tr0
+	addw	$4,pr0
+	addw	$4,pr1
+	addw	$4,pr2
+	addw	$-1,tr2
+	bne	Loop0
+
+	mtstw	pr3,pr3
+	beq	Lend
+Lend0:
+Loop:	rsubw	$0,tr0		# restore carry bit from carry-save register
+
+	movw	(pr1),tr1
+	addwc	(pr2),tr1
+	movw	tr1,(pr0)
+
+	movw	4(pr1),tr1
+	addwc	4(pr2),tr1
+	movw	tr1,4(pr0)
+
+	movw	8(pr1),tr1
+	addwc	8(pr2),tr1
+	movw	tr1,8(pr0)
+
+	movw	12(pr1),tr1
+	addwc	12(pr2),tr1
+	movw	tr1,12(pr0)
+
+	subwb	tr0,tr0
+	addw	$16,pr0
+	addw	$16,pr1
+	addw	$16,pr2
+	addw	$-4,pr3
+	bne	Loop
+Lend:
+	mnegw	tr0,pr0
+	ret
diff --git a/rts/gmp/mpn/pyr/addmul_1.s b/rts/gmp/mpn/pyr/addmul_1.s
new file mode 100644
index 0000000000..65c3f8f008
--- /dev/null
+++ b/rts/gmp/mpn/pyr/addmul_1.s
@@ -0,0 +1,45 @@
+# Pyramid __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+	.align	2
+.globl	___gmpn_addmul_1
+___gmpn_addmul_1:
+	mova	(pr0)[pr2*4],pr0
+	mova	(pr1)[pr2*4],pr1
+	mnegw	pr2,pr2
+	movw	$0,tr3
+
+Loop:	movw	(pr1)[pr2*4],tr1
+	uemul	pr3,tr0
+	addw	tr3,tr1
+	movw	$0,tr3
+	addwc	tr0,tr3
+	movw	(pr0)[pr2*0x4],tr0
+	addw	tr0,tr1
+	addwc	$0,tr3
+	movw	tr1,(pr0)[pr2*4]
+	addw	$1,pr2
+	bne	Loop
+
+	movw	tr3,pr0
+	ret
diff --git a/rts/gmp/mpn/pyr/mul_1.s b/rts/gmp/mpn/pyr/mul_1.s
new file mode 100644
index 0000000000..1272297c42
--- /dev/null
+++ b/rts/gmp/mpn/pyr/mul_1.s
@@ -0,0 +1,42 @@
+# Pyramid __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+	.align	2
+.globl	___gmpn_mul_1
+___gmpn_mul_1:
+	mova	(pr0)[pr2*4],pr0
+	mova	(pr1)[pr2*4],pr1
+	mnegw	pr2,pr2
+	movw	$0,tr3
+
+Loop:	movw	(pr1)[pr2*4],tr1
+	uemul	pr3,tr0
+	addw	tr3,tr1
+	movw	$0,tr3
+	addwc	tr0,tr3
+	movw	tr1,(pr0)[pr2*4]
+	addw	$1,pr2
+	bne	Loop
+
+	movw	tr3,pr0
+	ret
diff --git a/rts/gmp/mpn/pyr/sub_n.s b/rts/gmp/mpn/pyr/sub_n.s
new file mode 100644
index 0000000000..1fd2eb0f17
--- /dev/null
+++ b/rts/gmp/mpn/pyr/sub_n.s
@@ -0,0 +1,76 @@
+# Pyramid __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+# store difference in a third limb vector.
+
+# Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+.text
+	.align	2
+.globl	___gmpn_sub_n
+___gmpn_sub_n:
+	movw	$-1,tr0		# representation for carry clear
+
+	movw	pr3,tr2
+	andw	$3,tr2
+	beq	Lend0
+	subw	tr2,pr3
+
+Loop0:	rsubw	$0,tr0		# restore carry bit from carry-save register
+
+	movw	(pr1),tr1
+	subwb	(pr2),tr1
+	movw	tr1,(pr0)
+
+	subwb	tr0,tr0
+	addw	$4,pr0
+	addw	$4,pr1
+	addw	$4,pr2
+	addw	$-1,tr2
+	bne	Loop0
+
+	mtstw	pr3,pr3
+	beq	Lend
+Lend0:
+Loop:	rsubw	$0,tr0		# restore carry bit from carry-save register
+
+	movw	(pr1),tr1
+	subwb	(pr2),tr1
+	movw	tr1,(pr0)
+
+	movw	4(pr1),tr1
+	subwb	4(pr2),tr1
+	movw	tr1,4(pr0)
+
+	movw	8(pr1),tr1
+	subwb	8(pr2),tr1
+	movw	tr1,8(pr0)
+
+	movw	12(pr1),tr1
+	subwb	12(pr2),tr1
+	movw	tr1,12(pr0)
+
+	subwb	tr0,tr0
+	addw	$16,pr0
+	addw	$16,pr1
+	addw	$16,pr2
+	addw	$-4,pr3
+	bne	Loop
+Lend:
+	mnegw	tr0,pr0
+	ret
diff --git a/rts/gmp/mpn/sh/add_n.s b/rts/gmp/mpn/sh/add_n.s
new file mode 100644
index 0000000000..df388b31a3
--- /dev/null
+++ b/rts/gmp/mpn/sh/add_n.s
@@ -0,0 +1,47 @@
+! SH __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+! sum in a third limb vector.
+
+! Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r4
+! s1_ptr	r5
+! s2_ptr	r6
+! size		r7
+
+	.text
+	.align 2
+	.global	___gmpn_add_n
+___gmpn_add_n:
+	mov	#0,r3		! clear cy save reg
+
+Loop:	mov.l	@r5+,r1
+	mov.l	@r6+,r2
+	shlr	r3		! restore cy
+	addc	r2,r1
+	movt	r3		! save cy
+	mov.l	r1,@r4
+	dt	r7
+	bf.s	Loop
+	 add	#4,r4
+
+	rts
+	mov	r3,r0		! return carry-out from most sign. limb
diff --git a/rts/gmp/mpn/sh/sh2/addmul_1.s b/rts/gmp/mpn/sh/sh2/addmul_1.s
new file mode 100644
index 0000000000..f34a7f0503
--- /dev/null
+++ b/rts/gmp/mpn/sh/sh2/addmul_1.s
@@ -0,0 +1,53 @@
+! SH2 __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+! the result to a second limb vector.
+
+! Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r4
+! s1_ptr	r5
+! size		r6
+! s2_limb	r7
+
+	.text
+	.align 1
+	.global	___gmpn_addmul_1
+___gmpn_addmul_1:
+	mov	#0,r2		! cy_limb = 0
+	mov	#0,r0		! Keep r0 = 0 for entire loop
+	clrt
+
+Loop:	mov.l	@r5+,r3
+	dmulu.l	r3,r7
+	sts	macl,r1
+	addc	r2,r1		! lo_prod += old cy_limb
+	sts	mach,r2		! new cy_limb = hi_prod
+	mov.l	@r4,r3
+	addc	r0,r2		! cy_limb += T, T = 0
+	addc	r3,r1
+	addc	r0,r2		! cy_limb += T, T = 0
+	dt	r6
+	mov.l	r1,@r4
+	bf.s	Loop
+	add	#4,r4
+
+	rts
+	mov	r2,r0
diff --git a/rts/gmp/mpn/sh/sh2/mul_1.s b/rts/gmp/mpn/sh/sh2/mul_1.s
new file mode 100644
index 0000000000..2a117a3175
--- /dev/null
+++ b/rts/gmp/mpn/sh/sh2/mul_1.s
@@ -0,0 +1,50 @@
+! SH2 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+! the result in a second limb vector.
+
+! Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r4
+! s1_ptr	r5
+! size		r6
+! s2_limb	r7
+
+	.text
+	.align 1
+	.global	___gmpn_mul_1
+___gmpn_mul_1:
+	mov	#0,r2		! cy_limb = 0
+	mov	#0,r0		! Keep r0 = 0 for entire loop
+	clrt
+
+Loop:	mov.l	@r5+,r3
+	dmulu.l	r3,r7
+	sts	macl,r1
+	addc	r2,r1
+	sts	mach,r2
+	addc	r0,r2		! propagate carry to cy_limb (dt clobbers T)
+	dt	r6
+	mov.l	r1,@r4
+	bf.s	Loop
+	add	#4,r4
+
+	rts
+	mov	r2,r0
diff --git a/rts/gmp/mpn/sh/sh2/submul_1.s b/rts/gmp/mpn/sh/sh2/submul_1.s
new file mode 100644
index 0000000000..eb9a27dde3
--- /dev/null
+++ b/rts/gmp/mpn/sh/sh2/submul_1.s
@@ -0,0 +1,53 @@
+! SH2 __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract
+! the result from a second limb vector.
+
+! Copyright (C) 1995, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r4
+! s1_ptr	r5
+! size		r6
+! s2_limb	r7
+
+	.text
+	.align 1
+	.global	___gmpn_submul_1
+___gmpn_submul_1:
+	mov	#0,r2		! cy_limb = 0
+	mov	#0,r0		! Keep r0 = 0 for entire loop
+	clrt
+
+Loop:	mov.l	@r5+,r3
+	dmulu.l	r3,r7
+	sts	macl,r1
+	addc	r2,r1		! lo_prod += old cy_limb
+	sts	mach,r2		! new cy_limb = hi_prod
+	mov.l	@r4,r3
+	addc	r0,r2		! cy_limb += T, T = 0
+	subc	r3,r1
+	addc	r0,r2		! cy_limb += T, T = 0
+	dt	r6
+	mov.l	r1,@r4
+	bf.s	Loop
+	add	#4,r4
+
+	rts
+	mov	r2,r0
diff --git a/rts/gmp/mpn/sh/sub_n.s b/rts/gmp/mpn/sh/sub_n.s
new file mode 100644
index 0000000000..5f818c95a8
--- /dev/null
+++ b/rts/gmp/mpn/sh/sub_n.s
@@ -0,0 +1,47 @@
+! SH __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and store
+! difference in a third limb vector.
+
+! Copyright (C) 1995, 1997, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r4
+! s1_ptr	r5
+! s2_ptr	r6
+! size		r7
+
+	.text
+	.align 2
+	.global	___gmpn_sub_n
+___gmpn_sub_n:
+	mov	#0,r3		! clear cy save reg
+
+Loop:	mov.l	@r5+,r1
+	mov.l	@r6+,r2
+	shlr	r3		! restore cy
+	subc	r2,r1
+	movt	r3		! save cy
+	mov.l	r1,@r4
+	dt	r7
+	bf.s	Loop
+	 add	#4,r4
+
+	rts
+	mov	r3,r0		! return carry-out from most sign. limb
diff --git a/rts/gmp/mpn/sparc32/README b/rts/gmp/mpn/sparc32/README
new file mode 100644
index 0000000000..7c19df7bc4
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/README
@@ -0,0 +1,36 @@
+This directory contains mpn functions for various SPARC chips.  Code that
+runs only on version 8 SPARC implementations, is in the v8 subdirectory.
+
+RELEVANT OPTIMIZATION ISSUES
+
+  Load and Store timing
+
+On most early SPARC implementations, the ST instructions takes multiple
+cycles, while a STD takes just a single cycle more than an ST.  For the CPUs
+in SPARCstation I and II, the times are 3 and 4 cycles, respectively.
+Therefore, combining two ST instrucitons into a STD when possible is a
+significant optimiation.
+
+Later SPARC implementations have single cycle ST.
+
+For SuperSPARC, we can perform just one memory instruction per cycle, even
+if up to two integer instructions can be executed in its pipeline.  For
+programs that perform so many memory operations that there are not enough
+non-memory operations to issue in parallel with all memory operations, using
+LDD and STD when possible helps.
+
+STATUS
+
+1. On a SuperSPARC, mpn_lshift and mpn_rshift run at 3 cycles/limb, or 2.5
+   cycles/limb asymptotically.  We could optimize speed for special counts
+   by using ADDXCC.
+
+2. On a SuperSPARC, mpn_add_n and mpn_sub_n runs at 2.5 cycles/limb, or 2
+   cycles/limb asymptotically.
+
+3. mpn_mul_1 runs at what is believed to be optimal speed.
+
+4. On SuperSPARC, mpn_addmul_1 and mpn_submul_1 could both be improved by a
+   cycle by avoiding one of the add instrucitons.  See a29k/addmul_1.
+
+The speed of the code for other SPARC implementations is uncertain.
diff --git a/rts/gmp/mpn/sparc32/add_n.asm b/rts/gmp/mpn/sparc32/add_n.asm
new file mode 100644
index 0000000000..5f1d00c0e0
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/add_n.asm
@@ -0,0 +1,236 @@
+dnl  SPARC mpn_add_n -- Add two limb vectors of the same length > 0 and store
+dnl  sum in a third limb vector.
+
+dnl  Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(res_ptr,%o0)
+define(s1_ptr,%o1)
+define(s2_ptr,%o2)
+define(n,%o3)
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+	xor	s2_ptr,res_ptr,%g1
+	andcc	%g1,4,%g0
+	bne	L(1)			C branch if alignment differs
+	nop
+C **  V1a  **
+L(0):	andcc	res_ptr,4,%g0		C res_ptr unaligned? Side effect: cy=0
+	be	L(v1)			C if no, branch
+	nop
+C Add least significant limb separately to align res_ptr and s2_ptr
+	ld	[s1_ptr],%g4
+	add	s1_ptr,4,s1_ptr
+	ld	[s2_ptr],%g2
+	add	s2_ptr,4,s2_ptr
+	add	n,-1,n
+	addcc	%g4,%g2,%o4
+	st	%o4,[res_ptr]
+	add	res_ptr,4,res_ptr
+L(v1):	addx	%g0,%g0,%o4		C save cy in register
+	cmp	n,2			C if n < 2 ...
+	bl	L(end2)			C ... branch to tail code
+	subcc	%g0,%o4,%g0		C restore cy
+
+	ld	[s1_ptr+0],%g4
+	addcc	n,-10,n
+	ld	[s1_ptr+4],%g1
+	ldd	[s2_ptr+0],%g2
+	blt	L(fin1)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 8 limbs until less than 8 limbs remain
+L(loop1):
+	addxcc	%g4,%g2,%o4
+	ld	[s1_ptr+8],%g4
+	addxcc	%g1,%g3,%o5
+	ld	[s1_ptr+12],%g1
+	ldd	[s2_ptr+8],%g2
+	std	%o4,[res_ptr+0]
+	addxcc	%g4,%g2,%o4
+	ld	[s1_ptr+16],%g4
+	addxcc	%g1,%g3,%o5
+	ld	[s1_ptr+20],%g1
+	ldd	[s2_ptr+16],%g2
+	std	%o4,[res_ptr+8]
+	addxcc	%g4,%g2,%o4
+	ld	[s1_ptr+24],%g4
+	addxcc	%g1,%g3,%o5
+	ld	[s1_ptr+28],%g1
+	ldd	[s2_ptr+24],%g2
+	std	%o4,[res_ptr+16]
+	addxcc	%g4,%g2,%o4
+	ld	[s1_ptr+32],%g4
+	addxcc	%g1,%g3,%o5
+	ld	[s1_ptr+36],%g1
+	ldd	[s2_ptr+32],%g2
+	std	%o4,[res_ptr+24]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	add	s1_ptr,32,s1_ptr
+	add	s2_ptr,32,s2_ptr
+	add	res_ptr,32,res_ptr
+	bge	L(loop1)
+	subcc	%g0,%o4,%g0		C restore cy
+
+L(fin1):
+	addcc	n,8-2,n
+	blt	L(end1)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 2 limbs until less than 2 limbs remain
+L(loope1):
+	addxcc	%g4,%g2,%o4
+	ld	[s1_ptr+8],%g4
+	addxcc	%g1,%g3,%o5
+	ld	[s1_ptr+12],%g1
+	ldd	[s2_ptr+8],%g2
+	std	%o4,[res_ptr+0]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-2,n
+	add	s1_ptr,8,s1_ptr
+	add	s2_ptr,8,s2_ptr
+	add	res_ptr,8,res_ptr
+	bge	L(loope1)
+	subcc	%g0,%o4,%g0		C restore cy
+L(end1):
+	addxcc	%g4,%g2,%o4
+	addxcc	%g1,%g3,%o5
+	std	%o4,[res_ptr+0]
+	addx	%g0,%g0,%o4		C save cy in register
+
+	andcc	n,1,%g0
+	be	L(ret1)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add last limb
+	ld	[s1_ptr+8],%g4
+	ld	[s2_ptr+8],%g2
+	addxcc	%g4,%g2,%o4
+	st	%o4,[res_ptr+8]
+
+L(ret1):
+	retl
+	addx	%g0,%g0,%o0	C return carry-out from most sign. limb
+
+L(1):	xor	s1_ptr,res_ptr,%g1
+	andcc	%g1,4,%g0
+	bne	L(2)
+	nop
+C **  V1b  **
+	mov	s2_ptr,%g1
+	mov	s1_ptr,s2_ptr
+	b	L(0)
+	mov	%g1,s1_ptr
+
+C **  V2  **
+C If we come here, the alignment of s1_ptr and res_ptr as well as the
+C alignment of s2_ptr and res_ptr differ.  Since there are only two ways
+C things can be aligned (that we care about) we now know that the alignment
+C of s1_ptr and s2_ptr are the same.
+
+L(2):	cmp	n,1
+	be	L(jone)
+	nop
+	andcc	s1_ptr,4,%g0		C s1_ptr unaligned? Side effect: cy=0
+	be	L(v2)			C if no, branch
+	nop
+C Add least significant limb separately to align s1_ptr and s2_ptr
+	ld	[s1_ptr],%g4
+	add	s1_ptr,4,s1_ptr
+	ld	[s2_ptr],%g2
+	add	s2_ptr,4,s2_ptr
+	add	n,-1,n
+	addcc	%g4,%g2,%o4
+	st	%o4,[res_ptr]
+	add	res_ptr,4,res_ptr
+
+L(v2):	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	blt	L(fin2)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 8 limbs until less than 8 limbs remain
+L(loop2):
+	ldd	[s1_ptr+0],%g2
+	ldd	[s2_ptr+0],%o4
+	addxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+0]
+	addxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+4]
+	ldd	[s1_ptr+8],%g2
+	ldd	[s2_ptr+8],%o4
+	addxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+8]
+	addxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+12]
+	ldd	[s1_ptr+16],%g2
+	ldd	[s2_ptr+16],%o4
+	addxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+16]
+	addxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+20]
+	ldd	[s1_ptr+24],%g2
+	ldd	[s2_ptr+24],%o4
+	addxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+24]
+	addxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+28]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	add	s1_ptr,32,s1_ptr
+	add	s2_ptr,32,s2_ptr
+	add	res_ptr,32,res_ptr
+	bge	L(loop2)
+	subcc	%g0,%o4,%g0		C restore cy
+
+L(fin2):
+	addcc	n,8-2,n
+	blt	L(end2)
+	subcc	%g0,%o4,%g0		C restore cy
+L(loope2):
+	ldd	[s1_ptr+0],%g2
+	ldd	[s2_ptr+0],%o4
+	addxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+0]
+	addxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+4]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-2,n
+	add	s1_ptr,8,s1_ptr
+	add	s2_ptr,8,s2_ptr
+	add	res_ptr,8,res_ptr
+	bge	L(loope2)
+	subcc	%g0,%o4,%g0		C restore cy
+L(end2):
+	andcc	n,1,%g0
+	be	L(ret2)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add last limb
+L(jone):
+	ld	[s1_ptr],%g4
+	ld	[s2_ptr],%g2
+	addxcc	%g4,%g2,%o4
+	st	%o4,[res_ptr]
+
+L(ret2):
+	retl
+	addx	%g0,%g0,%o0	C return carry-out from most sign. limb
+EPILOGUE(mpn_add_n)
diff --git a/rts/gmp/mpn/sparc32/addmul_1.asm b/rts/gmp/mpn/sparc32/addmul_1.asm
new file mode 100644
index 0000000000..80c94e4251
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/addmul_1.asm
@@ -0,0 +1,146 @@
+dnl  SPARC mpn_addmul_1 -- Multiply a limb vector with a limb and add the
+dnl  result to a second limb vector.
+
+dnl  Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	o0
+C s1_ptr	o1
+C size		o2
+C s2_limb	o3
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	C Make S1_PTR and RES_PTR point at the end of their blocks
+	C and put (- 4 x SIZE) in index/loop counter.
+	sll	%o2,2,%o2
+	add	%o0,%o2,%o4	C RES_PTR in o4 since o0 is retval
+	add	%o1,%o2,%o1
+	sub	%g0,%o2,%o2
+
+	cmp	%o3,0xfff
+	bgu	L(large)
+	nop
+
+	ld	[%o1+%o2],%o5
+	mov	0,%o0
+	b	L(0)
+	 add	%o4,-4,%o4
+L(loop0):
+	addcc	%o5,%g1,%g1
+	ld	[%o1+%o2],%o5
+	addx	%o0,%g0,%o0
+	st	%g1,[%o4+%o2]
+L(0):	wr	%g0,%o3,%y
+	sra	%o5,31,%g2
+	and	%o3,%g2,%g2
+	andcc	%g1,0,%g1
+	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,0,%g1
+	sra	%g1,20,%g4
+	sll	%g1,12,%g1
+ 	rd	%y,%g3
+	srl	%g3,20,%g3
+	or	%g1,%g3,%g1
+
+	addcc	%g1,%o0,%g1
+	addx	%g2,%g4,%o0	C add sign-compensation and cy to hi limb
+	addcc	%o2,4,%o2	C loop counter
+	bne	L(loop0)
+	 ld	[%o4+%o2],%o5
+
+	addcc	%o5,%g1,%g1
+	addx	%o0,%g0,%o0
+	retl
+	st	%g1,[%o4+%o2]
+
+L(large):
+	ld	[%o1+%o2],%o5
+	mov	0,%o0
+	sra	%o3,31,%g4	C g4 = mask of ones iff S2_LIMB < 0
+	b	L(1)
+	 add	%o4,-4,%o4
+L(loop):
+	addcc	%o5,%g3,%g3
+	ld	[%o1+%o2],%o5
+	addx	%o0,%g0,%o0
+	st	%g3,[%o4+%o2]
+L(1):	wr	%g0,%o5,%y
+	and	%o5,%g4,%g2
+	andcc	%g0,%g0,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%g0,%g1
+	rd	%y,%g3
+	addcc	%g3,%o0,%g3
+	addx	%g2,%g1,%o0
+	addcc	%o2,4,%o2
+	bne	L(loop)
+	 ld	[%o4+%o2],%o5
+
+	addcc	%o5,%g3,%g3
+	addx	%o0,%g0,%o0
+	retl
+	st	%g3,[%o4+%o2]
+EPILOGUE(mpn_addmul_1)
diff --git a/rts/gmp/mpn/sparc32/lshift.asm b/rts/gmp/mpn/sparc32/lshift.asm
new file mode 100644
index 0000000000..529733ac2d
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/lshift.asm
@@ -0,0 +1,97 @@
+dnl  SPARC mpn_lshift -- Shift a number left.
+dnl  
+
+dnl  Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	%o0
+C src_ptr	%o1
+C size		%o2
+C cnt		%o3
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	sll	%o2,2,%g1
+	add	%o1,%g1,%o1	C make %o1 point at end of src
+	ld	[%o1-4],%g2	C load first limb
+	sub	%g0,%o3,%o5	C negate shift count
+	add	%o0,%g1,%o0	C make %o0 point at end of res
+	add	%o2,-1,%o2
+	andcc	%o2,4-1,%g4	C number of limbs in first loop
+	srl	%g2,%o5,%g1	C compute function result
+	be	L(0)		C if multiple of 4 limbs, skip first loop
+	st	%g1,[%sp+80]
+
+	sub	%o2,%g4,%o2	C adjust count for main loop
+
+L(loop0):
+	ld	[%o1-8],%g3
+	add	%o0,-4,%o0
+	add	%o1,-4,%o1
+	addcc	%g4,-1,%g4
+	sll	%g2,%o3,%o4
+	srl	%g3,%o5,%g1
+	mov	%g3,%g2
+	or	%o4,%g1,%o4
+	bne	L(loop0)
+	 st	%o4,[%o0+0]
+
+L(0):	tst	%o2
+	be	L(end)
+	 nop
+
+L(loop):
+	ld	[%o1-8],%g3
+	add	%o0,-16,%o0
+	addcc	%o2,-4,%o2
+	sll	%g2,%o3,%o4
+	srl	%g3,%o5,%g1
+
+	ld	[%o1-12],%g2
+	sll	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	st	%o4,[%o0+12]
+	srl	%g2,%o5,%g1
+
+	ld	[%o1-16],%g3
+	sll	%g2,%o3,%o4
+	or	%g4,%g1,%g4
+	st	%g4,[%o0+8]
+	srl	%g3,%o5,%g1
+
+	ld	[%o1-20],%g2
+	sll	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	st	%o4,[%o0+4]
+	srl	%g2,%o5,%g1
+
+	add	%o1,-16,%o1
+	or	%g4,%g1,%g4
+	bne	L(loop)
+	 st	%g4,[%o0+0]
+
+L(end):	sll	%g2,%o3,%g2
+	st	%g2,[%o0-4]
+	retl
+	ld	[%sp+80],%o0
+EPILOGUE(mpn_lshift)
diff --git a/rts/gmp/mpn/sparc32/mul_1.asm b/rts/gmp/mpn/sparc32/mul_1.asm
new file mode 100644
index 0000000000..e5fedeabaa
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/mul_1.asm
@@ -0,0 +1,137 @@
+dnl  SPARC mpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl  the result in a second limb vector.
+
+dnl  Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	o0
+C s1_ptr	o1
+C size		o2
+C s2_limb	o3
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	C Make S1_PTR and RES_PTR point at the end of their blocks
+	C and put (- 4 x SIZE) in index/loop counter.
+	sll	%o2,2,%o2
+	add	%o0,%o2,%o4	C RES_PTR in o4 since o0 is retval
+	add	%o1,%o2,%o1
+	sub	%g0,%o2,%o2
+
+	cmp	%o3,0xfff
+	bgu	L(large)
+	nop
+
+	ld	[%o1+%o2],%o5
+	mov	0,%o0
+	b	L(0)
+	 add	%o4,-4,%o4
+L(loop0):
+	st	%g1,[%o4+%o2]
+L(0):	wr	%g0,%o3,%y
+	sra	%o5,31,%g2
+	and	%o3,%g2,%g2
+	andcc	%g1,0,%g1
+	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,0,%g1
+	sra	%g1,20,%g4
+	sll	%g1,12,%g1
+ 	rd	%y,%g3
+	srl	%g3,20,%g3
+	or	%g1,%g3,%g1
+
+	addcc	%g1,%o0,%g1
+	addx	%g2,%g4,%o0	C add sign-compensation and cy to hi limb
+	addcc	%o2,4,%o2	C loop counter
+	bne,a	L(loop0)
+	 ld	[%o1+%o2],%o5
+
+	retl
+	st	%g1,[%o4+%o2]
+
+
+L(large):
+	ld	[%o1+%o2],%o5
+	mov	0,%o0
+	sra	%o3,31,%g4	C g4 = mask of ones iff S2_LIMB < 0
+	b	L(1)
+	 add	%o4,-4,%o4
+L(loop):
+	st	%g3,[%o4+%o2]
+L(1):	wr	%g0,%o5,%y
+	and	%o5,%g4,%g2	C g2 = S1_LIMB iff S2_LIMB < 0, else 0
+	andcc	%g0,%g0,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%g0,%g1
+	rd	%y,%g3
+	addcc	%g3,%o0,%g3
+	addx	%g2,%g1,%o0	C add sign-compensation and cy to hi limb
+	addcc	%o2,4,%o2	C loop counter
+	bne,a	L(loop)
+	 ld	[%o1+%o2],%o5
+
+	retl
+	st	%g3,[%o4+%o2]
+EPILOGUE(mpn_mul_1)
diff --git a/rts/gmp/mpn/sparc32/rshift.asm b/rts/gmp/mpn/sparc32/rshift.asm
new file mode 100644
index 0000000000..9187dbaa6f
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/rshift.asm
@@ -0,0 +1,93 @@
+dnl  SPARC mpn_rshift -- Shift a number right.
+
+dnl  Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	%o0
+C src_ptr	%o1
+C size		%o2
+C cnt		%o3
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	ld	[%o1],%g2	C load first limb
+	sub	%g0,%o3,%o5	C negate shift count
+	add	%o2,-1,%o2
+	andcc	%o2,4-1,%g4	C number of limbs in first loop
+	sll	%g2,%o5,%g1	C compute function result
+	be	L(0)		C if multiple of 4 limbs, skip first loop
+	st	%g1,[%sp+80]
+
+	sub	%o2,%g4,%o2	C adjust count for main loop
+
+L(loop0):
+	ld	[%o1+4],%g3
+	add	%o0,4,%o0
+	add	%o1,4,%o1
+	addcc	%g4,-1,%g4
+	srl	%g2,%o3,%o4
+	sll	%g3,%o5,%g1
+	mov	%g3,%g2
+	or	%o4,%g1,%o4
+	bne	L(loop0)
+	 st	%o4,[%o0-4]
+
+L(0):	tst	%o2
+	be	L(end)
+	 nop
+
+L(loop):
+	ld	[%o1+4],%g3
+	add	%o0,16,%o0
+	addcc	%o2,-4,%o2
+	srl	%g2,%o3,%o4
+	sll	%g3,%o5,%g1
+
+	ld	[%o1+8],%g2
+	srl	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	st	%o4,[%o0-16]
+	sll	%g2,%o5,%g1
+
+	ld	[%o1+12],%g3
+	srl	%g2,%o3,%o4
+	or	%g4,%g1,%g4
+	st	%g4,[%o0-12]
+	sll	%g3,%o5,%g1
+
+	ld	[%o1+16],%g2
+	srl	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	st	%o4,[%o0-8]
+	sll	%g2,%o5,%g1
+
+	add	%o1,16,%o1
+	or	%g4,%g1,%g4
+	bne	L(loop)
+	 st	%g4,[%o0-4]
+
+L(end):	srl	%g2,%o3,%g2
+	st	%g2,[%o0-0]
+	retl
+	ld	[%sp+80],%o0
+EPILOGUE(mpn_rshift)
diff --git a/rts/gmp/mpn/sparc32/sub_n.asm b/rts/gmp/mpn/sparc32/sub_n.asm
new file mode 100644
index 0000000000..071909a1b6
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/sub_n.asm
@@ -0,0 +1,326 @@
+dnl  SPARC mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl  store difference in a third limb vector.
+
+dnl  Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(res_ptr,%o0)
+define(s1_ptr,%o1)
+define(s2_ptr,%o2)
+define(n,%o3)
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	xor	s2_ptr,res_ptr,%g1
+	andcc	%g1,4,%g0
+	bne	L(1)			C branch if alignment differs
+	nop
+C **  V1a  **
+	andcc	res_ptr,4,%g0		C res_ptr unaligned? Side effect: cy=0
+	be	L(v1)			C if no, branch
+	nop
+C Add least significant limb separately to align res_ptr and s2_ptr
+	ld	[s1_ptr],%g4
+	add	s1_ptr,4,s1_ptr
+	ld	[s2_ptr],%g2
+	add	s2_ptr,4,s2_ptr
+	add	n,-1,n
+	subcc	%g4,%g2,%o4
+	st	%o4,[res_ptr]
+	add	res_ptr,4,res_ptr
+L(v1):	addx	%g0,%g0,%o4		C save cy in register
+	cmp	n,2			C if n < 2 ...
+	bl	L(end2)			C ... branch to tail code
+	subcc	%g0,%o4,%g0		C restore cy
+
+	ld	[s1_ptr+0],%g4
+	addcc	n,-10,n
+	ld	[s1_ptr+4],%g1
+	ldd	[s2_ptr+0],%g2
+	blt	L(fin1)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 8 limbs until less than 8 limbs remain
+L(loop1):
+	subxcc	%g4,%g2,%o4
+	ld	[s1_ptr+8],%g4
+	subxcc	%g1,%g3,%o5
+	ld	[s1_ptr+12],%g1
+	ldd	[s2_ptr+8],%g2
+	std	%o4,[res_ptr+0]
+	subxcc	%g4,%g2,%o4
+	ld	[s1_ptr+16],%g4
+	subxcc	%g1,%g3,%o5
+	ld	[s1_ptr+20],%g1
+	ldd	[s2_ptr+16],%g2
+	std	%o4,[res_ptr+8]
+	subxcc	%g4,%g2,%o4
+	ld	[s1_ptr+24],%g4
+	subxcc	%g1,%g3,%o5
+	ld	[s1_ptr+28],%g1
+	ldd	[s2_ptr+24],%g2
+	std	%o4,[res_ptr+16]
+	subxcc	%g4,%g2,%o4
+	ld	[s1_ptr+32],%g4
+	subxcc	%g1,%g3,%o5
+	ld	[s1_ptr+36],%g1
+	ldd	[s2_ptr+32],%g2
+	std	%o4,[res_ptr+24]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	add	s1_ptr,32,s1_ptr
+	add	s2_ptr,32,s2_ptr
+	add	res_ptr,32,res_ptr
+	bge	L(loop1)
+	subcc	%g0,%o4,%g0		C restore cy
+
+L(fin1):
+	addcc	n,8-2,n
+	blt	L(end1)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 2 limbs until less than 2 limbs remain
+L(loope1):
+	subxcc	%g4,%g2,%o4
+	ld	[s1_ptr+8],%g4
+	subxcc	%g1,%g3,%o5
+	ld	[s1_ptr+12],%g1
+	ldd	[s2_ptr+8],%g2
+	std	%o4,[res_ptr+0]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-2,n
+	add	s1_ptr,8,s1_ptr
+	add	s2_ptr,8,s2_ptr
+	add	res_ptr,8,res_ptr
+	bge	L(loope1)
+	subcc	%g0,%o4,%g0		C restore cy
+L(end1):
+	subxcc	%g4,%g2,%o4
+	subxcc	%g1,%g3,%o5
+	std	%o4,[res_ptr+0]
+	addx	%g0,%g0,%o4		C save cy in register
+
+	andcc	n,1,%g0
+	be	L(ret1)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add last limb
+	ld	[s1_ptr+8],%g4
+	ld	[s2_ptr+8],%g2
+	subxcc	%g4,%g2,%o4
+	st	%o4,[res_ptr+8]
+
+L(ret1):
+	retl
+	addx	%g0,%g0,%o0	C return carry-out from most sign. limb
+
+L(1):	xor	s1_ptr,res_ptr,%g1
+	andcc	%g1,4,%g0
+	bne	L(2)
+	nop
+C **  V1b  **
+	andcc	res_ptr,4,%g0		C res_ptr unaligned? Side effect: cy=0
+	be	L(v1b)			C if no, branch
+	nop
+C Add least significant limb separately to align res_ptr and s1_ptr
+	ld	[s2_ptr],%g4
+	add	s2_ptr,4,s2_ptr
+	ld	[s1_ptr],%g2
+	add	s1_ptr,4,s1_ptr
+	add	n,-1,n
+	subcc	%g2,%g4,%o4
+	st	%o4,[res_ptr]
+	add	res_ptr,4,res_ptr
+L(v1b):	addx	%g0,%g0,%o4		C save cy in register
+	cmp	n,2			C if n < 2 ...
+	bl	L(end2)			C ... branch to tail code
+	subcc	%g0,%o4,%g0		C restore cy
+
+	ld	[s2_ptr+0],%g4
+	addcc	n,-10,n
+	ld	[s2_ptr+4],%g1
+	ldd	[s1_ptr+0],%g2
+	blt	L(fin1b)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 8 limbs until less than 8 limbs remain
+L(loop1b):
+	subxcc	%g2,%g4,%o4
+	ld	[s2_ptr+8],%g4
+	subxcc	%g3,%g1,%o5
+	ld	[s2_ptr+12],%g1
+	ldd	[s1_ptr+8],%g2
+	std	%o4,[res_ptr+0]
+	subxcc	%g2,%g4,%o4
+	ld	[s2_ptr+16],%g4
+	subxcc	%g3,%g1,%o5
+	ld	[s2_ptr+20],%g1
+	ldd	[s1_ptr+16],%g2
+	std	%o4,[res_ptr+8]
+	subxcc	%g2,%g4,%o4
+	ld	[s2_ptr+24],%g4
+	subxcc	%g3,%g1,%o5
+	ld	[s2_ptr+28],%g1
+	ldd	[s1_ptr+24],%g2
+	std	%o4,[res_ptr+16]
+	subxcc	%g2,%g4,%o4
+	ld	[s2_ptr+32],%g4
+	subxcc	%g3,%g1,%o5
+	ld	[s2_ptr+36],%g1
+	ldd	[s1_ptr+32],%g2
+	std	%o4,[res_ptr+24]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	add	s1_ptr,32,s1_ptr
+	add	s2_ptr,32,s2_ptr
+	add	res_ptr,32,res_ptr
+	bge	L(loop1b)
+	subcc	%g0,%o4,%g0		C restore cy
+
+L(fin1b):
+	addcc	n,8-2,n
+	blt	L(end1b)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 2 limbs until less than 2 limbs remain
+L(loope1b):
+	subxcc	%g2,%g4,%o4
+	ld	[s2_ptr+8],%g4
+	subxcc	%g3,%g1,%o5
+	ld	[s2_ptr+12],%g1
+	ldd	[s1_ptr+8],%g2
+	std	%o4,[res_ptr+0]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-2,n
+	add	s1_ptr,8,s1_ptr
+	add	s2_ptr,8,s2_ptr
+	add	res_ptr,8,res_ptr
+	bge	L(loope1b)
+	subcc	%g0,%o4,%g0		C restore cy
+L(end1b):
+	subxcc	%g2,%g4,%o4
+	subxcc	%g3,%g1,%o5
+	std	%o4,[res_ptr+0]
+	addx	%g0,%g0,%o4		C save cy in register
+
+	andcc	n,1,%g0
+	be	L(ret1b)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add last limb
+	ld	[s2_ptr+8],%g4
+	ld	[s1_ptr+8],%g2
+	subxcc	%g2,%g4,%o4
+	st	%o4,[res_ptr+8]
+
+L(ret1b):
+	retl
+	addx	%g0,%g0,%o0		C return carry-out from most sign. limb
+
+C **  V2  **
+C If we come here, the alignment of s1_ptr and res_ptr as well as the
+C alignment of s2_ptr and res_ptr differ.  Since there are only two ways
+C things can be aligned (that we care about) we now know that the alignment
+C of s1_ptr and s2_ptr are the same.
+
+L(2):	cmp	n,1
+	be	L(jone)
+	nop
+	andcc	s1_ptr,4,%g0		C s1_ptr unaligned? Side effect: cy=0
+	be	L(v2)			C if no, branch
+	nop
+C Add least significant limb separately to align s1_ptr and s2_ptr
+	ld	[s1_ptr],%g4
+	add	s1_ptr,4,s1_ptr
+	ld	[s2_ptr],%g2
+	add	s2_ptr,4,s2_ptr
+	add	n,-1,n
+	subcc	%g4,%g2,%o4
+	st	%o4,[res_ptr]
+	add	res_ptr,4,res_ptr
+
+L(v2):	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	blt	L(fin2)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 8 limbs until less than 8 limbs remain
+L(loop2):
+	ldd	[s1_ptr+0],%g2
+	ldd	[s2_ptr+0],%o4
+	subxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+0]
+	subxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+4]
+	ldd	[s1_ptr+8],%g2
+	ldd	[s2_ptr+8],%o4
+	subxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+8]
+	subxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+12]
+	ldd	[s1_ptr+16],%g2
+	ldd	[s2_ptr+16],%o4
+	subxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+16]
+	subxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+20]
+	ldd	[s1_ptr+24],%g2
+	ldd	[s2_ptr+24],%o4
+	subxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+24]
+	subxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+28]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	add	s1_ptr,32,s1_ptr
+	add	s2_ptr,32,s2_ptr
+	add	res_ptr,32,res_ptr
+	bge	L(loop2)
+	subcc	%g0,%o4,%g0		C restore cy
+
+L(fin2):
+	addcc	n,8-2,n
+	blt	L(end2)
+	subcc	%g0,%o4,%g0		C restore cy
+L(loope2):
+	ldd	[s1_ptr+0],%g2
+	ldd	[s2_ptr+0],%o4
+	subxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+0]
+	subxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+4]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-2,n
+	add	s1_ptr,8,s1_ptr
+	add	s2_ptr,8,s2_ptr
+	add	res_ptr,8,res_ptr
+	bge	L(loope2)
+	subcc	%g0,%o4,%g0		C restore cy
+L(end2):
+	andcc	n,1,%g0
+	be	L(ret2)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add last limb
+L(jone):
+	ld	[s1_ptr],%g4
+	ld	[s2_ptr],%g2
+	subxcc	%g4,%g2,%o4
+	st	%o4,[res_ptr]
+
+L(ret2):
+	retl
+	addx	%g0,%g0,%o0		C return carry-out from most sign. limb
+EPILOGUE(mpn_sub_n)
diff --git a/rts/gmp/mpn/sparc32/submul_1.asm b/rts/gmp/mpn/sparc32/submul_1.asm
new file mode 100644
index 0000000000..12abd844ce
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/submul_1.asm
@@ -0,0 +1,146 @@
+dnl  SPARC mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+dnl  the result from a second limb vector.
+
+dnl  Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	o0
+C s1_ptr	o1
+C size		o2
+C s2_limb	o3
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	C Make S1_PTR and RES_PTR point at the end of their blocks
+	C and put (- 4 x SIZE) in index/loop counter.
+	sll	%o2,2,%o2
+	add	%o0,%o2,%o4	C RES_PTR in o4 since o0 is retval
+	add	%o1,%o2,%o1
+	sub	%g0,%o2,%o2
+
+	cmp	%o3,0xfff
+	bgu	L(large)
+	nop
+
+	ld	[%o1+%o2],%o5
+	mov	0,%o0
+	b	L(0)
+	 add	%o4,-4,%o4
+L(loop0):
+	subcc	%o5,%g1,%g1
+	ld	[%o1+%o2],%o5
+	addx	%o0,%g0,%o0
+	st	%g1,[%o4+%o2]
+L(0):	wr	%g0,%o3,%y
+	sra	%o5,31,%g2
+	and	%o3,%g2,%g2
+	andcc	%g1,0,%g1
+	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+ 	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,0,%g1
+	sra	%g1,20,%g4
+	sll	%g1,12,%g1
+ 	rd	%y,%g3
+	srl	%g3,20,%g3
+	or	%g1,%g3,%g1
+
+	addcc	%g1,%o0,%g1
+	addx	%g2,%g4,%o0	C add sign-compensation and cy to hi limb
+	addcc	%o2,4,%o2	C loop counter
+	bne	L(loop0)
+	 ld	[%o4+%o2],%o5
+
+	subcc	%o5,%g1,%g1
+	addx	%o0,%g0,%o0
+	retl
+	st	%g1,[%o4+%o2]
+
+L(large):
+	ld	[%o1+%o2],%o5
+	mov	0,%o0
+	sra	%o3,31,%g4	C g4 = mask of ones iff S2_LIMB < 0
+	b	L(1)
+	 add	%o4,-4,%o4
+L(loop):
+	subcc	%o5,%g3,%g3
+	ld	[%o1+%o2],%o5
+	addx	%o0,%g0,%o0
+	st	%g3,[%o4+%o2]
+L(1):	wr	%g0,%o5,%y
+	and	%o5,%g4,%g2
+	andcc	%g0,%g0,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%g0,%g1
+	rd	%y,%g3
+	addcc	%g3,%o0,%g3
+	addx	%g2,%g1,%o0
+	addcc	%o2,4,%o2
+	bne	L(loop)
+	 ld	[%o4+%o2],%o5
+
+	subcc	%o5,%g3,%g3
+	addx	%o0,%g0,%o0
+	retl
+	st	%g3,[%o4+%o2]
+EPILOGUE(mpn_submul_1)
diff --git a/rts/gmp/mpn/sparc32/udiv_fp.asm b/rts/gmp/mpn/sparc32/udiv_fp.asm
new file mode 100644
index 0000000000..e340e147d2
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/udiv_fp.asm
@@ -0,0 +1,158 @@
+dnl  SPARC v7 __udiv_qrnnd division support, used from longlong.h.
+dnl  This is for v7 CPUs with a floating-point unit.
+
+dnl  Copyright (C) 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rem_ptr	i0
+C n1		i1
+C n0		i2
+C d		i3
+
+ASM_START()
+
+ifdef(`PIC',
+`	TEXT
+L(getpc):
+	retl
+	nop')
+
+	TEXT
+	ALIGN(8)
+L(C0):	.double	0r4294967296
+L(C1):	.double	0r2147483648
+
+PROLOGUE(mpn_udiv_qrnnd)
+	save	%sp,-104,%sp
+	st	%i1,[%fp-8]
+	ld	[%fp-8],%f10
+
+ifdef(`PIC',
+`L(pc):	call	L(getpc)		C put address of this insn in %o7
+	ldd	[%o7+L(C0)-L(pc)],%f8',
+`	sethi	%hi(L(C0)),%o7
+	ldd	[%o7+%lo(L(C0))],%f8')
+
+	fitod	%f10,%f4
+	cmp	%i1,0
+	bge	L(248)
+	mov	%i0,%i5
+	faddd	%f4,%f8,%f4
+L(248):
+	st	%i2,[%fp-8]
+	ld	[%fp-8],%f10
+	fmuld	%f4,%f8,%f6
+	cmp	%i2,0
+	bge	L(249)
+	fitod	%f10,%f2
+	faddd	%f2,%f8,%f2
+L(249):
+	st	%i3,[%fp-8]
+	faddd	%f6,%f2,%f2
+	ld	[%fp-8],%f10
+	cmp	%i3,0
+	bge	L(250)
+	fitod	%f10,%f4
+	faddd	%f4,%f8,%f4
+L(250):
+	fdivd	%f2,%f4,%f2
+
+ifdef(`PIC',
+`	ldd	[%o7+L(C1)-L(pc)],%f4',
+`	sethi	%hi(L(C1)),%o7
+	ldd	[%o7+%lo(L(C1))],%f4')
+
+	fcmped	%f2,%f4
+	nop
+	fbge,a	L(251)
+	fsubd	%f2,%f4,%f2
+	fdtoi	%f2,%f2
+	st	%f2,[%fp-8]
+	b	L(252)
+	ld	[%fp-8],%i4
+L(251):
+	fdtoi	%f2,%f2
+	st	%f2,[%fp-8]
+	ld	[%fp-8],%i4
+	sethi	%hi(-2147483648),%g2
+	xor	%i4,%g2,%i4
+L(252):
+	wr	%g0,%i4,%y
+	sra	%i3,31,%g2
+	and	%i4,%g2,%g2
+	andcc	%g0,0,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,0,%g1
+	add	%g1,%g2,%i0
+	rd	%y,%g3
+	subcc	%i2,%g3,%o7
+	subxcc	%i1,%i0,%g0
+	be	L(253)
+	cmp	%o7,%i3
+
+	add	%i4,-1,%i0
+	add	%o7,%i3,%o7
+	st	%o7,[%i5]
+	ret
+	restore
+L(253):
+	blu	L(246)
+	mov	%i4,%i0
+	add	%i4,1,%i0
+	sub	%o7,%i3,%o7
+L(246):
+	st	%o7,[%i5]
+	ret
+	restore
+EPILOGUE(mpn_udiv_qrnnd)
diff --git a/rts/gmp/mpn/sparc32/udiv_nfp.asm b/rts/gmp/mpn/sparc32/udiv_nfp.asm
new file mode 100644
index 0000000000..ae19f4c6e9
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/udiv_nfp.asm
@@ -0,0 +1,193 @@
+dnl  SPARC v7 __udiv_qrnnd division support, used from longlong.h.
+dnl  This is for v7 CPUs without a floating-point unit.
+
+dnl  Copyright (C) 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+ 
+C INPUT PARAMETERS
+C rem_ptr	o0
+C n1		o1
+C n0		o2
+C d		o3
+
+ASM_START()
+PROLOGUE(mpn_udiv_qrnnd)
+	tst	%o3
+	bneg	L(largedivisor)
+	mov	8,%g1
+
+	b	L(p1)
+	addxcc	%o2,%o2,%o2
+
+L(plop):
+	bcc	L(n1)
+	addxcc	%o2,%o2,%o2
+L(p1):	addx	%o1,%o1,%o1
+	subcc	%o1,%o3,%o4
+	bcc	L(n2)
+	addxcc	%o2,%o2,%o2
+L(p2):	addx	%o1,%o1,%o1
+	subcc	%o1,%o3,%o4
+	bcc	L(n3)
+	addxcc	%o2,%o2,%o2
+L(p3):	addx	%o1,%o1,%o1
+	subcc	%o1,%o3,%o4
+	bcc	L(n4)
+	addxcc	%o2,%o2,%o2
+L(p4):	addx	%o1,%o1,%o1
+	addcc	%g1,-1,%g1
+	bne	L(plop)
+	subcc	%o1,%o3,%o4
+	bcc	L(n5)
+	addxcc	%o2,%o2,%o2
+L(p5):	st	%o1,[%o0]
+	retl
+	xnor	%g0,%o2,%o0
+
+L(nlop):
+	bcc	L(p1)
+	addxcc	%o2,%o2,%o2
+L(n1):	addx	%o4,%o4,%o4
+	subcc	%o4,%o3,%o1
+	bcc	L(p2)
+	addxcc	%o2,%o2,%o2
+L(n2):	addx	%o4,%o4,%o4
+	subcc	%o4,%o3,%o1
+	bcc	L(p3)
+	addxcc	%o2,%o2,%o2
+L(n3):	addx	%o4,%o4,%o4
+	subcc	%o4,%o3,%o1
+	bcc	L(p4)
+	addxcc	%o2,%o2,%o2
+L(n4):	addx	%o4,%o4,%o4
+	addcc	%g1,-1,%g1
+	bne	L(nlop)
+	subcc	%o4,%o3,%o1
+	bcc	L(p5)
+	addxcc	%o2,%o2,%o2
+L(n5):	st	%o4,[%o0]
+	retl
+	xnor	%g0,%o2,%o0
+
+L(largedivisor):
+	and	%o2,1,%o5	C %o5 = n0 & 1
+
+	srl	%o2,1,%o2
+	sll	%o1,31,%g2
+	or	%g2,%o2,%o2	C %o2 = lo(n1n0 >> 1)
+	srl	%o1,1,%o1	C %o1 = hi(n1n0 >> 1)
+
+	and	%o3,1,%g2
+	srl	%o3,1,%g3	C %g3 = floor(d / 2)
+	add	%g3,%g2,%g3	C %g3 = ceil(d / 2)
+
+	b	L(Lp1)
+	addxcc	%o2,%o2,%o2
+
+L(Lplop):
+	bcc	L(Ln1)
+	addxcc	%o2,%o2,%o2
+L(Lp1):	addx	%o1,%o1,%o1
+	subcc	%o1,%g3,%o4
+	bcc	L(Ln2)
+	addxcc	%o2,%o2,%o2
+L(Lp2):	addx	%o1,%o1,%o1
+	subcc	%o1,%g3,%o4
+	bcc	L(Ln3)
+	addxcc	%o2,%o2,%o2
+L(Lp3):	addx	%o1,%o1,%o1
+	subcc	%o1,%g3,%o4
+	bcc	L(Ln4)
+	addxcc	%o2,%o2,%o2
+L(Lp4):	addx	%o1,%o1,%o1
+	addcc	%g1,-1,%g1
+	bne	L(Lplop)
+	subcc	%o1,%g3,%o4
+	bcc	L(Ln5)
+	addxcc	%o2,%o2,%o2
+L(Lp5):	add	%o1,%o1,%o1	C << 1
+	tst	%g2
+	bne	L(oddp)
+	add	%o5,%o1,%o1
+	st	%o1,[%o0]
+	retl
+	xnor	%g0,%o2,%o0
+
+L(Lnlop):
+	bcc	L(Lp1)
+	addxcc	%o2,%o2,%o2
+L(Ln1):	addx	%o4,%o4,%o4
+	subcc	%o4,%g3,%o1
+	bcc	L(Lp2)
+	addxcc	%o2,%o2,%o2
+L(Ln2):	addx	%o4,%o4,%o4
+	subcc	%o4,%g3,%o1
+	bcc	L(Lp3)
+	addxcc	%o2,%o2,%o2
+L(Ln3):	addx	%o4,%o4,%o4
+	subcc	%o4,%g3,%o1
+	bcc	L(Lp4)
+	addxcc	%o2,%o2,%o2
+L(Ln4):	addx	%o4,%o4,%o4
+	addcc	%g1,-1,%g1
+	bne	L(Lnlop)
+	subcc	%o4,%g3,%o1
+	bcc	L(Lp5)
+	addxcc	%o2,%o2,%o2
+L(Ln5):	add	%o4,%o4,%o4	C << 1
+	tst	%g2
+	bne	L(oddn)
+	add	%o5,%o4,%o4
+	st	%o4,[%o0]
+	retl
+	xnor	%g0,%o2,%o0
+
+L(oddp):
+	xnor	%g0,%o2,%o2
+	C q' in %o2. r' in %o1
+	addcc	%o1,%o2,%o1
+	bcc	L(Lp6)
+	addx	%o2,0,%o2
+	sub	%o1,%o3,%o1
+L(Lp6):	subcc	%o1,%o3,%g0
+	bcs	L(Lp7)
+	subx	%o2,-1,%o2
+	sub	%o1,%o3,%o1
+L(Lp7):	st	%o1,[%o0]
+	retl
+	mov	%o2,%o0
+
+L(oddn):
+	xnor	%g0,%o2,%o2
+	C q' in %o2. r' in %o4
+	addcc	%o4,%o2,%o4
+	bcc	L(Ln6)
+	addx	%o2,0,%o2
+	sub	%o4,%o3,%o4
+L(Ln6):	subcc	%o4,%o3,%g0
+	bcs	L(Ln7)
+	subx	%o2,-1,%o2
+	sub	%o4,%o3,%o4
+L(Ln7):	st	%o4,[%o0]
+	retl
+	mov	%o2,%o0
+EPILOGUE(mpn_udiv_qrnnd)
diff --git a/rts/gmp/mpn/sparc32/umul.asm b/rts/gmp/mpn/sparc32/umul.asm
new file mode 100644
index 0000000000..efa56851d6
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/umul.asm
@@ -0,0 +1,68 @@
+dnl  SPARC mpn_umul_ppmm -- support for longlong.h for non-gcc.
+
+dnl  Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_umul_ppmm)
+	wr	%g0,%o1,%y
+	sra	%o2,31,%g2	C Don't move this insn
+	and	%o1,%g2,%g2	C Don't move this insn
+	andcc	%g0,0,%g1	C Don't move this insn
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,0,%g1
+	rd	%y,%g3
+	st	%g3,[%o0]
+	retl
+	add	%g1,%g2,%o0
+EPILOGUE(mpn_umul_ppmm)
diff --git a/rts/gmp/mpn/sparc32/v8/addmul_1.asm b/rts/gmp/mpn/sparc32/v8/addmul_1.asm
new file mode 100644
index 0000000000..da44644b51
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v8/addmul_1.asm
@@ -0,0 +1,122 @@
+dnl  SPARC v8 mpn_addmul_1 -- Multiply a limb vector with a limb and
+dnl  add the result to a second limb vector.
+
+dnl  Copyright (C) 1992, 1993, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	o0
+C s1_ptr	o1
+C size		o2
+C s2_limb	o3
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	orcc	%g0,%g0,%g2
+	ld	[%o1+0],%o4	C 1
+
+	sll	%o2,4,%g1
+	and	%g1,(4-1)<<4,%g1
+ifdef(`PIC',
+`	mov	%o7,%g4		C Save return address register
+0:	call	1f
+	add	%o7,L(1)-0b,%g3
+1:	mov	%g4,%o7		C Restore return address register
+',
+`	sethi	%hi(L(1)),%g3
+	or	%g3,%lo(L(1)),%g3
+')
+	jmp	%g3+%g1
+	nop
+L(1):
+L(L00):	add	%o0,-4,%o0
+	b	L(loop00)	C 4, 8, 12, ...
+	add	%o1,-4,%o1
+	nop
+L(L01):	b	L(loop01)	C 1, 5, 9, ...
+	nop
+	nop
+	nop
+L(L10):	add	%o0,-12,%o0	C 2, 6, 10, ...
+	b	L(loop10)
+	add	%o1,4,%o1
+	nop
+L(L11):	add	%o0,-8,%o0	C 3, 7, 11, ...
+	b	L(loop11)
+	add	%o1,-8,%o1
+	nop
+
+L(loop):
+	addcc	%g3,%g2,%g3	C 1
+	ld	[%o1+4],%o4	C 2
+	rd	%y,%g2		C 1
+	addx	%g0,%g2,%g2
+	ld	[%o0+0],%g1	C 2
+	addcc	%g1,%g3,%g3
+	st	%g3,[%o0+0]	C 1
+L(loop00):
+	umul	%o4,%o3,%g3	C 2
+	ld	[%o0+4],%g1	C 2
+	addxcc	%g3,%g2,%g3	C 2
+	ld	[%o1+8],%o4	C 3
+	rd	%y,%g2		C 2
+	addx	%g0,%g2,%g2
+	nop
+	addcc	%g1,%g3,%g3
+	st	%g3,[%o0+4]	C 2
+L(loop11):
+	umul	%o4,%o3,%g3	C 3
+	addxcc	%g3,%g2,%g3	C 3
+	ld	[%o1+12],%o4	C 4
+	rd	%y,%g2		C 3
+	add	%o1,16,%o1
+	addx	%g0,%g2,%g2
+	ld	[%o0+8],%g1	C 2
+	addcc	%g1,%g3,%g3
+	st	%g3,[%o0+8]	C 3
+L(loop10):
+	umul	%o4,%o3,%g3	C 4
+	addxcc	%g3,%g2,%g3	C 4
+	ld	[%o1+0],%o4	C 1
+	rd	%y,%g2		C 4
+	addx	%g0,%g2,%g2
+	ld	[%o0+12],%g1	C 2
+	addcc	%g1,%g3,%g3
+	st	%g3,[%o0+12]	C 4
+	add	%o0,16,%o0
+	addx	%g0,%g2,%g2
+L(loop01):
+	addcc	%o2,-4,%o2
+	bg	L(loop)
+	umul	%o4,%o3,%g3	C 1
+
+	addcc	%g3,%g2,%g3	C 4
+	rd	%y,%g2		C 4
+	addx	%g0,%g2,%g2
+	ld	[%o0+0],%g1	C 2
+	addcc	%g1,%g3,%g3
+	st	%g3,[%o0+0]	C 4
+	addx	%g0,%g2,%o0
+
+	retl
+	 nop
+EPILOGUE(mpn_addmul_1)
diff --git a/rts/gmp/mpn/sparc32/v8/mul_1.asm b/rts/gmp/mpn/sparc32/v8/mul_1.asm
new file mode 100644
index 0000000000..801247553a
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v8/mul_1.asm
@@ -0,0 +1,103 @@
+dnl  SPARC v8 mpn_mul_1 -- Multiply a limb vector with a single limb and
+dnl  store the product in a second limb vector.
+
+dnl  Copyright (C) 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	o0
+C s1_ptr	o1
+C size		o2
+C s2_limb	o3
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	sll	%o2,4,%g1
+	and	%g1,(4-1)<<4,%g1
+ifdef(`PIC',
+`	mov	%o7,%g4		C Save return address register
+0:	call	1f
+	add	%o7,L(1)-0b,%g3
+1:	mov	%g4,%o7		C Restore return address register
+',
+`	sethi	%hi(L(1)),%g3
+	or	%g3,%lo(L(1)),%g3
+')
+	jmp	%g3+%g1
+	ld	[%o1+0],%o4	C 1
+L(1):
+L(L00):	add	%o0,-4,%o0
+	add	%o1,-4,%o1
+	b	L(loop00)	C 4, 8, 12, ...
+	orcc	%g0,%g0,%g2
+L(L01):	b	L(loop01)	C 1, 5, 9, ...
+	orcc	%g0,%g0,%g2
+	nop
+	nop
+L(L10):	add	%o0,-12,%o0	C 2, 6, 10, ...
+	add	%o1,4,%o1
+	b	L(loop10)
+	orcc	%g0,%g0,%g2
+	nop
+L(L11):	add	%o0,-8,%o0	C 3, 7, 11, ...
+	add	%o1,-8,%o1
+	b	L(loop11)
+	orcc	%g0,%g0,%g2
+
+L(loop):
+	addcc	%g3,%g2,%g3	C 1
+	ld	[%o1+4],%o4	C 2
+	st	%g3,[%o0+0]	C 1
+	rd	%y,%g2		C 1
+L(loop00):
+	umul	%o4,%o3,%g3	C 2
+	addxcc	%g3,%g2,%g3	C 2
+	ld	[%o1+8],%o4	C 3
+	st	%g3,[%o0+4]	C 2
+	rd	%y,%g2		C 2
+L(loop11):
+	umul	%o4,%o3,%g3	C 3
+	addxcc	%g3,%g2,%g3	C 3
+	ld	[%o1+12],%o4	C 4
+	add	%o1,16,%o1
+	st	%g3,[%o0+8]	C 3
+	rd	%y,%g2		C 3
+L(loop10):
+	umul	%o4,%o3,%g3	C 4
+	addxcc	%g3,%g2,%g3	C 4
+	ld	[%o1+0],%o4	C 1
+	st	%g3,[%o0+12]	C 4
+	add	%o0,16,%o0
+	rd	%y,%g2		C 4
+	addx	%g0,%g2,%g2
+L(loop01):
+	addcc	%o2,-4,%o2
+	bg	L(loop)
+	umul	%o4,%o3,%g3	C 1
+
+	addcc	%g3,%g2,%g3	C 4
+	st	%g3,[%o0+0]	C 4
+	rd	%y,%g2		C 4
+
+	retl
+	addx	%g0,%g2,%o0
+EPILOGUE(mpn_mul_1)
diff --git a/rts/gmp/mpn/sparc32/v8/submul_1.asm b/rts/gmp/mpn/sparc32/v8/submul_1.asm
new file mode 100644
index 0000000000..9ed132f4c1
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v8/submul_1.asm
@@ -0,0 +1,58 @@
+dnl  SPARC v8 mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl  subtract the result from a second limb vector.
+
+dnl  Copyright (C) 1992, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	o0
+C s1_ptr	o1
+C size		o2
+C s2_limb	o3
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	sub	%g0,%o2,%o2		C negate ...
+	sll	%o2,2,%o2		C ... and scale size
+	sub	%o1,%o2,%o1		C o1 is offset s1_ptr
+	sub	%o0,%o2,%g1		C g1 is offset res_ptr
+
+	mov	0,%o0			C clear cy_limb
+
+L(loop):
+	ld	[%o1+%o2],%o4
+	ld	[%g1+%o2],%g2
+	umul	%o4,%o3,%o5
+	rd	%y,%g3
+	addcc	%o5,%o0,%o5
+	addx	%g3,0,%o0
+	subcc	%g2,%o5,%g2
+	addx	%o0,0,%o0
+	st	%g2,[%g1+%o2]
+
+	addcc	%o2,4,%o2
+	bne	L(loop)
+	 nop
+
+	retl
+	 nop
+EPILOGUE(mpn_submul_1)
diff --git a/rts/gmp/mpn/sparc32/v8/supersparc/udiv.asm b/rts/gmp/mpn/sparc32/v8/supersparc/udiv.asm
new file mode 100644
index 0000000000..0d5e8d415d
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v8/supersparc/udiv.asm
@@ -0,0 +1,122 @@
+dnl  SuperSPARC mpn_udiv_qrnnd division support, used from longlong.h.
+dnl  This is for SuperSPARC only, to compensate for its semi-functional
+dnl  udiv instruction.
+
+dnl  Copyright (C) 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rem_ptr	i0
+C n1		i1
+C n0		i2
+C d		i3
+
+ASM_START()
+
+ifdef(`PIC',
+`	TEXT
+L(getpc):
+	retl
+	nop')
+
+	TEXT
+	ALIGN(8)
+L(C0):	.double	0r4294967296
+L(C1):	.double	0r2147483648
+
+PROLOGUE(mpn_udiv_qrnnd)
+	save	%sp,-104,%sp
+	st	%i1,[%fp-8]
+	ld	[%fp-8],%f10
+
+ifdef(`PIC',
+`L(pc):	call	L(getpc)		C put address of this insn in %o7
+	ldd	[%o7+L(C0)-L(pc)],%f8',
+`	sethi	%hi(L(C0)),%o7
+	ldd	[%o7+%lo(L(C0))],%f8')
+
+	fitod	%f10,%f4
+	cmp	%i1,0
+	bge	L(248)
+	mov	%i0,%i5
+	faddd	%f4,%f8,%f4
+L(248):
+	st	%i2,[%fp-8]
+	ld	[%fp-8],%f10
+	fmuld	%f4,%f8,%f6
+	cmp	%i2,0
+	bge	L(249)
+	fitod	%f10,%f2
+	faddd	%f2,%f8,%f2
+L(249):
+	st	%i3,[%fp-8]
+	faddd	%f6,%f2,%f2
+	ld	[%fp-8],%f10
+	cmp	%i3,0
+	bge	L(250)
+	fitod	%f10,%f4
+	faddd	%f4,%f8,%f4
+L(250):
+	fdivd	%f2,%f4,%f2
+
+ifdef(`PIC',
+`	ldd	[%o7+L(C1)-L(pc)],%f4',
+`	sethi	%hi(L(C1)),%o7
+	ldd	[%o7+%lo(L(C1))],%f4')
+
+	fcmped	%f2,%f4
+	nop
+	fbge,a	L(251)
+	fsubd	%f2,%f4,%f2
+	fdtoi	%f2,%f2
+	st	%f2,[%fp-8]
+	b	L(252)
+	ld	[%fp-8],%i4
+L(251):
+	fdtoi	%f2,%f2
+	st	%f2,[%fp-8]
+	ld	[%fp-8],%i4
+	sethi	%hi(-2147483648),%g2
+	xor	%i4,%g2,%i4
+L(252):
+	umul	%i3,%i4,%g3
+	rd	%y,%i0
+	subcc	%i2,%g3,%o7
+	subxcc	%i1,%i0,%g0
+	be	L(253)
+	cmp	%o7,%i3
+
+	add	%i4,-1,%i0
+	add	%o7,%i3,%o7
+	st	%o7,[%i5]
+	ret
+	restore
+L(253):
+	blu	L(246)
+	mov	%i4,%i0
+	add	%i4,1,%i0
+	sub	%o7,%i3,%o7
+L(246):
+	st	%o7,[%i5]
+	ret
+	restore
+EPILOGUE(mpn_udiv_qrnnd)
diff --git a/rts/gmp/mpn/sparc32/v8/umul.asm b/rts/gmp/mpn/sparc32/v8/umul.asm
new file mode 100644
index 0000000000..ae8f692a0a
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v8/umul.asm
@@ -0,0 +1,31 @@
+dnl  SPARC v8 mpn_umul_ppmm -- support for longlong.h for non-gcc.
+
+dnl  Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_umul_ppmm)
+	umul %o1,%o2,%g2
+	st %g2,[%o0]
+	retl
+	rd %y,%o0
+EPILOGUE(mpn_umul_ppmm)
diff --git a/rts/gmp/mpn/sparc32/v9/README b/rts/gmp/mpn/sparc32/v9/README
new file mode 100644
index 0000000000..9b39713271
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v9/README
@@ -0,0 +1,4 @@
+Code for SPARC processors implementing version 9 of the SPARC architecture.
+This code is for systems that doesn't preserve the full 64-bit contents of
+integer register at context switch.  For other systems (such as Solaris 7 or
+later) use the code in ../../sparc64.
diff --git a/rts/gmp/mpn/sparc32/v9/addmul_1.asm b/rts/gmp/mpn/sparc32/v9/addmul_1.asm
new file mode 100644
index 0000000000..c1762cc41f
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v9/addmul_1.asm
@@ -0,0 +1,288 @@
+dnl  SPARC v9 32-bit mpn_addmul_1 -- Multiply a limb vector with a limb and
+dnl  add the result to a second limb vector.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	i0
+C s1_ptr	i1
+C size		i2
+C s2_limb	i3
+
+ASM_START()
+
+	TEXT
+	ALIGN(4)
+L(noll):
+	.word	0
+
+PROLOGUE(mpn_addmul_1)
+	save %sp,-256,%sp
+
+ifdef(`PIC',
+`L(pc):	rd	%pc,%o7
+	ld	[%o7+L(noll)-L(pc)],%f10',
+`	sethi	%hi(L(noll)),%g1
+	ld	[%g1+%lo(L(noll))],%f10')
+
+	sethi	%hi(0xffff0000),%o0
+	andn	%i3,%o0,%o0
+	st	%o0,[%fp-16]
+	ld	[%fp-16],%f11
+	fxtod	%f10,%f6
+
+	srl	%i3,16,%o0
+	st	%o0,[%fp-16]
+	ld	[%fp-16],%f11
+	fxtod	%f10,%f8
+
+	mov	0,%g3			C cy = 0
+
+	ld	[%i1],%f11
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end1)
+	add	%i1,4,%i1		C s1_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end2)
+	std	%f12,[%fp-16]
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end3)
+	std	%f12,[%fp-32]
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	ld	[%i0],%g5
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	add	%i0,4,%i0		C res_ptr++
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end4)
+	std	%f12,[%fp-16]
+
+	b,a	L(loopm)
+
+	.align 16
+C BEGIN LOOP
+L(loop):
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(loope)
+	add	%i0,4,%i0		C res_ptr++
+L(loopm):
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-32],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	subcc	%i2,1,%i2
+	bne,pt	%icc,L(loop)
+	add	%i0,4,%i0		C res_ptr++
+C END LOOP
+
+	fxtod	%f10,%f2
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	b,a	L(xxx)
+L(loope):
+L(end4):
+	fxtod	%f10,%f2
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-32],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	b,a	L(yyy)
+
+L(end3):
+	fxtod	%f10,%f2
+	ld	[%i0],%g5
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+L(xxx):	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	ldx	[%fp-32],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+	b,a	L(ret)
+
+L(end2):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	ld	[%i0],%g5
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+L(yyy):	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	ldx	[%fp-32],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+	b,a	L(ret)
+
+L(end1):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+
+	ld	[%i0],%g5
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+L(ret):	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	st	%g4,[%i0-4]
+
+	ret
+	restore %g0,%g3,%o0		C sideeffect: put cy in retreg
+EPILOGUE(mpn_addmul_1)
diff --git a/rts/gmp/mpn/sparc32/v9/gmp-mparam.h b/rts/gmp/mpn/sparc32/v9/gmp-mparam.h
new file mode 100644
index 0000000000..f946b900f0
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v9/gmp-mparam.h
@@ -0,0 +1,69 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+/* These values are for UltraSPARC I, II, and IIi.  It is bogus that
+   this file lives in v9, but that will do for now.  */
+
+/* Variations in addmul_1 speed make the multiply and square thresholds
+   doubtful.  TOOM3_SQR_THRESHOLD had to be estimated here.  */
+
+/* Generated by tuneup.c, 2000-07-06. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   30
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      200
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   59
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      500
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD             107
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD            146
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            29
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD           3
+#endif
diff --git a/rts/gmp/mpn/sparc32/v9/mul_1.asm b/rts/gmp/mpn/sparc32/v9/mul_1.asm
new file mode 100644
index 0000000000..f8f0fdd8c2
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v9/mul_1.asm
@@ -0,0 +1,267 @@
+dnl  SPARC v9 32-bit mpn_mul_1 -- Multiply a limb vector with a limb and
+dnl  store the result in a second limb vector.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	i0
+C s1_ptr	i1
+C size		i2
+C s2_limb	i3
+
+ASM_START()
+
+	TEXT
+	ALIGN(4)
+L(noll):
+	.word	0
+
+PROLOGUE(mpn_mul_1)
+	save %sp,-256,%sp
+
+ifdef(`PIC',
+`L(pc):	rd	%pc,%o7
+	ld	[%o7+L(noll)-L(pc)],%f10',
+`	sethi	%hi(L(noll)),%g1
+	ld	[%g1+%lo(L(noll))],%f10')
+
+	sethi	%hi(0xffff0000),%o0
+	andn	%i3,%o0,%o0
+	st	%o0,[%fp-16]
+	ld	[%fp-16],%f11
+	fxtod	%f10,%f6
+
+	srl	%i3,16,%o0
+	st	%o0,[%fp-16]
+	ld	[%fp-16],%f11
+	fxtod	%f10,%f8
+
+	mov	0,%g3			C cy = 0
+
+	ld	[%i1],%f11
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end1)
+	add	%i1,4,%i1		C s1_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end2)
+	std	%f12,[%fp-16]
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end3)
+	std	%f12,[%fp-32]
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	add	%i0,4,%i0		C res_ptr++
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end4)
+	std	%f12,[%fp-16]
+
+	b,a	L(loopm)
+
+	.align 16
+C BEGIN LOOP
+L(loop):
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(loope)
+	add	%i0,4,%i0		C res_ptr++
+L(loopm):
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-32],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	subcc	%i2,1,%i2
+	bne,pt	%icc,L(loop)
+	add	%i0,4,%i0		C res_ptr++
+C END LOOP
+
+	fxtod	%f10,%f2
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	b,a	L(xxx)
+L(loope):
+L(end4):
+	fxtod	%f10,%f2
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-32],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	b,a	L(yyy)
+
+L(end3):
+	fxtod	%f10,%f2
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+L(xxx):	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	ldx	[%fp-32],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+	b,a	L(ret)
+
+L(end2):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+L(yyy):	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	ldx	[%fp-32],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+	b,a	L(ret)
+
+L(end1):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+L(ret):	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	st	%g4,[%i0-4]
+
+	ret
+	restore %g0,%g3,%o0		C sideeffect: put cy in retreg
+EPILOGUE(mpn_mul_1)
diff --git a/rts/gmp/mpn/sparc32/v9/submul_1.asm b/rts/gmp/mpn/sparc32/v9/submul_1.asm
new file mode 100644
index 0000000000..6195ea88ea
--- /dev/null
+++ b/rts/gmp/mpn/sparc32/v9/submul_1.asm
@@ -0,0 +1,291 @@
+dnl  SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl  subtract the result from a second limb vector.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	i0
+C s1_ptr	i1
+C size		i2
+C s2_limb	i3
+
+ASM_START()
+
+	TEXT
+	ALIGN(4)
+L(noll):
+	.word	0
+
+PROLOGUE(mpn_submul_1)
+	save %sp,-256,%sp
+
+ifdef(`PIC',
+`L(pc):	rd	%pc,%o7
+	ld	[%o7+L(noll)-L(pc)],%f10',
+`	sethi	%hi(L(noll)),%g1
+	ld	[%g1+%lo(L(noll))],%f10')
+
+	sethi	%hi(0xffff0000),%o0
+	andn	%i3,%o0,%o0
+	st	%o0,[%fp-16]
+	ld	[%fp-16],%f11
+	fxtod	%f10,%f6
+
+	srl	%i3,16,%o0
+	st	%o0,[%fp-16]
+	ld	[%fp-16],%f11
+	fxtod	%f10,%f8
+
+	mov	0,%g3			C cy = 0
+
+	ld	[%i1],%f11
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end1)
+	add	%i1,4,%i1		C s1_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end2)
+	std	%f12,[%fp-16]
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end3)
+	std	%f12,[%fp-32]
+
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	ld	[%i0],%g5
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	add	%i0,4,%i0		C res_ptr++
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(end4)
+	std	%f12,[%fp-16]
+
+	b,a	L(loopm)
+
+	.align 16
+C BEGIN LOOP
+L(loop):
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	subcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4]
+	addx	%g3,0,%g3
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+	subcc	%i2,1,%i2
+	be,pn	%icc,L(loope)
+	add	%i0,4,%i0		C res_ptr++
+L(loopm):
+	fxtod	%f10,%f2
+	ld	[%i1],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	subcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-32],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4]
+	addx	%g3,0,%g3
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	subcc	%i2,1,%i2
+	bne,pt	%icc,L(loop)
+	add	%i0,4,%i0		C res_ptr++
+C END LOOP
+
+	fxtod	%f10,%f2
+	add	%g3,%g1,%g4		C p += cy
+	subcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4]
+	b,a	L(xxx)
+L(loope):
+L(end4):
+	fxtod	%f10,%f2
+	add	%g3,%g1,%g4		C p += cy
+	subcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-32],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4]
+	b,a	L(yyy)
+
+L(end3):
+	fxtod	%f10,%f2
+	ld	[%i0],%g5
+	ldx	[%fp-24],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+L(xxx):	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	ldx	[%fp-32],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+	b,a	L(ret)
+
+L(end2):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-40]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-32]
+	ld	[%i0],%g5
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+L(yyy):	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-40],%g2		C p16
+	ldx	[%fp-32],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+	b,a	L(ret)
+
+L(end1):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-24]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-16]
+
+	ld	[%i0],%g5
+	ldx	[%fp-24],%g2		C p16
+	ldx	[%fp-16],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+L(ret):	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	srlx	%g4,32,%g3
+	st	%l2,[%i0-4]
+
+	addx	%g3,%g0,%g3
+	ret
+	restore %g0,%g3,%o0		C sideeffect: put cy in retreg
+EPILOGUE(mpn_submul_1)
diff --git a/rts/gmp/mpn/sparc64/README b/rts/gmp/mpn/sparc64/README
new file mode 100644
index 0000000000..6923a133f3
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/README
@@ -0,0 +1,48 @@
+This directory contains mpn functions for 64-bit V9 SPARC
+
+RELEVANT OPTIMIZATION ISSUES
+
+The Ultra I/II pipeline executes up to two simple integer arithmetic operations
+per cycle.  The 64-bit integer multiply instruction mulx takes from 5 cycles to
+35 cycles, depending on the position of the most significant bit of the 1st
+source operand.  It cannot overlap with other instructions.  For our use of
+mulx, it will take from 5 to 20 cycles.
+
+Integer conditional move instructions cannot dual-issue with other integer
+instructions.  No conditional move can issue 1-5 cycles after a load.  (Or
+something such bizzare.)
+
+Integer branches can issue with two integer arithmetic instructions.  Likewise
+for integer loads.  Four instructions may issue (arith, arith, ld/st, branch)
+but only if the branch is last.
+
+(The V9 architecture manual recommends that the 2nd operand of a multiply
+instruction be the smaller one.  For UltraSPARC, they got things backwards and
+optimize for the wrong operand!  Really helpful in the light of that multiply
+is incredibly slow on these CPUs!)
+
+STATUS
+
+There is new code in ~/prec/gmp-remote/sparc64.  Not tested or completed, but
+the pipelines are worked out.  Here are the timings:
+
+* lshift, rshift: The code is well-optimized and runs at 2.0 cycles/limb.
+
+* add_n, sub_n: add3.s currently runs at 6 cycles/limb.  We use a bizarre
+  scheme of compares and branches (with some nops and fnops to align things)
+  and carefully stay away from the instructions intended for this application
+  (i.e., movcs and movcc).
+
+  Using movcc/movcs, even with deep unrolling, seems to get down to 7
+  cycles/limb.
+
+  The most promising approach is to split operands in 32-bit pieces using
+  srlx, then use two addccc, and finally compile the results with sllx+or.
+  The result could run at 5 cycles/limb, I think.  It might be possible to
+  do without unrolling, or with minimal unrolling.
+
+* addmul_1/submul_1: Should optimize for when scalar operand < 2^32.
+* addmul_1/submul_1: Since mulx is horrendously slow on UltraSPARC I/II,
+  Karatsuba's method should save up to 16 cycles (i.e. > 20%).
+* mul_1 (and possibly the other multiply functions): Handle carry in the
+  same tricky way as add_n,sub_n.
diff --git a/rts/gmp/mpn/sparc64/add_n.asm b/rts/gmp/mpn/sparc64/add_n.asm
new file mode 100644
index 0000000000..72b3895a5b
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/add_n.asm
@@ -0,0 +1,172 @@
+! SPARC v9 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+! sum in a third limb vector.
+
+! Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	%o0
+! s1_ptr	%o1
+! s2_ptr	%o2
+! size		%o3
+
+include(`../config.m4')
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+PROLOGUE(mpn_add_n)
+
+! 12 mem ops >= 12 cycles
+! 8 shift insn >= 8 cycles
+! 8 addccc, executing alone, +8 cycles
+! Unrolling not mandatory...perhaps 2-way is best?
+! Put one ldx/stx and one s?lx per issue tuple, fill with pointer arith and loop ctl
+! All in all, it runs at 5 cycles/limb
+
+	save	%sp,-160,%sp
+
+	addcc	%g0,%g0,%g0
+
+	add	%i3,-4,%i3
+	brlz,pn	%i3,L(there)
+	nop
+
+	ldx	[%i1+0],%l0
+	ldx	[%i2+0],%l4
+	ldx	[%i1+8],%l1
+	ldx	[%i2+8],%l5
+	ldx	[%i1+16],%l2
+	ldx	[%i2+16],%l6
+	ldx	[%i1+24],%l3
+	ldx	[%i2+24],%l7
+	add	%i1,32,%i1
+	add	%i2,32,%i2
+
+	add	%i3,-4,%i3
+	brlz,pn	%i3,L(skip)
+	nop
+	b	L(loop1)	! jump instead of executing many NOPs
+	nop
+	ALIGN(32)
+!---------  Start main loop ---------
+L(loop1):
+	addccc	%l0,%l4,%g1
+!-
+	srlx	%l0,32,%o0
+	ldx	[%i1+0],%l0
+!-
+	srlx	%l4,32,%o4
+	ldx	[%i2+0],%l4
+!-
+	addccc	%o0,%o4,%g0
+!-
+	addccc	%l1,%l5,%g2
+!-
+	srlx	%l1,32,%o1
+	ldx	[%i1+8],%l1
+!-
+	srlx	%l5,32,%o5
+	ldx	[%i2+8],%l5
+!-
+	addccc	%o1,%o5,%g0
+!-
+	addccc	%l2,%l6,%g3
+!-
+	srlx	%l2,32,%o2
+	ldx	[%i1+16],%l2
+!-
+	srlx	%l6,32,%g5	! asymmetry
+	ldx	[%i2+16],%l6
+!-
+	addccc	%o2,%g5,%g0
+!-
+	addccc	%l3,%l7,%g4
+!-
+	srlx	%l3,32,%o3
+	ldx	[%i1+24],%l3
+	add	%i1,32,%i1
+!-
+	srlx	%l7,32,%o7
+	ldx	[%i2+24],%l7
+	add	%i2,32,%i2
+!-
+	addccc	%o3,%o7,%g0
+!-
+	stx	%g1,[%i0+0]
+!-
+	stx	%g2,[%i0+8]
+!-
+	stx	%g3,[%i0+16]
+	add	%i3,-4,%i3
+!-
+	stx	%g4,[%i0+24]
+	add	%i0,32,%i0
+
+	brgez,pt	%i3,L(loop1)
+	nop
+!---------  End main loop ---------
+L(skip):
+	addccc	%l0,%l4,%g1
+	srlx	%l0,32,%o0
+	srlx	%l4,32,%o4
+	addccc	%o0,%o4,%g0
+	addccc	%l1,%l5,%g2
+	srlx	%l1,32,%o1
+	srlx	%l5,32,%o5
+	addccc	%o1,%o5,%g0
+	addccc	%l2,%l6,%g3
+	srlx	%l2,32,%o2
+	srlx	%l6,32,%g5	! asymmetry
+	addccc	%o2,%g5,%g0
+	addccc	%l3,%l7,%g4
+	srlx	%l3,32,%o3
+	srlx	%l7,32,%o7
+	addccc	%o3,%o7,%g0
+	stx	%g1,[%i0+0]
+	stx	%g2,[%i0+8]
+	stx	%g3,[%i0+16]
+	stx	%g4,[%i0+24]
+	add	%i0,32,%i0
+
+L(there):
+	add	%i3,4,%i3
+	brz,pt	%i3,L(end)
+	nop
+
+L(loop2):
+	ldx	[%i1+0],%l0
+	add	%i1,8,%i1
+	ldx	[%i2+0],%l4
+	add	%i2,8,%i2
+	srlx	%l0,32,%g2
+	srlx	%l4,32,%g3
+	addccc	%l0,%l4,%g1
+	addccc	%g2,%g3,%g0
+	stx	%g1,[%i0+0]
+	add	%i0,8,%i0
+	add	%i3,-1,%i3
+	brgz,pt	%i3,L(loop2)
+	nop
+
+L(end):	addc	%g0,%g0,%i0
+	ret
+	restore
+EPILOGUE(mpn_add_n)
diff --git a/rts/gmp/mpn/sparc64/addmul1h.asm b/rts/gmp/mpn/sparc64/addmul1h.asm
new file mode 100644
index 0000000000..96cb5f7369
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/addmul1h.asm
@@ -0,0 +1,203 @@
+dnl  SPARC 64-bit addmull/addmulu -- Helper for mpn_addmul_1 and mpn_mul_1.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+ifdef(`LOWPART',
+`addmull:',
+`addmulu:')
+	save %sp,-256,%sp
+
+	sethi	%hi(0xffff0000),%o0
+	andn	%i3,%o0,%o0
+	st	%o0,[%fp-17]
+	ld	[%fp-17],%f11
+	fxtod	%f10,%f6
+
+	srl	%i3,16,%o0
+	st	%o0,[%fp-17]
+	ld	[%fp-17],%f11
+	fxtod	%f10,%f8
+
+	mov	0,%g3			C cy = 0
+
+	ld	[%i1+4],%f11
+	subcc	%i2,1,%i2
+dnl	be,pn	%icc,E(end1)
+	add	%i1,4,%i1		C s1_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,E(end2)
+	std	%f12,[%fp-17]
+
+	fxtod	%f10,%f2
+	ld	[%i1+4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+dnl	be,pn	%icc,E(end3)
+	std	%f12,[%fp-33]
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	ld	[%i0+DLO],%g5
+	ldx	[%fp-25],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-17],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	add	%i0,4,%i0		C res_ptr++
+	subcc	%i2,1,%i2
+	be,pn	%icc,E(end4)
+	std	%f12,[%fp-17]
+
+	b,a	E(loop)
+	nop				C nop is cheap to nullify
+
+	ALIGN(16)
+C BEGIN LOOP
+E(loop):
+	fxtod	%f10,%f2
+	ld	[%i1+4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0+DHI],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-33],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DLO]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	sub	%i2,2,%i2
+	add	%i0,4,%i0		C res_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0+DLO],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-25],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-17],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DHI]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-17]
+	brnz,pt	%i2,E(loop)
+	add	%i0,4,%i0		C res_ptr++
+C END LOOP
+E(loope):
+E(end4):
+	fxtod	%f10,%f2
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0+DHI],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-33],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DLO]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+	ld	[%i0+DLO],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-25],%g2		C p16
+	ldx	[%fp-17],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DHI]
+	b,a	E(yyy)
+
+E(end2):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	ld	[%i0+DLO],%g5
+	ldx	[%fp-25],%g2		C p16
+	ldx	[%fp-17],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+E(yyy):	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g5,%g1,%g1		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+ifdef(`LOWPART',
+`	ld	[%i0+DHI],%g5')
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	ldx	[%fp-33],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DLO]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+ifdef(`LOWPART',
+`	add	%g5,%g1,%g1')		C add *res_ptr to p0 (ADD2)
+	add	%g3,%g1,%g4		C p += cy
+ifdef(`LOWPART',
+`	st	%g4,[%i0-4+DHI]
+	srlx	%g4,32,%g4')
+
+	ret
+	restore %g0,%g4,%o0		C sideeffect: put cy in retreg
+ifdef(`LOWPART',
+`EPILOGUE(addmull)',
+`EPILOGUE(addmulu)')
diff --git a/rts/gmp/mpn/sparc64/addmul_1.asm b/rts/gmp/mpn/sparc64/addmul_1.asm
new file mode 100644
index 0000000000..c3f04cea6a
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/addmul_1.asm
@@ -0,0 +1,114 @@
+dnl  SPARC 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and
+dnl  add the result to a second limb vector.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	i0
+C s1_ptr	i1
+C size		i2
+C s2_limb	i3
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+
+PROLOGUE(mpn_addmul_1)
+	save	%sp,-256,%sp
+
+C We store 0.0 in f10 and keep it invariant accross thw two
+C function calls below.  Note that this is not ABI conformant,
+C but since the functions are local, that's acceptable.
+ifdef(`PIC',
+`L(pc):	rd	%pc,%o7
+	ld	[%o7+L(noll)-L(pc)],%f10',
+`	sethi	%hh(L(noll)),%g2
+	sethi	%lm(L(noll)),%g1
+	or	%g2,%hm(L(noll)),%g2
+	or	%g1,%lo(L(noll)),%g1
+	sllx	%g2,32,%g2
+	ld	[%g1+%g2],%f10')
+
+	sub	%i1,%i0,%g1
+	srlx	%g1,3,%g1
+	cmp	%g1,%i2
+	bcc,pt	%xcc,L(nooverlap)
+	nop
+
+	sllx	%i2,3,%g2		C compute stack allocation byte count
+	add	%g2,15,%o0
+	and	%o0,-16,%o0
+	sub	%sp,%o0,%sp
+	add	%sp,2223,%o0
+
+	mov	%i1,%o1			C copy s1_ptr to mpn_copyi's srcp
+	call	mpn_copyi
+	mov	%i2,%o2			C copy n to mpn_copyi's count parameter
+
+	add	%sp,2223,%i1
+
+L(nooverlap):
+C First multiply-add with low 32 bits of s2_limb
+	mov	%i0,%o0
+	mov	%i1,%o1
+	add	%i2,%i2,%o2
+	call	addmull
+	srl	%i3,0,%o3
+
+	mov	%o0,%l0			C keep carry-out from accmull
+
+C Now multiply-add with high 32 bits of s2_limb, unless it is zero.
+	srlx	%i3,32,%o3
+	brz,a,pn	%o3,L(small)
+	 mov	%o0,%i0
+	mov	%i1,%o1
+	add	%i2,%i2,%o2
+	call	addmulu
+	add	%i0,4,%o0
+
+	add	%l0,%o0,%i0
+L(small):
+	ret
+	restore	%g0,%g0,%g0
+EPILOGUE(mpn_addmul_1)
+
+C Put a zero in the text segment to allow us to t the address
+C quickly when compiling for PIC
+	TEXT
+	ALIGN(4)
+L(noll):
+	.word	0
+
+define(`LO',`(+4)')
+define(`HI',`(-4)')
+
+define(`DLO',`(+4)')
+define(`DHI',`(-4)')
+define(`LOWPART')
+define(`E',`L(l.$1)')
+include_mpn(`sparc64/addmul1h.asm')
+
+define(`DLO',`(-4)')
+define(`DHI',`(+4)')
+undefine(`LOWPART')
+define(`E',`L(u.$1)')
+include_mpn(`sparc64/addmul1h.asm')
diff --git a/rts/gmp/mpn/sparc64/copyi.asm b/rts/gmp/mpn/sparc64/copyi.asm
new file mode 100644
index 0000000000..d9957e3c90
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/copyi.asm
@@ -0,0 +1,79 @@
+! SPARC v9 __gmpn_copy -- Copy a limb vector.
+
+! Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! rptr	%o0
+! sptr	%o1
+! n	%o2
+
+include(`../config.m4')
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+PROLOGUE(mpn_copyi)
+	add	%o2,-8,%o2
+	brlz,pn	%o2,L(skip)
+	nop
+	b,a	L(loop1)
+	nop
+
+	ALIGN(16)
+L(loop1):
+	ldx	[%o1+0],%g1
+	ldx	[%o1+8],%g2
+	ldx	[%o1+16],%g3
+	ldx	[%o1+24],%g4
+	ldx	[%o1+32],%g5
+	ldx	[%o1+40],%o3
+	ldx	[%o1+48],%o4
+	ldx	[%o1+56],%o5
+	add	%o1,64,%o1
+	stx	%g1,[%o0+0]
+	stx	%g2,[%o0+8]
+	stx	%g3,[%o0+16]
+	stx	%g4,[%o0+24]
+	stx	%g5,[%o0+32]
+	stx	%o3,[%o0+40]
+	stx	%o4,[%o0+48]
+	stx	%o5,[%o0+56]
+	add	%o2,-8,%o2
+	brgez,pt	%o2,L(loop1)
+	add	%o0,64,%o0
+
+L(skip):
+	add	%o2,8,%o2
+	brz,pt	%o2,L(end)
+	nop
+
+L(loop2):
+	ldx	[%o1],%g1
+	add	%o1,8,%o1
+	add	%o2,-1,%o2
+	stx	%g1,[%o0]
+	add	%o0,8,%o0
+	brgz,pt	%o2,L(loop2)
+	nop
+
+L(end):	retl
+	nop
+EPILOGUE(mpn_copyi)
diff --git a/rts/gmp/mpn/sparc64/gmp-mparam.h b/rts/gmp/mpn/sparc64/gmp-mparam.h
new file mode 100644
index 0000000000..74f61661c1
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/gmp-mparam.h
@@ -0,0 +1,88 @@
+/* Sparc64 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 64
+#define BYTES_PER_MP_LIMB 8
+#define BITS_PER_LONGINT 64
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+/* Tell the toom3 multiply implementation to call low-level mpn
+   functions instead of open-coding operations in C.  */
+#define USE_MORE_MPN 1
+
+
+/* Run on sun workshop cc. */
+/* Generated by tuneup.c, 2000-07-30. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD     12
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD         95
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD     33
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD        125
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD                27
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD              107
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD              12
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD          4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD           199
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE  { 304, 608, 1344, 2304, 7168, 20480, 49152, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD     320
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD         1664
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE  { 304, 608, 1344, 2816, 7168, 20480, 49152, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD     320
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD         1664
+#endif
diff --git a/rts/gmp/mpn/sparc64/lshift.asm b/rts/gmp/mpn/sparc64/lshift.asm
new file mode 100644
index 0000000000..2d2edc50a7
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/lshift.asm
@@ -0,0 +1,97 @@
+! SPARC v9 __gmpn_lshift --
+
+! Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	%o0
+! src_ptr	%o1
+! size		%o2
+! cnt		%o3
+
+include(`../config.m4')
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+PROLOGUE(mpn_lshift)
+	sllx	%o2,3,%g1
+	add	%o1,%g1,%o1	! make %o1 point at end of src
+	ldx	[%o1-8],%g2	! load first limb
+	sub	%g0,%o3,%o5	! negate shift count
+	add	%o0,%g1,%o0	! make %o0 point at end of res
+	add	%o2,-1,%o2
+	and	%o2,4-1,%g4	! number of limbs in first loop
+	srlx	%g2,%o5,%g1	! compute function result
+	brz,pn	%g4,L(0)		! if multiple of 4 limbs, skip first loop
+	mov	%g1,%g5
+
+	sub	%o2,%g4,%o2	! adjust count for main loop
+
+L(loop0):
+	ldx	[%o1-16],%g3
+	add	%o0,-8,%o0
+	add	%o1,-8,%o1
+	add	%g4,-1,%g4
+	sllx	%g2,%o3,%o4
+	srlx	%g3,%o5,%g1
+	mov	%g3,%g2
+	or	%o4,%g1,%o4
+	brnz,pt	%g4,L(loop0)
+	 stx	%o4,[%o0+0]
+
+L(0):	brz,pn	%o2,L(end)
+	 nop
+
+L(loop1):
+	ldx	[%o1-16],%g3
+	add	%o0,-32,%o0
+	add	%o2,-4,%o2
+	sllx	%g2,%o3,%o4
+	srlx	%g3,%o5,%g1
+
+	ldx	[%o1-24],%g2
+	sllx	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	stx	%o4,[%o0+24]
+	srlx	%g2,%o5,%g1
+
+	ldx	[%o1-32],%g3
+	sllx	%g2,%o3,%o4
+	or	%g4,%g1,%g4
+	stx	%g4,[%o0+16]
+	srlx	%g3,%o5,%g1
+
+	ldx	[%o1-40],%g2
+	sllx	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	stx	%o4,[%o0+8]
+	srlx	%g2,%o5,%g1
+
+	add	%o1,-32,%o1
+	or	%g4,%g1,%g4
+	brnz,pt	%o2,L(loop1)
+	 stx	%g4,[%o0+0]
+
+L(end):	sllx	%g2,%o3,%g2
+	stx	%g2,[%o0-8]
+	retl
+	mov	%g5,%o0
+EPILOGUE(mpn_lshift)
diff --git a/rts/gmp/mpn/sparc64/mul_1.asm b/rts/gmp/mpn/sparc64/mul_1.asm
new file mode 100644
index 0000000000..f2f2821d51
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/mul_1.asm
@@ -0,0 +1,113 @@
+dnl  SPARC 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and
+dnl  store the result to a second limb vector.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	i0
+C s1_ptr	i1
+C size		i2
+C s2_limb	i3
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+
+PROLOGUE(mpn_mul_1)
+	save	%sp,-256,%sp
+
+C We store 0.0 in f10 and keep it invariant accross thw two
+C function calls below.  Note that this is not ABI conformant,
+C but since the functions are local, that's acceptable.
+ifdef(`PIC',
+`L(pc):	rd	%pc,%o7
+	ld	[%o7+L(noll)-L(pc)],%f10',
+`	sethi	%hh(L(noll)),%g2
+	sethi	%lm(L(noll)),%g1
+	or	%g2,%hm(L(noll)),%g2
+	or	%g1,%lo(L(noll)),%g1
+	sllx	%g2,32,%g2
+	ld	[%g1+%g2],%f10')
+
+	sub	%i1,%i0,%g1
+	srlx	%g1,3,%g1
+	cmp	%g1,%i2
+	bcc,pt	%xcc,L(nooverlap)
+	nop
+
+	sllx	%i2,3,%g2		C compute stack allocation byte count
+	add	%g2,15,%o0
+	and	%o0,-16,%o0
+	sub	%sp,%o0,%sp
+	add	%sp,2223,%o0
+
+	mov	%i1,%o1			C copy s1_ptr to mpn_copyi's srcp
+	call	mpn_copyi
+	mov	%i2,%o2			C copy n to mpn_copyi's count parameter
+
+	add	%sp,2223,%i1
+
+L(nooverlap):
+C First multiply-add with low 32 bits of s2_limb
+	mov	%i0,%o0
+	mov	%i1,%o1
+	add	%i2,%i2,%o2
+	call	mull
+	srl	%i3,0,%o3
+
+	mov	%o0,%l0			C keep carry-out from accmull
+
+C Now multiply-add with high 32 bits of s2_limb, unless it is zero.
+	srlx	%i3,32,%o3
+	brz,a,pn	%o3,L(small)
+	 mov	%o0,%i0
+	mov	%i1,%o1
+	add	%i2,%i2,%o2
+	call	addmulu
+	add	%i0,4,%o0
+
+	add	%l0,%o0,%i0
+L(small):
+	ret
+	restore	%g0,%g0,%g0
+EPILOGUE(mpn_mul_1)
+
+C Put a zero in the text segment to allow us to t the address
+C quickly when compiling for PIC
+	TEXT
+	ALIGN(4)
+L(noll):
+	.word	0
+
+define(`LO',`(+4)')
+define(`HI',`(-4)')
+
+define(`DLO',`(+4)')
+define(`DHI',`(-4)')
+define(`E',`L($1)')
+include_mpn(`sparc64/mul_1h.asm')
+
+define(`DLO',`(-4)')
+define(`DHI',`(+4)')
+undefine(`LOWPART')
+define(`E',`L(u.$1)')
+include_mpn(`sparc64/addmul1h.asm')
diff --git a/rts/gmp/mpn/sparc64/mul_1h.asm b/rts/gmp/mpn/sparc64/mul_1h.asm
new file mode 100644
index 0000000000..5078c01c3f
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/mul_1h.asm
@@ -0,0 +1,183 @@
+dnl  SPARC 64-bit mull -- Helper for mpn_mul_1.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+mull:
+	save %sp,-256,%sp
+
+	sethi	%hi(0xffff0000),%o0
+	andn	%i3,%o0,%o0
+	st	%o0,[%fp-17]
+	ld	[%fp-17],%f11
+	fxtod	%f10,%f6
+
+	srl	%i3,16,%o0
+	st	%o0,[%fp-17]
+	ld	[%fp-17],%f11
+	fxtod	%f10,%f8
+
+	mov	0,%g3			C cy = 0
+
+	ld	[%i1+4],%f11
+	subcc	%i2,1,%i2
+dnl	be,pn	%icc,E(end1)
+	add	%i1,4,%i1		C s1_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,E(end2)
+	std	%f12,[%fp-17]
+
+	fxtod	%f10,%f2
+	ld	[%i1+4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+dnl	be,pn	%icc,E(end3)
+	std	%f12,[%fp-33]
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	ldx	[%fp-25],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-17],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	add	%i0,4,%i0		C res_ptr++
+	subcc	%i2,1,%i2
+	be,pn	%icc,E(end4)
+	std	%f12,[%fp-17]
+
+	b,a	E(loop)
+	nop				C nop is cheap to nullify
+
+	ALIGN(16)
+C BEGIN LOOP
+E(loop):
+	fxtod	%f10,%f2
+	ld	[%i1+4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-33],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DLO]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	sub	%i2,2,%i2
+	add	%i0,4,%i0		C res_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-25],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-17],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DHI]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-17]
+	brnz,pt	%i2,E(loop)
+	add	%i0,4,%i0		C res_ptr++
+C END LOOP
+E(loope):
+E(end4):
+	fxtod	%f10,%f2
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-33],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DLO]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-25],%g2		C p16
+	ldx	[%fp-17],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DHI]
+	b,a	E(yyy)
+
+E(end2):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	ldx	[%fp-25],%g2		C p16
+	ldx	[%fp-17],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+E(yyy):	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	ldx	[%fp-33],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%g4,[%i0-4+DLO]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	st	%g4,[%i0-4+DHI]
+	srlx	%g4,32,%g4
+
+	ret
+	restore %g0,%g4,%o0		C sideeffect: put cy in retreg
+EPILOGUE(mull)
diff --git a/rts/gmp/mpn/sparc64/rshift.asm b/rts/gmp/mpn/sparc64/rshift.asm
new file mode 100644
index 0000000000..baf7920efb
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/rshift.asm
@@ -0,0 +1,94 @@
+! SPARC v9 __gmpn_rshift --
+
+! Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	%o0
+! src_ptr	%o1
+! size		%o2
+! cnt		%o3
+
+include(`../config.m4')
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+PROLOGUE(mpn_rshift)
+	ldx	[%o1],%g2	! load first limb
+	sub	%g0,%o3,%o5	! negate shift count
+	add	%o2,-1,%o2
+	and	%o2,4-1,%g4	! number of limbs in first loop
+	sllx	%g2,%o5,%g1	! compute function result
+	brz,pn	%g4,L(0)	! if multiple of 4 limbs, skip first loop
+	mov	%g1,%g5
+
+	sub	%o2,%g4,%o2	! adjust count for main loop
+
+L(loop0):
+	ldx	[%o1+8],%g3
+	add	%o0,8,%o0
+	add	%o1,8,%o1
+	add	%g4,-1,%g4
+	srlx	%g2,%o3,%o4
+	sllx	%g3,%o5,%g1
+	mov	%g3,%g2
+	or	%o4,%g1,%o4
+	brnz,pt	%g4,L(loop0)
+	 stx	%o4,[%o0-8]
+
+L(0):	brz,pn	%o2,L(end)
+	 nop
+
+L(loop1):
+	ldx	[%o1+8],%g3
+	add	%o0,32,%o0
+	add	%o2,-4,%o2
+	srlx	%g2,%o3,%o4
+	sllx	%g3,%o5,%g1
+
+	ldx	[%o1+16],%g2
+	srlx	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	stx	%o4,[%o0-32]
+	sllx	%g2,%o5,%g1
+
+	ldx	[%o1+24],%g3
+	srlx	%g2,%o3,%o4
+	or	%g4,%g1,%g4
+	stx	%g4,[%o0-24]
+	sllx	%g3,%o5,%g1
+
+	ldx	[%o1+32],%g2
+	srlx	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	stx	%o4,[%o0-16]
+	sllx	%g2,%o5,%g1
+
+	add	%o1,32,%o1
+	or	%g4,%g1,%g4
+	brnz	%o2,L(loop1)
+	 stx	%g4,[%o0-8]
+
+L(end):	srlx	%g2,%o3,%g2
+	stx	%g2,[%o0-0]
+	retl
+	mov	%g5,%o0
+EPILOGUE(mpn_rshift)
diff --git a/rts/gmp/mpn/sparc64/sub_n.asm b/rts/gmp/mpn/sparc64/sub_n.asm
new file mode 100644
index 0000000000..61547138e0
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/sub_n.asm
@@ -0,0 +1,172 @@
+! SPARC v9 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+! store difference in a third limb vector.
+
+! Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	%o0
+! s1_ptr	%o1
+! s2_ptr	%o2
+! size		%o3
+
+include(`../config.m4')
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+PROLOGUE(mpn_sub_n)
+
+! 12 mem ops >= 12 cycles
+! 8 shift insn >= 8 cycles
+! 8 addccc, executing alone, +8 cycles
+! Unrolling not mandatory...perhaps 2-way is best?
+! Put one ldx/stx and one s?lx per issue tuple, fill with pointer arith and loop ctl
+! All in all, it runs at 5 cycles/limb
+
+	save	%sp,-160,%sp
+
+	addcc	%g0,%g0,%g0
+
+	add	%i3,-4,%i3
+	brlz,pn	%i3,L(there)
+	nop
+
+	ldx	[%i1+0],%l0
+	ldx	[%i2+0],%l4
+	ldx	[%i1+8],%l1
+	ldx	[%i2+8],%l5
+	ldx	[%i1+16],%l2
+	ldx	[%i2+16],%l6
+	ldx	[%i1+24],%l3
+	ldx	[%i2+24],%l7
+	add	%i1,32,%i1
+	add	%i2,32,%i2
+
+	add	%i3,-4,%i3
+	brlz,pn	%i3,L(skip)
+	nop
+	b	L(loop1)	! jump instead of executing many NOPs
+	nop
+	ALIGN(32)
+!---------  Start main loop ---------
+L(loop1):
+	subccc	%l0,%l4,%g1
+!-
+	srlx	%l0,32,%o0
+	ldx	[%i1+0],%l0
+!-
+	srlx	%l4,32,%o4
+	ldx	[%i2+0],%l4
+!-
+	subccc	%o0,%o4,%g0
+!-
+	subccc	%l1,%l5,%g2
+!-
+	srlx	%l1,32,%o1
+	ldx	[%i1+8],%l1
+!-
+	srlx	%l5,32,%o5
+	ldx	[%i2+8],%l5
+!-
+	subccc	%o1,%o5,%g0
+!-
+	subccc	%l2,%l6,%g3
+!-
+	srlx	%l2,32,%o2
+	ldx	[%i1+16],%l2
+!-
+	srlx	%l6,32,%g5	! asymmetry
+	ldx	[%i2+16],%l6
+!-
+	subccc	%o2,%g5,%g0
+!-
+	subccc	%l3,%l7,%g4
+!-
+	srlx	%l3,32,%o3
+	ldx	[%i1+24],%l3
+	add	%i1,32,%i1
+!-
+	srlx	%l7,32,%o7
+	ldx	[%i2+24],%l7
+	add	%i2,32,%i2
+!-
+	subccc	%o3,%o7,%g0
+!-
+	stx	%g1,[%i0+0]
+!-
+	stx	%g2,[%i0+8]
+!-
+	stx	%g3,[%i0+16]
+	add	%i3,-4,%i3
+!-
+	stx	%g4,[%i0+24]
+	add	%i0,32,%i0
+
+	brgez,pt	%i3,L(loop1)
+	nop
+!---------  End main loop ---------
+L(skip):
+	subccc	%l0,%l4,%g1
+	srlx	%l0,32,%o0
+	srlx	%l4,32,%o4
+	subccc	%o0,%o4,%g0
+	subccc	%l1,%l5,%g2
+	srlx	%l1,32,%o1
+	srlx	%l5,32,%o5
+	subccc	%o1,%o5,%g0
+	subccc	%l2,%l6,%g3
+	srlx	%l2,32,%o2
+	srlx	%l6,32,%g5	! asymmetry
+	subccc	%o2,%g5,%g0
+	subccc	%l3,%l7,%g4
+	srlx	%l3,32,%o3
+	srlx	%l7,32,%o7
+	subccc	%o3,%o7,%g0
+	stx	%g1,[%i0+0]
+	stx	%g2,[%i0+8]
+	stx	%g3,[%i0+16]
+	stx	%g4,[%i0+24]
+	add	%i0,32,%i0
+
+L(there):
+	add	%i3,4,%i3
+	brz,pt	%i3,L(end)
+	nop
+
+L(loop2):
+	ldx	[%i1+0],%l0
+	add	%i1,8,%i1
+	ldx	[%i2+0],%l4
+	add	%i2,8,%i2
+	srlx	%l0,32,%g2
+	srlx	%l4,32,%g3
+	subccc	%l0,%l4,%g1
+	subccc	%g2,%g3,%g0
+	stx	%g1,[%i0+0]
+	add	%i0,8,%i0
+	add	%i3,-1,%i3
+	brgz,pt	%i3,L(loop2)
+	nop
+
+L(end):	addc	%g0,%g0,%i0
+	ret
+	restore
+EPILOGUE(mpn_sub_n)
diff --git a/rts/gmp/mpn/sparc64/submul1h.asm b/rts/gmp/mpn/sparc64/submul1h.asm
new file mode 100644
index 0000000000..7f51ba59c6
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/submul1h.asm
@@ -0,0 +1,204 @@
+dnl  SPARC 64-bit submull/submulu -- Helper for mpn_submul_1 and mpn_mul_1.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+ifdef(`LOWPART',
+`submull:',
+`submulu:')
+	save %sp,-256,%sp
+
+	sethi	%hi(0xffff0000),%o0
+	andn	%i3,%o0,%o0
+	st	%o0,[%fp-17]
+	ld	[%fp-17],%f11
+	fxtod	%f10,%f6
+
+	srl	%i3,16,%o0
+	st	%o0,[%fp-17]
+	ld	[%fp-17],%f11
+	fxtod	%f10,%f8
+
+	mov	0,%g3			C cy = 0
+
+	ld	[%i1+4],%f11
+	subcc	%i2,1,%i2
+dnl	be,pn	%icc,E(end1)
+	add	%i1,4,%i1		C s1_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+	be,pn	%icc,E(end2)
+	std	%f12,[%fp-17]
+
+	fxtod	%f10,%f2
+	ld	[%i1+4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	subcc	%i2,1,%i2
+dnl	be,pn	%icc,E(end3)
+	std	%f12,[%fp-33]
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	ld	[%i0+DLO],%g5
+	ldx	[%fp-25],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-17],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	add	%i0,4,%i0		C res_ptr++
+	subcc	%i2,1,%i2
+	be,pn	%icc,E(end4)
+	std	%f12,[%fp-17]
+
+	b,a	E(loop)
+	nop				C nop is cheap to nullify
+
+	ALIGN(16)
+C BEGIN LOOP
+E(loop):
+	fxtod	%f10,%f2
+	ld	[%i1+4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0+DHI],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-33],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4+DLO]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	sub	%i2,2,%i2
+	add	%i0,4,%i0		C res_ptr++
+
+	fxtod	%f10,%f2
+	ld	[%i1-4],%f11
+	add	%i1,4,%i1		C s1_ptr++
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0+DLO],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-25],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-17],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4+DHI]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-25]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-17]
+	brnz,pt	%i2,E(loop)
+	add	%i0,4,%i0		C res_ptr++
+C END LOOP
+E(loope):
+E(end4):
+	fxtod	%f10,%f2
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0+DHI],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	fmuld	%f2,%f8,%f16
+	ldx	[%fp-33],%g1		C p0
+	fmuld	%f2,%f6,%f4
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4+DLO]
+	fdtox	%f16,%f14
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+	ld	[%i0+DLO],%g5
+	srlx	%g4,32,%g3
+	ldx	[%fp-25],%g2		C p16
+	ldx	[%fp-17],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4+DHI]
+	b,a	E(yyy)
+
+E(end2):
+	fxtod	%f10,%f2
+	fmuld	%f2,%f8,%f16
+	fmuld	%f2,%f6,%f4
+	fdtox	%f16,%f14
+	std	%f14,[%fp-41]
+	fdtox	%f4,%f12
+	std	%f12,[%fp-33]
+	ld	[%i0+DLO],%g5
+	ldx	[%fp-25],%g2		C p16
+	ldx	[%fp-17],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+E(yyy):	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+	subxcc	%g5,%g4,%l2		C add *res_ptr to p0 (ADD2)
+ifdef(`LOWPART',
+`	ld	[%i0+DHI],%g5')
+	srlx	%g4,32,%g3
+	ldx	[%fp-41],%g2		C p16
+	ldx	[%fp-33],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	st	%l2,[%i0-4+DLO]
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i0,4,%i0		C res_ptr++
+
+	add	%g3,%g1,%g4		C p += cy
+ifdef(`LOWPART',
+`	subxcc	%g5,%g4,%l2')		C add *res_ptr to p0 (ADD2)
+ifdef(`LOWPART',
+`	st	%l2,[%i0-4+DHI]
+	srlx	%g4,32,%g4')
+
+	addx	%g4,0,%g4
+	ret
+	restore %g0,%g4,%o0		C sideeffect: put cy in retreg
+ifdef(`LOWPART',
+`EPILOGUE(submull)',
+`EPILOGUE(submulu)')
diff --git a/rts/gmp/mpn/sparc64/submul_1.asm b/rts/gmp/mpn/sparc64/submul_1.asm
new file mode 100644
index 0000000000..7c6af0a98b
--- /dev/null
+++ b/rts/gmp/mpn/sparc64/submul_1.asm
@@ -0,0 +1,114 @@
+dnl  SPARC 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl  subtract the result from a second limb vector.
+
+dnl  Copyright (C) 1998, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 2.1 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+dnl  the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+dnl  MA 02111-1307, USA.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	i0
+C s1_ptr	i1
+C size		i2
+C s2_limb	i3
+
+ASM_START()
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+
+PROLOGUE(mpn_submul_1)
+	save	%sp,-256,%sp
+
+C We store 0.0 in f10 and keep it invariant accross thw two
+C function calls below.  Note that this is not ABI conformant,
+C but since the functions are local, that's acceptable.
+ifdef(`PIC',
+`L(pc):	rd	%pc,%o7
+	ld	[%o7+L(noll)-L(pc)],%f10',
+`	sethi	%hh(L(noll)),%g2
+	sethi	%lm(L(noll)),%g1
+	or	%g2,%hm(L(noll)),%g2
+	or	%g1,%lo(L(noll)),%g1
+	sllx	%g2,32,%g2
+	ld	[%g1+%g2],%f10')
+
+	sub	%i1,%i0,%g1
+	srlx	%g1,3,%g1
+	cmp	%g1,%i2
+	bcc,pt	%xcc,L(nooverlap)
+	nop
+
+	sllx	%i2,3,%g2		C compute stack allocation byte count
+	add	%g2,15,%o0
+	and	%o0,-16,%o0
+	sub	%sp,%o0,%sp
+	add	%sp,2223,%o0
+
+	mov	%i1,%o1			C copy s1_ptr to mpn_copyi's srcp
+	call	mpn_copyi
+	mov	%i2,%o2			C copy n to mpn_copyi's count parameter
+
+	add	%sp,2223,%i1
+
+L(nooverlap):
+C First multiply-add with low 32 bits of s2_limb
+	mov	%i0,%o0
+	mov	%i1,%o1
+	add	%i2,%i2,%o2
+	call	submull
+	srl	%i3,0,%o3
+
+	mov	%o0,%l0			C keep carry-out from accmull
+
+C Now multiply-add with high 32 bits of s2_limb, unless it is zero.
+	srlx	%i3,32,%o3
+	brz,a,pn	%o3,L(small)
+	 mov	%o0,%i0
+	mov	%i1,%o1
+	add	%i2,%i2,%o2
+	call	submulu
+	add	%i0,4,%o0
+
+	add	%l0,%o0,%i0
+L(small):
+	ret
+	restore	%g0,%g0,%g0
+EPILOGUE(mpn_submul_1)
+
+C Put a zero in the text segment to allow us to t the address
+C quickly when compiling for PIC
+	TEXT
+	ALIGN(4)
+L(noll):
+	.word	0
+
+define(`LO',`(+4)')
+define(`HI',`(-4)')
+
+define(`DLO',`(+4)')
+define(`DHI',`(-4)')
+define(`LOWPART')
+define(`E',`L(l.$1)')
+include_mpn(`sparc64/submul1h.asm')
+
+define(`DLO',`(-4)')
+define(`DHI',`(+4)')
+undefine(`LOWPART')
+define(`E',`L(u.$1)')
+include_mpn(`sparc64/submul1h.asm')
diff --git a/rts/gmp/mpn/thumb/add_n.s b/rts/gmp/mpn/thumb/add_n.s
new file mode 100644
index 0000000000..c1eeb6ca87
--- /dev/null
+++ b/rts/gmp/mpn/thumb/add_n.s
@@ -0,0 +1,50 @@
+@ ARM/Thumb __gmpn_add -- Add two limb vectors of the same length > 0 and store
+@ sum in a third limb vector.
+
+@ Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 2.1 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+@ MA 02111-1307, USA.
+
+
+@ INPUT PARAMETERS
+@ RES_ptr	r0
+@ S1_ptr	r1
+@ S2_ptr	r2
+@ SIZE		r3
+
+@ NOT TESTED CODE
+
+	.text
+	.thumb
+	.align	0
+	.global	___gmpn_add_n
+___gmpn_add_n:
+	push	{r4, r5, r6, lr}
+	mov	r6, #1			@ init carry save register
+
+Loop:	sub	r6, #1			@ restore carry (set iff r6 was 0)
+	ldmia	r1!, {r4}		@ load next limb from S1
+	ldmia	r2!, {r5}		@ load next limb from S2
+	adc	r4, r5
+	stmia	r0!, {r4}		@ store result limb to RES
+	sbc	r6, r6			@ save negated carry
+	sub	r3, #1
+	bge	Loop			@ loop back while remaining count >= 4
+
+	mov	r0, r6
+	pop	{r4, r5, r6, pc}
diff --git a/rts/gmp/mpn/thumb/sub_n.s b/rts/gmp/mpn/thumb/sub_n.s
new file mode 100644
index 0000000000..53c292375f
--- /dev/null
+++ b/rts/gmp/mpn/thumb/sub_n.s
@@ -0,0 +1,50 @@
+@ ARM/Thumb __gmpn_sub -- Subtract two limb vectors of the same length > 0 and
+@ store difference in a third limb vector.
+
+@ Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+@ This file is part of the GNU MP Library.
+
+@ The GNU MP Library is free software; you can redistribute it and/or modify
+@ it under the terms of the GNU Lesser General Public License as published by
+@ the Free Software Foundation; either version 2.1 of the License, or (at your
+@ option) any later version.
+
+@ The GNU MP Library is distributed in the hope that it will be useful, but
+@ WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+@ or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+@ License for more details.
+
+@ You should have received a copy of the GNU Lesser General Public License
+@ along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+@ the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+@ MA 02111-1307, USA.
+
+
+@ INPUT PARAMETERS
+@ RES_ptr	r0
+@ S1_ptr	r1
+@ S2_ptr	r2
+@ SIZE		r3
+
+@ NOT TESTED CODE
+
+	.text
+	.thumb
+	.align	0
+	.global	___gmpn_sub_n
+___gmpn_sub_n:
+	push	{r4, r5, r6, lr}
+	mov	r6, #1			@ init carry save register
+
+Loop:	sub	r6, #1			@ restore carry (set iff r6 was 0)
+	ldmia	r1!, {r4}		@ load next limb from S1
+	ldmia	r2!, {r5}		@ load next limb from S2
+	sbc	r4, r5
+	stmia	r0!, {r4}		@ store result limb to RES
+	sbc	r6, r6			@ save negated carry
+	sub	r3, #1
+	bge	Loop			@ loop back while remaining count >= 4
+
+	mov	r0, r6
+	pop	{r4, r5, r6, pc}
diff --git a/rts/gmp/mpn/underscore.h b/rts/gmp/mpn/underscore.h
new file mode 100644
index 0000000000..240dae0f63
--- /dev/null
+++ b/rts/gmp/mpn/underscore.h
@@ -0,0 +1,26 @@
+/*
+Copyright (C) 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#if __STDC__
+#define C_SYMBOL_NAME(name) _##name
+#else
+#define C_SYMBOL_NAME(name) _/**/name
+#endif
diff --git a/rts/gmp/mpn/vax/add_n.s b/rts/gmp/mpn/vax/add_n.s
new file mode 100644
index 0000000000..cf4060f521
--- /dev/null
+++ b/rts/gmp/mpn/vax/add_n.s
@@ -0,0 +1,61 @@
+# VAX __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+# sum in a third limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	(sp + 4)
+# s1_ptr	(sp + 8)
+# s2_ptr	(sp + 12)
+# size		(sp + 16)
+
+.text
+	.align 1
+.globl ___gmpn_add_n
+___gmpn_add_n:
+	.word	0x0
+	movl	16(ap),r0
+	movl	12(ap),r1
+	movl	8(ap),r2
+	movl	4(ap),r3
+	mnegl	r0,r5
+	addl2	$3,r0
+	ashl	$-2,r0,r0	# unroll loop count
+	bicl2	$-4,r5		# mask out low 2 bits
+	movaq	(r5)[r5],r5	# 9x
+	jmp	Loop(r5)
+
+Loop:	movl	(r2)+,r4
+	adwc	(r1)+,r4
+	movl	r4,(r3)+
+	movl	(r2)+,r4
+	adwc	(r1)+,r4
+	movl	r4,(r3)+
+	movl	(r2)+,r4
+	adwc	(r1)+,r4
+	movl	r4,(r3)+
+	movl	(r2)+,r4
+	adwc	(r1)+,r4
+	movl	r4,(r3)+
+	sobgtr	r0,Loop
+
+	adwc	r0,r0
+	ret
diff --git a/rts/gmp/mpn/vax/addmul_1.s b/rts/gmp/mpn/vax/addmul_1.s
new file mode 100644
index 0000000000..379061dcb7
--- /dev/null
+++ b/rts/gmp/mpn/vax/addmul_1.s
@@ -0,0 +1,126 @@
+# VAX __gmpn_addmul_1 -- Multiply a limb vector with a limb and add
+# the result to a second limb vector.
+
+# Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	(sp + 4)
+# s1_ptr	(sp + 8)
+# size		(sp + 12)
+# s2_limb	(sp + 16)
+
+.text
+	.align 1
+.globl ___gmpn_addmul_1
+___gmpn_addmul_1:
+	.word	0xfc0
+	movl	12(ap),r4
+	movl	8(ap),r8
+	movl	4(ap),r9
+	movl	16(ap),r6
+	jlss	s2_big
+
+	clrl	r3
+	incl	r4
+	ashl	$-1,r4,r7
+	jlbc	r4,L1
+	clrl	r11
+
+# Loop for S2_LIMB < 0x80000000
+Loop1:	movl	(r8)+,r1
+	jlss	L1n0
+	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	$0,r3
+	addl2	r2,(r9)+
+	adwc	$0,r3
+L1:	movl	(r8)+,r1
+	jlss	L1n1
+L1p1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	$0,r11
+	addl2	r10,(r9)+
+	adwc	$0,r11
+
+	sobgtr	r7,Loop1
+	movl	r11,r0
+	ret
+
+L1n0:	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r6,r3
+	addl2	r2,(r9)+
+	adwc	$0,r3
+	movl	(r8)+,r1
+	jgeq	L1p1
+L1n1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r6,r11
+	addl2	r10,(r9)+
+	adwc	$0,r11
+
+	sobgtr	r7,Loop1
+	movl	r11,r0
+	ret
+
+
+s2_big:	clrl	r3
+	incl	r4
+	ashl	$-1,r4,r7
+	jlbc	r4,L2
+	clrl	r11
+
+# Loop for S2_LIMB >= 0x80000000
+Loop2:	movl	(r8)+,r1
+	jlss	L2n0
+	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r1,r3
+	addl2	r2,(r9)+
+	adwc	$0,r3
+L2:	movl	(r8)+,r1
+	jlss	L2n1
+L2p1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r1,r11
+	addl2	r10,(r9)+
+	adwc	$0,r11
+
+	sobgtr	r7,Loop2
+	movl	r11,r0
+	ret
+
+L2n0:	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r6,r3
+	addl2	r2,(r9)+
+	adwc	r1,r3
+	movl	(r8)+,r1
+	jgeq	L2p1
+L2n1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r6,r11
+	addl2	r10,(r9)+
+	adwc	r1,r11
+
+	sobgtr	r7,Loop2
+	movl	r11,r0
+	ret
diff --git a/rts/gmp/mpn/vax/lshift.s b/rts/gmp/mpn/vax/lshift.s
new file mode 100644
index 0000000000..fd311a9782
--- /dev/null
+++ b/rts/gmp/mpn/vax/lshift.s
@@ -0,0 +1,58 @@
+# VAX __gmpn_lshift -- left shift.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# rptr		(sp + 4)
+# sptr		(sp + 8)
+# size		(sp + 12)
+# cnt		(sp + 16)
+# r0=retval r1=size r2,r3=itmp r4,r5=otmp	call-used registers
+# r6=sptr r7=rptr r8=cnt r9 r10 r11		call-saved registers
+
+.text
+	.align 1
+.globl ___gmpn_lshift
+___gmpn_lshift:
+	.word	0x1c0
+	movl	4(ap),r7
+	movl	8(ap),r6
+	movl	12(ap),r1
+	movl	16(ap),r8
+
+	moval	(r6)[r1],r6
+	moval	(r7)[r1],r7
+	clrl	r3
+	movl	-(r6),r2
+	ashq	r8,r2,r4
+	movl	r5,r0
+	movl	r2,r3
+	decl	r1
+	jeql	Lend
+
+Loop:	movl	-(r6),r2
+	ashq	r8,r2,r4
+	movl	r5,-(r7)
+	movl	r2,r3
+	jsobgtr	r1,Loop
+
+Lend:	movl	r4,-4(r7)
+	ret
diff --git a/rts/gmp/mpn/vax/mul_1.s b/rts/gmp/mpn/vax/mul_1.s
new file mode 100644
index 0000000000..708e8ca6ca
--- /dev/null
+++ b/rts/gmp/mpn/vax/mul_1.s
@@ -0,0 +1,123 @@
+# VAX __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+# the result in a second limb vector.
+
+# Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	(sp + 4)
+# s1_ptr	(sp + 8)
+# size		(sp + 12)
+# s2_limb	(sp + 16)
+
+.text
+	.align 1
+.globl ___gmpn_mul_1
+___gmpn_mul_1:
+	.word	0xfc0
+	movl	12(ap),r4
+	movl	8(ap),r8
+	movl	4(ap),r9
+	movl	16(ap),r6
+	jlss	s2_big
+
+# One might want to combine the addl2 and the store below, but that
+# is actually just slower according to my timing tests.  (VAX 3600)
+
+	clrl	r3
+	incl	r4
+	ashl	$-1,r4,r7
+	jlbc	r4,L1
+	clrl	r11
+
+# Loop for S2_LIMB < 0x80000000
+Loop1:	movl	(r8)+,r1
+	jlss	L1n0
+	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	$0,r3
+	movl	r2,(r9)+
+L1:	movl	(r8)+,r1
+	jlss	L1n1
+L1p1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	$0,r11
+	movl	r10,(r9)+
+
+	sobgtr	r7,Loop1
+	movl	r11,r0
+	ret
+
+L1n0:	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r6,r3
+	movl	r2,(r9)+
+	movl	(r8)+,r1
+	jgeq	L1p1
+L1n1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r6,r11
+	movl	r10,(r9)+
+
+	sobgtr	r7,Loop1
+	movl	r11,r0
+	ret
+
+
+s2_big:	clrl	r3
+	incl	r4
+	ashl	$-1,r4,r7
+	jlbc	r4,L2
+	clrl	r11
+
+# Loop for S2_LIMB >= 0x80000000
+Loop2:	movl	(r8)+,r1
+	jlss	L2n0
+	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r1,r3
+	movl	r2,(r9)+
+L2:	movl	(r8)+,r1
+	jlss	L2n1
+L2p1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r1,r11
+	movl	r10,(r9)+
+
+	sobgtr	r7,Loop2
+	movl	r11,r0
+	ret
+
+L2n0:	emul	r1,r6,$0,r2
+	addl2	r1,r3
+	addl2	r11,r2
+	adwc	r6,r3
+	movl	r2,(r9)+
+	movl	(r8)+,r1
+	jgeq	L2p1
+L2n1:	emul	r1,r6,$0,r10
+	addl2	r1,r11
+	addl2	r3,r10
+	adwc	r6,r11
+	movl	r10,(r9)+
+
+	sobgtr	r7,Loop2
+	movl	r11,r0
+	ret
diff --git a/rts/gmp/mpn/vax/rshift.s b/rts/gmp/mpn/vax/rshift.s
new file mode 100644
index 0000000000..515813208d
--- /dev/null
+++ b/rts/gmp/mpn/vax/rshift.s
@@ -0,0 +1,56 @@
+# VAX __gmpn_rshift -- right shift.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# rptr		(sp + 4)
+# sptr		(sp + 8)
+# size		(sp + 12)
+# cnt		(sp + 16)
+# r0=retval r1=size r2,r3=itmp r4,r5=otmp	call-used registers
+# r6=sptr r7=rptr r8=cnt r9 r10 r11		call-saved registers
+
+.text
+	.align 1
+.globl ___gmpn_rshift
+___gmpn_rshift:
+	.word	0x1c0
+	movl	4(ap),r7
+	movl	8(ap),r6
+	movl	12(ap),r1
+	movl	16(ap),r8
+
+	movl	(r6)+,r2
+	subl3	r8,$32,r8
+	ashl	r8,r2,r0
+	decl	r1
+	jeql	Lend
+
+Loop:	movl	(r6)+,r3
+	ashq	r8,r2,r4
+	movl	r5,(r7)+
+	movl	r3,r2
+	jsobgtr	r1,Loop
+
+Lend:	clrl	r3
+	ashq	r8,r2,r4
+	movl	r5,(r7)
+	ret
diff --git a/rts/gmp/mpn/vax/sub_n.s b/rts/gmp/mpn/vax/sub_n.s
new file mode 100644
index 0000000000..eff4b1c044
--- /dev/null
+++ b/rts/gmp/mpn/vax/sub_n.s
@@ -0,0 +1,61 @@
+# VAX __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and store
+# difference in a third limb vector.
+
+# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	(sp + 4)
+# s1_ptr	(sp + 8)
+# s2_ptr	(sp + 12)
+# size		(sp + 16)
+
+.text
+	.align 1
+.globl ___gmpn_sub_n
+___gmpn_sub_n:
+	.word	0x0
+	movl	16(ap),r0
+	movl	12(ap),r1
+	movl	8(ap),r2
+	movl	4(ap),r3
+	mnegl	r0,r5
+	addl2	$3,r0
+	ashl	$-2,r0,r0	# unroll loop count
+	bicl2	$-4,r5		# mask out low 2 bits
+	movaq	(r5)[r5],r5	# 9x
+	jmp	Loop(r5)
+
+Loop:	movl	(r2)+,r4
+	sbwc	(r1)+,r4
+	movl	r4,(r3)+
+	movl	(r2)+,r4
+	sbwc	(r1)+,r4
+	movl	r4,(r3)+
+	movl	(r2)+,r4
+	sbwc	(r1)+,r4
+	movl	r4,(r3)+
+	movl	(r2)+,r4
+	sbwc	(r1)+,r4
+	movl	r4,(r3)+
+	sobgtr	r0,Loop
+
+	adwc	r0,r0
+	ret
diff --git a/rts/gmp/mpn/vax/submul_1.s b/rts/gmp/mpn/vax/submul_1.s
new file mode 100644
index 0000000000..be42286935
--- /dev/null
+++ b/rts/gmp/mpn/vax/submul_1.s
@@ -0,0 +1,126 @@
+# VAX __gmpn_submul_1 -- Multiply a limb vector with a limb and subtract
+# the result from a second limb vector.
+
+# Copyright (C) 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+# This file is part of the GNU MP Library.
+
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# INPUT PARAMETERS
+# res_ptr	(sp + 4)
+# s1_ptr	(sp + 8)
+# size		(sp + 12)
+# s2_limb	(sp + 16)
+
+.text
+	.align 1
+.globl ___gmpn_submul_1
+___gmpn_submul_1:
+	.word	0xfc0
+	movl	12(ap),r4
+	movl	8(ap),r8
+	movl	4(ap),r9
+	movl	16(ap),r6
+	jlss	s2_big
+
+	clrl	r3
+	incl	r4
+	ashl	$-1,r4,r7
+	jlbc	r4,L1
+	clrl	r11
+
+# Loop for S2_LIMB < 0x80000000
+Loop1:	movl	(r8)+,r1
+	jlss	L1n0
+	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	$0,r3
+	subl2	r2,(r9)+
+	adwc	$0,r3
+L1:	movl	(r8)+,r1
+	jlss	L1n1
+L1p1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	$0,r11
+	subl2	r10,(r9)+
+	adwc	$0,r11
+
+	sobgtr	r7,Loop1
+	movl	r11,r0
+	ret
+
+L1n0:	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r6,r3
+	subl2	r2,(r9)+
+	adwc	$0,r3
+	movl	(r8)+,r1
+	jgeq	L1p1
+L1n1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r6,r11
+	subl2	r10,(r9)+
+	adwc	$0,r11
+
+	sobgtr	r7,Loop1
+	movl	r11,r0
+	ret
+
+
+s2_big:	clrl	r3
+	incl	r4
+	ashl	$-1,r4,r7
+	jlbc	r4,L2
+	clrl	r11
+
+# Loop for S2_LIMB >= 0x80000000
+Loop2:	movl	(r8)+,r1
+	jlss	L2n0
+	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r1,r3
+	subl2	r2,(r9)+
+	adwc	$0,r3
+L2:	movl	(r8)+,r1
+	jlss	L2n1
+L2p1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r1,r11
+	subl2	r10,(r9)+
+	adwc	$0,r11
+
+	sobgtr	r7,Loop2
+	movl	r11,r0
+	ret
+
+L2n0:	emul	r1,r6,$0,r2
+	addl2	r11,r2
+	adwc	r6,r3
+	subl2	r2,(r9)+
+	adwc	r1,r3
+	movl	(r8)+,r1
+	jgeq	L2p1
+L2n1:	emul	r1,r6,$0,r10
+	addl2	r3,r10
+	adwc	r6,r11
+	subl2	r10,(r9)+
+	adwc	r1,r11
+
+	sobgtr	r7,Loop2
+	movl	r11,r0
+	ret
diff --git a/rts/gmp/mpn/x86/README b/rts/gmp/mpn/x86/README
new file mode 100644
index 0000000000..3507548b8c
--- /dev/null
+++ b/rts/gmp/mpn/x86/README
@@ -0,0 +1,40 @@
+
+                      X86 MPN SUBROUTINES
+
+
+This directory contains mpn functions for various 80x86 chips.
+
+
+CODE ORGANIZATION
+
+	x86              i386, i486, generic
+	x86/pentium      Intel Pentium (P5, P54)
+	x86/pentium/mmx  Intel Pentium with MMX (P55)
+	x86/p6           Intel Pentium Pro
+	x86/p6/mmx       Intel Pentium II, III
+	x86/p6/p3mmx     Intel Pentium III
+	x86/k6           AMD K6, K6-2, K6-3
+	x86/k6/mmx
+	x86/k6/k62mmx    AMD K6-2
+	x86/k7           AMD Athlon
+	x86/k7/mmx
+
+
+The x86 directory is also the main support for P6 at the moment, and
+is something of a blended style, meant to be reasonable on all x86s.
+
+	
+
+STATUS
+
+The code is well-optimized for AMD and Intel chips, but not so well
+optimized for Cyrix chips.
+
+
+
+RELEVANT OPTIMIZATION ISSUES
+
+For implementations with slow double shift instructions (SHLD and
+SHRD), it might be better to mimic their operation with SHL+SHR+OR.
+(M2 is likely to benefit from that, but not Pentium due to its slow
+plain SHL and SHR.)
diff --git a/rts/gmp/mpn/x86/README.family b/rts/gmp/mpn/x86/README.family
new file mode 100644
index 0000000000..3bc73f58b0
--- /dev/null
+++ b/rts/gmp/mpn/x86/README.family
@@ -0,0 +1,333 @@
+
+                    X86 CPU FAMILY MPN SUBROUTINES
+
+
+This file has some notes on things common to all the x86 family code.
+
+
+
+ASM FILES
+
+The x86 .asm files are BSD style x86 assembler code, first put through m4
+for macro processing.  The generic mpn/asm-defs.m4 is used, together with
+mpn/x86/x86-defs.m4.  Detailed notes are in those files.
+
+The code is meant for use with GNU "gas" or a system "as".  There's no
+support for assemblers that demand Intel style, and with gas freely
+available and easy to use that shouldn't be a problem.
+
+
+
+STACK FRAME
+
+m4 macros are used to define the parameters passed on the stack, and these
+act like comments on what the stack frame looks like too.  For example,
+mpn_mul_1() has the following.
+
+        defframe(PARAM_MULTIPLIER, 16)
+        defframe(PARAM_SIZE,       12)
+        defframe(PARAM_SRC,         8)
+        defframe(PARAM_DST,         4)
+
+Here PARAM_MULTIPLIER gets defined as `FRAME+16(%esp)', and the others
+similarly.  The return address is at offset 0, but there's not normally any
+need to access that.
+
+FRAME is redefined as necessary through the code so it's the number of bytes
+pushed on the stack, and hence the offsets in the parameter macros stay
+correct.  At the start of a routine FRAME should be zero.
+
+        deflit(`FRAME',0)
+	...
+	deflit(`FRAME',4)
+	...
+	deflit(`FRAME',8)
+	...
+
+Helper macros FRAME_pushl(), FRAME_popl(), FRAME_addl_esp() and
+FRAME_subl_esp() exist to adjust FRAME for the effect of those instructions,
+and can be used instead of explicit definitions if preferred.
+defframe_pushl() is a combination FRAME_pushl() and defframe().
+
+There's generally some slackness in redefining FRAME.  If new values aren't
+going to get used, then the redefinitions are omitted to keep from
+cluttering up the code.  This happens for instance at the end of a routine,
+where there might be just four register pops and then a ret, so FRAME isn't
+getting used.
+
+Local variables and saved registers can be similarly defined, with negative
+offsets representing stack space below the initial stack pointer.  For
+example,
+
+	defframe(SAVE_ESI,   -4)
+	defframe(SAVE_EDI,   -8)
+	defframe(VAR_COUNTER,-12)
+
+	deflit(STACK_SPACE, 12)
+
+Here STACK_SPACE gets used in a "subl $STACK_SPACE, %esp" to allocate the
+space, and that instruction must be followed by a redefinition of FRAME
+(setting it equal to STACK_SPACE) to reflect the change in %esp.
+
+Definitions for pushed registers are only put in when they're going to be
+used.  If registers are just saved and restored with pushes and pops then
+definitions aren't made.
+
+
+
+ASSEMBLER EXPRESSIONS
+
+Only addition and subtraction seem to be universally available, certainly
+that's all the Solaris 8 "as" seems to accept.  If expressions are wanted
+then m4 eval() should be used.
+
+In particular note that a "/" anywhere in a line starts a comment in Solaris
+"as", and in some configurations of gas too.
+
+	addl	$32/2, %eax           <-- wrong
+
+	addl	$eval(32/2), %eax     <-- right
+
+Binutils gas/config/tc-i386.c has a choice between "/" being a comment
+anywhere in a line, or only at the start.  FreeBSD patches 2.9.1 to select
+the latter, and as of 2.9.5 it's the default for GNU/Linux too.
+
+
+
+ASSEMBLER COMMENTS
+
+Solaris "as" doesn't support "#" commenting, using /* */ instead,
+unfortunately.  For that reason "C" commenting is used (see asm-defs.m4) and
+the intermediate ".s" files have no comments.
+
+
+
+ZERO DISPLACEMENTS
+
+In a couple of places addressing modes like 0(%ebx) with a byte-sized zero
+displacement are wanted, rather than (%ebx) with no displacement.  These are
+either for computed jumps or to get desirable code alignment.  Explicit
+.byte sequences are used to ensure the assembler doesn't turn 0(%ebx) into
+(%ebx).  The Zdisp() macro in x86-defs.m4 is used for this.
+
+Current gas 2.9.5 or recent 2.9.1 leave 0(%ebx) as written, but old gas
+1.92.3 changes it.  In general changing would be the sort of "optimization"
+an assembler might perform, hence explicit ".byte"s are used where
+necessary.
+
+
+
+SHLD/SHRD INSTRUCTIONS
+
+The %cl count forms of double shift instructions like "shldl %cl,%eax,%ebx"
+must be written "shldl %eax,%ebx" for some assemblers.  gas takes either,
+Solaris "as" doesn't allow %cl, gcc generates %cl for gas and NeXT (which is
+gas), and omits %cl elsewhere.
+
+For GMP an autoconf test is used to determine whether %cl should be used and
+the macros shldl, shrdl, shldw and shrdw in mpn/x86/x86-defs.m4 then pass
+through or omit %cl as necessary.  See comments with those macros for usage.
+
+
+
+DIRECTION FLAG
+
+The x86 calling conventions say that the direction flag should be clear at
+function entry and exit.  (See iBCS2 and SVR4 ABI books, references below.)
+
+Although this has been so since the year dot, it's not absolutely clear
+whether it's universally respected.  Since it's better to be safe than
+sorry, gmp follows glibc and does a "cld" if it depends on the direction
+flag being clear.  This happens only in a few places.
+
+
+
+POSITION INDEPENDENT CODE
+
+Defining the symbol PIC in m4 processing selects position independent code.
+This mainly affects computed jumps, and these are implemented in a
+self-contained fashion (without using the global offset table).  The few
+calls from assembly code to global functions use the normal procedure
+linkage table.
+
+PIC is necessary for ELF shared libraries because they can be mapped into
+different processes at different virtual addresses.  Text relocations in
+shared libraries are allowed, but that presumably means a page with such a
+relocation isn't shared.  The use of the PLT for PIC adds a fixed cost to
+every function call, which is small but might be noticeable when working with
+small operands.
+
+Calls from one library function to another don't need to go through the PLT,
+since of course the call instruction uses a displacement, not an absolute
+address, and the relative locations of object files are known when libgmp.so
+is created.  "ld -Bsymbolic" (or "gcc -Wl,-Bsymbolic") will resolve calls
+this way, so that there's no jump through the PLT, but of course leaving
+setups of the GOT address in %ebx that may be unnecessary.
+
+The %ebx setup could be avoided in assembly if a separate option controlled
+PIC for calls as opposed to computed jumps etc.  But there's only ever
+likely to be a handful of calls out of assembler, and getting the same
+optimization for C intra-library calls would be more important.  There seems
+no easy way to tell gcc that certain functions can be called non-PIC, and
+unfortunately many gmp functions use the global memory allocation variables,
+so they need the GOT anyway.  Object files with no global data references
+and only intra-library calls could go into the library as non-PIC under
+-Bsymbolic.  Integrating this into libtool and automake is left as an
+exercise for the reader.
+
+
+
+SIMPLE LOOPS
+
+The overheads in setting up for an unrolled loop can mean that at small
+sizes a simple loop is faster.  Making small sizes go fast is important,
+even if it adds a cycle or two to bigger sizes.  To this end various
+routines choose between a simple loop and an unrolled loop according to
+operand size.  The path to the simple loop, or to special case code for
+small sizes, is always as fast as possible.
+
+Adding a simple loop requires a conditional jump to choose between the
+simple and unrolled code.  The size of a branch misprediction penalty
+affects whether a simple loop is worthwhile.
+
+The convention is for an m4 definition UNROLL_THRESHOLD to set the crossover
+point, with sizes < UNROLL_THRESHOLD using the simple loop, sizes >=
+UNROLL_THRESHOLD using the unrolled loop.  If position independent code adds
+a couple of cycles to an unrolled loop setup, the threshold will vary with
+PIC or non-PIC.  Something like the following is typical.
+
+	ifdef(`PIC',`
+	deflit(UNROLL_THRESHOLD, 10)
+	',`
+	deflit(UNROLL_THRESHOLD, 8)
+	')
+
+There's no automated way to determine the threshold.  Setting it to a small
+value and then to a big value makes it possible to measure the simple and
+unrolled loops each over a range of sizes, from which the crossover point
+can be determined.  Alternately, just adjust the threshold up or down until
+there's no more speedups.
+
+
+
+UNROLLED LOOP CODING
+
+The x86 addressing modes allow a byte displacement of -128 to +127, making
+it possible to access 256 bytes, which is 64 limbs, without adjusting
+pointer registers within the loop.  Dword sized displacements can be used
+too, but they increase code size, and unrolling to 64 ought to be enough.
+
+When unrolling to the full 64 limbs/loop, the limb at the top of the loop
+will have a displacement of -128, so pointers have to have a corresponding
++128 added before entering the loop.  When unrolling to 32 limbs/loop
+displacements 0 to 127 can be used with 0 at the top of the loop and no
+adjustment needed to the pointers.
+
+Where 64 limbs/loop is supported, the +128 adjustment is done only when 64
+limbs/loop is selected.  Usually the gain in speed using 64 instead of 32 or
+16 is small, so support for 64 limbs/loop is generally only for comparison.
+
+
+
+COMPUTED JUMPS
+
+When working from least significant limb to most significant limb (most
+routines) the computed jump and pointer calculations in preparation for an
+unrolled loop are as follows.
+
+	S = operand size in limbs
+	N = number of limbs per loop (UNROLL_COUNT)
+	L = log2 of unrolling (UNROLL_LOG2)
+	M = mask for unrolling (UNROLL_MASK)
+	C = code bytes per limb in the loop
+	B = bytes per limb (4 for x86)
+	
+	computed jump            (-S & M) * C + entrypoint
+	subtract from pointers   (-S & M) * B
+	initial loop counter     (S-1) >> L
+	displacements            0 to B*(N-1)
+
+The loop counter is decremented at the end of each loop, and the looping
+stops when the decrement takes the counter to -1.  The displacements are for
+the addressing accessing each limb, eg. a load with "movl disp(%ebx), %eax".
+
+Usually the multiply by "C" can be handled without an imul, using instead an
+leal, or a shift and subtract.
+
+When working from most significant to least significant limb (eg. mpn_lshift
+and mpn_copyd), the calculations change as follows.
+
+	add to pointers          (-S & M) * B
+	displacements            0 to -B*(N-1)
+
+
+
+OLD GAS 1.92.3
+
+This version comes with FreeBSD 2.2.8 and has a couple of gremlins that
+affect gmp code.
+
+Firstly, an expression involving two forward references to labels comes out
+as zero.  For example,
+
+		addl	$bar-foo, %eax
+	foo:
+		nop
+	bar:
+
+This should lead to "addl $1, %eax", but it comes out as "addl $0, %eax".
+When only one forward reference is involved, it works correctly, as for
+example,
+
+	foo:
+		addl	$bar-foo, %eax
+		nop
+	bar:
+
+Secondly, an expression involving two labels can't be used as the
+displacement for an leal.  For example,
+
+	foo:
+		nop
+	bar:
+		leal	bar-foo(%eax,%ebx,8), %ecx
+
+A slightly cryptic error is given, "Unimplemented segment type 0 in
+parse_operand".  When only one label is used it's ok, and the label can be a
+forward reference too, as for example,
+
+		leal	foo(%eax,%ebx,8), %ecx
+		nop
+	foo:
+
+These problems only affect PIC computed jump calculations.  The workarounds
+are just to do an leal without a displacement and then an addl, and to make
+sure the code is placed so that there's at most one forward reference in the
+addl.
+
+
+
+REFERENCES
+
+"Intel Architecture Software Developer's Manual", volumes 1 to 3, 1999,
+order numbers 243190, 243191 and 243192.  Available on-line,
+
+	ftp://download.intel.com/design/PentiumII/manuals/243190.htm
+	ftp://download.intel.com/design/PentiumII/manuals/243191.htm
+	ftp://download.intel.com/design/PentiumII/manuals/243192.htm
+
+"Intel386 Family Binary Compatibility Specification 2", Intel Corporation,
+published by McGraw-Hill, 1991, ISBN 0-07-031219-2.
+
+"System V Application Binary Interface", Unix System Laboratories Inc, 1992,
+published by Prentice Hall, ISBN 0-13-880410-9.  And the "Intel386 Processor
+Supplement", AT&T, 1991, ISBN 0-13-877689-X.  (These have details of ELF
+shared library PIC coding.)
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/rts/gmp/mpn/x86/addsub_n.S b/rts/gmp/mpn/x86/addsub_n.S
new file mode 100644
index 0000000000..fe6f648f53
--- /dev/null
+++ b/rts/gmp/mpn/x86/addsub_n.S
@@ -0,0 +1,174 @@
+/* Currently not working and not used. */
+
+/*
+Copyright (C) 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+
+#define SAVE_BORROW_RESTORE_CARRY(r)	adcl r,r; shll $31,r
+#define SAVE_CARRY_RESTORE_BORROW(r)	adcl r,r
+
+	.globl	mpn_addsub_n_0
+	.globl	mpn_addsub_n_1
+
+/* Cute i386/i486/p6 addsub loop for the "full overlap" case r1==s2,r2==s1.
+   We let subtraction and addition alternate in being two limbs
+   ahead of the other, thereby avoiding some SAVE_RESTORE. */
+// r1 = r2 + r1    edi = esi + edi
+// r2 = r2 - r1    esi = esi - edi
+//			s1  s2
+//	                r2  r1
+//	eax,ebx,ecx,edx,esi,edi,ebp
+mpn_addsub_n_0:
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+
+	movl	20(%esp),%edi		/* res_ptr */
+	movl	24(%esp),%esi		/* s1_ptr */
+	movl	36(%esp),%ebp		/* size */
+
+	shrl	$2,%ebp
+	xorl	%edx,%edx
+	.align	4
+Loop0:				// L=load E=execute S=store
+	movl	(%esi),%ebx	// sub 0 L
+	movl	4(%esi),%ecx	// sub 1 L
+	sbbl	(%edi),%ebx	// sub 0 LE
+	sbbl	4(%edi),%ecx	// sub 1 LE
+//	SAVE_BORROW_RESTORE_CARRY(%edx)
+	movl	(%esi),%eax	// add 0 L
+	adcl	%eax,(%edi)	// add 0 LES
+	movl	4(%esi),%eax	// add 1 L
+	adcl	%eax,4(%edi)	// add 1 LES
+	movl	%ebx,(%esi)	// sub 0 S
+	movl	%ecx,4(%esi)	// sub 1 S
+	movl	8(%esi),%ebx	// add 2 L
+	adcl	8(%edi),%ebx	// add 2 LE
+	movl	12(%esi),%ecx	// add 3 L
+	adcl	12(%edi),%ecx	// add 3 LE
+//	SAVE_CARRY_RESTORE_BORROW(%edx)
+	movl	8(%edi),%eax	// sub 2 L
+	sbbl	%eax,8(%esi)	// sub 2 LES
+	movl	12(%edi),%eax	// sub 3 L
+	sbbl	%eax,12(%esi)	// sub 3 LES
+	movl	%ebx,8(%edi)	// add 2 S
+	movl	%ecx,12(%edi)	// add 3 S
+	leal	16(%esi),%esi
+	leal	16(%edi),%edi
+	decl	%ebp
+	jnz	Loop0
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+/* Cute i386/i486/p6 addsub loop for the "full overlap" case r1==s1,r2==s2.
+   We let subtraction and addition alternate in being two limbs
+   ahead of the other, thereby avoiding some SAVE_RESTORE. */
+// r1 = r1 + r2    edi = edi + esi
+// r2 = r1 - r2    esi = edi - esi
+//			s2  s1
+//	                r2  r1
+//	eax,ebx,ecx,edx,esi,edi,ebp
+mpn_addsub_n_1:
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+
+	movl	20(%esp),%edi		/* res_ptr */
+	movl	24(%esp),%esi		/* s1_ptr */
+	movl	36(%esp),%ebp		/* size */
+
+	shrl	$2,%ebp
+	xorl	%edx,%edx
+	.align	4
+Loop1:				// L=load E=execute S=store
+	movl	(%edi),%ebx	// sub 0 L
+	sbbl	(%esi),%ebx	// sub 0 LE
+	movl	4(%edi),%ecx	// sub 1 L
+	sbbl	4(%esi),%ecx	// sub 1 LE
+//	SAVE_BORROW_RESTORE_CARRY(%edx)
+	movl	(%esi),%eax	// add 0 L
+	adcl	%eax,(%edi)	// add 0 LES
+	movl	4(%esi),%eax	// add 1 L
+	adcl	%eax,4(%edi)	// add 1 LES
+	movl	%ebx,(%esi)	// sub 0 S
+	movl	%ecx,4(%esi)	// sub 1 S
+	movl	8(%esi),%ebx	// add 2 L
+	adcl	8(%edi),%ebx	// add 2 LE
+	movl	12(%esi),%ecx	// add 3 L
+	adcl	12(%edi),%ecx	// add 3 LE
+//	SAVE_CARRY_RESTORE_BORROW(%edx)
+	movl	8(%edi),%eax	// sub 2 L
+	sbbl	8(%esi),%eax	// sub 2 LES
+	movl	%eax,8(%esi)	// sub 2 S
+	movl	12(%edi),%eax	// sub 3 L
+	sbbl	12(%esi),%eax	// sub 3 LE
+	movl	%eax,12(%esi)	// sub 3 S
+	movl	%ebx,8(%edi)	// add 2 S
+	movl	%ecx,12(%edi)	// add 3 S
+	leal	16(%esi),%esi
+	leal	16(%edi),%edi
+	decl	%ebp
+	jnz	Loop1
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+	.globl	mpn_copy
+mpn_copy:
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+
+	movl	20(%esp),%edi		/* res_ptr */
+	movl	24(%esp),%esi		/* s1_ptr */
+	movl	28(%esp),%ebp		/* size */
+
+	shrl	$2,%ebp
+	.align	4
+Loop2:
+	movl	(%esi),%eax
+	movl	4(%esi),%ebx
+	movl	%eax,(%edi)
+	movl	%ebx,4(%edi)
+	movl	8(%esi),%eax
+	movl	12(%esi),%ebx
+	movl	%eax,8(%edi)
+	movl	%ebx,12(%edi)
+	leal	16(%esi),%esi
+	leal	16(%edi),%edi
+	decl	%ebp
+	jnz	Loop2
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
diff --git a/rts/gmp/mpn/x86/aors_n.asm b/rts/gmp/mpn/x86/aors_n.asm
new file mode 100644
index 0000000000..18ef816b4d
--- /dev/null
+++ b/rts/gmp/mpn/x86/aors_n.asm
@@ -0,0 +1,187 @@
+dnl  x86 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+
+dnl  Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
+dnl  Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+ifdef(`OPERATION_add_n',`
+	define(M4_inst,        adcl)
+	define(M4_function_n,  mpn_add_n)
+	define(M4_function_nc, mpn_add_nc)
+
+',`ifdef(`OPERATION_sub_n',`
+	define(M4_inst,        sbbl)
+	define(M4_function_n,  mpn_sub_n)
+	define(M4_function_nc, mpn_sub_nc)
+
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                          mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C	                    mp_size_t size, mp_limb_t carry);
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST,  4)
+
+	.text
+	ALIGN(8)
+
+PROLOGUE(M4_function_nc)
+deflit(`FRAME',0)
+
+	pushl	%edi		FRAME_pushl()
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC1,%esi
+	movl	PARAM_SRC2,%edx
+	movl	PARAM_SIZE,%ecx
+
+	movl	%ecx,%eax
+	shrl	$3,%ecx			C compute count for unrolled loop
+	negl	%eax
+	andl	$7,%eax			C get index where to start loop
+	jz	LF(M4_function_n,oopgo)	C necessary special case for 0
+	incl	%ecx			C adjust loop count
+	shll	$2,%eax			C adjustment for pointers...
+	subl	%eax,%edi		C ... since they are offset ...
+	subl	%eax,%esi		C ... by a constant when we ...
+	subl	%eax,%edx		C ... enter the loop
+	shrl	$2,%eax			C restore previous value
+
+ifdef(`PIC',`
+	C Calculate start address in loop for PIC.  Due to limitations in
+	C old gas, LF(M4_function_n,oop)-L(0a)-3 cannot be put into the leal
+	call	L(0a)
+L(0a):	leal	(%eax,%eax,8),%eax
+	addl	(%esp),%eax
+	addl	$LF(M4_function_n,oop)-L(0a)-3,%eax
+	addl	$4,%esp
+',`
+	C Calculate start address in loop for non-PIC.
+ 	leal	LF(M4_function_n,oop)-3(%eax,%eax,8),%eax
+')
+
+	C These lines initialize carry from the 5th parameter.  Should be
+	C possible to simplify.
+	pushl	%ebp		FRAME_pushl()
+	movl	PARAM_CARRY,%ebp
+	shrl	$1,%ebp			C shift bit 0 into carry
+	popl	%ebp		FRAME_popl()
+
+	jmp	*%eax			C jump into loop
+
+EPILOGUE()
+
+
+	ALIGN(8)
+PROLOGUE(M4_function_n)
+deflit(`FRAME',0)
+
+	pushl	%edi		FRAME_pushl()
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC1,%esi
+	movl	PARAM_SRC2,%edx
+	movl	PARAM_SIZE,%ecx
+
+	movl	%ecx,%eax
+	shrl	$3,%ecx			C compute count for unrolled loop
+	negl	%eax
+	andl	$7,%eax			C get index where to start loop
+	jz	L(oop)			C necessary special case for 0
+	incl	%ecx			C adjust loop count
+	shll	$2,%eax			C adjustment for pointers...
+	subl	%eax,%edi		C ... since they are offset ...
+	subl	%eax,%esi		C ... by a constant when we ...
+	subl	%eax,%edx		C ... enter the loop
+	shrl	$2,%eax			C restore previous value
+
+ifdef(`PIC',`
+	C Calculate start address in loop for PIC.  Due to limitations in
+	C some assemblers, L(oop)-L(0b)-3 cannot be put into the leal
+	call	L(0b)
+L(0b):	leal	(%eax,%eax,8),%eax
+	addl	(%esp),%eax
+	addl	$L(oop)-L(0b)-3,%eax
+	addl	$4,%esp
+',`
+	C Calculate start address in loop for non-PIC.
+ 	leal	L(oop)-3(%eax,%eax,8),%eax
+')
+	jmp	*%eax			C jump into loop
+
+L(oopgo):
+	pushl	%ebp		FRAME_pushl()
+	movl	PARAM_CARRY,%ebp
+	shrl	$1,%ebp			C shift bit 0 into carry
+	popl	%ebp		FRAME_popl()
+
+	ALIGN(8)
+L(oop):	movl	(%esi),%eax
+	M4_inst	(%edx),%eax
+	movl	%eax,(%edi)
+	movl	4(%esi),%eax
+	M4_inst	4(%edx),%eax
+	movl	%eax,4(%edi)
+	movl	8(%esi),%eax
+	M4_inst	8(%edx),%eax
+	movl	%eax,8(%edi)
+	movl	12(%esi),%eax
+	M4_inst	12(%edx),%eax
+	movl	%eax,12(%edi)
+	movl	16(%esi),%eax
+	M4_inst	16(%edx),%eax
+	movl	%eax,16(%edi)
+	movl	20(%esi),%eax
+	M4_inst	20(%edx),%eax
+	movl	%eax,20(%edi)
+	movl	24(%esi),%eax
+	M4_inst	24(%edx),%eax
+	movl	%eax,24(%edi)
+	movl	28(%esi),%eax
+	M4_inst	28(%edx),%eax
+	movl	%eax,28(%edi)
+	leal	32(%edi),%edi
+	leal	32(%esi),%esi
+	leal	32(%edx),%edx
+	decl	%ecx
+	jnz	L(oop)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/aorsmul_1.asm b/rts/gmp/mpn/x86/aorsmul_1.asm
new file mode 100644
index 0000000000..f32ad83989
--- /dev/null
+++ b/rts/gmp/mpn/x86/aorsmul_1.asm
@@ -0,0 +1,134 @@
+dnl  x86 __gmpn_addmul_1 (for 386 and 486) -- Multiply a limb vector with a
+dnl  limb and add the result to a second limb vector.
+
+
+dnl  Copyright (C) 1992, 1994, 1997, 1999, 2000 Free Software Foundation,
+dnl  Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+ifdef(`OPERATION_addmul_1',`
+      define(M4_inst,        addl)
+      define(M4_function_1,  mpn_addmul_1)
+
+',`ifdef(`OPERATION_submul_1',`
+      define(M4_inst,        subl)
+      define(M4_function_1,  mpn_submul_1)
+
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                          mp_limb_t mult);
+
+define(PARAM_MULTIPLIER, `FRAME+16(%esp)')
+define(PARAM_SIZE,       `FRAME+12(%esp)')
+define(PARAM_SRC,        `FRAME+8(%esp)')
+define(PARAM_DST,        `FRAME+4(%esp)')
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(M4_function_1)
+deflit(`FRAME',0)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%ecx
+
+	xorl	%ebx,%ebx
+	andl	$3,%ecx
+	jz	L(end0)
+
+L(oop0):
+	movl	(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	leal	4(%esi),%esi
+	addl	%ebx,%eax
+	movl	$0,%ebx
+	adcl	%ebx,%edx
+	M4_inst	%eax,(%edi)
+	adcl	%edx,%ebx	C propagate carry into cylimb
+
+	leal	4(%edi),%edi
+	decl	%ecx
+	jnz	L(oop0)
+
+L(end0):
+	movl	PARAM_SIZE,%ecx
+	shrl	$2,%ecx
+	jz	L(end)
+
+	ALIGN(8)
+L(oop):	movl	(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	addl	%eax,%ebx
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	4(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	M4_inst	%ebx,(%edi)
+	adcl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	movl	8(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	M4_inst	%ebp,4(%edi)
+	adcl	%eax,%ebx	C new lo + cylimb
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	12(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	M4_inst	%ebx,8(%edi)
+	adcl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	M4_inst	%ebp,12(%edi)
+	adcl	$0,%ebx		C propagate carry into cylimb
+
+	leal	16(%esi),%esi
+	leal	16(%edi),%edi
+	decl	%ecx
+	jnz	L(oop)
+
+L(end):	movl	%ebx,%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/copyd.asm b/rts/gmp/mpn/x86/copyd.asm
new file mode 100644
index 0000000000..439640e836
--- /dev/null
+++ b/rts/gmp/mpn/x86/copyd.asm
@@ -0,0 +1,80 @@
+dnl  x86 mpn_copyd -- copy limb vector, decrementing.
+dnl 
+dnl  Future: On P6 an MMX loop should be able to go faster than this code.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size, working from high to low addresses.
+C
+C The code here is very generic and can be expected to be reasonable on all
+C the x86 family.
+C
+C P5 - 1.0 cycles/limb.
+C
+C P6 - 2.4 cycles/limb, approx 40 cycles startup.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_copyd)
+	C eax	saved esi
+	C ebx
+	C ecx	counter
+	C edx	saved edi
+	C esi	src
+	C edi	dst
+	C ebp
+
+	movl	PARAM_SIZE, %ecx
+	movl	%esi, %eax
+
+	movl	PARAM_SRC, %esi
+	movl	%edi, %edx
+
+	movl	PARAM_DST, %edi
+	leal	-4(%esi,%ecx,4), %esi
+
+	leal	-4(%edi,%ecx,4), %edi
+	
+	std
+
+	rep
+	movsl
+
+	cld
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/copyi.asm b/rts/gmp/mpn/x86/copyi.asm
new file mode 100644
index 0000000000..5bc4e36689
--- /dev/null
+++ b/rts/gmp/mpn/x86/copyi.asm
@@ -0,0 +1,79 @@
+dnl  x86 mpn_copyi -- copy limb vector, incrementing.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size, working from low to high addresses.
+C
+C The code here is very generic and can be expected to be reasonable on all
+C the x86 family.
+C
+C P5 - 1.0 cycles/limb.
+C
+C P6 - 0.75 cycles/limb.  An MMX based copy was tried, but was found to be
+C      slower than a rep movs in all cases.  The fastest MMX found was 0.8
+C      cycles/limb (when fully aligned).  A rep movs seems to have a startup
+C      time of about 15 cycles, but doing something special for small sizes
+C      could lead to a branch misprediction that would destroy any saving.
+C      For now a plain rep movs seems ok for P6.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+	.text
+	ALIGN(32)
+
+	C eax	saved esi
+	C ebx
+	C ecx	counter
+	C edx	saved edi
+	C esi	src
+	C edi	dst
+	C ebp
+
+PROLOGUE(mpn_copyi)
+
+	movl	PARAM_SIZE, %ecx
+	movl	%esi, %eax
+
+	movl	PARAM_SRC, %esi
+	movl	%edi, %edx
+
+	movl	PARAM_DST, %edi
+
+	cld	C better safe than sorry, see mpn/x86/README.family
+
+	rep
+	movsl
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/diveby3.asm b/rts/gmp/mpn/x86/diveby3.asm
new file mode 100644
index 0000000000..df879da9e1
--- /dev/null
+++ b/rts/gmp/mpn/x86/diveby3.asm
@@ -0,0 +1,115 @@
+dnl  x86 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl  The following all have their own optimized versions of this routine,
+dnl  but for reference the code here runs as follows.
+dnl
+dnl       cycles/limb
+dnl  P54     18.0
+dnl  P55     17.0
+dnl  P6      14.5
+dnl  K6      14.0
+dnl  K7      10.0
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                              mp_limb_t carry);
+
+defframe(PARAM_CARRY,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,   8)
+defframe(PARAM_DST,   4)
+
+dnl  multiplicative inverse of 3, modulo 2^32
+deflit(INVERSE_3,       0xAAAAAAAB)
+
+dnl  ceil(b/3) and ceil(b*2/3) where b=2^32
+deflit(ONE_THIRD_CEIL,  0x55555556)
+deflit(TWO_THIRDS_CEIL, 0xAAAAAAAB)
+
+	.text
+	ALIGN(8)
+
+PROLOGUE(mpn_divexact_by3c)
+deflit(`FRAME',0)
+
+	movl	PARAM_SRC, %ecx
+	pushl	%ebp		FRAME_pushl()
+
+	movl	PARAM_SIZE, %ebp
+	pushl	%edi		FRAME_pushl()
+
+	movl	PARAM_DST, %edi
+	pushl	%esi		FRAME_pushl()
+
+	movl	$INVERSE_3, %esi
+	pushl	%ebx		FRAME_pushl()
+
+	leal	(%ecx,%ebp,4), %ecx
+	movl	PARAM_CARRY, %ebx
+
+	leal	(%edi,%ebp,4), %edi
+	negl	%ebp
+
+
+	ALIGN(8)
+L(top):
+	C eax	scratch, low product
+	C ebx	carry limb (0 to 3)
+	C ecx	&src[size]
+	C edx	scratch, high product
+	C esi	multiplier
+	C edi	&dst[size]
+	C ebp	counter, limbs, negative
+
+	movl	(%ecx,%ebp,4), %eax
+
+	subl	%ebx, %eax
+
+	setc	%bl
+
+	imull	%esi
+
+	cmpl	$ONE_THIRD_CEIL, %eax
+	movl	%eax, (%edi,%ebp,4)
+
+	sbbl	$-1, %ebx	C +1 if eax>=ceil(b/3)
+	cmpl	$TWO_THIRDS_CEIL, %eax
+
+	sbbl	$-1, %ebx	C +1 if eax>=ceil(b*2/3)
+	incl	%ebp
+
+	jnz	L(top)
+
+
+	movl	%ebx, %eax
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	popl	%ebp
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/divrem_1.asm b/rts/gmp/mpn/x86/divrem_1.asm
new file mode 100644
index 0000000000..12f14676d6
--- /dev/null
+++ b/rts/gmp/mpn/x86/divrem_1.asm
@@ -0,0 +1,232 @@
+dnl  x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient.
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl        cycles/limb
+dnl  K6        20
+dnl  P5        44
+dnl  P6        39
+dnl  486   approx 43 maybe
+dnl
+dnl
+dnl  The following have their own optimized divrem_1 implementations, but
+dnl  for reference the code here runs as follows.
+dnl
+dnl        cycles/limb
+dnl  P6MMX     39
+dnl  K7        42
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                         mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C                          mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C
+C Divide src,size by divisor and store the quotient in dst+xsize,size.
+C Extend the division to fractional quotient limbs in dst,xsize.  Return the
+C remainder.  Either or both xsize and size can be 0.
+C
+C mpn_divrem_1c takes a carry parameter which is an initial high limb,
+C effectively one extra limb at the top of src,size.  Must have
+C carry<divisor.
+C
+C
+C Essentially the code is the same as the division based part of
+C mpn/generic/divrem_1.c, but has the following advantages.
+C
+C - If gcc isn't being used then divrem_1.c will get the generic C
+C   udiv_qrnnd() and be rather slow.
+C
+C - On K6, using the loop instruction is a 10% speedup, but gcc doesn't
+C   generate that instruction (as of gcc 2.95.2 at least).
+C
+C A test is done to see if the high limb is less the the divisor, and if so
+C one less div is done.  A div is between 20 and 40 cycles on the various
+C x86s, so assuming high<divisor about half the time, then this test saves
+C half that amount.  The branch misprediction penalty on each chip is less
+C than half a div.
+C  	
+C
+C K6: Back-to-back div instructions run at 20 cycles, the same as the loop
+C     here, so it seems there's nothing to gain by rearranging the loop.
+C     Pairing the mov and loop instructions was found to gain nothing.  (The
+C     same is true of the mpn/x86/mod_1.asm loop.)
+C
+C     With a "decl/jnz" rather than a "loop" this code runs at 22 cycles.
+C     The loop_or_decljnz macro is an easy way to get a 10% speedup.
+C
+C     The fast K6 multiply might be thought to suit a multiply-by-inverse,
+C     but that algorithm has been found to suffer from the releatively poor
+C     carry handling on K6 and too many auxiliary instructions.  The
+C     fractional part however could be done at about 13 c/l.
+C
+C P5: Moving the load down to pair with the store might save 1 cycle, but
+C     that doesn't seem worth bothering with, since it'd be only a 2.2%
+C     saving.
+C
+C     Again here the auxiliary instructions hinder a multiply-by-inverse,
+C     though there might be a 10-15% speedup available
+
+
+defframe(PARAM_CARRY,  24)
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE,   16)
+defframe(PARAM_SRC,    12)
+defframe(PARAM_XSIZE,  8)
+defframe(PARAM_DST,    4)
+
+	.text
+	ALIGN(16)
+
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%edi		FRAME_pushl()
+	
+	movl	PARAM_SRC, %edi
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DIVISOR, %esi
+	pushl	%ebx		FRAME_pushl()
+
+	movl	PARAM_DST, %ebx
+	pushl	%ebp		FRAME_pushl()
+
+	movl	PARAM_XSIZE, %ebp
+	orl	%ecx, %ecx
+
+	movl	PARAM_CARRY, %edx
+	jz	LF(mpn_divrem_1,fraction)
+
+	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
+	jmp	LF(mpn_divrem_1,integer_top)
+
+EPILOGUE()
+
+
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%edi		FRAME_pushl()
+	
+	movl	PARAM_SRC, %edi
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DIVISOR, %esi
+	orl	%ecx,%ecx
+
+	jz	L(size_zero)
+	pushl	%ebx		FRAME_pushl()
+
+	movl	-4(%edi,%ecx,4), %eax	C src high limb
+	xorl	%edx, %edx
+
+	movl	PARAM_DST, %ebx
+	pushl	%ebp		FRAME_pushl()
+
+	movl	PARAM_XSIZE, %ebp
+	cmpl	%esi, %eax
+
+	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
+	jae	L(integer_entry)
+
+
+	C high<divisor, so high of dst is zero, and avoid one div
+
+	movl	%edx, (%ebx,%ecx,4)
+	decl	%ecx
+
+	movl	%eax, %edx
+	jz	L(fraction)
+
+
+L(integer_top):
+	C eax	scratch (quotient)
+	C ebx	dst+4*xsize-4
+	C ecx	counter
+	C edx	scratch (remainder)
+	C esi	divisor
+	C edi	src
+	C ebp	xsize
+
+	movl	-4(%edi,%ecx,4), %eax
+L(integer_entry):
+
+	divl	%esi
+
+	movl	%eax, (%ebx,%ecx,4)
+	loop_or_decljnz	L(integer_top)
+
+
+L(fraction):
+	orl	%ebp, %ecx
+	jz	L(done)
+
+	movl	PARAM_DST, %ebx
+
+
+L(fraction_top):
+	C eax	scratch (quotient)
+	C ebx	dst
+	C ecx	counter
+	C edx	scratch (remainder)
+	C esi	divisor
+	C edi
+	C ebp
+
+	xorl	%eax, %eax
+
+	divl	%esi
+
+	movl	%eax, -4(%ebx,%ecx,4)
+	loop_or_decljnz	L(fraction_top)
+
+
+L(done):
+	popl	%ebp
+	movl	%edx, %eax
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+
+L(size_zero):
+deflit(`FRAME',8)
+	movl	PARAM_XSIZE, %ecx
+	xorl	%eax, %eax
+
+	movl	PARAM_DST, %edi
+
+	cld	C better safe than sorry, see mpn/x86/README.family
+
+	rep
+	stosl
+
+	popl	%esi
+	popl	%edi
+	ret
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/README b/rts/gmp/mpn/x86/k6/README
new file mode 100644
index 0000000000..3ad96c8b89
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/README
@@ -0,0 +1,237 @@
+
+			AMD K6 MPN SUBROUTINES
+
+
+
+This directory contains code optimized for AMD K6 CPUs, meaning K6, K6-2 and
+K6-3.
+
+The mmx and k62mmx subdirectories have routines using MMX instructions.  All
+K6s have MMX, the separate directories are just so that ./configure can omit
+them if the assembler doesn't support MMX.
+
+
+
+
+STATUS
+
+Times for the loops, with all code and data in L1 cache, are as follows.
+
+                                 cycles/limb
+
+	mpn_add_n/sub_n            3.25 normal, 2.75 in-place
+
+	mpn_mul_1                  6.25
+	mpn_add/submul_1           7.65-8.4  (varying with data values)
+
+	mpn_mul_basecase           9.25 cycles/crossproduct (approx)
+	mpn_sqr_basecase           4.7  cycles/crossproduct (approx)
+                                   or 9.2 cycles/triangleproduct (approx)
+
+	mpn_divrem_1              20.0
+	mpn_mod_1                 20.0
+	mpn_divexact_by3          11.0
+
+	mpn_l/rshift               3.0
+
+	mpn_copyi/copyd            1.0
+
+	mpn_com_n                  1.5-1.85  \
+	mpn_and/andn/ior/xor_n     1.5-1.75  | varying with
+	mpn_iorn/xnor_n            2.0-2.25  | data alignment
+	mpn_nand/nior_n            2.0-2.25  /
+
+	mpn_popcount		  12.5
+	mpn_hamdist		  13.0
+
+
+K6-2 and K6-3 have dual-issue MMX and get the following improvements.
+
+	mpn_l/rshift               1.75
+
+	mpn_copyi/copyd            0.56 or 1.0  \
+                                                |
+	mpn_com_n                  1.0-1.2      | varying with
+	mpn_and/andn/ior/xor_n     1.2-1.5      | data alignment
+	mpn_iorn/xnor_n            1.5-2.0      |
+	mpn_nand/nior_n            1.75-2.0     /
+
+	mpn_popcount		   9.0
+	mpn_hamdist		  11.5
+
+
+Prefetching of sources hasn't yet given any joy.  With the 3DNow "prefetch"
+instruction, code seems to run slower, and with just "mov" loads it doesn't
+seem faster.  Results so far are inconsistent.  The K6 does a hardware
+prefetch of the second cache line in a sector, so the penalty for not
+prefetching in software is reduced.
+
+
+
+
+NOTES
+
+All K6 family chips have MMX, but only K6-2 and K6-3 have 3DNow.
+
+Plain K6 executes MMX instructions only in the X pipe, but K6-2 and K6-3 can
+execute them in both X and Y (and together).
+
+Branch misprediction penalty is 1 to 4 cycles (Optimization Manual
+chapter 6 table 12).
+
+Write-allocate L1 data cache means prefetching of destinations is unnecessary.
+Store queue is 7 entries of 64 bits each.
+
+Floating point multiplications can be done in parallel with integer
+multiplications, but there doesn't seem to be any way to make use of this.
+
+
+
+OPTIMIZATIONS
+
+Unrolled loops are used to reduce looping overhead.  The unrolling is
+configurable up to 32 limbs/loop for most routines, up to 64 for some.
+
+Sometimes computed jumps into the unrolling are used to handle sizes not a
+multiple of the unrolling.  An attractive feature of this is that times
+smoothly increase with operand size, but an indirect jump is about 6 cycles
+and the setups about another 6, so it depends on how much the unrolled code
+is faster than a simple loop as to whether a computed jump ought to be used.
+
+Position independent code is implemented using a call to get eip for
+computed jumps and a ret is always done, rather than an addl $4,%esp or a
+popl, so the CPU return address branch prediction stack stays synchronised
+with the actual stack in memory.  Such a call however still costs 4 to 7
+cycles.
+
+Branch prediction, in absence of any history, will guess forward jumps are
+not taken and backward jumps are taken.  Where possible it's arranged that
+the less likely or less important case is under a taken forward jump.
+
+
+
+MMX
+
+Putting emms or femms as late as possible in a routine seems to be fastest.
+Perhaps an emms or femms stalls until all outstanding MMX instructions have
+completed, so putting it later gives them a chance to complete on their own,
+in parallel with other operations (like register popping).
+
+The Optimization Manual chapter 5 recommends using a femms on K6-2 and K6-3
+at the start of a routine, in case it's been preceded by x87 floating point
+operations.  This isn't done because in gmp programs it's expected that x87
+floating point won't be much used and that chances are an mpn routine won't
+have been preceded by any x87 code.
+
+
+
+CODING
+
+Instructions in general code are shown paired if they can decode and execute
+together, meaning two short decode instructions with the second not
+depending on the first, only the first using the shifter, no more than one
+load, and no more than one store.
+
+K6 does some out of order execution so the pairings aren't essential, they
+just show what slots might be available.  When decoding is the limiting
+factor things can be scheduled that might not execute until later.
+
+
+
+NOTES
+
+Code alignment
+
+- if an opcode/modrm or 0Fh/opcode/modrm crosses a cache line boundary,
+  short decode is inhibited.  The cross.pl script detects this.
+
+- loops and branch targets should be aligned to 16 bytes, or ensure at least
+  2 instructions before a 32 byte boundary.  This makes use of the 16 byte
+  cache in the BTB.
+
+Addressing modes
+
+- (%esi) degrades decoding from short to vector.  0(%esi) doesn't have this
+  problem, and can be used as an equivalent, or easier is just to use a
+  different register, like %ebx.
+
+- K6 and pre-CXT core K6-2 have the following problem.  (K6-2 CXT and K6-3
+  have it fixed, these being cpuid function 1 signatures 0x588 to 0x58F).
+
+  If more than 3 bytes are needed to determine instruction length then
+  decoding degrades from direct to long, or from long to vector.  This
+  happens with forms like "0F opcode mod/rm" with mod/rm=00-xxx-100 since
+  with mod=00 the sib determines whether there's a displacement.
+
+  This affects all MMX and 3DNow instructions, and others with an 0F prefix
+  like movzbl.  The modes affected are anything with an index and no
+  displacement, or an index but no base, and this includes (%esp) which is
+  really (,%esp,1).
+
+  The cross.pl script detects problem cases.  The workaround is to always
+  use a displacement, and to do this with Zdisp if it's zero so the
+  assembler doesn't discard it.
+
+  See Optimization Manual rev D page 67 and 3DNow Porting Guide rev B pages
+  13-14 and 36-37.
+
+Calls
+
+- indirect jumps and calls are not branch predicted, they measure about 6
+  cycles.
+
+Various
+
+- adcl      2 cycles of decode, maybe 2 cycles executing in the X pipe
+- bsf       12-27 cycles
+- emms      5 cycles
+- femms     3 cycles
+- jecxz     2 cycles taken, 13 not taken (optimization manual says 7 not taken)
+- divl      20 cycles back-to-back
+- imull     2 decode, 2 execute
+- mull      2 decode, 3 execute (optimization manual decoding sample)
+- prefetch  2 cycles
+- rcll/rcrl implicit by one bit: 2 cycles
+            immediate or %cl count: 11 + 2 per bit for dword
+                                    13 + 4 per bit for byte
+- setCC	    2 cycles
+- xchgl	%eax,reg  1.5 cycles, back-to-back (strange)
+        reg,reg   2 cycles, back-to-back
+
+
+
+
+REFERENCES
+
+"AMD-K6 Processor Code Optimization Application Note", AMD publication
+number 21924, revision D amendment 0, January 2000.  This describes K6-2 and
+K6-3.  Available on-line,
+
+	http://www.amd.com/K6/k6docs/pdf/21924.pdf
+
+"AMD-K6 MMX Enhanced Processor x86 Code Optimization Application Note", AMD
+publication number 21828, revision A amendment 0, August 1997.  This is an
+older edition of the above document, describing plain K6.  Available
+on-line,
+
+	http://www.amd.com/K6/k6docs/pdf/21828.pdf
+
+"3DNow Technology Manual", AMD publication number 21928F/0-August 1999.
+This describes the femms and prefetch instructions, but nothing else from
+3DNow has been used.  Available on-line,
+
+	http://www.amd.com/K6/k6docs/pdf/21928.pdf
+
+"3DNow Instruction Porting Guide", AMD publication number 22621, revision B,
+August 1999.  This has some notes on general K6 optimizations as well as
+3DNow.  Available on-line,
+
+	http://www.amd.com/products/cpg/athlon/techdocs/pdf/22621.pdf
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/rts/gmp/mpn/x86/k6/aors_n.asm b/rts/gmp/mpn/x86/k6/aors_n.asm
new file mode 100644
index 0000000000..31b05ada51
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/aors_n.asm
@@ -0,0 +1,329 @@
+dnl  AMD K6 mpn_add/sub_n -- mpn addition or subtraction.
+dnl 
+dnl  K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+ifdef(`OPERATION_add_n', `
+	define(M4_inst,        adcl)
+	define(M4_function_n,  mpn_add_n)
+	define(M4_function_nc, mpn_add_nc)
+	define(M4_description, add)
+',`ifdef(`OPERATION_sub_n', `
+	define(M4_inst,        sbbl)
+	define(M4_function_n,  mpn_sub_n)
+	define(M4_function_nc, mpn_sub_nc)
+	define(M4_description, subtract)
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                          mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C	                      mp_size_t size, mp_limb_t carry);
+C
+C Calculate src1,size M4_description src2,size, and store the result in
+C dst,size.  The return value is the carry bit from the top of the result
+C (1 or 0).
+C
+C The _nc version accepts 1 or 0 for an initial carry into the low limb of
+C the calculation.  Note values other than 1 or 0 here will lead to garbage
+C results.
+C
+C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and
+C an in-place dst+=src to 2.5 c/l.  The unrolled loops have 1 cycle/loop of
+C loop control, which with 4 limbs/loop means an extra 0.25 c/l.
+
+define(PARAM_CARRY, `FRAME+20(%esp)')
+define(PARAM_SIZE,  `FRAME+16(%esp)')
+define(PARAM_SRC2,  `FRAME+12(%esp)')
+define(PARAM_SRC1,  `FRAME+8(%esp)')
+define(PARAM_DST,   `FRAME+4(%esp)')
+deflit(`FRAME',0)
+
+dnl  minimum 5 because the unrolled code can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(M4_function_nc)
+	movl	PARAM_CARRY, %eax
+	jmp	LF(M4_function_n,start)
+EPILOGUE()
+
+
+PROLOGUE(M4_function_n)
+	xorl	%eax, %eax
+L(start):
+	movl	PARAM_SIZE, %ecx
+	pushl	%ebx
+FRAME_pushl()
+
+	movl	PARAM_SRC1, %ebx
+	pushl	%edi
+FRAME_pushl()
+
+	movl	PARAM_SRC2, %edx
+	cmpl	$UNROLL_THRESHOLD, %ecx
+
+	movl	PARAM_DST, %edi
+	jae	L(unroll)
+
+
+	shrl	%eax		C initial carry flag
+
+	C offset 0x21 here, close enough to aligned
+L(simple):
+	C eax	scratch
+	C ebx	src1
+	C ecx	counter
+	C edx	src2
+	C esi
+	C edi	dst
+	C ebp
+	C
+	C The store to (%edi) could be done with a stosl; it'd be smaller
+	C code, but there's no speed gain and a cld would have to be added
+	C (per mpn/x86/README.family).
+
+	movl	(%ebx), %eax
+	leal	4(%ebx), %ebx
+	
+	M4_inst	(%edx), %eax
+
+	movl	%eax, (%edi)
+	leal	4(%edi), %edi
+
+	leal	4(%edx), %edx
+	loop	L(simple)
+
+	
+	movl	$0, %eax
+	popl	%edi
+
+	setc	%al
+
+	popl	%ebx
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(unroll):
+	C eax	carry
+	C ebx	src1
+	C ecx	counter
+	C edx	src2
+	C esi
+	C edi	dst
+	C ebp
+
+	cmpl	%edi, %ebx
+	pushl	%esi
+
+	je	L(inplace)
+
+ifdef(`OPERATION_add_n',`
+	cmpl	%edi, %edx
+
+	je	L(inplace_reverse)
+')
+
+	movl	%ecx, %esi
+
+	andl	$-4, %ecx
+	andl	$3, %esi
+
+	leal	(%ebx,%ecx,4), %ebx
+	leal	(%edx,%ecx,4), %edx
+	leal	(%edi,%ecx,4), %edi
+
+	negl	%ecx
+	shrl	%eax
+
+	ALIGN(32)
+L(normal_top):
+	C eax	counter, qwords, negative
+	C ebx	src1
+	C ecx	scratch
+	C edx	src2
+	C esi
+	C edi	dst
+	C ebp
+
+ 	movl	(%ebx,%ecx,4), %eax
+	leal	5(%ecx), %ecx
+ 	M4_inst	-20(%edx,%ecx,4), %eax
+ 	movl	%eax, -20(%edi,%ecx,4)
+
+ 	movl	4-20(%ebx,%ecx,4), %eax
+ 	M4_inst	4-20(%edx,%ecx,4), %eax
+ 	movl	%eax, 4-20(%edi,%ecx,4)
+
+ 	movl	8-20(%ebx,%ecx,4), %eax
+ 	M4_inst	8-20(%edx,%ecx,4), %eax
+ 	movl	%eax, 8-20(%edi,%ecx,4)
+
+ 	movl	12-20(%ebx,%ecx,4), %eax
+ 	M4_inst	12-20(%edx,%ecx,4), %eax
+ 	movl	%eax, 12-20(%edi,%ecx,4)
+
+	loop	L(normal_top)
+
+
+	decl	%esi
+	jz	L(normal_finish_one)
+	js	L(normal_done)
+
+	C two or three more limbs
+
+ 	movl	(%ebx), %eax
+ 	M4_inst	(%edx), %eax
+ 	movl	%eax, (%edi)
+
+ 	movl	4(%ebx), %eax
+ 	M4_inst	4(%edx), %eax
+	decl	%esi
+ 	movl	%eax, 4(%edi)
+
+	jz	L(normal_done)
+	movl	$2, %ecx
+
+L(normal_finish_one):
+ 	movl	(%ebx,%ecx,4), %eax
+ 	M4_inst	(%edx,%ecx,4), %eax
+ 	movl	%eax, (%edi,%ecx,4)
+
+L(normal_done):	
+	popl	%esi
+	popl	%edi
+
+	movl	$0, %eax
+	popl	%ebx
+
+	setc	%al
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+
+ifdef(`OPERATION_add_n',`
+L(inplace_reverse):
+	C dst==src2
+
+	movl	%ebx, %edx
+')
+
+L(inplace):
+	C eax	initial carry
+	C ebx
+	C ecx	size
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+
+	leal	-1(%ecx), %esi
+	decl	%ecx
+
+	andl	$-4, %ecx
+	andl	$3, %esi
+
+ 	movl	(%edx), %ebx		C src low limb
+	leal	(%edx,%ecx,4), %edx
+
+	leal	(%edi,%ecx,4), %edi
+	negl	%ecx
+
+	shrl	%eax
+
+
+	ALIGN(32)
+L(inplace_top):
+	C eax
+	C ebx	next src limb
+	C ecx	size
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+
+ 	M4_inst	%ebx, (%edi,%ecx,4)
+
+ 	movl	4(%edx,%ecx,4), %eax
+	leal	5(%ecx), %ecx
+
+ 	M4_inst	%eax, 4-20(%edi,%ecx,4)
+
+ 	movl	8-20(%edx,%ecx,4), %eax
+ 	movl	12-20(%edx,%ecx,4), %ebx
+
+ 	M4_inst	%eax, 8-20(%edi,%ecx,4)
+ 	M4_inst	%ebx, 12-20(%edi,%ecx,4)
+
+ 	movl	16-20(%edx,%ecx,4), %ebx
+	loop	L(inplace_top)
+
+
+	C now %esi is 0 to 3 representing respectively 1 to 4 limbs more
+
+ 	M4_inst	%ebx, (%edi)
+
+	decl	%esi
+	jz	L(inplace_finish_one)
+	js	L(inplace_done)
+
+	C two or three more limbs
+
+ 	movl	4(%edx), %eax
+ 	movl	8(%edx), %ebx
+ 	M4_inst	%eax, 4(%edi)
+ 	M4_inst	%ebx, 8(%edi)
+
+	decl	%esi
+	movl	$2, %ecx
+
+	jz	L(normal_done)
+
+L(inplace_finish_one):
+ 	movl	4(%edx,%ecx,4), %eax
+ 	M4_inst	%eax, 4(%edi,%ecx,4)
+
+L(inplace_done):	
+	popl	%esi
+	popl	%edi
+
+	movl	$0, %eax
+	popl	%ebx
+
+	setc	%al
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/aorsmul_1.asm b/rts/gmp/mpn/x86/k6/aorsmul_1.asm
new file mode 100644
index 0000000000..da4120fe2f
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/aorsmul_1.asm
@@ -0,0 +1,372 @@
+dnl  AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
+dnl 
+dnl  K6: 7.65 to 8.5 cycles/limb (at 16 limbs/loop and depending on the data),
+dnl  PIC adds about 6 cycles at the start.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K6:           large multpliers  small multpliers
+dnl  UNROLL_COUNT    cycles/limb       cycles/limb
+dnl        4             9.5              7.78
+dnl        8             9.0              7.78
+dnl       16             8.4              7.65
+dnl       32             8.4              8.2
+dnl
+dnl  Maximum possible unrolling with the current code is 32.
+dnl
+dnl  Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256
+dnl  byte block, which might explain the good speed at that unrolling.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_addmul_1', `
+	define(M4_inst,        addl)
+	define(M4_function_1,  mpn_addmul_1)
+	define(M4_function_1c, mpn_addmul_1c)
+	define(M4_description, add it to)
+	define(M4_desc_retval, carry)
+',`ifdef(`OPERATION_submul_1', `
+	define(M4_inst,        subl)
+	define(M4_function_1,  mpn_submul_1)
+	define(M4_function_1c, mpn_submul_1c)
+	define(M4_description, subtract it from)
+	define(M4_desc_retval, borrow)
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                          mp_limb_t mult);
+C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                           mp_limb_t mult, mp_limb_t carry);
+C
+C Calculate src,size multiplied by mult and M4_description dst,size.
+C Return the M4_desc_retval limb from the top of the result.
+C
+C The jadcl0()s in the unrolled loop makes the speed data dependent.  Small
+C multipliers (most significant few bits clear) result in few carry bits and
+C speeds up to 7.65 cycles/limb are attained.  Large multipliers (most
+C significant few bits set) make the carry bits 50/50 and lead to something
+C more like 8.4 c/l.  (With adcl's both of these would be 9.3 c/l.)
+C
+C It's important that the gains for jadcl0 on small multipliers don't come
+C at the cost of slowing down other data.  Tests on uniformly distributed
+C random data, designed to confound branch prediction, show about a 7%
+C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all
+C overheads included).
+C
+C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus
+C 11.0 cycles/limb), and hence isn't used.
+C
+C In the simple loop, note that running ecx from negative to zero and using
+C it as an index in the two movs wouldn't help.  It would save one
+C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired
+C that would be collapsed by this.
+C
+C
+C jadcl0
+C ------
+C
+C jadcl0() being faster than adcl $0 seems to be an artifact of two things,
+C firstly the instruction decoding and secondly the fact that there's a
+C carry bit for the jadcl0 only on average about 1/4 of the time.
+C
+C The code in the unrolled loop decodes something like the following.
+C
+C                                         decode cycles
+C		mull	%ebp                    2
+C		M4_inst	%esi, disp(%edi)        1
+C		adcl	%eax, %ecx              2
+C		movl	%edx, %esi            \ 1
+C		jnc	1f                    /
+C		incl	%esi                  \ 1
+C	1:	movl	disp(%ebx), %eax      /
+C                                              ---
+C                                               7
+C
+C In a back-to-back style test this measures 7 with the jnc not taken, or 8
+C with it taken (both when correctly predicted).  This is opposite to the
+C measurements showing small multipliers running faster than large ones.
+C Watch this space for more info ...
+C
+C It's not clear how much branch misprediction might be costing.  The K6
+C doco says it will be 1 to 4 cycles, but presumably it's near the low end
+C of that range to get the measured results.
+C
+C
+C In the code the two carries are more or less the preceding mul product and
+C the calculation is roughly
+C
+C	x*y + u*b+v
+C
+C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and
+C v are the two limbs it's added to (being the low of the next mul, and a
+C limb from the destination).
+C
+C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and
+C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of
+C x*y/b^2.  If x, y, u and v are random and uniformly distributed between 0
+C and b-1, then the total probability can be summed over x and y,
+C
+C	 1    b-1 b-1 x*y    1    b*(b-1)   b*(b-1)
+C	--- * sum sum --- = --- * ------- * ------- = 1/4
+C       b^2   x=0 y=1 b^2   b^4      2         2
+C
+C Actually it's a very tiny bit less than 1/4 of course.  If y is fixed,
+C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2.
+
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 9)
+',`
+deflit(UNROLL_THRESHOLD, 6)
+')
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(M4_function_1c)
+	pushl	%esi
+deflit(`FRAME',4)
+	movl	PARAM_CARRY, %esi
+	jmp	LF(M4_function_1,start_nc)
+EPILOGUE()
+
+PROLOGUE(M4_function_1)
+	push	%esi
+deflit(`FRAME',4)
+	xorl	%esi, %esi	C initial carry
+
+L(start_nc):
+	movl	PARAM_SIZE, %ecx
+	pushl	%ebx
+deflit(`FRAME',8)
+
+	movl	PARAM_SRC, %ebx
+	pushl	%edi
+deflit(`FRAME',12)
+
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	movl	PARAM_DST, %edi
+
+	pushl	%ebp
+deflit(`FRAME',16)
+	jae	L(unroll)
+
+	
+	C simple loop
+
+	movl	PARAM_MULTIPLIER, %ebp
+
+L(simple):
+	C eax	scratch
+	C ebx	src
+	C ecx	counter
+	C edx	scratch
+	C esi	carry
+	C edi	dst
+	C ebp	multiplier
+
+	movl	(%ebx), %eax
+	addl	$4, %ebx
+
+	mull	%ebp
+
+	addl	$4, %edi
+	addl	%esi, %eax
+
+	adcl	$0, %edx
+
+	M4_inst	%eax, -4(%edi)
+
+	adcl	$0, %edx
+
+	movl	%edx, %esi
+	loop	L(simple)
+
+
+	popl	%ebp
+	popl	%edi
+
+	popl	%ebx
+	movl	%esi, %eax
+
+	popl	%esi
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+C The unrolled loop uses a "two carry limbs" scheme.  At the top of the loop
+C the carries are ecx=lo, esi=hi, then they swap for each limb processed.
+C For the computed jump an odd size means they start one way around, an even
+C size the other.
+C
+C VAR_JUMP holds the computed jump temporarily because there's not enough
+C registers at the point of doing the mul for the initial two carry limbs.
+C
+C The add/adc for the initial carry in %esi is necessary only for the
+C mpn_addmul/submul_1c entry points.  Duplicating the startup code to
+C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
+C idea.
+
+dnl  overlapping with parameters already fetched
+define(VAR_COUNTER, `PARAM_SIZE')
+define(VAR_JUMP,    `PARAM_DST')
+
+L(unroll):
+	C eax
+	C ebx	src
+	C ecx	size
+	C edx
+	C esi	initial carry
+	C edi	dst
+	C ebp
+
+	movl	%ecx, %edx
+	decl	%ecx
+
+	subl	$2, %edx
+	negl	%ecx
+
+	shrl	$UNROLL_LOG2, %edx
+	andl	$UNROLL_MASK, %ecx
+
+	movl	%edx, VAR_COUNTER
+	movl	%ecx, %edx
+
+	shll	$4, %edx
+	negl	%ecx
+
+	C 15 code bytes per limb
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(entry) (%edx,%ecx,1), %edx
+')
+	movl	(%ebx), %eax		C src low limb
+
+	movl	PARAM_MULTIPLIER, %ebp
+	movl	%edx, VAR_JUMP
+
+	mull	%ebp
+
+	addl	%esi, %eax	C initial carry (from _1c)
+	jadcl0(	%edx)
+
+
+	leal	4(%ebx,%ecx,4), %ebx
+	movl	%edx, %esi	C high carry
+
+	movl	VAR_JUMP, %edx
+	leal	(%edi,%ecx,4), %edi
+
+	testl	$1, %ecx
+	movl	%eax, %ecx	C low carry
+
+	jz	L(noswap)
+	movl	%esi, %ecx	C high,low carry other way around
+
+	movl	%eax, %esi
+L(noswap):
+
+	jmp	*%edx
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See README.family about old gas bugs
+	leal	(%edx,%ecx,1), %edx
+	addl	$L(entry)-L(here), %edx
+	addl	(%esp), %edx
+	ret
+')
+
+
+C -----------------------------------------------------------
+	ALIGN(32)
+L(top):
+deflit(`FRAME',16)
+	C eax	scratch
+	C ebx	src
+	C ecx	carry lo
+	C edx	scratch
+	C esi	carry hi
+	C edi	dst
+	C ebp	multiplier
+	C
+	C 15 code bytes per limb
+
+	leal	UNROLL_BYTES(%edi), %edi
+
+L(entry):
+forloop(`i', 0, UNROLL_COUNT/2-1, `
+	deflit(`disp0', eval(2*i*4))
+	deflit(`disp1', eval(disp0 + 4))
+
+Zdisp(	movl,	disp0,(%ebx), %eax)
+	mull	%ebp
+Zdisp(	M4_inst,%ecx, disp0,(%edi))
+	adcl	%eax, %esi
+	movl	%edx, %ecx
+	jadcl0(	%ecx)
+
+	movl	disp1(%ebx), %eax
+	mull	%ebp
+	M4_inst	%esi, disp1(%edi)
+	adcl	%eax, %ecx
+	movl	%edx, %esi
+	jadcl0(	%esi)
+')
+
+	decl	VAR_COUNTER
+	leal	UNROLL_BYTES(%ebx), %ebx
+
+	jns	L(top)
+
+
+	popl	%ebp
+	M4_inst	%ecx, UNROLL_BYTES(%edi)
+
+	popl	%edi
+	movl	%esi, %eax
+
+	popl	%ebx
+	jadcl0(	%eax)
+
+	popl	%esi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/cross.pl b/rts/gmp/mpn/x86/k6/cross.pl
new file mode 100644
index 0000000000..21734f3e52
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/cross.pl
@@ -0,0 +1,141 @@
+#! /usr/bin/perl
+
+# Copyright (C) 2000 Free Software Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published
+# by the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+# Usage: cross.pl [filename.o]...
+#
+# Produce an annotated disassembly of the given object files, indicating
+# certain code alignment and addressing mode problems afflicting K6 chips.
+# "ZZ" is used on all annotations, so this can be searched for.
+#
+# With no arguments, all .o files corresponding to .asm files are processed.
+# This is good in the mpn object directory of a k6*-*-* build.
+#
+# As far as fixing problems goes, any cache line crossing problems in loops
+# get attention, but as a rule it's too tedious to rearrange code or slip in
+# nops to fix every problem in setup or finishup code.
+#
+# Bugs:
+#
+# Instructions without mod/rm bytes or which are already vector decoded are
+# unaffected by cache line boundary crossing, but not all of these have yet
+# been put in as exceptions.  All that occur in practice in GMP are present
+# though.
+#
+# There's no messages for using the vector decoded addressing mode (%esi),
+# but that mode is easy to avoid when coding.
+
+use strict;
+
+sub disassemble {
+    my ($file) = @_;
+    my ($addr,$b1,$b2,$b3, $prefix,$opcode,$modrm);
+
+    open (IN, "objdump -Srfh $file |")
+	|| die "Cannot open pipe from objdump\n";
+    while (<IN>) {
+	print;
+
+	if (/^[ \t]*[0-9]+[ \t]+\.text[ \t]/ && /2\*\*([0-9]+)$/) {
+	    if ($1 < 5) {
+		print "ZZ need at least 2**5 for predictable cache line crossing\n";
+	    }
+	}
+	
+	if (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)[ \t]+([0-9a-f]+)/) {
+	    ($addr,$b1,$b2,$b3) = ($1,$2,$3,$4);
+
+	} elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)/) {
+	    ($addr,$b1,$b2,$b3) = ($1,$2,$3,'');
+
+	} elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)/) {
+	    ($addr,$b1,$b2,$b3) = ($1,$2,'','');
+
+	} else {
+	    next;
+	}
+
+	if ($b1 =~ /0f/) {
+	    $prefix = $b1;
+	    $opcode = $b2;
+	    $modrm = $b3;
+	} else {
+	    $prefix = '';
+	    $opcode = $b1;
+	    $modrm = $b2;
+	}
+
+	# modrm of the form 00-xxx-100 with an 0F prefix is the problem case
+	# for K6 and pre-CXT K6-2
+	if ($prefix =~ /0f/
+	    && $opcode !~ /^8/         # jcond disp32
+	    && $modrm =~ /^[0-3][4c]/) {
+	    print "ZZ ($file) >3 bytes to determine instruction length\n";
+	}
+
+	# with just an opcode, starting 1f mod 20h
+	if ($addr =~ /[13579bdf]f$/
+	    && $prefix !~ /0f/
+	    && $opcode !~ /1[012345]/ # adc
+	    && $opcode !~ /1[89abcd]/ # sbb
+	    && $opcode !~ /68/        # push $imm32
+	    && $opcode !~ /^7/        # jcond disp8
+	    && $opcode !~ /a[89]/     # test+imm
+	    && $opcode !~ /a[a-f]/    # stos/lods/scas
+	    && $opcode !~ /b8/        # movl $imm32,%eax
+	    && $opcode !~ /e[0123]/   # loop/loopz/loopnz/jcxz
+	    && $opcode !~ /e[b9]/     # jmp disp8/disp32
+	    && $opcode !~ /f[89abcd]/ # clc,stc,cli,sti,cld,std
+	    && !($opcode =~ /f[67]/          # grp 1
+		 && $modrm =~ /^[2367abef]/) # mul, imul, div, idiv
+	    && $modrm !~ /^$/) {
+	    print "ZZ ($file) opcode/modrm cross 32-byte boundary\n";
+	}
+
+	# with an 0F prefix, anything starting at 1f mod 20h
+	if ($addr =~ /[13579bdf][f]$/
+	    && $prefix =~ /0f/) {
+	    print "ZZ ($file) prefix/opcode cross 32-byte boundary\n";
+	}
+
+	# with an 0F prefix, anything with mod/rm starting at 1e mod 20h
+	if ($addr =~ /[13579bdf][e]$/
+	    && $prefix =~ /0f/
+	     && $opcode !~ /^8/        # jcond disp32
+	    && $modrm !~ /^$/) {
+	    print "ZZ ($file) prefix/opcode/modrm cross 32-byte boundary\n";
+	}
+    }
+    close IN || die "Error from objdump (or objdump not available)\n";
+}
+
+
+my @files;
+if ($#ARGV >= 0) {
+    @files = @ARGV;
+} else {
+    @files = glob "*.asm";
+    map {s/.asm/.o/} @files;
+}
+
+foreach (@files)  {
+    disassemble($_);
+}
diff --git a/rts/gmp/mpn/x86/k6/diveby3.asm b/rts/gmp/mpn/x86/k6/diveby3.asm
new file mode 100644
index 0000000000..ffb97bc380
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/diveby3.asm
@@ -0,0 +1,110 @@
+dnl  AMD K6 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
+dnl 
+dnl  K6: 11.0 cycles/limb
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                              mp_limb_t carry);
+C
+C Using %esi in (%esi,%ecx,4) or 0(%esi,%ecx,4) addressing modes doesn't
+C lead to vector decoding, unlike plain (%esi) does.
+
+defframe(PARAM_CARRY,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,   8)
+defframe(PARAM_DST,   4)
+
+dnl  multiplicative inverse of 3, modulo 2^32
+deflit(INVERSE_3, 0xAAAAAAAB)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_divexact_by3c)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%esi		defframe_pushl(SAVE_ESI)
+
+	movl	PARAM_SRC, %esi
+	pushl	%edi		defframe_pushl(SAVE_EDI)
+
+	movl	PARAM_DST, %edi
+	pushl	%ebx		defframe_pushl(SAVE_EBX)
+
+	movl	PARAM_CARRY, %ebx
+	leal	(%esi,%ecx,4), %esi
+
+	pushl	$3		defframe_pushl(VAR_THREE)
+	leal	(%edi,%ecx,4), %edi
+
+	negl	%ecx
+
+
+	C Need 32 alignment for claimed speed, to avoid the movl store
+	C opcode/modrm crossing a cache line boundary
+
+	ALIGN(32)
+L(top):
+	C eax	scratch, low product
+	C ebx	carry limb (0 to 3)
+	C ecx	counter, limbs, negative
+	C edx	scratch, high product
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp
+	C
+	C The 0(%esi,%ecx,4) form pads so the finishup "movl %ebx, %eax"
+	C doesn't cross a 32 byte boundary, saving a couple of cycles
+	C (that's a fixed couple, not per loop).
+
+Zdisp(	movl,	0,(%esi,%ecx,4), %eax)
+	subl	%ebx, %eax
+
+	setc	%bl
+
+	imull	$INVERSE_3, %eax
+
+	movl	%eax, (%edi,%ecx,4)
+	addl	$2, %ecx
+
+	mull	VAR_THREE
+
+	addl	%edx, %ebx
+	loop	L(top)
+
+
+	movl	SAVE_ESI, %esi
+	movl	%ebx, %eax
+
+	movl	SAVE_EBX, %ebx
+
+	movl	SAVE_EDI, %edi
+	addl	$FRAME, %esp
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/gmp-mparam.h b/rts/gmp/mpn/x86/k6/gmp-mparam.h
new file mode 100644
index 0000000000..77f3948d77
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/gmp-mparam.h
@@ -0,0 +1,97 @@
+/* AMD K6 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+#ifndef UMUL_TIME
+#define UMUL_TIME   3   /* cycles */
+#endif
+
+#ifndef UDIV_TIME
+#define UDIV_TIME   20  /* cycles */
+#endif
+
+/* bsfl takes 12-27 cycles, put an average for uniform random numbers */
+#ifndef COUNT_TRAILING_ZEROS_TIME
+#define COUNT_TRAILING_ZEROS_TIME   14  /* cycles */
+#endif
+
+
+/* Generated by tuneup.c, 2000-07-04. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   18
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      130
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   34
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      116
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              68
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD             98
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            13
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD          67
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE  { 528, 1184, 2176, 5632, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD     472
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD         4352
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE  { 528, 1184, 2176, 5632, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD     544
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD         4352
+#endif
diff --git a/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm b/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm
new file mode 100644
index 0000000000..20a33e6ccf
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/k62mmx/copyd.asm
@@ -0,0 +1,179 @@
+dnl  AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
+dnl 
+dnl  K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data
+dnl  alignment.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K6-2 aligned:
+dnl  UNROLL_COUNT cycles/limb
+dnl        8          0.75
+dnl       16          0.625
+dnl       32          0.5625
+dnl       64          0.53
+dnl  Maximum possible with the current code is 64, the minimum is 2.
+
+deflit(UNROLL_COUNT, 32)
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size, processing limbs from high to low addresses.
+C
+C The comments in copyi.asm apply here too.
+
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_copyd)
+	movl	PARAM_SIZE, %ecx
+	movl	%esi, %eax
+
+	movl	PARAM_SRC, %esi
+	movl	%edi, %edx
+
+	std
+
+	movl	PARAM_DST, %edi
+	cmpl	$UNROLL_COUNT, %ecx
+
+	leal	-4(%esi,%ecx,4), %esi
+
+	leal	-4(%edi,%ecx,4), %edi
+	ja	L(unroll)
+
+L(simple):
+	rep
+	movsl
+
+	cld
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	ret
+
+
+L(unroll):
+	C if src and dst are different alignments mod8, then use rep movs
+	C if src and dst are both 4mod8 then process one limb to get 0mod8
+
+	pushl	%ebx
+	leal	(%esi,%edi), %ebx
+
+	testb	$4, %bl
+	popl	%ebx
+	
+	jnz	L(simple)
+	testl	$4, %esi
+
+	leal	-UNROLL_COUNT(%ecx), %ecx
+	jnz	L(already_aligned)
+
+	movsl
+
+	decl	%ecx
+L(already_aligned):
+
+
+ifelse(UNROLL_BYTES,256,`
+	subl	$128, %esi
+	subl	$128, %edi
+')
+
+	C offset 0x3D here, but gets full speed without further alignment
+L(top):
+	C eax	saved esi
+	C ebx
+	C ecx	counter, limbs
+	C edx	saved edi
+	C esi	src, incrementing
+	C edi	dst, incrementing
+	C ebp
+	C
+	C `disp' is never 0, so don't need to force 0(%esi).
+
+deflit(CHUNK_COUNT, 2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
+	movq	disp(%esi), %mm0
+	movq	%mm0, disp(%edi)
+')
+
+	leal	-UNROLL_BYTES(%esi), %esi
+	subl	$UNROLL_COUNT, %ecx
+
+	leal	-UNROLL_BYTES(%edi), %edi
+	jns	L(top)
+
+
+	C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to
+	C UNROLL_COUNT-1 limbs remaining
+
+	testb	$eval(UNROLL_COUNT/2), %cl
+
+	leal	UNROLL_COUNT(%ecx), %ecx
+	jz	L(not_half)
+
+
+	C at an unroll count of 32 this block of code is 16 cycles faster than
+	C the rep movs, less 3 or 4 to test whether to do it
+
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, `
+	deflit(`disp', eval(-4-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,+128)))
+	movq	disp(%esi), %mm0
+	movq	%mm0, disp(%edi)
+')
+
+	subl	$eval(UNROLL_BYTES/2), %esi
+	subl	$eval(UNROLL_BYTES/2), %edi
+
+	subl	$eval(UNROLL_COUNT/2), %ecx
+L(not_half):
+
+
+ifelse(UNROLL_BYTES,256,`
+	addl	$128, %esi
+	addl	$128, %edi
+')
+
+	rep
+	movsl
+
+	cld
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	femms
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm b/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm
new file mode 100644
index 0000000000..215d805f2e
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/k62mmx/copyi.asm
@@ -0,0 +1,196 @@
+dnl  AMD K6-2 mpn_copyi -- copy limb vector, incrementing.
+dnl 
+dnl  K6-2: 0.56 or 1.0 cycles/limb (at 32 limbs/loop), depending on data
+dnl  alignment.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K6-2 aligned:
+dnl  UNROLL_COUNT cycles/limb
+dnl        8          0.75
+dnl       16          0.625
+dnl       32          0.5625
+dnl       64          0.53
+dnl  Maximum possible with the current code is 64, the minimum is 2.
+
+deflit(UNROLL_COUNT, 32)
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The MMX loop is faster than a rep movs when src and dst are both 0mod8.
+C With one 0mod8 and one 4mod8 it's 1.056 c/l and the rep movs at 1.0 c/l is
+C used instead.
+C
+C         mod8
+C	src  dst
+C	 0    0	   both aligned, use mmx
+C	 0    4    unaligned, use rep movs
+C	 4    0    unaligned, use rep movs
+C	 4    4    do one movs, then both aligned, use mmx
+C
+C The MMX code on aligned data is 0.5 c/l, plus loop overhead of 2
+C cycles/loop, which is 0.0625 c/l at 32 limbs/loop.
+C
+C A pattern of two movq loads and two movq stores (or four and four) was
+C tried, but found to be the same speed as just one of each.
+C
+C Note that this code only suits K6-2 and K6-3.  Plain K6 does only one mmx
+C instruction per cycle, so "movq"s are no faster than the simple 1 c/l rep
+C movs.
+C
+C Enhancement:
+C
+C Addressing modes like disp(%esi,%ecx,4) aren't currently used.  They'd
+C make it possible to avoid incrementing %esi and %edi in the loop and hence
+C get loop overhead down to 1 cycle.  Care would be needed to avoid bad
+C cache line crossings since the "movq"s would then be 5 code bytes rather
+C than 4.
+
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_copyi)
+	movl	PARAM_SIZE, %ecx
+	movl	%esi, %eax
+
+	movl	PARAM_SRC, %esi
+	movl	%edi, %edx
+
+	cld
+
+	movl	PARAM_DST, %edi
+	cmpl	$UNROLL_COUNT, %ecx
+
+	ja	L(unroll)
+
+L(simple):
+	rep
+	movsl
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	ret
+
+
+L(unroll):
+	C if src and dst are different alignments mod8, then use rep movs
+	C if src and dst are both 4mod8 then process one limb to get 0mod8
+
+	pushl	%ebx
+	leal	(%esi,%edi), %ebx
+
+	testb	$4, %bl
+	popl	%ebx
+	
+	jnz	L(simple)
+	testl	$4, %esi
+
+	leal	-UNROLL_COUNT(%ecx), %ecx
+	jz	L(already_aligned)
+
+	decl	%ecx
+
+	movsl
+L(already_aligned):
+
+
+ifelse(UNROLL_BYTES,256,`
+	addl	$128, %esi
+	addl	$128, %edi
+')
+
+	C this is offset 0x34, no alignment needed
+L(top):
+	C eax	saved esi
+	C ebx
+	C ecx	counter, limbs
+	C edx	saved edi
+	C esi	src, incrementing
+	C edi	dst, incrementing
+	C ebp
+	C
+	C Zdisp gets 0(%esi) left that way to avoid vector decode, and with
+	C 0(%edi) keeps code aligned to 16 byte boundaries.
+
+deflit(CHUNK_COUNT, 2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+Zdisp(	movq,	disp,(%esi), %mm0)
+Zdisp(	movq,	%mm0, disp,(%edi))
+')
+
+	addl	$UNROLL_BYTES, %esi
+	subl	$UNROLL_COUNT, %ecx
+
+	leal	UNROLL_BYTES(%edi), %edi
+	jns	L(top)
+
+
+	C now %ecx is -UNROLL_COUNT to -1 representing repectively 0 to
+	C UNROLL_COUNT-1 limbs remaining
+
+	testb	$eval(UNROLL_COUNT/2), %cl
+
+	leal	UNROLL_COUNT(%ecx), %ecx
+	jz	L(not_half)
+
+	C at an unroll count of 32 this block of code is 16 cycles faster than
+	C the rep movs, less 3 or 4 to test whether to do it
+
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT/2-1, `
+	deflit(`disp', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	movq	disp(%esi), %mm0
+	movq	%mm0, disp(%edi)
+')
+	addl	$eval(UNROLL_BYTES/2), %esi
+	addl	$eval(UNROLL_BYTES/2), %edi
+
+	subl	$eval(UNROLL_COUNT/2), %ecx
+L(not_half):
+
+
+ifelse(UNROLL_BYTES,256,`
+	subl	$128, %esi
+	subl	$128, %edi
+')
+
+	rep
+	movsl
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	femms
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm b/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm
new file mode 100644
index 0000000000..f6d54f97a8
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/k62mmx/lshift.asm
@@ -0,0 +1,286 @@
+dnl  AMD K6-2 mpn_lshift -- mpn left shift.
+dnl 
+dnl  K6-2: 1.75 cycles/limb
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+dnl  used after src has been fetched
+define(VAR_RETVAL,`PARAM_SRC')
+
+dnl  minimum 9, because unrolled loop can't handle less
+deflit(UNROLL_THRESHOLD, 9)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+	C The 1 limb case can be done without the push %ebx, but it's then
+	C still the same speed.  The push is left as a free helping hand for
+	C the two_or_more code.
+
+	movl	PARAM_SIZE, %eax
+	pushl	%ebx			FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	decl	%eax
+
+	movl	PARAM_SHIFT, %ecx
+	jnz	L(two_or_more)
+
+	movl	(%ebx), %edx		C src limb
+	movl	PARAM_DST, %ebx
+
+	shldl(	%cl, %edx, %eax)	C return value
+
+ 	shll	%cl, %edx
+
+	movl	%edx, (%ebx)		C dst limb
+	popl	%ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)	C avoid offset 0x1f
+L(two_or_more):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx
+
+	movl	(%ebx,%eax,4), %edx	C src high limb
+	negl	%ecx
+
+	movd	PARAM_SHIFT, %mm6
+	addl	$32, %ecx		C 32-shift
+
+	shrl	%cl, %edx
+	cmpl	$UNROLL_THRESHOLD-1, %eax
+
+	movl	%edx, VAR_RETVAL
+	jae	L(unroll)
+
+
+	movd	%ecx, %mm7
+	movl	%eax, %ecx
+
+	movl	PARAM_DST, %eax
+
+L(simple):
+	C eax	dst
+	C ebx	src
+	C ecx	counter, size-1 to 1
+	C edx	retval
+	C
+	C mm0	scratch
+	C mm6	shift
+	C mm7	32-shift
+
+	movq	-4(%ebx,%ecx,4), %mm0
+
+ 	psrlq	%mm7, %mm0
+
+Zdisp(	movd,	%mm0, 0,(%eax,%ecx,4))
+	loop	L(simple)
+
+
+	movd	(%ebx), %mm0
+	popl	%ebx
+
+ 	psllq	%mm6, %mm0
+
+	movd	%mm0, (%eax)
+	movl	%edx, %eax
+
+	femms
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll):
+	C eax	size-1
+	C ebx	src
+	C ecx	32-shift
+	C edx	retval (but instead VAR_RETVAL is used)
+	C
+	C mm6	shift
+
+	addl	$32, %ecx
+	movl	PARAM_DST, %edx
+
+	movd	%ecx, %mm7
+	subl	$7, %eax			C size-8
+
+	leal	(%edx,%eax,4), %ecx		C alignment of dst
+
+	movq	32-8(%ebx,%eax,4), %mm2		C src high qword
+	testb	$4, %cl
+
+	jz	L(dst_aligned)
+	psllq	%mm6, %mm2
+
+	psrlq	$32, %mm2
+	decl	%eax
+
+	movd	%mm2, 32(%edx,%eax,4)		C dst high limb
+	movq	32-8(%ebx,%eax,4), %mm2		C new src high qword
+L(dst_aligned):
+
+	movq	32-16(%ebx,%eax,4), %mm0	C src second highest qword
+
+
+	C This loop is the important bit, the rest is just support for it.
+	C Four src limbs are held at the start, and four more will be read.
+	C Four dst limbs will be written.  This schedule seems necessary for
+	C full speed.
+	C
+	C The use of size-8 lets the loop stop when %eax goes negative and
+	C leaves -4 to -1 which can be tested with test $1 and $2.
+
+L(top):
+	C eax	counter, size-8 step by -4 until <0
+	C ebx	src
+	C ecx
+	C edx	dst
+	C
+	C mm0	src next qword
+	C mm1	scratch
+	C mm2	src prev qword
+	C mm6	shift
+	C mm7	64-shift
+
+	psllq	%mm6, %mm2
+	subl	$4, %eax
+
+	movq	%mm0, %mm1
+	psrlq	%mm7, %mm0
+
+	por	%mm0, %mm2
+	movq	24(%ebx,%eax,4), %mm0
+
+	psllq	%mm6, %mm1
+	movq	%mm2, 40(%edx,%eax,4)
+
+	movq	%mm0, %mm2
+	psrlq	%mm7, %mm0
+
+	por	%mm0, %mm1
+	movq	16(%ebx,%eax,4), %mm0
+
+	movq	%mm1, 32(%edx,%eax,4)
+	jnc	L(top)
+
+
+	C Now have four limbs in mm2 (prev) and mm0 (next), plus eax mod 4.
+	C
+	C 8(%ebx) is the next source, and 24(%edx) is the next destination.
+	C %eax is between -4 and -1, representing respectively 0 to 3 extra
+	C limbs that must be read.
+
+
+	testl	$2, %eax	C testl to avoid bad cache line crossing
+	jz	L(finish_nottwo)
+
+	C Two more limbs: lshift mm2, OR it with rshifted mm0, mm0 becomes
+	C new mm2 and a new mm0 is loaded.
+
+	psllq	%mm6, %mm2
+	movq	%mm0, %mm1
+
+	psrlq	%mm7, %mm0
+	subl	$2, %eax
+
+	por	%mm0, %mm2
+	movq	16(%ebx,%eax,4), %mm0
+
+	movq	%mm2, 32(%edx,%eax,4)
+	movq	%mm1, %mm2
+L(finish_nottwo):
+
+
+	C lshift mm2, OR with rshifted mm0, mm1 becomes lshifted mm0
+
+	testb	$1, %al
+	psllq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psrlq	%mm7, %mm0
+
+	por	%mm0, %mm2
+	psllq	%mm6, %mm1
+
+	movq	%mm2, 24(%edx,%eax,4)
+	jz	L(finish_even)
+
+
+	C Size is odd, so mm1 and one extra limb to process.
+
+	movd	(%ebx), %mm0		C src[0]
+	popl	%ebx
+deflit(`FRAME',0)
+
+	movq	%mm0, %mm2
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+
+	psllq	%mm6, %mm2
+	por	%mm0, %mm1
+
+	movq	%mm1, 4(%edx)		C dst[1,2]
+	movd	%mm2, (%edx)		C dst[0]
+
+	movl	VAR_RETVAL, %eax
+
+	femms
+	ret
+
+
+	nop	C avoid bad cache line crossing
+L(finish_even):
+deflit(`FRAME',4)
+	C Size is even, so only mm1 left to process.
+
+	movq	%mm1, (%edx)		C dst[0,1]
+	movl	VAR_RETVAL, %eax
+
+	popl	%ebx
+	femms
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm b/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm
new file mode 100644
index 0000000000..8a8c144241
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/k62mmx/rshift.asm
@@ -0,0 +1,285 @@
+dnl  AMD K6-2 mpn_rshift -- mpn right shift.
+dnl 
+dnl  K6-2: 1.75 cycles/limb
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+dnl  Minimum 9, because the unrolled loop can't handle less.
+dnl
+deflit(UNROLL_THRESHOLD, 9)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+	C The 1 limb case can be done without the push %ebx, but it's then
+	C still the same speed.  The push is left as a free helping hand for
+	C the two_or_more code.
+
+	movl	PARAM_SIZE, %eax
+	pushl	%ebx			FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	decl	%eax
+
+	movl	PARAM_SHIFT, %ecx
+	jnz	L(two_or_more)
+
+	movl	(%ebx), %edx		C src limb
+	movl	PARAM_DST, %ebx
+
+	shrdl(	%cl, %edx, %eax)	C return value
+
+ 	shrl	%cl, %edx
+
+	movl	%edx, (%ebx)		C dst limb
+	popl	%ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)	C avoid offset 0x1f
+L(two_or_more):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx
+
+	movl	(%ebx), %edx	C src low limb
+	negl	%ecx
+
+	addl	$32, %ecx
+	movd	PARAM_SHIFT, %mm6
+
+	shll	%cl, %edx
+	cmpl	$UNROLL_THRESHOLD-1, %eax
+
+	jae	L(unroll)
+
+
+	C eax	size-1
+	C ebx	src
+	C ecx	32-shift
+	C edx	retval
+	C
+	C mm6	shift
+
+	movl	PARAM_DST, %ecx
+	leal	(%ebx,%eax,4), %ebx
+
+	leal	-4(%ecx,%eax,4), %ecx
+	negl	%eax
+
+	C This loop runs at about 3 cycles/limb, which is the amount of
+	C decoding, and this is despite every second access being unaligned.
+
+L(simple):
+	C eax	counter, -(size-1) to -1
+	C ebx	&src[size-1]
+	C ecx	&dst[size-1]
+	C edx	retval
+	C
+	C mm0	scratch
+	C mm6	shift
+
+Zdisp(	movq,	0,(%ebx,%eax,4), %mm0)
+	incl	%eax
+
+ 	psrlq	%mm6, %mm0
+
+Zdisp(	movd,	%mm0, 0,(%ecx,%eax,4))
+	jnz	L(simple)
+
+
+	movq	%mm0, (%ecx)
+	movl	%edx, %eax
+
+	popl	%ebx
+
+	femms
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)	
+L(unroll):
+	C eax	size-1
+	C ebx	src
+	C ecx	32-shift
+	C edx	retval
+	C
+	C mm6	shift
+
+	addl	$32, %ecx
+	subl	$7, %eax		C size-8
+
+	movd	%ecx, %mm7
+	movl	PARAM_DST, %ecx
+
+	movq	(%ebx), %mm2		C src low qword
+	leal	(%ebx,%eax,4), %ebx	C src end - 32
+
+	testb	$4, %cl
+	leal	(%ecx,%eax,4), %ecx	C dst end - 32
+
+	notl	%eax			C -(size-7)
+	jz	L(dst_aligned)
+
+	psrlq	%mm6, %mm2
+	incl	%eax
+
+Zdisp(	movd,	%mm2, 0,(%ecx,%eax,4))	C dst low limb
+	movq	4(%ebx,%eax,4), %mm2	C new src low qword
+L(dst_aligned):
+
+	movq	12(%ebx,%eax,4), %mm0	C src second lowest qword
+	nop 	C avoid bad cache line crossing
+
+
+	C This loop is the important bit, the rest is just support for it.
+	C Four src limbs are held at the start, and four more will be read.
+	C Four dst limbs will be written.  This schedule seems necessary for
+	C full speed.
+	C
+	C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and
+	C and leaves 0 to 3 which can be tested with test $1 and $2.
+
+L(top):
+	C eax	counter, -(size-7) step by +4 until >=0
+	C ebx	src end - 32
+	C ecx	dst end - 32
+	C edx	retval
+	C
+	C mm0	src next qword
+	C mm1	scratch
+	C mm2	src prev qword
+	C mm6	shift
+	C mm7	64-shift
+
+	psrlq	%mm6, %mm2
+	addl	$4, %eax
+
+	movq	%mm0, %mm1
+	psllq	%mm7, %mm0
+
+	por	%mm0, %mm2
+	movq	4(%ebx,%eax,4), %mm0
+
+	psrlq	%mm6, %mm1
+	movq	%mm2, -12(%ecx,%eax,4)
+
+	movq	%mm0, %mm2
+	psllq	%mm7, %mm0
+
+	por	%mm0, %mm1
+	movq	12(%ebx,%eax,4), %mm0
+
+	movq	%mm1, -4(%ecx,%eax,4)
+	ja	L(top)		C jump if no carry and not zero
+
+
+
+	C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0
+	C to 3 representing respectively 3 to 0 further limbs.
+
+	testl	$2, %eax	C testl to avoid bad cache line crossings
+	jnz	L(finish_nottwo)
+
+	C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0
+	C becomes new mm2 and a new mm0 is loaded.
+
+	psrlq	%mm6, %mm2
+	movq	%mm0, %mm1
+
+	psllq	%mm7, %mm0
+	addl	$2, %eax
+
+	por	%mm0, %mm2
+	movq	12(%ebx,%eax,4), %mm0
+
+	movq	%mm2, -4(%ecx,%eax,4)
+	movq	%mm1, %mm2
+L(finish_nottwo):
+
+
+	testb	$1, %al
+	psrlq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psllq	%mm7, %mm0
+
+	por	%mm0, %mm2
+	psrlq	%mm6, %mm1
+
+	movq	%mm2, 4(%ecx,%eax,4)
+	jnz	L(finish_even)
+
+
+	C one further extra limb to process
+
+	movd	32-4(%ebx), %mm0	C src[size-1], most significant limb
+	popl	%ebx
+
+	movq	%mm0, %mm2
+	psllq	%mm7, %mm0
+
+	por	%mm0, %mm1
+	psrlq	%mm6, %mm2
+
+	movq	%mm1, 32-12(%ecx)	C dst[size-3,size-2]
+	movd	%mm2, 32-4(%ecx)	C dst[size-1]
+
+	movl	%edx, %eax		C retval
+
+	femms
+	ret
+
+
+	nop	C avoid bad cache line crossing
+L(finish_even):
+	C no further extra limbs
+
+	movq	%mm1, 32-8(%ecx)	C dst[size-2,size-1]
+	movl	%edx, %eax		C retval
+
+	popl	%ebx
+
+	femms
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/com_n.asm b/rts/gmp/mpn/x86/k6/mmx/com_n.asm
new file mode 100644
index 0000000000..8915080f0f
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mmx/com_n.asm
@@ -0,0 +1,91 @@
+dnl  AMD K6-2 mpn_com_n -- mpn bitwise one's complement.
+dnl   
+dnl     alignment dst/src, A=0mod8 N=4mod8
+dnl        A/A   A/N   N/A   N/N
+dnl  K6-2  1.0   1.18  1.18  1.18  cycles/limb
+dnl  K6    1.5   1.85  1.75  1.85
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_com_n (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Take the bitwise ones-complement of src,size and write it to dst,size.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_com_n)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %edx
+	shrl	%ecx
+	jnz	L(two_or_more)
+
+	movl	(%eax), %eax
+	notl	%eax
+	movl	%eax, (%edx)
+	ret
+
+
+L(two_or_more):
+	pushl	%ebx
+FRAME_pushl()
+	movl	%ecx, %ebx
+
+	pcmpeqd	%mm7, %mm7	C all ones
+
+
+	ALIGN(16)
+L(top):
+	C eax	src
+	C ebx	floor(size/2)
+	C ecx	counter
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+
+	movq	-8(%eax,%ecx,8), %mm0
+	pxor	%mm7, %mm0
+	movq	%mm0, -8(%edx,%ecx,8)
+	loop	L(top)
+
+
+	jnc	L(no_extra)
+	movl	(%eax,%ebx,8), %eax
+	notl	%eax
+	movl	%eax, (%edx,%ebx,8)
+L(no_extra):
+
+	popl	%ebx
+	emms_or_femms
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/logops_n.asm b/rts/gmp/mpn/x86/k6/mmx/logops_n.asm
new file mode 100644
index 0000000000..46cb3b7ea5
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mmx/logops_n.asm
@@ -0,0 +1,212 @@
+dnl  AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
+dnl  mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
+dnl   
+dnl          alignment dst/src1/src2, A=0mod8, N=4mod8
+dnl       A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
+dnl 
+dnl  K6-2  1.2   1.5   1.5   1.2   1.2   1.5   1.5   1.2   and,andn,ior,xor
+dnl  K6-2  1.5   1.75  2.0   1.75  1.75  2.0   1.75  1.5   iorn,xnor
+dnl  K6-2  1.75  2.0   2.0   2.0   2.0   2.0   2.0   1.75  nand,nior
+dnl
+dnl  K6    1.5   1.68  1.75  1.2   1.75  1.75  1.68  1.5   and,andn,ior,xor
+dnl  K6    2.0   2.0   2.25  2.25  2.25  2.25  2.0   2.0   iorn,xnor
+dnl  K6    2.0   2.25  2.25  2.25  2.25  2.25  2.25  2.0   nand,nior
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  M4_p and M4_i are the MMX and integer instructions
+dnl  M4_*_neg_dst means whether to negate the final result before writing
+dnl  M4_*_neg_src2 means whether to negate the src2 values before using them
+
+define(M4_choose_op,
+m4_assert_numargs(7)
+`ifdef(`OPERATION_$1',`
+define(`M4_function',  `mpn_$1')
+define(`M4_operation', `$1')
+define(`M4_p',         `$2')
+define(`M4_p_neg_dst', `$3')
+define(`M4_p_neg_src2',`$4')
+define(`M4_i',         `$5')
+define(`M4_i_neg_dst', `$6')
+define(`M4_i_neg_src2',`$7')
+')')
+
+dnl  xnor is done in "iorn" style because it's a touch faster than "nior"
+dnl  style (the two are equivalent for xor).
+
+M4_choose_op( and_n,  pand,0,0,  andl,0,0)
+M4_choose_op( andn_n, pandn,0,0, andl,0,1)
+M4_choose_op( nand_n, pand,1,0,  andl,1,0)
+M4_choose_op( ior_n,  por,0,0,   orl,0,0)
+M4_choose_op( iorn_n, por,0,1,   orl,0,1)
+M4_choose_op( nior_n, por,1,0,   orl,1,0)
+M4_choose_op( xor_n,  pxor,0,0,  xorl,0,0)
+M4_choose_op( xnor_n, pxor,0,1,  xorl,0,1)
+
+ifdef(`M4_function',,
+`m4_error(`Unrecognised or undefined OPERATION symbol
+')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+
+C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                   mp_size_t size);
+C
+C Do src1,size M4_operation src2,size, storing the result in dst,size.
+C
+C Unaligned movq loads and stores are a bit slower than aligned ones.  The
+C test at the start of the routine checks the alignment of src1 and if
+C necessary processes one limb separately at the low end to make it aligned.
+C
+C The raw speeds without this alignment switch are as follows.
+C
+C           alignment dst/src1/src2, A=0mod8, N=4mod8
+C     A/A/A  A/A/N  A/N/A  A/N/N  N/A/A  N/A/N  N/N/A  N/N/N
+C
+C K6                 1.5    2.0                 1.5    2.0    and,andn,ior,xor
+C K6                 1.75   2.2                 2.0    2.28   iorn,xnor
+C K6                 2.0    2.25                2.35   2.28   nand,nior
+C
+C
+C Future:
+C
+C K6 can do one 64-bit load per cycle so each of these routines should be
+C able to approach 1.0 c/l, if aligned.  The basic and/andn/ior/xor might be
+C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs.
+C The others are 4 instructions per 2 limbs, and so can only approach 1.0
+C because there's nowhere to hide some loop control.
+
+defframe(PARAM_SIZE,16)
+defframe(PARAM_SRC2,12)
+defframe(PARAM_SRC1,8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+	.text
+	ALIGN(32)
+PROLOGUE(M4_function)
+			movl	PARAM_SIZE, %ecx
+			pushl	%ebx
+		FRAME_pushl()
+			movl	PARAM_SRC1, %eax
+			movl	PARAM_SRC2, %ebx
+			cmpl	$1, %ecx
+			movl	PARAM_DST, %edx
+			ja	L(two_or_more)
+
+
+			movl	(%ebx), %ecx
+			popl	%ebx
+ifelse(M4_i_neg_src2,1,`notl	%ecx')
+			M4_i	(%eax), %ecx
+ifelse(M4_i_neg_dst,1,`	notl	%ecx')
+			movl	%ecx, (%edx)
+
+			ret
+
+
+L(two_or_more):
+			C eax	src1
+			C ebx	src2
+			C ecx	size
+			C edx	dst
+			C esi
+			C edi
+			C ebp
+			C
+			C carry bit is low of size
+
+			pushl	%esi
+		FRAME_pushl()
+			testl	$4, %eax
+			jz	L(alignment_ok)
+
+			movl	(%ebx), %esi
+			addl	$4, %ebx
+ifelse(M4_i_neg_src2,1,`notl	%esi')
+			M4_i	(%eax), %esi
+			addl	$4, %eax
+ifelse(M4_i_neg_dst,1,`	notl	%esi')
+			movl	%esi, (%edx)
+			addl	$4, %edx
+			decl	%ecx
+
+L(alignment_ok):
+			movl	%ecx, %esi
+			shrl	%ecx
+			jnz	L(still_two_or_more)
+
+			movl	(%ebx), %ecx
+			popl	%esi
+ifelse(M4_i_neg_src2,1,`notl	%ecx')
+			M4_i	(%eax), %ecx
+ifelse(M4_i_neg_dst,1,`	notl	%ecx')
+			popl	%ebx
+			movl	%ecx, (%edx)
+			ret
+
+
+L(still_two_or_more):
+ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,`
+			pcmpeqd	%mm7, %mm7	C all ones
+')
+
+			ALIGN(16)
+L(top):
+			C eax	src1
+			C ebx	src2
+			C ecx	counter
+			C edx	dst
+			C esi
+			C edi
+			C ebp
+			C
+			C carry bit is low of size
+
+			movq	-8(%ebx,%ecx,8), %mm0
+ifelse(M4_p_neg_src2,1,`pxor	%mm7, %mm0')
+			M4_p	-8(%eax,%ecx,8), %mm0
+ifelse(M4_p_neg_dst,1,`	pxor	%mm7, %mm0')
+			movq	%mm0, -8(%edx,%ecx,8)
+
+			loop	L(top)
+
+
+			jnc	L(no_extra)
+
+			movl	-4(%ebx,%esi,4), %ebx
+ifelse(M4_i_neg_src2,1,`notl	%ebx')
+			M4_i	-4(%eax,%esi,4), %ebx
+ifelse(M4_i_neg_dst,1,`	notl	%ebx')
+			movl	%ebx, -4(%edx,%esi,4)
+L(no_extra):
+
+			popl	%esi
+			popl	%ebx
+			emms_or_femms
+			ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/lshift.asm b/rts/gmp/mpn/x86/k6/mmx/lshift.asm
new file mode 100644
index 0000000000..f1dc83db46
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mmx/lshift.asm
@@ -0,0 +1,122 @@
+dnl  AMD K6 mpn_lshift -- mpn left shift.
+dnl 
+dnl  K6: 3.0 cycles/limb
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
+C instructions.  This is despite every second fetch being unaligned.
+
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+	C The 1 limb case can be done without the push %ebx, but it's then
+	C still the same speed.  The push is left as a free helping hand for
+	C the two_or_more code.
+
+	movl	PARAM_SIZE, %eax
+	pushl	%ebx			FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	decl	%eax
+
+	movl	PARAM_SHIFT, %ecx
+	jnz	L(two_or_more)
+
+	movl	(%ebx), %edx		C src limb
+	movl	PARAM_DST, %ebx
+
+	shldl(	%cl, %edx, %eax)	C return value
+
+ 	shll	%cl, %edx
+
+	movl	%edx, (%ebx)		C dst limb
+	popl	%ebx
+
+	ret
+
+
+	ALIGN(16)	C avoid offset 0x1f
+	nop		C avoid bad cache line crossing
+L(two_or_more):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx
+
+	movl	(%ebx,%eax,4), %edx	C src high limb
+	negl	%ecx
+
+	movd	PARAM_SHIFT, %mm6
+	addl	$32, %ecx		C 32-shift
+
+	shrl	%cl, %edx
+
+	movd	%ecx, %mm7
+	movl	PARAM_DST, %ecx
+
+L(top):
+	C eax	counter, size-1 to 1
+	C ebx	src
+	C ecx	dst
+	C edx	retval
+	C
+	C mm0	scratch
+	C mm6	shift
+	C mm7	32-shift
+
+	movq	-4(%ebx,%eax,4), %mm0
+	decl	%eax
+
+ 	psrlq	%mm7, %mm0
+
+	movd	%mm0, 4(%ecx,%eax,4)
+	jnz	L(top)
+
+
+	movd	(%ebx), %mm0
+	popl	%ebx
+
+ 	psllq	%mm6, %mm0
+	movl	%edx, %eax
+
+	movd	%mm0, (%ecx)
+
+	emms
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/popham.asm b/rts/gmp/mpn/x86/k6/mmx/popham.asm
new file mode 100644
index 0000000000..2c619252bb
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mmx/popham.asm
@@ -0,0 +1,238 @@
+dnl  AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and
+dnl  hamming distance.
+dnl
+dnl         popcount  hamdist
+dnl  K6-2:    9.0       11.5   cycles/limb
+dnl  K6:      12.5      13.0
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
+C
+C The code here isn't optimal, but it's already a 2x speedup over the plain
+C integer mpn/generic/popcount.c,hamdist.c.
+
+
+ifdef(`OPERATION_popcount',,
+`ifdef(`OPERATION_hamdist',,
+`m4_error(`Need OPERATION_popcount or OPERATION_hamdist
+')m4exit(1)')')
+
+define(HAM,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_hamdist',`$1')')
+
+define(POP,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_popcount',`$1')')
+
+HAM(`
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC2,   8)
+defframe(PARAM_SRC,    4)
+define(M4_function,mpn_hamdist)
+')
+POP(`
+defframe(PARAM_SIZE,   8)
+defframe(PARAM_SRC,    4)
+define(M4_function,mpn_popcount)
+')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+
+
+ifdef(`PIC',,`
+	dnl  non-PIC
+
+	DATA
+	ALIGN(8)
+
+define(LS,
+m4_assert_numargs(1)
+`LF(M4_function,`$1')')
+
+LS(rodata_AAAAAAAAAAAAAAAA):
+	.long	0xAAAAAAAA
+	.long	0xAAAAAAAA
+
+LS(rodata_3333333333333333):
+	.long	0x33333333
+	.long	0x33333333
+
+LS(rodata_0F0F0F0F0F0F0F0F):
+	.long	0x0F0F0F0F
+	.long	0x0F0F0F0F
+
+LS(rodata_000000FF000000FF):
+	.long	0x000000FF
+	.long	0x000000FF
+')
+
+	.text
+	ALIGN(32)
+
+POP(`ifdef(`PIC', `
+	C avoid shrl crossing a 32-byte boundary
+	nop')')
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	orl	%ecx, %ecx
+	jz	L(zero)
+
+ifdef(`PIC',`
+	movl	$0xAAAAAAAA, %eax
+	movl	$0x33333333, %edx
+
+	movd	%eax, %mm7
+	movd	%edx, %mm6
+
+	movl	$0x0F0F0F0F, %eax
+	movl	$0x000000FF, %edx
+
+	punpckldq %mm7, %mm7
+	punpckldq %mm6, %mm6
+
+	movd	%eax, %mm5
+	movd	%edx, %mm4
+
+	punpckldq %mm5, %mm5
+	punpckldq %mm4, %mm4
+',`
+
+	movq	LS(rodata_AAAAAAAAAAAAAAAA), %mm7
+	movq	LS(rodata_3333333333333333), %mm6
+	movq	LS(rodata_0F0F0F0F0F0F0F0F), %mm5
+	movq	LS(rodata_000000FF000000FF), %mm4
+')
+
+define(REG_AAAAAAAAAAAAAAAA, %mm7)
+define(REG_3333333333333333, %mm6)
+define(REG_0F0F0F0F0F0F0F0F, %mm5)
+define(REG_000000FF000000FF, %mm4)
+
+
+	movl	PARAM_SRC, %eax
+HAM(`	movl	PARAM_SRC2, %edx')
+
+	pxor	%mm2, %mm2	C total
+
+	shrl	%ecx
+	jnc	L(top)
+
+Zdisp(	movd,	0,(%eax,%ecx,8), %mm1)
+
+HAM(`
+Zdisp(	movd,	0,(%edx,%ecx,8), %mm0)
+	pxor	%mm0, %mm1
+')
+
+	incl	%ecx
+	jmp	L(loaded)
+
+
+	ALIGN(16)
+POP(`	nop	C alignment to avoid crossing 32-byte boundaries')
+
+L(top):
+	C eax	src
+	C ebx
+	C ecx	counter, qwords, decrementing
+	C edx	[hamdist] src2
+	C
+	C mm0	(scratch)
+	C mm1	(scratch)
+	C mm2	total (low dword)
+	C mm3
+	C mm4	\
+	C mm5	| special constants
+	C mm6	|
+	C mm7	/
+
+	movq	-8(%eax,%ecx,8), %mm1
+HAM(`	pxor	-8(%edx,%ecx,8), %mm1')
+
+L(loaded):
+	movq	%mm1, %mm0
+	pand	REG_AAAAAAAAAAAAAAAA, %mm1
+
+	psrlq	$1, %mm1
+HAM(`	nop			C code alignment')
+
+	psubd	%mm1, %mm0	C bit pairs
+HAM(`	nop			C code alignment')
+
+
+	movq	%mm0, %mm1
+	psrlq	$2, %mm0
+
+	pand	REG_3333333333333333, %mm0
+	pand	REG_3333333333333333, %mm1
+
+	paddd	%mm1, %mm0	C nibbles
+
+
+	movq	%mm0, %mm1
+	psrlq	$4, %mm0
+
+	pand	REG_0F0F0F0F0F0F0F0F, %mm0
+	pand	REG_0F0F0F0F0F0F0F0F, %mm1
+
+	paddd	%mm1, %mm0	C bytes
+
+	movq	%mm0, %mm1
+	psrlq	$8, %mm0
+
+
+	paddb	%mm1, %mm0	C words
+
+
+	movq	%mm0, %mm1
+	psrlq	$16, %mm0
+
+	paddd	%mm1, %mm0	C dwords
+
+	pand	REG_000000FF000000FF, %mm0
+
+	paddd	%mm0, %mm2	C low to total
+	psrlq	$32, %mm0
+
+	paddd	%mm0, %mm2	C high to total
+	loop	L(top)
+
+
+
+	movd	%mm2, %eax
+	emms_or_femms
+	ret
+
+L(zero):
+	movl	$0, %eax
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mmx/rshift.asm b/rts/gmp/mpn/x86/k6/mmx/rshift.asm
new file mode 100644
index 0000000000..cc5948f26c
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mmx/rshift.asm
@@ -0,0 +1,122 @@
+dnl  AMD K6 mpn_rshift -- mpn right shift.
+dnl 
+dnl  K6: 3.0 cycles/limb
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
+C instructions.  This is despite every second fetch being unaligned.
+
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+	C The 1 limb case can be done without the push %ebx, but it's then
+	C still the same speed.  The push is left as a free helping hand for
+	C the two_or_more code.
+
+	movl	PARAM_SIZE, %eax
+	pushl	%ebx			FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	decl	%eax
+
+	movl	PARAM_SHIFT, %ecx
+	jnz	L(two_or_more)
+
+	movl	(%ebx), %edx		C src limb
+	movl	PARAM_DST, %ebx
+
+	shrdl(	%cl, %edx, %eax)	C return value
+
+ 	shrl	%cl, %edx
+
+	movl	%edx, (%ebx)		C dst limb
+	popl	%ebx
+
+	ret
+
+
+	ALIGN(16)	C avoid offset 0x1f
+L(two_or_more):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx
+
+	movl	(%ebx), %edx	C src low limb
+	negl	%ecx
+
+	addl	$32, %ecx	C 32-shift
+	movd	PARAM_SHIFT, %mm6
+
+	shll	%cl, %edx	C retval
+	movl	PARAM_DST, %ecx
+
+	leal	(%ebx,%eax,4), %ebx
+
+	leal	-4(%ecx,%eax,4), %ecx
+	negl	%eax
+
+
+L(simple):
+	C eax	counter (negative)
+	C ebx	&src[size-1]
+	C ecx	&dst[size-1]
+	C edx	retval
+	C
+	C mm0	scratch
+	C mm6	shift
+
+Zdisp(	movq,	0,(%ebx,%eax,4), %mm0)
+	incl	%eax
+
+ 	psrlq	%mm6, %mm0
+
+Zdisp(	movd,	%mm0, 0,(%ecx,%eax,4))
+	jnz	L(simple)
+
+
+	movq	%mm0, (%ecx)
+	movl	%edx, %eax
+
+	popl	%ebx
+
+	emms
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mul_1.asm b/rts/gmp/mpn/x86/k6/mul_1.asm
new file mode 100644
index 0000000000..c2220fe4ca
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mul_1.asm
@@ -0,0 +1,272 @@
+dnl  AMD K6 mpn_mul_1 -- mpn by limb multiply.
+dnl 
+dnl  K6: 6.25 cycles/limb.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t multiplier);
+C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       mp_limb_t multiplier, mp_limb_t carry);
+C
+C Multiply src,size by mult and store the result in dst,size.
+C Return the carry limb from the top of the result.
+C
+C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
+C the low limb of the result.
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+dnl  minimum 5 because the unrolled code can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_mul_1c)
+	pushl	%esi
+deflit(`FRAME',4)
+	movl	PARAM_CARRY, %esi
+	jmp	LF(mpn_mul_1,start_nc)
+EPILOGUE()
+
+
+PROLOGUE(mpn_mul_1)
+	push	%esi
+deflit(`FRAME',4)
+	xorl	%esi, %esi	C initial carry
+
+L(start_nc):
+	mov	PARAM_SIZE, %ecx
+	push	%ebx
+FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	push	%edi
+FRAME_pushl()
+
+	movl	PARAM_DST, %edi
+	pushl	%ebp
+FRAME_pushl()
+
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	movl	PARAM_MULTIPLIER, %ebp
+
+	jae	L(unroll)
+
+
+	C code offset 0x22 here, close enough to aligned
+L(simple):
+	C eax	scratch
+	C ebx	src
+	C ecx	counter
+	C edx	scratch
+	C esi	carry
+	C edi	dst
+	C ebp	multiplier
+	C
+	C this loop 8 cycles/limb
+
+	movl	(%ebx), %eax
+	addl	$4, %ebx
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi)
+	addl	$4, %edi
+
+	loop	L(simple)
+
+
+	popl	%ebp
+
+	popl	%edi
+	popl	%ebx
+
+	movl	%esi, %eax
+	popl	%esi
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C The code for each limb is 6 cycles, with instruction decoding being the
+C limiting factor.  At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25
+C cycles/limb in total.
+C
+C The secret ingredient to get 6.25 is to start the loop with the mul and
+C have the load/store pair at the end.  Rotating the load/store to the top
+C is an 0.5 c/l slowdown.  (Some address generation effect probably.)
+C
+C The whole unrolled loop fits nicely in exactly 80 bytes.
+
+
+	ALIGN(16)	C already aligned to 16 here actually
+L(unroll):
+	movl	(%ebx), %eax
+	leal	-16(%ebx,%ecx,4), %ebx
+
+	leal	-16(%edi,%ecx,4), %edi
+	subl	$4, %ecx
+
+	negl	%ecx
+
+
+	ALIGN(16)	C one byte nop for this alignment
+L(top):
+	C eax	scratch
+	C ebx	&src[size-4]
+	C ecx	counter
+	C edx	scratch
+	C esi	carry
+	C edi	&dst[size-4]
+	C ebp	multiplier
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi,%ecx,4)
+	movl	4(%ebx,%ecx,4), %eax
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 4(%edi,%ecx,4)
+	movl	8(%ebx,%ecx,4), %eax
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 8(%edi,%ecx,4)
+	movl	12(%ebx,%ecx,4), %eax
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 12(%edi,%ecx,4)
+	movl	16(%ebx,%ecx,4), %eax
+
+
+	addl	$4, %ecx
+	js	L(top)
+
+
+
+	C eax	next src limb
+	C ebx	&src[size-4]
+	C ecx	0 to 3 representing respectively 4 to 1 further limbs
+	C edx
+	C esi	carry
+	C edi	&dst[size-4]
+
+	testb	$2, %cl
+	jnz	L(finish_not_two)
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi,%ecx,4)
+	movl	4(%ebx,%ecx,4), %eax
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 4(%edi,%ecx,4)
+	movl	8(%ebx,%ecx,4), %eax
+
+	addl	$2, %ecx
+L(finish_not_two):
+
+
+	testb	$1, %cl
+	jnz	L(finish_not_one)
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 8(%edi)
+	movl	12(%ebx), %eax
+L(finish_not_one):
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	popl	%ebp
+
+	adcl	$0, %edx
+
+	movl	%eax, 12(%edi)
+	popl	%edi
+
+	popl	%ebx
+	movl	%edx, %eax
+
+	popl	%esi
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/mul_basecase.asm b/rts/gmp/mpn/x86/k6/mul_basecase.asm
new file mode 100644
index 0000000000..1f5a3a4b4b
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/mul_basecase.asm
@@ -0,0 +1,600 @@
+dnl  AMD K6 mpn_mul_basecase -- multiply two mpn numbers.
+dnl 
+dnl  K6: approx 9.0 cycles per cross product on 30x30 limbs (with 16 limbs/loop
+dnl      unrolling).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K6: UNROLL_COUNT cycles/product (approx)
+dnl           8           9.75
+dnl          16           9.3
+dnl          32           9.3
+dnl  Maximum possible with the current code is 32.
+dnl
+dnl  With 16 the inner unrolled loop fits exactly in a 256 byte block, which
+dnl  might explain it's good performance.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xsize,
+C                        mp_srcptr yp, mp_size_t ysize);
+C
+C Calculate xp,xsize multiplied by yp,ysize, storing the result in
+C wp,xsize+ysize.
+C
+C This routine is essentially the same as mpn/generic/mul_basecase.c, but
+C it's faster because it does most of the mpn_addmul_1() entry code only
+C once.  The saving is about 10-20% on typical sizes coming from the
+C Karatsuba multiply code.
+C
+C Future:
+C
+C The unrolled loop could be shared by mpn_addmul_1, with some extra stack
+C setups and maybe 2 or 3 wasted cycles at the end.  Code saving would be
+C 256 bytes.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 8)
+',`
+deflit(UNROLL_THRESHOLD, 8)
+')
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP,   16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP,   8)
+defframe(PARAM_WP,   4)
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_XSIZE, %ecx
+	movl	PARAM_YP, %eax
+
+	movl	PARAM_XP, %edx
+	movl	(%eax), %eax	C yp low limb
+
+	cmpl	$2, %ecx
+	ja	L(xsize_more_than_two_limbs)
+	je	L(two_by_something)
+
+
+	C one limb by one limb
+
+	movl	(%edx), %edx	C xp low limb
+	movl	PARAM_WP, %ecx
+	
+	mull	%edx
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(two_by_something):
+	decl	PARAM_YSIZE
+	pushl	%ebx
+deflit(`FRAME',4)
+
+	movl	PARAM_WP, %ebx
+	pushl	%esi
+deflit(`FRAME',8)
+
+	movl	%eax, %ecx	C yp low limb
+	movl	(%edx), %eax	C xp low limb	
+
+	movl	%edx, %esi	C xp
+	jnz	L(two_by_two)
+
+
+	C two limbs by one limb
+
+	mull	%ecx	
+
+	movl	%eax, (%ebx)
+	movl	4(%esi), %eax
+
+	movl	%edx, %esi	C carry
+
+	mull	%ecx
+
+	addl	%eax, %esi
+	movl	%esi, 4(%ebx)
+
+	adcl	$0, %edx
+
+	movl	%edx, 8(%ebx)
+	popl	%esi
+
+	popl	%ebx
+	ret
+	
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(two_by_two):
+	C eax	xp low limb
+	C ebx	wp
+	C ecx	yp low limb
+	C edx
+	C esi	xp
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	mull	%ecx		C xp[0] * yp[0]
+
+	push	%edi
+deflit(`FRAME',12)
+	movl	%eax, (%ebx)
+
+	movl	4(%esi), %eax
+	movl	%edx, %edi	C carry, for wp[1]
+
+	mull	%ecx		C xp[1] * yp[0]
+
+	addl	%eax, %edi
+	movl	PARAM_YP, %ecx
+
+	adcl	$0, %edx
+
+	movl	%edi, 4(%ebx)
+	movl	4(%ecx), %ecx	C yp[1]
+
+	movl	4(%esi), %eax	C xp[1]
+	movl	%edx, %edi	C carry, for wp[2]
+
+	mull	%ecx		C xp[1] * yp[1]
+
+	addl	%eax, %edi
+
+	adcl	$0, %edx
+
+	movl	(%esi), %eax	C xp[0]
+	movl	%edx, %esi	C carry, for wp[3]
+
+	mull	%ecx		C xp[0] * yp[1]
+
+	addl	%eax, 4(%ebx)
+	adcl	%edx, %edi
+	adcl	$0, %esi
+
+	movl	%edi, 8(%ebx)
+	popl	%edi
+
+	movl	%esi, 12(%ebx)
+	popl	%esi
+
+	popl	%ebx
+	ret
+
+	
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(xsize_more_than_two_limbs):
+
+C The first limb of yp is processed with a simple mpn_mul_1 style loop
+C inline.  Unrolling this doesn't seem worthwhile since it's only run once
+C (whereas the addmul below is run ysize-1 many times).  A call to the
+C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
+C popping, and doesn't seem likely to be worthwhile on the typical 10-20
+C limb operations the Karatsuba code calls here with.
+
+	C eax	yp[0]
+	C ebx
+	C ecx	xsize
+	C edx	xp
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',0)
+
+	pushl	%edi		defframe_pushl(SAVE_EDI)
+	pushl	%ebp		defframe_pushl(SAVE_EBP)
+
+	movl	PARAM_WP, %edi
+	pushl	%esi		defframe_pushl(SAVE_ESI)
+
+	movl	%eax, %ebp
+	pushl	%ebx		defframe_pushl(SAVE_EBX)
+
+	leal	(%edx,%ecx,4), %ebx	C xp end
+	xorl	%esi, %esi
+
+	leal	(%edi,%ecx,4), %edi	C wp end of mul1
+	negl	%ecx
+
+
+L(mul1):
+	C eax	scratch
+	C ebx	xp end
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	carry
+	C edi	wp end of mul1
+	C ebp	multiplier
+
+	movl	(%ebx,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi,%ecx,4)
+	incl	%ecx
+
+	jnz	L(mul1)
+
+
+	movl	PARAM_YSIZE, %edx
+	movl	%esi, (%edi)		C final carry
+
+	movl	PARAM_XSIZE, %ecx
+	decl	%edx
+
+	jnz	L(ysize_more_than_one_limb)
+
+	popl	%ebx
+	popl	%esi
+	popl	%ebp
+	popl	%edi
+	ret
+
+
+L(ysize_more_than_one_limb):
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	movl	PARAM_YP, %eax
+
+	jae	L(unroll)
+
+
+C -----------------------------------------------------------------------------
+C Simple addmul loop.
+C
+C Using ebx and edi pointing at the ends of their respective locations saves
+C a couple of instructions in the outer loop.  The inner loop is still 11
+C cycles, the same as the simple loop in aorsmul_1.asm.
+
+	C eax	yp
+	C ebx	xp end
+	C ecx	xsize
+	C edx	ysize-1
+	C esi
+	C edi	wp end of mul1
+	C ebp
+
+	movl	4(%eax), %ebp		C multiplier
+	negl	%ecx
+
+	movl	%ecx, PARAM_XSIZE	C -xsize
+	xorl	%esi, %esi		C initial carry
+
+	leal	4(%eax,%edx,4), %eax	C yp end
+	negl	%edx
+
+	movl	%eax, PARAM_YP
+	movl	%edx, PARAM_YSIZE
+
+	jmp	L(simple_outer_entry)
+
+
+	C aligning here saves a couple of cycles
+	ALIGN(16)
+L(simple_outer_top):	
+	C edx	ysize counter, negative
+
+	movl	PARAM_YP, %eax		C yp end
+	xorl	%esi, %esi		C carry
+
+	movl	PARAM_XSIZE, %ecx	C -xsize
+	movl	%edx, PARAM_YSIZE
+
+	movl	(%eax,%edx,4), %ebp	C yp limb multiplier
+L(simple_outer_entry):
+	addl	$4, %edi
+
+
+L(simple_inner):
+	C eax	scratch
+	C ebx	xp end
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	carry
+	C edi	wp end of this addmul
+	C ebp	multiplier
+
+	movl	(%ebx,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	$0, %edx
+	addl	%eax, (%edi,%ecx,4)
+	adcl	%edx, %esi
+
+	incl	%ecx
+	jnz	L(simple_inner)
+
+
+	movl	PARAM_YSIZE, %edx
+	movl	%esi, (%edi)
+
+	incl	%edx
+	jnz	L(simple_outer_top)
+
+
+	popl	%ebx
+	popl	%esi
+	popl	%ebp
+	popl	%edi
+	ret
+
+
+C -----------------------------------------------------------------------------
+C Unrolled loop.
+C
+C The unrolled inner loop is the same as in aorsmul_1.asm, see that code for
+C some comments.
+C
+C VAR_COUNTER is for the inner loop, running from VAR_COUNTER_INIT down to
+C 0, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled loop.
+C
+C PARAM_XP and PARAM_WP get offset appropriately for where the unrolled loop
+C is entered.
+C
+C VAR_XP_LOW is the least significant limb of xp, which is needed at the
+C start of the unrolled loop.  This can't just be fetched through the xp
+C pointer because of the offset applied to it.
+C
+C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
+C inclusive.
+C
+C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
+C added to give the location of the next limb of yp, which is the multiplier
+C in the unrolled loop.
+C
+C PARAM_WP is similarly offset so that the PARAM_YSIZE counter can be added
+C to give the starting point in the destination for each unrolled loop (this
+C point is one limb upwards for each limb of yp processed).
+C
+C Having PARAM_YSIZE count negative to zero means it's not necessary to
+C store new values of PARAM_YP and PARAM_WP on each loop.  Those values on
+C the stack remain constant and on each loop an leal adjusts them with the
+C PARAM_YSIZE counter value.
+
+
+defframe(VAR_COUNTER,      -20)
+defframe(VAR_COUNTER_INIT, -24)
+defframe(VAR_JMP,          -28)
+defframe(VAR_XP_LOW,       -32)
+deflit(VAR_STACK_SPACE, 16)
+
+dnl  For some strange reason using (%esp) instead of 0(%esp) is a touch
+dnl  slower in this code, hence the defframe empty-if-zero feature is
+dnl  disabled.
+dnl
+dnl  If VAR_COUNTER is at (%esp), the effect is worse.  In this case the
+dnl  unrolled loop is 255 instead of 256 bytes, but quite how this affects
+dnl  anything isn't clear.
+dnl
+define(`defframe_empty_if_zero_disabled',1)
+
+L(unroll):
+	C eax	yp (not used)
+	C ebx	xp end (not used)
+	C ecx	xsize
+	C edx	ysize-1
+	C esi
+	C edi	wp end of mul1 (not used)
+	C ebp
+deflit(`FRAME', 16)
+
+	leal	-2(%ecx), %ebp	C one limb processed at start,
+	decl	%ecx		C and ebp is one less
+
+	shrl	$UNROLL_LOG2, %ebp
+	negl	%ecx
+
+	subl	$VAR_STACK_SPACE, %esp
+deflit(`FRAME', 16+VAR_STACK_SPACE)
+	andl	$UNROLL_MASK, %ecx
+
+	movl	%ecx, %esi
+	shll	$4, %ecx
+
+	movl	%ebp, VAR_COUNTER_INIT
+	negl	%esi
+
+	C 15 code bytes per limb
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(unroll_here):
+',`
+	leal	L(unroll_entry) (%ecx,%esi,1), %ecx
+')
+
+	movl	PARAM_XP, %ebx
+	movl	%ebp, VAR_COUNTER
+
+	movl	PARAM_WP, %edi
+	movl	%ecx, VAR_JMP
+
+	movl	(%ebx), %eax
+	leal	4(%edi,%esi,4), %edi	C wp adjust for unrolling and mul1
+
+	leal	(%ebx,%esi,4), %ebx	C xp adjust for unrolling
+
+	movl	%eax, VAR_XP_LOW
+
+	movl	%ebx, PARAM_XP
+	movl	PARAM_YP, %ebx
+
+	leal	(%edi,%edx,4), %ecx	C wp adjust for ysize indexing
+	movl	4(%ebx), %ebp		C multiplier (yp second limb)
+
+	leal	4(%ebx,%edx,4), %ebx	C yp adjust for ysize indexing
+
+	movl	%ecx, PARAM_WP
+
+	leal	1(%esi), %ecx	C adjust parity for decl %ecx above
+
+	movl	%ebx, PARAM_YP
+	negl	%edx
+
+	movl	%edx, PARAM_YSIZE
+	jmp	L(unroll_outer_entry)
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See README.family about old gas bugs
+	leal	(%ecx,%esi,1), %ecx
+	addl	$L(unroll_entry)-L(unroll_here), %ecx
+	addl	(%esp), %ecx
+	ret
+')
+
+
+C -----------------------------------------------------------------------------
+	C Aligning here saves a couple of cycles per loop.  Using 32 doesn't
+	C cost any extra space, since the inner unrolled loop below is
+	C aligned to 32.
+	ALIGN(32)
+L(unroll_outer_top):
+	C edx	ysize
+
+	movl	PARAM_YP, %eax
+	movl	%edx, PARAM_YSIZE	C incremented ysize counter
+
+	movl	PARAM_WP, %edi
+
+	movl	VAR_COUNTER_INIT, %ebx
+	movl	(%eax,%edx,4), %ebp	C next multiplier
+
+	movl	PARAM_XSIZE, %ecx
+	leal	(%edi,%edx,4), %edi	C adjust wp for where we are in yp
+
+	movl	VAR_XP_LOW, %eax
+	movl	%ebx, VAR_COUNTER
+
+L(unroll_outer_entry):
+	mull	%ebp
+
+	C using testb is a tiny bit faster than testl
+	testb	$1, %cl
+
+	movl	%eax, %ecx	C low carry
+	movl	VAR_JMP, %eax
+
+	movl	%edx, %esi	C high carry
+	movl	PARAM_XP, %ebx
+
+	jnz	L(unroll_noswap)
+	movl	%ecx, %esi	C high,low carry other way around
+
+	movl	%edx, %ecx
+L(unroll_noswap):
+
+	jmp	*%eax
+
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(32)
+L(unroll_top):
+	C eax	scratch
+	C ebx	xp
+	C ecx	carry low
+	C edx	scratch
+	C esi	carry high
+	C edi	wp
+	C ebp	multiplier
+	C VAR_COUNTER  loop counter
+	C
+	C 15 code bytes each limb
+
+	leal	UNROLL_BYTES(%edi), %edi
+
+L(unroll_entry):
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*CHUNK_COUNT*4))
+	deflit(`disp1', eval(disp0 + 4))
+	deflit(`disp2', eval(disp1 + 4))
+
+	movl	disp1(%ebx), %eax
+	mull	%ebp
+Zdisp(	addl,	%ecx, disp0,(%edi))
+	adcl	%eax, %esi
+	movl	%edx, %ecx
+	jadcl0( %ecx)
+
+	movl	disp2(%ebx), %eax
+	mull	%ebp
+	addl	%esi, disp1(%edi)
+	adcl	%eax, %ecx
+	movl	%edx, %esi
+	jadcl0( %esi)
+')
+
+	decl	VAR_COUNTER
+	leal	UNROLL_BYTES(%ebx), %ebx
+
+	jns	L(unroll_top)
+
+
+	movl	PARAM_YSIZE, %edx
+	addl	%ecx, UNROLL_BYTES(%edi)
+
+	adcl	$0, %esi
+
+	incl	%edx
+	movl	%esi, UNROLL_BYTES+4(%edi)
+
+	jnz	L(unroll_outer_top)
+
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBP, %ebp
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBX, %ebx
+
+	addl	$FRAME, %esp
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k6/sqr_basecase.asm b/rts/gmp/mpn/x86/k6/sqr_basecase.asm
new file mode 100644
index 0000000000..70d49b3e57
--- /dev/null
+++ b/rts/gmp/mpn/x86/k6/sqr_basecase.asm
@@ -0,0 +1,672 @@
+dnl  AMD K6 mpn_sqr_basecase -- square an mpn number.
+dnl 
+dnl  K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular
+dnl  product (measured on the speed difference between 17 and 33 limbs,
+dnl  which is roughly the Karatsuba recursing range).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  KARATSUBA_SQR_THRESHOLD_MAX is the maximum KARATSUBA_SQR_THRESHOLD this
+dnl  code supports.  This value is used only by the tune program to know
+dnl  what it can go up to.  (An attempt to compile with a bigger value will
+dnl  trigger some m4_assert()s in the code, making the build fail.)
+dnl
+dnl  The value is determined by requiring the displacements in the unrolled
+dnl  addmul to fit in single bytes.  This means a maximum UNROLL_COUNT of
+dnl  63, giving a maximum KARATSUBA_SQR_THRESHOLD of 66.
+
+deflit(KARATSUBA_SQR_THRESHOLD_MAX, 66)
+
+
+dnl  Allow a value from the tune program to override config.m4.
+
+ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE',
+`define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)')
+
+
+dnl  UNROLL_COUNT is the number of code chunks in the unrolled addmul.  The
+dnl  number required is determined by KARATSUBA_SQR_THRESHOLD, since
+dnl  mpn_sqr_basecase only needs to handle sizes < KARATSUBA_SQR_THRESHOLD.
+dnl
+dnl  The first addmul is the biggest, and this takes the second least
+dnl  significant limb and multiplies it by the third least significant and
+dnl  up.  Hence for a maximum operand size of KARATSUBA_SQR_THRESHOLD-1
+dnl  limbs, UNROLL_COUNT needs to be KARATSUBA_SQR_THRESHOLD-3.
+
+m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD')
+deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3))
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the given size
+C is small.
+C
+C The code size might look a bit excessive, but not all of it is executed
+C and so won't fill up the code cache.  The 1x1, 2x2 and 3x3 special cases
+C clearly apply only to those sizes; mid sizes like 10x10 only need part of
+C the unrolled addmul; and big sizes like 35x35 that do need all of it will
+C at least be getting value for money, because 35x35 spends something like
+C 5780 cycles here.
+C
+C Different values of UNROLL_COUNT give slightly different speeds, between
+C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs.
+C This isn't a big difference, but it's presumably some alignment effect
+C which if understood could give a simple speedup.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %eax
+
+	cmpl	$2, %ecx
+	je	L(two_limbs)
+
+	movl	PARAM_DST, %edx
+	ja	L(three_or_more)
+
+
+C -----------------------------------------------------------------------------
+C one limb only
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	dst
+
+	movl	(%eax), %eax
+	movl	%edx, %ecx
+
+	mull	%eax
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(two_limbs):
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	dst
+
+	pushl	%ebx
+	movl	%eax, %ebx	C src
+deflit(`FRAME',4)
+
+	movl	(%ebx), %eax
+	movl	PARAM_DST, %ecx
+
+	mull	%eax		C src[0]^2
+
+	movl	%eax, (%ecx)
+	movl	4(%ebx), %eax
+
+	movl	%edx, 4(%ecx)
+
+	mull	%eax		C src[1]^2
+
+	movl	%eax, 8(%ecx)
+	movl	(%ebx), %eax
+
+	movl	%edx, 12(%ecx)
+	movl	4(%ebx), %edx
+
+	mull	%edx		C src[0]*src[1]
+
+	addl	%eax, 4(%ecx)
+
+	adcl	%edx, 8(%ecx)
+	adcl	$0, 12(%ecx)
+
+	popl	%ebx
+	addl	%eax, 4(%ecx)
+
+	adcl	%edx, 8(%ecx)
+	adcl	$0, 12(%ecx)
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(three_or_more):
+deflit(`FRAME',0)
+	cmpl	$4, %ecx
+	jae	L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+	C eax	src
+	C ecx	size
+	C edx	dst
+
+	pushl	%ebx
+	movl	%eax, %ebx	C src
+
+	movl	(%ebx), %eax
+	movl	%edx, %ecx	C dst
+
+	mull	%eax		C src[0] ^ 2
+
+	movl	%eax, (%ecx)
+	movl	4(%ebx), %eax
+
+	movl	%edx, 4(%ecx)
+	pushl	%esi
+
+	mull	%eax		C src[1] ^ 2
+
+	movl	%eax, 8(%ecx)
+	movl	8(%ebx), %eax
+
+	movl	%edx, 12(%ecx)
+	pushl	%edi
+
+	mull	%eax		C src[2] ^ 2
+
+	movl	%eax, 16(%ecx)
+	movl	(%ebx), %eax
+
+	movl	%edx, 20(%ecx)
+	movl	4(%ebx), %edx
+
+	mull	%edx		C src[0] * src[1]
+
+	movl	%eax, %esi
+	movl	(%ebx), %eax
+
+	movl	%edx, %edi
+	movl	8(%ebx), %edx
+
+	pushl	%ebp
+	xorl	%ebp, %ebp
+
+	mull	%edx		C src[0] * src[2]
+
+	addl	%eax, %edi
+	movl	4(%ebx), %eax
+
+	adcl	%edx, %ebp
+
+	movl	8(%ebx), %edx
+
+	mull	%edx		C src[1] * src[2]
+
+	addl	%eax, %ebp
+
+	adcl	$0, %edx
+
+
+	C eax	will be dst[5]
+	C ebx
+	C ecx	dst
+	C edx	dst[4]
+	C esi	dst[1]
+	C edi	dst[2]
+	C ebp	dst[3]
+
+	xorl	%eax, %eax
+	addl	%esi, %esi
+	adcl	%edi, %edi
+	adcl	%ebp, %ebp
+	adcl	%edx, %edx
+	adcl	$0, %eax
+
+	addl	%esi, 4(%ecx)
+	adcl	%edi, 8(%ecx)
+	adcl	%ebp, 12(%ecx)
+
+	popl	%ebp
+	popl	%edi
+
+	adcl	%edx, 16(%ecx)
+
+	popl	%esi
+	popl	%ebx
+
+	adcl	%eax, 20(%ecx)
+	ASSERT(nc)
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+
+defframe(SAVE_EBX,   -4)
+defframe(SAVE_ESI,   -8)
+defframe(SAVE_EDI,   -12)
+defframe(SAVE_EBP,   -16)
+defframe(VAR_COUNTER,-20)
+defframe(VAR_JMP,    -24)
+deflit(STACK_SPACE, 24)
+
+	ALIGN(16)
+L(four_or_more):
+
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+C
+C A test was done calling mpn_mul_1 here to get the benefit of its unrolled
+C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off
+C a 5780 cycle operation, which is not surprising since the loop here is 8
+C c/l and mpn_mul_1 is 6.25 c/l.
+
+	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
+	
+	movl	%edi, SAVE_EDI
+	leal	4(%edx), %edi
+
+	movl	%ebx, SAVE_EBX
+	leal	4(%eax), %ebx
+
+	movl	%esi, SAVE_ESI
+	xorl	%esi, %esi
+
+	movl	%ebp, SAVE_EBP
+
+	C eax
+	C ebx	src+4
+	C ecx	size
+	C edx
+	C esi
+	C edi	dst+4
+	C ebp
+
+	movl	(%eax), %ebp	C multiplier
+	leal	-1(%ecx), %ecx	C size-1, and pad to a 16 byte boundary
+
+
+	ALIGN(16)
+L(mul_1):
+	C eax	scratch
+	C ebx	src ptr
+	C ecx	counter
+	C edx	scratch
+	C esi	carry
+	C edi	dst ptr
+	C ebp	multiplier
+
+	movl	(%ebx), %eax
+	addl	$4, %ebx
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi)
+	addl	$4, %edi
+
+	loop	L(mul_1)
+
+
+C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
+C
+C The last two addmuls, which are the bottom right corner of the product
+C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
+C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
+C cases that need to be done.
+C
+C The unrolled code is the same as mpn_addmul_1(), see that routine for some
+C comments.
+C
+C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled code, stepped by one code
+C chunk each outer loop.
+C
+C K6 doesn't do any branch prediction on indirect jumps, which is good
+C actually because it's a different target each time.  The unrolled addmul
+C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of
+C the indirect jump is quickly recovered.
+
+
+dnl  This value is also implicitly encoded in a shift and add.
+dnl
+deflit(CODE_BYTES_PER_LIMB, 15)
+
+dnl  With the unmodified &src[size] and &dst[size] pointers, the
+dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
+dnl  values up to 31.  Above that an offset must be added to them.
+dnl
+deflit(OFFSET,
+ifelse(eval(UNROLL_COUNT>31),1,
+eval((UNROLL_COUNT-31)*4),
+0))
+
+	C eax
+	C ebx	&src[size]
+	C ecx
+	C edx
+	C esi	carry
+	C edi	&dst[size]
+	C ebp
+
+	movl	PARAM_SIZE, %ecx
+	movl	%esi, (%edi)
+
+	subl	$4, %ecx
+	jz	L(corner)
+
+	movl	%ecx, %edx
+ifelse(OFFSET,0,,
+`	subl	$OFFSET, %ebx')
+
+	shll	$4, %ecx
+ifelse(OFFSET,0,,
+`	subl	$OFFSET, %edi')
+
+	negl	%ecx
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
+')
+	negl	%edx
+
+
+	C The calculated jump mustn't be before the start of the available
+	C code.  This is the limitation UNROLL_COUNT puts on the src operand
+	C size, but checked here using the jump address directly.
+	C
+	ASSERT(ae,`
+	movl_text_address( L(unroll_inner_start), %eax)
+	cmpl	%eax, %ecx
+	')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll_outer_top):
+	C eax
+	C ebx	&src[size], constant
+	C ecx	VAR_JMP
+	C edx	VAR_COUNTER, limbs, negative
+	C esi	high limb to store
+	C edi	dst ptr, high of last addmul
+	C ebp
+
+	movl	-12+OFFSET(%ebx,%edx,4), %ebp	C multiplier
+	movl	%edx, VAR_COUNTER
+
+	movl	-8+OFFSET(%ebx,%edx,4), %eax	C first limb of multiplicand
+
+	mull	%ebp
+
+	testb	$1, %cl
+
+	movl	%edx, %esi	C high carry
+	movl	%ecx, %edx	C jump
+
+	movl	%eax, %ecx	C low carry
+	leal	CODE_BYTES_PER_LIMB(%edx), %edx
+
+	movl	%edx, VAR_JMP
+	leal	4(%edi), %edi
+
+	C A branch-free version of this using some xors was found to be a
+	C touch slower than just a conditional jump, despite the jump
+	C switching between taken and not taken on every loop.
+
+ifelse(eval(UNROLL_COUNT%2),0,
+	jz,jnz)	L(unroll_noswap)
+	movl	%esi, %eax	C high,low carry other way around
+
+	movl	%ecx, %esi
+	movl	%eax, %ecx
+L(unroll_noswap):
+
+	jmp	*%edx
+
+
+	C Must be on an even address here so the low bit of the jump address
+	C will indicate which way around ecx/esi should start.
+	C
+	C An attempt was made at padding here to get the end of the unrolled
+	C code to come out on a good alignment, to save padding before
+	C L(corner).  This worked, but turned out to run slower than just an
+	C ALIGN(2).  The reason for this is not clear, it might be related
+	C to the different speeds on different UNROLL_COUNTs noted above.
+
+	ALIGN(2)
+
+L(unroll_inner_start):
+	C eax	scratch
+	C ebx	src
+	C ecx	carry low
+	C edx	scratch
+	C esi	carry high
+	C edi	dst
+	C ebp	multiplier
+	C
+	C 15 code bytes each limb
+	C ecx/esi swapped on each chunk
+
+forloop(`i', UNROLL_COUNT, 1, `
+	deflit(`disp_src', eval(-i*4 + OFFSET))
+	deflit(`disp_dst', eval(disp_src - 4))
+
+	m4_assert(`disp_src>=-128 && disp_src<128')
+	m4_assert(`disp_dst>=-128 && disp_dst<128')
+
+ifelse(eval(i%2),0,`
+Zdisp(	movl,	disp_src,(%ebx), %eax)
+	mull	%ebp
+Zdisp(	addl,	%esi, disp_dst,(%edi))
+	adcl	%eax, %ecx
+	movl	%edx, %esi
+	jadcl0( %esi)
+',`
+	dnl  this one comes out last
+Zdisp(	movl,	disp_src,(%ebx), %eax)
+	mull	%ebp
+Zdisp(	addl,	%ecx, disp_dst,(%edi))
+	adcl	%eax, %esi
+	movl	%edx, %ecx
+	jadcl0( %ecx)
+')
+')
+L(unroll_inner_end):
+
+	addl	%esi, -4+OFFSET(%edi)
+
+	movl	VAR_COUNTER, %edx
+	jadcl0(	%ecx)
+
+	movl	%ecx, m4_empty_if_zero(OFFSET)(%edi)
+	movl	VAR_JMP, %ecx
+
+	incl	%edx
+	jnz	L(unroll_outer_top)
+	
+
+ifelse(OFFSET,0,,`
+	addl	$OFFSET, %ebx
+	addl	$OFFSET, %edi
+')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(corner):
+	C ebx	&src[size]
+	C edi	&dst[2*size-5]
+
+	movl	-12(%ebx), %ebp
+
+	movl	-8(%ebx), %eax
+	movl	%eax, %ecx
+
+	mull	%ebp
+
+	addl	%eax, -4(%edi)
+	adcl	$0, %edx
+
+	movl	-4(%ebx), %eax
+	movl	%edx, %esi
+	movl	%eax, %ebx
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	adcl	$0, %edx
+
+	addl	%eax, (%edi)
+	adcl	$0, %edx
+
+	movl	%edx, %esi
+	movl	%ebx, %eax
+
+	mull	%ecx
+
+	addl	%esi, %eax
+	movl	%eax, 4(%edi)
+
+	adcl	$0, %edx
+
+	movl	%edx, 8(%edi)
+	
+
+C -----------------------------------------------------------------------------
+C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
+C The loop measures about 6 cycles/iteration, though it looks like it should
+C decode in 5.
+
+L(lshift_start):
+	movl	PARAM_SIZE, %ecx
+
+	movl	PARAM_DST, %edi
+	subl	$1, %ecx		C size-1 and clear carry
+
+	movl	PARAM_SRC, %ebx
+	movl	%ecx, %edx
+
+	xorl	%eax, %eax		C ready for adcl
+
+
+	ALIGN(16)
+L(lshift):
+	C eax
+	C ebx	src (for later use)
+	C ecx	counter, decrementing
+	C edx	size-1 (for later use)
+	C esi
+	C edi	dst, incrementing
+	C ebp
+
+	rcll	4(%edi)
+	rcll	8(%edi)
+	leal	8(%edi), %edi
+	loop	L(lshift)
+
+
+	adcl	%eax, %eax
+
+	movl	%eax, 4(%edi)		C dst most significant limb
+	movl	(%ebx), %eax		C src[0]
+
+	leal	4(%ebx,%edx,4), %ebx	C &src[size]
+	subl	%edx, %ecx		C -(size-1)
+
+
+C -----------------------------------------------------------------------------
+C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
+C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+
+	mull	%eax
+
+	movl	%eax, (%edi,%ecx,8)	C dst[0]
+
+
+	ALIGN(16)
+L(diag):
+	C eax	scratch
+	C ebx	&src[size]
+	C ecx	counter, negative
+	C edx	carry
+	C esi	scratch
+	C edi	dst[2*size-2]
+	C ebp
+
+	movl	(%ebx,%ecx,4), %eax
+	movl	%edx, %esi
+
+	mull	%eax
+
+	addl	%esi, 4(%edi,%ecx,8)
+	adcl	%eax, 8(%edi,%ecx,8)
+	adcl	$0, %edx
+
+	incl	%ecx
+	jnz	L(diag)
+
+
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_ESI, %esi
+
+	addl	%edx, 4(%edi)		C dst most significant limb
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBP, %ebp
+	addl	$FRAME, %esp
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+ifdef(`PIC',`
+L(pic_calc):
+        C See README.family about old gas bugs
+	addl	(%esp), %ecx
+	addl	$L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
+	addl	%edx, %ecx
+	ret
+')
+
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/README b/rts/gmp/mpn/x86/k7/README
new file mode 100644
index 0000000000..c34315c401
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/README
@@ -0,0 +1,145 @@
+
+                      AMD K7 MPN SUBROUTINES
+
+
+This directory contains code optimized for the AMD Athlon CPU.
+
+The mmx subdirectory has routines using MMX instructions.  All Athlons have
+MMX, the separate directory is just so that configure can omit it if the
+assembler doesn't support MMX.
+
+
+
+STATUS
+
+Times for the loops, with all code and data in L1 cache.
+
+                               cycles/limb
+	mpn_add/sub_n             1.6
+
+	mpn_copyi                 0.75 or 1.0   \ varying with data alignment
+	mpn_copyd                 0.75 or 1.0   /
+
+	mpn_divrem_1             17.0 integer part, 15.0 fractional part
+	mpn_mod_1                17.0
+	mpn_divexact_by3          8.0
+
+	mpn_l/rshift              1.2
+
+	mpn_mul_1                 3.4
+	mpn_addmul/submul_1       3.9
+
+	mpn_mul_basecase          4.42 cycles/crossproduct (approx)
+
+	mpn_popcount		   5.0
+	mpn_hamdist		   6.0
+
+Prefetching of sources hasn't yet been tried.
+
+
+
+NOTES
+
+cmov, MMX, 3DNow and some extensions to MMX and 3DNow are available.
+
+Write-allocate L1 data cache means prefetching of destinations is unnecessary.
+
+Floating point multiplications can be done in parallel with integer
+multiplications, but there doesn't seem to be any way to make use of this.
+
+Unsigned "mul"s can be issued every 3 cycles.  This suggests 3 is a limit on
+the speed of the multiplication routines.  The documentation shows mul
+executing in IEU0 (or maybe in IEU0 and IEU1 together), so it might be that,
+to get near 3 cycles code has to be arranged so that nothing else is issued
+to IEU0.  A busy IEU0 could explain why some code takes 4 cycles and other
+apparently equivalent code takes 5.
+
+
+
+OPTIMIZATIONS
+
+Unrolled loops are used to reduce looping overhead.  The unrolling is
+configurable up to 32 limbs/loop for most routines and up to 64 for some.
+The K7 has 64k L1 code cache so quite big unrolling is allowable.
+
+Computed jumps into the unrolling are used to handle sizes not a multiple of
+the unrolling.  An attractive feature of this is that times increase
+smoothly with operand size, but it may be that some routines should just
+have simple loops to finish up, especially when PIC adds between 2 and 16
+cycles to get %eip.
+
+Position independent code is implemented using a call to get %eip for the
+computed jumps and a ret is always done, rather than an addl $4,%esp or a
+popl, so the CPU return address branch prediction stack stays synchronised
+with the actual stack in memory.
+
+Branch prediction, in absence of any history, will guess forward jumps are
+not taken and backward jumps are taken.  Where possible it's arranged that
+the less likely or less important case is under a taken forward jump.
+
+
+
+CODING
+
+Instructions in general code have been shown grouped if they can execute
+together, which means up to three direct-path instructions which have no
+successive dependencies.  K7 always decodes three and has out-of-order
+execution, but the groupings show what slots might be available and what
+dependency chains exist.
+
+When there's vector-path instructions an effort is made to get triplets of
+direct-path instructions in between them, even if there's dependencies,
+since this maximizes decoding throughput and might save a cycle or two if
+decoding is the limiting factor.
+
+
+
+INSTRUCTIONS
+
+adcl       direct
+divl       39 cycles back-to-back
+lodsl,etc  vector
+loop       1 cycle vector (decl/jnz opens up one decode slot)
+movd reg   vector
+movd mem   direct
+mull       issue every 3 cycles, latency 4 cycles low word, 6 cycles high word
+popl	   vector (use movl for more than one pop)
+pushl	   direct, will pair with a load
+shrdl %cl  vector, 3 cycles, seems to be 3 decode too
+xorl r,r   false read dependency recognised
+
+
+
+REFERENCES
+
+"AMD Athlon Processor X86 Code Optimization Guide", AMD publication number
+22007, revision E, November 1999.  Available on-line,
+
+	http://www.amd.com/products/cpg/athlon/techdocs/pdf/22007.pdf
+
+"3DNow Technology Manual", AMD publication number 21928F/0-August 1999.
+This describes the femms and prefetch instructions.  Available on-line,
+
+	http://www.amd.com/K6/k6docs/pdf/21928.pdf
+
+"AMD Extensions to the 3DNow and MMX Instruction Sets Manual", AMD
+publication number 22466, revision B, August 1999.  This describes
+instructions added in the Athlon processor, such as pswapd and the extra
+prefetch forms.  Available on-line,
+
+	http://www.amd.com/products/cpg/athlon/techdocs/pdf/22466.pdf
+
+"3DNow Instruction Porting Guide", AMD publication number 22621, revision B,
+August 1999.  This has some notes on general Athlon optimizations as well as
+3DNow.  Available on-line,
+
+	http://www.amd.com/products/cpg/athlon/techdocs/pdf/22621.pdf
+
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/rts/gmp/mpn/x86/k7/aors_n.asm b/rts/gmp/mpn/x86/k7/aors_n.asm
new file mode 100644
index 0000000000..85fa9d3036
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/aors_n.asm
@@ -0,0 +1,250 @@
+dnl  AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
+dnl 
+dnl  K7: 1.64 cycles/limb (at 16 limb/loop).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K7: UNROLL_COUNT cycles/limb
+dnl           8           1.9
+dnl          16           1.64
+dnl          32           1.7
+dnl          64           2.0
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_add_n', `
+	define(M4_inst,        adcl)
+	define(M4_function_n,  mpn_add_n)
+	define(M4_function_nc, mpn_add_nc)
+	define(M4_description, add)
+',`ifdef(`OPERATION_sub_n', `
+	define(M4_inst,        sbbl)
+	define(M4_function_n,  mpn_sub_n)
+	define(M4_function_nc, mpn_sub_nc)
+	define(M4_description, subtract)
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                         mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C	                   mp_size_t size, mp_limb_t carry);
+C
+C Calculate src1,size M4_description src2,size, and store the result in
+C dst,size.  The return value is the carry bit from the top of the result (1
+C or 0).
+C
+C The _nc version accepts 1 or 0 for an initial carry into the low limb of
+C the calculation.  Note values other than 1 or 0 here will lead to garbage
+C results.
+C
+C This code runs at 1.64 cycles/limb, which is probably the best possible
+C with plain integer operations.  Each limb is 2 loads and 1 store, and in
+C one cycle the K7 can do two loads, or a load and a store, leading to 1.5
+C c/l.
+
+dnl  Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 8)
+',`
+deflit(UNROLL_THRESHOLD, 8)
+')
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST,  4)
+
+defframe(SAVE_EBP, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EBX, -12)
+defframe(SAVE_EDI, -16)
+deflit(STACK_SPACE, 16)
+
+	.text
+	ALIGN(32)
+deflit(`FRAME',0)
+
+PROLOGUE(M4_function_nc)
+	movl	PARAM_CARRY, %eax
+	jmp	LF(M4_function_n,start)
+EPILOGUE()
+
+PROLOGUE(M4_function_n)
+
+	xorl	%eax, %eax	C carry
+L(start):
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%edi, SAVE_EDI
+	movl	%ebx, SAVE_EBX
+	cmpl	$UNROLL_THRESHOLD, %ecx
+
+	movl	PARAM_SRC2, %edx
+	movl	PARAM_SRC1, %ebx
+	jae	L(unroll)
+
+	movl	PARAM_DST, %edi
+	leal	(%ebx,%ecx,4), %ebx
+	leal	(%edx,%ecx,4), %edx
+
+	leal	(%edi,%ecx,4), %edi
+	negl	%ecx
+	shrl	%eax
+
+	C This loop in in a single 16 byte code block already, so no
+	C alignment necessary.
+L(simple):
+	C eax	scratch
+	C ebx	src1
+	C ecx	counter
+	C edx	src2
+	C esi
+	C edi	dst
+	C ebp
+
+	movl	(%ebx,%ecx,4), %eax
+	M4_inst	(%edx,%ecx,4), %eax
+	movl	%eax, (%edi,%ecx,4)
+	incl	%ecx
+	jnz	L(simple)
+
+	movl	$0, %eax
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBX, %ebx
+	setc	%al
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	C This is at 0x55, close enough to aligned.
+L(unroll):
+deflit(`FRAME',STACK_SPACE)
+	movl	%ebp, SAVE_EBP
+	andl	$-2, %ecx		C size low bit masked out
+	andl	$1, PARAM_SIZE		C size low bit kept
+
+	movl	%ecx, %edi
+	decl	%ecx
+	movl	PARAM_DST, %ebp
+
+	shrl	$UNROLL_LOG2, %ecx
+	negl	%edi
+	movl	%esi, SAVE_ESI
+
+	andl	$UNROLL_MASK, %edi
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(entry) (%edi,%edi,8), %esi	C 9 bytes per
+')
+	negl	%edi
+	shrl	%eax
+
+	leal	ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
+	leal	ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
+	leal	ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
+
+	jmp	*%esi
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See README.family about old gas bugs
+	leal	(%edi,%edi,8), %esi
+	addl	$L(entry)-L(here), %esi
+	addl	(%esp), %esi
+	ret
+')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(32)
+L(top):
+	C eax	zero
+	C ebx	src1
+	C ecx	counter
+	C edx	src2
+	C esi	scratch (was computed jump)
+	C edi	dst
+	C ebp	scratch
+
+	leal	UNROLL_BYTES(%edx), %edx
+
+L(entry):
+deflit(CHUNK_COUNT, 2)
+forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 + 4))
+
+Zdisp(	movl,	disp0,(%ebx), %esi)
+	movl	disp1(%ebx), %ebp
+Zdisp(	M4_inst,disp0,(%edx), %esi)
+Zdisp(	movl,	%esi, disp0,(%edi))
+	M4_inst	disp1(%edx), %ebp
+	movl	%ebp, disp1(%edi)
+')
+
+	decl	%ecx
+	leal	UNROLL_BYTES(%ebx), %ebx
+	leal	UNROLL_BYTES(%edi), %edi
+	jns	L(top)
+
+
+	mov	PARAM_SIZE, %esi
+	movl	SAVE_EBP, %ebp
+	movl	$0, %eax
+
+	decl	%esi
+	js	L(even)
+
+	movl	(%ebx), %ecx
+	M4_inst	UNROLL_BYTES(%edx), %ecx
+	movl	%ecx, (%edi)
+L(even):
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBX, %ebx
+	setc	%al
+
+	movl	SAVE_ESI, %esi
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/aorsmul_1.asm b/rts/gmp/mpn/x86/k7/aorsmul_1.asm
new file mode 100644
index 0000000000..9f9c3daaf4
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/aorsmul_1.asm
@@ -0,0 +1,364 @@
+dnl  AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
+dnl 
+dnl  K7: 3.9 cycles/limb.
+dnl 
+dnl  Future: It should be possible to avoid the separate mul after the
+dnl  unrolled loop by moving the movl/adcl to the top.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K7: UNROLL_COUNT  cycles/limb
+dnl           4            4.42
+dnl           8            4.16
+dnl          16            3.9
+dnl          32            3.9
+dnl          64            3.87
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_addmul_1',`
+	define(M4_inst,        addl)
+	define(M4_function_1,  mpn_addmul_1)
+	define(M4_function_1c, mpn_addmul_1c)
+	define(M4_description, add it to)
+	define(M4_desc_retval, carry)
+',`ifdef(`OPERATION_submul_1',`
+	define(M4_inst,        subl)
+	define(M4_function_1,  mpn_submul_1)
+	define(M4_function_1c, mpn_submul_1c)
+	define(M4_description, subtract it from)
+	define(M4_desc_retval, borrow)
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                            mp_limb_t mult);
+C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                             mp_limb_t mult, mp_limb_t carry);
+C
+C Calculate src,size multiplied by mult and M4_description dst,size.
+C Return the M4_desc_retval limb from the top of the result.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 9)
+',`
+deflit(UNROLL_THRESHOLD, 6)
+')
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+deflit(`FRAME',0)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+deflit(SAVE_SIZE, 16)
+
+	.text
+	ALIGN(32)
+PROLOGUE(M4_function_1)
+	movl	PARAM_SIZE, %edx
+	movl	PARAM_SRC, %eax
+	xorl	%ecx, %ecx
+
+	decl	%edx
+	jnz	LF(M4_function_1c,start_1)
+
+	movl	(%eax), %eax
+	movl	PARAM_DST, %ecx
+
+	mull	PARAM_MULTIPLIER
+
+	M4_inst	%eax, (%ecx)
+	adcl	$0, %edx
+	movl	%edx, %eax
+
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(M4_function_1c)
+	movl	PARAM_SIZE, %edx
+	movl	PARAM_SRC, %eax
+
+	decl	%edx
+	jnz	L(more_than_one_limb)
+
+	movl	(%eax), %eax
+	movl	PARAM_DST, %ecx
+
+	mull	PARAM_MULTIPLIER
+
+	addl	PARAM_CARRY, %eax
+
+	adcl	$0, %edx
+	M4_inst	%eax, (%ecx)
+
+	adcl	$0, %edx
+	movl	%edx, %eax
+
+	ret
+
+
+	C offset 0x44 so close enough to aligned
+L(more_than_one_limb):
+	movl	PARAM_CARRY, %ecx
+L(start_1):
+	C eax	src
+	C ecx	initial carry
+	C edx	size-1
+	subl	$SAVE_SIZE, %esp
+deflit(`FRAME',16)
+
+	movl	%ebx, SAVE_EBX
+	movl	%esi, SAVE_ESI
+	movl	%edx, %ebx	C size-1
+
+	movl	PARAM_SRC, %esi
+	movl	%ebp, SAVE_EBP
+	cmpl	$UNROLL_THRESHOLD, %edx
+
+	movl	PARAM_MULTIPLIER, %ebp
+	movl	%edi, SAVE_EDI
+
+	movl	(%esi), %eax	C src low limb
+	movl	PARAM_DST, %edi
+	ja	L(unroll)
+
+
+	C simple loop
+
+	leal	4(%esi,%ebx,4), %esi	C point one limb past last
+	leal	(%edi,%ebx,4), %edi	C point at last limb
+	negl	%ebx
+
+	C The movl to load the next source limb is done well ahead of the
+	C mul.  This is necessary for full speed, and leads to one limb
+	C handled separately at the end.
+
+L(simple):
+	C eax	src limb
+	C ebx	loop counter
+	C ecx	carry limb
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+
+	mull	%ebp
+
+	addl	%eax, %ecx
+	adcl	$0, %edx
+
+	M4_inst	%ecx, (%edi,%ebx,4)
+	movl	(%esi,%ebx,4), %eax
+	adcl	$0, %edx
+
+	incl	%ebx
+	movl	%edx, %ecx
+	jnz	L(simple)
+
+
+	mull	%ebp
+
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBP, %ebp
+
+	addl	%eax, %ecx
+	adcl	$0, %edx
+
+	M4_inst	%ecx, (%edi)
+	adcl	$0, %edx
+	movl	SAVE_EDI, %edi
+
+	addl	$SAVE_SIZE, %esp
+	movl	%edx, %eax
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll):
+	C eax	src low limb
+	C ebx	size-1
+	C ecx	carry
+	C edx	size-1
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+	
+dnl  overlapping with parameters no longer needed
+define(VAR_COUNTER,`PARAM_SIZE')
+define(VAR_JUMP,   `PARAM_MULTIPLIER')
+
+	subl	$2, %ebx	C (size-2)-1
+	decl	%edx		C size-2
+	
+	shrl	$UNROLL_LOG2, %ebx
+	negl	%edx
+
+	movl	%ebx, VAR_COUNTER
+	andl	$UNROLL_MASK, %edx
+
+	movl	%edx, %ebx
+	shll	$4, %edx
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(entry) (%edx,%ebx,1), %edx
+')
+	negl	%ebx
+	movl	%edx, VAR_JUMP
+
+	mull	%ebp
+
+	addl	%eax, %ecx	C initial carry, becomes low carry
+	adcl	$0, %edx
+	testb	$1, %bl
+
+	movl	4(%esi), %eax	C src second limb
+	leal	ifelse(UNROLL_BYTES,256,128+) 8(%esi,%ebx,4), %esi
+	leal	ifelse(UNROLL_BYTES,256,128)   (%edi,%ebx,4), %edi
+
+	movl	%edx, %ebx	C high carry
+	cmovnz(	%ecx, %ebx)	C high,low carry other way around
+	cmovnz(	%edx, %ecx)
+
+	jmp	*VAR_JUMP
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See README.family about old gas bugs
+	leal	(%edx,%ebx,1), %edx
+	addl	$L(entry)-L(here), %edx
+	addl	(%esp), %edx
+	ret
+')
+
+
+C -----------------------------------------------------------------------------
+C This code uses a "two carry limbs" scheme.  At the top of the loop the
+C carries are ebx=lo, ecx=hi, then they swap for each limb processed.  For
+C the computed jump an odd size means they start one way around, an even
+C size the other.  Either way one limb is handled separately at the start of
+C the loop.
+C
+C The positioning of the movl to load the next source limb is important.
+C Moving it after the adcl with a view to avoiding a separate mul at the end
+C of the loop slows the code down.
+
+	ALIGN(32)
+L(top):
+	C eax	src limb
+	C ebx	carry high
+	C ecx	carry low
+	C edx	scratch
+	C esi	src+8
+	C edi	dst
+	C ebp	multiplier
+	C
+	C VAR_COUNTER  loop counter
+	C
+	C 17 bytes each limb
+
+L(entry):
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 + 4))
+
+	mull	%ebp
+
+Zdisp(	M4_inst,%ecx, disp0,(%edi))
+	movl	$0, %ecx
+
+	adcl	%eax, %ebx
+
+Zdisp(	movl,	disp0,(%esi), %eax)
+	adcl	%edx, %ecx	
+
+
+	mull	%ebp
+
+	M4_inst	%ebx, disp1(%edi)
+	movl	$0, %ebx
+
+	adcl	%eax, %ecx
+
+	movl	disp1(%esi), %eax
+	adcl	%edx, %ebx
+')
+
+	decl	VAR_COUNTER
+	leal	UNROLL_BYTES(%esi), %esi
+	leal	UNROLL_BYTES(%edi), %edi
+
+	jns	L(top)
+
+
+	C eax	src limb
+	C ebx	carry high
+	C ecx	carry low
+	C edx
+	C esi
+	C edi	dst (points at second last limb)
+	C ebp	multiplier
+deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
+deflit(`disp1', eval(disp0-0 + 4))
+
+	mull	%ebp
+
+	M4_inst	%ecx, disp0(%edi)
+	movl	SAVE_EBP, %ebp
+
+	adcl	%ebx, %eax
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_ESI, %esi
+
+	adcl	$0, %edx
+	M4_inst	%eax, disp1(%edi)
+	movl	SAVE_EDI, %edi
+
+	adcl	$0, %edx
+	addl	$SAVE_SIZE, %esp
+
+	movl	%edx, %eax
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/diveby3.asm b/rts/gmp/mpn/x86/k7/diveby3.asm
new file mode 100644
index 0000000000..57684958a5
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/diveby3.asm
@@ -0,0 +1,131 @@
+dnl  AMD K7 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
+dnl 
+dnl  K7: 8.0 cycles/limb
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                              mp_limb_t carry);
+
+defframe(PARAM_CARRY,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,   8)
+defframe(PARAM_DST,   4)
+
+dnl  multiplicative inverse of 3, modulo 2^32
+deflit(INVERSE_3,        0xAAAAAAAB)
+
+dnl  ceil(b/3) and floor(b*2/3) where b=2^32
+deflit(ONE_THIRD_CEIL,   0x55555556)
+deflit(TWO_THIRDS_FLOOR, 0xAAAAAAAA)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_divexact_by3c)
+deflit(`FRAME',0)
+
+	movl	PARAM_SRC, %ecx
+	pushl	%ebx			defframe_pushl(SAVE_EBX)
+
+	movl	PARAM_CARRY, %ebx
+	pushl	%ebp			defframe_pushl(SAVE_EBP)
+
+	movl	PARAM_SIZE, %ebp
+	pushl	%edi			defframe_pushl(SAVE_EDI)
+
+	movl	(%ecx), %eax		C src low limb
+	pushl	%esi			defframe_pushl(SAVE_ESI)
+
+	movl	PARAM_DST, %edi
+	movl	$TWO_THIRDS_FLOOR, %esi
+	leal	-4(%ecx,%ebp,4), %ecx	C &src[size-1]
+
+	subl	%ebx, %eax
+
+	setc	%bl
+	decl	%ebp
+	jz	L(last)
+
+	leal	(%edi,%ebp,4), %edi	C &dst[size-1]
+	negl	%ebp
+
+
+	ALIGN(16)
+L(top):
+	C eax	src limb, carry subtracted
+	C ebx	carry limb (0 or 1)
+	C ecx	&src[size-1]
+	C edx	scratch
+	C esi	TWO_THIRDS_FLOOR
+	C edi	&dst[size-1]
+	C ebp	counter, limbs, negative
+
+	imull	$INVERSE_3, %eax, %edx
+
+	movl	4(%ecx,%ebp,4), %eax	C next src limb
+	cmpl	$ONE_THIRD_CEIL, %edx
+
+	sbbl	$-1, %ebx		C +1 if result>=ceil(b/3)
+	cmpl	%edx, %esi
+
+	sbbl	%ebx, %eax		C and further 1 if result>=ceil(b*2/3)
+	movl	%edx, (%edi,%ebp,4)
+	incl	%ebp
+
+	setc	%bl			C new carry
+	jnz	L(top)
+
+
+
+L(last):
+	C eax	src limb, carry subtracted
+	C ebx	carry limb (0 or 1)
+	C ecx	&src[size-1]
+	C edx	scratch
+	C esi	multiplier
+	C edi	&dst[size-1]
+	C ebp
+
+	imull	$INVERSE_3, %eax
+
+	cmpl	$ONE_THIRD_CEIL, %eax
+	movl	%eax, (%edi)
+	movl	SAVE_EBP, %ebp
+
+	sbbl	$-1, %ebx		C +1 if eax>=ceil(b/3)
+	cmpl	%eax, %esi
+	movl	$0, %eax
+
+	adcl	%ebx, %eax		C further +1 if eax>=ceil(b*2/3)
+	movl	SAVE_EDI, %edi
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EBX, %ebx
+	addl	$FRAME, %esp
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/gmp-mparam.h b/rts/gmp/mpn/x86/k7/gmp-mparam.h
new file mode 100644
index 0000000000..c3bba0afc4
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/gmp-mparam.h
@@ -0,0 +1,100 @@
+/* AMD K7 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+/* the low limb is ready after 4 cycles, but normally it's the high limb
+   which is of interest, and that comes out after 6 cycles */
+#ifndef UMUL_TIME
+#define UMUL_TIME   6  /* cycles */
+#endif
+
+/* AMD doco says 40, but it measures 39 back-to-back */
+#ifndef UDIV_TIME
+#define UDIV_TIME   39  /* cycles */
+#endif
+
+/* using bsf */
+#ifndef COUNT_TRAILING_ZEROS_TIME
+#define COUNT_TRAILING_ZEROS_TIME   7  /* cycles */
+#endif
+
+
+/* Generated by tuneup.c, 2000-07-06. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   26
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      177
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   52
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      173
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              76
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD            114
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            34
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        5
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD          54
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE  { 720, 1440, 2944, 7680, 18432, 57344, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD     736
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD         6912
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE  { 784, 1696, 3200, 7680, 18432, 57344, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD     800
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD         8448
+#endif
diff --git a/rts/gmp/mpn/x86/k7/mmx/copyd.asm b/rts/gmp/mpn/x86/k7/mmx/copyd.asm
new file mode 100644
index 0000000000..33214daa1f
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/copyd.asm
@@ -0,0 +1,136 @@
+dnl  AMD K7 mpn_copyd -- copy limb vector, decrementing.
+dnl 
+dnl     alignment dst/src, A=0mod8 N=4mod8
+dnl        A/A   A/N   N/A   N/N
+dnl  K7    0.75  1.0   1.0   0.75
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The various comments in mpn/x86/k7/copyi.asm apply here too.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+dnl  parameter space reused
+define(SAVE_EBX,`PARAM_SIZE')
+define(SAVE_ESI,`PARAM_SRC')
+
+dnl  minimum 5 since the unrolled code can't handle less than 5
+deflit(UNROLL_THRESHOLD, 5)
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_copyd)
+
+	movl	PARAM_SIZE, %ecx
+	movl	%ebx, SAVE_EBX
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %edx
+
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	jae	L(unroll)
+
+	orl	%ecx, %ecx
+	jz	L(simple_done)
+
+L(simple):
+	C eax	src
+	C ebx	scratch
+	C ecx	counter
+	C edx	dst
+	C
+	C this loop is 2 cycles/limb
+
+	movl	-4(%eax,%ecx,4), %ebx
+	movl	%ebx, -4(%edx,%ecx,4)
+	decl	%ecx
+	jnz	L(simple)
+
+L(simple_done):
+	movl	SAVE_EBX, %ebx
+	ret
+
+
+L(unroll):
+	movl	%esi, SAVE_ESI
+	leal	(%eax,%ecx,4), %ebx
+	leal	(%edx,%ecx,4), %esi
+
+	andl	%esi, %ebx
+	movl	SAVE_ESI, %esi
+	subl	$4, %ecx		C size-4
+
+	testl	$4, %ebx   C testl to pad code closer to 16 bytes for L(top)
+	jz	L(aligned)
+
+	C both src and dst unaligned, process one limb to align them
+	movl	12(%eax,%ecx,4), %ebx
+	movl	%ebx, 12(%edx,%ecx,4)
+	decl	%ecx
+L(aligned):
+
+
+	ALIGN(16)
+L(top):
+	C eax	src
+	C ebx
+	C ecx	counter, limbs
+	C edx	dst
+
+	movq	8(%eax,%ecx,4), %mm0
+	movq	(%eax,%ecx,4), %mm1
+	subl	$4, %ecx
+	movq	%mm0, 16+8(%edx,%ecx,4)
+	movq	%mm1, 16(%edx,%ecx,4)
+	jns	L(top)
+
+
+	C now %ecx is -4 to -1 representing respectively 0 to 3 limbs remaining
+
+	testb	$2, %cl
+	jz	L(finish_not_two)
+
+	movq	8(%eax,%ecx,4), %mm0
+	movq	%mm0, 8(%edx,%ecx,4)
+L(finish_not_two):
+
+	testb	$1, %cl
+	jz	L(done)
+
+	movl	(%eax), %ebx
+	movl	%ebx, (%edx)
+
+L(done):
+	movl	SAVE_EBX, %ebx
+	emms
+	ret
+
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/copyi.asm b/rts/gmp/mpn/x86/k7/mmx/copyi.asm
new file mode 100644
index 0000000000..b234a1628c
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/copyi.asm
@@ -0,0 +1,147 @@
+dnl  AMD K7 mpn_copyi -- copy limb vector, incrementing.
+dnl 
+dnl     alignment dst/src, A=0mod8 N=4mod8
+dnl        A/A   A/N   N/A   N/N
+dnl  K7    0.75  1.0   1.0   0.75
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size.
+C
+C This code at 0.75 or 1.0 c/l is always faster than a plain rep movsl at
+C 1.33 c/l.
+C
+C The K7 can do two loads, or two stores, or a load and a store, in one
+C cycle, so if those are 64-bit operations then 0.5 c/l should be possible,
+C however nothing under 0.7 c/l is known.
+C
+C If both source and destination are unaligned then one limb is processed at
+C the start to make them aligned and so get 0.75 c/l, whereas if they'd been
+C used unaligned it would be 1.5 c/l.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl  parameter space reused
+define(SAVE_EBX,`PARAM_SIZE')
+
+dnl  minimum 5 since the unrolled code can't handle less than 5
+deflit(UNROLL_THRESHOLD, 5)
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_copyi)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	%ebx, SAVE_EBX
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %edx
+
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	jae	L(unroll)
+
+	orl	%ecx, %ecx
+	jz	L(simple_done)
+
+L(simple):
+	C eax	src, incrementing
+	C ebx	scratch
+	C ecx	counter
+	C edx	dst, incrementing
+	C
+	C this loop is 2 cycles/limb
+
+	movl	(%eax), %ebx
+	movl	%ebx, (%edx)
+	decl	%ecx
+	leal	4(%eax), %eax
+	leal	4(%edx), %edx
+	jnz	L(simple)
+
+L(simple_done):
+	movl	SAVE_EBX, %ebx
+	ret
+
+
+L(unroll):
+	movl	%eax, %ebx
+	leal	-12(%eax,%ecx,4), %eax	C src end - 12
+	subl	$3, %ecx		C size-3
+
+	andl	%edx, %ebx
+	leal	(%edx,%ecx,4), %edx	C dst end - 12
+	negl	%ecx
+
+	testl	$4, %ebx   C testl to pad code closer to 16 bytes for L(top)
+	jz	L(aligned)
+
+	C both src and dst unaligned, process one limb to align them
+	movl	(%eax,%ecx,4), %ebx
+	movl	%ebx, (%edx,%ecx,4)
+	incl	%ecx
+L(aligned):
+
+
+	ALIGN(16)
+L(top):
+	C eax	src end - 12
+	C ebx
+	C ecx	counter, negative, limbs
+	C edx	dst end - 12
+
+	movq	(%eax,%ecx,4), %mm0
+	movq	8(%eax,%ecx,4), %mm1
+	addl	$4, %ecx
+	movq	%mm0, -16(%edx,%ecx,4)
+	movq	%mm1, -16+8(%edx,%ecx,4)
+	ja	L(top)		C jump no carry and not zero
+
+
+	C now %ecx is 0 to 3 representing respectively 3 to 0 limbs remaining
+
+	testb	$2, %cl
+	jnz	L(finish_not_two)
+
+	movq	(%eax,%ecx,4), %mm0
+	movq	%mm0, (%edx,%ecx,4)
+L(finish_not_two):
+
+	testb	$1, %cl
+	jnz	L(done)
+
+	movl	8(%eax), %ebx
+	movl	%ebx, 8(%edx)
+
+L(done):
+	movl	SAVE_EBX, %ebx
+	emms
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/divrem_1.asm b/rts/gmp/mpn/x86/k7/mmx/divrem_1.asm
new file mode 100644
index 0000000000..483ad6a9a1
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/divrem_1.asm
@@ -0,0 +1,718 @@
+dnl  AMD K7 mpn_divrem_1 -- mpn by limb division.
+dnl 
+dnl  K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                         mp_srcptr src, mp_size_t size,
+C                         mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C                          mp_srcptr src, mp_size_t size,
+C                          mp_limb_t divisor, mp_limb_t carry);
+C
+C The method and nomenclature follow part 8 of "Division by Invariant
+C Integers using Multiplication" by Granlund and Montgomery, reference in
+C gmp.texi.
+C
+C The "and"s shown in the paper are done here with "cmov"s.  "m" is written
+C for m', and "d" for d_norm, which won't cause any confusion since it's
+C only the normalized divisor that's of any use in the code.  "b" is written
+C for 2^N, the size of a limb, N being 32 here.
+C
+C mpn_divrem_1 avoids one division if the src high limb is less than the
+C divisor.  mpn_divrem_1c doesn't check for a zero carry, since in normal
+C circumstances that will be a very rare event.
+C
+C There's a small bias towards expecting xsize==0, by having code for
+C xsize==0 in a straight line and xsize!=0 under forward jumps.
+
+
+dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
+dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
+dnl
+dnl  The inverse takes about 50 cycles to calculate, but after that the
+dnl  multiply is 17 c/l versus division at 42 c/l.
+dnl
+dnl  At 3 limbs the mul is a touch faster than div on the integer part, and
+dnl  even more so on the fractional part.
+
+deflit(MUL_THRESHOLD, 3)
+
+
+defframe(PARAM_CARRY,  24)
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE,   16)
+defframe(PARAM_SRC,    12)
+defframe(PARAM_XSIZE,  8)
+defframe(PARAM_DST,    4)
+
+defframe(SAVE_EBX,    -4)
+defframe(SAVE_ESI,    -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+
+defframe(VAR_NORM,    -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC,     -28)
+defframe(VAR_DST,     -32)
+defframe(VAR_DST_STOP,-36)
+
+deflit(STACK_SPACE, 36)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+	movl	PARAM_CARRY, %edx
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	leal	-4(%edi,%ebx,4), %edi
+	jmp	LF(mpn_divrem_1,start_1c)
+
+EPILOGUE()
+
+
+	C offset 0x31, close enough to aligned
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	$0, %edx		C initial carry (if can't skip a div)
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+	orl	%ecx, %ecx
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
+
+	jz	L(no_skip_div)
+	movl	-4(%esi,%ecx,4), %eax	C src high limb
+
+	cmpl	%ebp, %eax		C one less div if high<divisor
+	jnb	L(no_skip_div)
+
+	movl	$0, (%edi,%ecx,4)	C dst high limb
+	decl	%ecx			C size-1
+	movl	%eax, %edx		C src high limb as initial carry
+L(no_skip_div):
+
+
+L(start_1c):
+	C eax	
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	leal	(%ebx,%ecx), %eax	C size+xsize
+	cmpl	$MUL_THRESHOLD, %eax
+	jae	L(mul_by_inverse)
+
+
+C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs.
+C It'd be possible to write them out without the looping, but no speedup
+C would be expected.
+C
+C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the
+C integer part, but curiously not on the fractional part, where %ebp is a
+C (fixed) couple of cycles faster.
+
+	orl	%ecx, %ecx
+	jz	L(divide_no_integer)
+
+L(divide_integer):
+	C eax	scratch (quotient)
+	C ebx	xsize
+	C ecx	counter
+	C edx	scratch (remainder)
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	movl	-4(%esi,%ecx,4), %eax
+
+	divl	PARAM_DIVISOR
+
+	movl	%eax, (%edi,%ecx,4)
+	decl	%ecx
+	jnz	L(divide_integer)
+
+
+L(divide_no_integer):
+	movl	PARAM_DST, %edi
+	orl	%ebx, %ebx
+	jnz	L(divide_fraction)
+
+L(divide_done):
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EDI, %edi
+	movl	%edx, %eax
+
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+L(divide_fraction):
+	C eax	scratch (quotient)
+	C ebx	counter
+	C ecx
+	C edx	scratch (remainder)
+	C esi
+	C edi	dst
+	C ebp	divisor
+
+	movl	$0, %eax
+
+	divl	%ebp
+
+	movl	%eax, -4(%edi,%ebx,4)
+	decl	%ebx
+	jnz	L(divide_fraction)
+
+	jmp	L(divide_done)
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+	C eax
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	bsrl	%ebp, %eax		C 31-l
+
+	leal	12(%edi), %ebx
+	leal	4(%edi,%ecx,4), %edi	C &dst[xsize+size]
+
+	movl	%edi, VAR_DST
+	movl	%ebx, VAR_DST_STOP
+
+	movl	%ecx, %ebx		C size
+	movl	$31, %ecx
+
+	movl	%edx, %edi		C carry
+	movl	$-1, %edx
+
+	C
+
+	xorl	%eax, %ecx		C l
+	incl	%eax			C 32-l
+
+	shll	%cl, %ebp		C d normalized
+	movl	%ecx, VAR_NORM
+
+	movd	%eax, %mm7
+
+	movl	$-1, %eax
+	subl	%ebp, %edx		C (b-d)-1 giving edx:eax = b*(b-d)-1
+
+	divl	%ebp			C floor (b*(b-d)-1) / d
+
+	orl	%ebx, %ebx		C size
+	movl	%eax, VAR_INVERSE
+	leal	-12(%esi,%ebx,4), %eax	C &src[size-3]
+
+	jz	L(start_zero)
+	movl	%eax, VAR_SRC
+	cmpl	$1, %ebx
+
+	movl	8(%eax), %esi		C src high limb
+	jz	L(start_one)
+
+L(start_two_or_more):
+	movl	4(%eax), %edx		C src second highest limb
+
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shldl(	%cl, %edx, %esi)	C n10 = high,second << l
+
+	cmpl	$2, %ebx
+	je	L(integer_two_left)
+	jmp	L(integer_top)
+
+
+L(start_one):
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shll	%cl, %esi		C n10 = high << l
+	movl	%eax, VAR_SRC
+	jmp	L(integer_one_left)
+
+
+L(start_zero):
+	shll	%cl, %edi		C n2 = carry << l
+	movl	$0, %esi		C n10 = 0
+
+	C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then
+	C must have xsize!=0
+	jmp	L(fraction_some)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C The multiply by inverse loop is 17 cycles, and relies on some out-of-order
+C execution.  The instruction scheduling is important, with various
+C apparently equivalent forms running 1 to 5 cycles slower.
+C
+C A lower bound for the time would seem to be 16 cycles, based on the
+C following successive dependencies.
+C
+C		      cycles
+C		n2+n1	1
+C		mul	6
+C		q1+1	1
+C		mul	6
+C		sub	1
+C		addback	1
+C		       ---
+C		       16
+C
+C This chain is what the loop has already, but 16 cycles isn't achieved.
+C K7 has enough decode, and probably enough execute (depending maybe on what
+C a mul actually consumes), but nothing running under 17 has been found.
+C
+C In theory n2+n1 could be done in the sub and addback stages (by
+C calculating both n2 and n2+n1 there), but lack of registers makes this an
+C unlikely proposition.
+C
+C The jz in the loop keeps the q1+1 stage to 1 cycle.  Handling an overflow
+C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent
+C chain, and nothing better than 18 cycles has been found when using it.
+C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
+C be an extremely rare event.
+C
+C Branch mispredictions will hit random occurrances of q1==0xFFFFFFFF, but
+C if some special data is coming out with this always, the q1_ff special
+C case actually runs at 15 c/l.  0x2FFF...FFFD divided by 3 is a good way to
+C induce the q1_ff case, for speed measurements or testing.  Note that
+C 0xFFF...FFF divided by 1 or 2 doesn't induce it.
+C
+C The instruction groupings and empty comments show the cycles for a naive
+C in-order view of the code (conveniently ignoring the load latency on
+C VAR_INVERSE).  This shows some of where the time is going, but is nonsense
+C to the extent that out-of-order execution rearranges it.  In this case
+C there's 19 cycles shown, but it executes at 17.
+
+	ALIGN(16)
+L(integer_top):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (src, dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	scratch (src qword)
+	C mm7	rshift for normalization
+
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+	movl	VAR_SRC, %ecx
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movq	(%ecx), %mm0       C next limb and the one below it
+	subl	$4, %ecx
+
+	movl	%ecx, VAR_SRC
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	C
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+	jz	L(q1_ff)
+	movl	VAR_DST, %ecx
+
+	mull	%ebx		   C (q1+1)*d
+
+	psrlq	%mm7, %mm0
+
+	leal	-4(%ecx), %ecx
+
+	C
+
+	subl	%eax, %esi
+	movl	VAR_DST_STOP, %eax
+
+	C
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	movd	%mm0, %esi
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	sbbl	$0, %ebx	   C q
+	cmpl	%eax, %ecx
+
+	movl	%ebx, (%ecx)
+	movl	%ecx, VAR_DST
+	jne	L(integer_top)
+
+
+L(integer_loop_done):
+
+
+C -----------------------------------------------------------------------------
+C
+C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
+C q1_ff special case.  This make the code a bit smaller and simpler, and
+C costs only 1 cycle (each).
+
+L(integer_two_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (src, dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	src limb, shifted
+	C mm7	rshift
+
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+	movl	PARAM_SRC, %ecx
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movd	(%ecx), %mm0	   C src low limb
+
+	movl	VAR_DST_STOP, %ecx
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx
+
+	mull	%ebx		   C (q1+1)*d
+
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+
+	C
+
+	subl	%eax, %esi
+
+	C
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	movd	%mm0, %esi
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	sbbl	$0, %ebx	   C q
+
+	movl	%ebx, -4(%ecx)
+
+
+C -----------------------------------------------------------------------------
+L(integer_one_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	dst
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	src limb, shifted
+	C mm7	rshift
+
+	movl	VAR_DST_STOP, %ecx
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	C
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	C
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx           C q1 if q1+1 overflowed
+
+	mull	%ebx
+
+	C
+
+	C
+
+	C
+
+	subl	%eax, %esi
+
+	C
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	sbbl	$0, %ebx	   C q
+
+	movl	%ebx, -8(%ecx)
+	subl	$8, %ecx
+
+
+
+L(integer_none):
+	cmpl	$0, PARAM_XSIZE
+	jne	L(fraction_some)
+
+	movl	%edi, %eax
+L(fraction_done):
+	movl	VAR_NORM, %ecx
+	movl	SAVE_EBP, %ebp
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EBX, %ebx
+	addl	$STACK_SPACE, %esp
+
+	shrl	%cl, %eax
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+	C eax	(divisor)
+	C ebx	(q1+1 == 0)
+	C ecx
+	C edx
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+
+	movl	VAR_DST, %ecx
+	movl	VAR_DST_STOP, %edx
+	subl	$4, %ecx
+
+	psrlq	%mm7, %mm0
+	leal	(%ebp,%esi), %edi	C n-q*d remainder -> next n2
+	movl	%ecx, VAR_DST
+
+	movd	%mm0, %esi		C next n10
+
+	movl	$-1, (%ecx)
+	cmpl	%ecx, %edx
+	jne	L(integer_top)
+
+	jmp	L(integer_loop_done)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C Being the fractional part, the "source" limbs are all zero, meaning
+C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated.
+C
+C The loop runs at 15 cycles.  The dependent chain is the same as the
+C general case above, but without the n2+n1 stage (due to n1==0), so 15
+C would seem to be the lower bound.
+C
+C A not entirely obvious simplification is that q1+1 never overflows a limb,
+C and so there's no need for the sbbl $0 or jz q1_ff from the general case.
+C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
+C rnd() means rounding down to a multiple of d.
+C
+C	m*n2 + b*n2 <= m*(d-1) + b*(d-1)
+C	             = m*d + b*d - m - b
+C	             = floor((b(b-d)-1)/d)*d + b*d - m - b
+C	             = rnd(b(b-d)-1) + b*d - m - b
+C	             = rnd(b(b-d)-1 + b*d) - m - b
+C	             = rnd(b*b-1) - m - b
+C	             <= (b-2)*b
+C
+C Unchanged from the general case is that the final quotient limb q can be
+C either q1 or q1+1, and the q1+1 case occurs often.  This can be seen from
+C equation 8.4 of the paper which simplifies as follows when n1==0 and
+C n0==0.
+C
+C	n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b
+C
+C As before, the instruction groupings and empty comments show a naive
+C in-order view of the code, which is made a nonsense by out of order
+C execution.  There's 17 cycles shown, but it executes at 15.
+C
+C Rotating the store q and remainder->n2 instructions up to the top of the
+C loop gets the run time down from 16 to 15.
+
+	ALIGN(16)
+L(fraction_some):
+	C eax
+	C ebx
+	C ecx
+	C edx
+	C esi
+	C edi	carry
+	C ebp	divisor
+
+	movl	PARAM_DST, %esi
+	movl	VAR_DST_STOP, %ecx
+	movl	%edi, %eax
+
+	subl	$8, %ecx
+
+	jmp	L(fraction_entry)
+
+
+	ALIGN(16)
+L(fraction_top):
+	C eax	n2 carry, then scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	dst, decrementing
+	C edx	scratch
+	C esi	dst stop point
+	C edi	(will be n2)
+	C ebp	divisor
+
+	movl	%ebx, (%ecx)	C previous q
+	movl	%eax, %edi	C remainder->n2
+
+L(fraction_entry):
+	mull	VAR_INVERSE	C m*n2
+
+	movl	%ebp, %eax	C d
+	subl	$4, %ecx	C dst
+	leal	1(%edi), %ebx
+
+	C
+
+	C
+
+	C
+
+	C
+
+	addl	%edx, %ebx	C 1 + high(n2<<32 + m*n2) = q1+1
+
+	mull	%ebx		C (q1+1)*d
+
+	C
+
+	C
+
+	C
+
+	negl	%eax		C low of n - (q1+1)*d
+
+	C
+
+ 	sbbl	%edx, %edi	C high of n - (q1+1)*d, caring only about carry
+ 	leal	(%ebp,%eax), %edx
+
+ 	cmovc(	%edx, %eax)	C n - q1*d if underflow from using q1+1
+ 	sbbl	$0, %ebx	C q
+ 	cmpl	%esi, %ecx
+
+	jne	L(fraction_top)
+
+
+	movl	%ebx, (%ecx)
+	jmp	L(fraction_done)
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/lshift.asm b/rts/gmp/mpn/x86/k7/mmx/lshift.asm
new file mode 100644
index 0000000000..4d17c881ec
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/lshift.asm
@@ -0,0 +1,472 @@
+dnl  AMD K7 mpn_lshift -- mpn left shift.
+dnl 
+dnl  K7: 1.21 cycles/limb (at 16 limbs/loop).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K7: UNROLL_COUNT cycles/limb
+dnl           4           1.51
+dnl           8           1.26
+dnl          16           1.21
+dnl          32           1.2
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C Shift src,size left by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the right.  The bits shifted out at the left are
+C the return value.
+C
+C The comments in mpn_rshift apply here too.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 10)
+',`
+deflit(UNROLL_THRESHOLD, 10)
+')
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+defframe(SAVE_EDI, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EBX, -12)
+deflit(SAVE_SIZE, 12)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_SRC, %edx
+	subl	$SAVE_SIZE, %esp
+deflit(`FRAME',SAVE_SIZE)
+
+	movl	PARAM_SHIFT, %ecx
+	movl	%edi, SAVE_EDI
+
+	movl	PARAM_DST, %edi
+	decl	%eax
+	jnz	L(more_than_one_limb)
+
+	movl	(%edx), %edx
+
+	shldl(	%cl, %edx, %eax)	C eax was decremented to zero
+
+ 	shll	%cl, %edx
+
+	movl	%edx, (%edi)
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(more_than_one_limb):
+	C eax	size-1
+	C ebx
+	C ecx	shift
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+
+	movd	PARAM_SHIFT, %mm6
+	movd	(%edx,%eax,4), %mm5	C src high limb
+	cmp	$UNROLL_THRESHOLD-1, %eax
+
+	jae	L(unroll)
+	negl	%ecx
+	movd	(%edx), %mm4		C src low limb
+
+	addl	$32, %ecx
+
+	movd	%ecx, %mm7
+
+L(simple_top):
+	C eax	loop counter, limbs
+	C ebx
+	C ecx
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+	C
+	C mm0	scratch
+	C mm4	src low limb
+	C mm5	src high limb
+	C mm6	shift
+	C mm7	32-shift
+
+	movq	-4(%edx,%eax,4), %mm0
+	decl	%eax
+
+ 	psrlq	%mm7, %mm0
+
+	movd	%mm0, 4(%edi,%eax,4)
+	jnz	L(simple_top)
+
+
+	psllq	%mm6, %mm5
+ 	psllq	%mm6, %mm4
+
+	psrlq	$32, %mm5
+	movd	%mm4, (%edi)		C dst low limb
+
+	movd	%mm5, %eax		C return value
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll):
+	C eax	size-1
+	C ebx	(saved)
+	C ecx	shift
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+	C
+	C mm5	src high limb, for return value
+	C mm6	lshift
+
+	movl	%esi, SAVE_ESI
+	movl	%ebx, SAVE_EBX
+	leal	-4(%edx,%eax,4), %edx   C &src[size-2]
+
+	testb	$4, %dl
+	movq	(%edx), %mm1		C src high qword
+
+	jz	L(start_src_aligned)
+
+
+	C src isn't aligned, process high limb (marked xxx) separately to
+	C make it so
+	C
+	C  source    -4(edx,%eax,4)
+	C                  |
+	C  +-------+-------+-------+--
+	C  |  xxx          |
+	C  +-------+-------+-------+--
+	C        0mod8   4mod8   0mod8
+	C
+	C  dest      -4(edi,%eax,4)
+	C                  |
+	C  +-------+-------+--
+	C  |  xxx  |       |  
+	C  +-------+-------+--
+
+	psllq	%mm6, %mm1
+	subl	$4, %edx
+	movl	%eax, PARAM_SIZE	C size-1
+
+	psrlq	$32, %mm1
+	decl	%eax			C size-2 is new size-1
+
+	movd	%mm1, 4(%edi,%eax,4)
+	movq	(%edx), %mm1		C new src high qword
+L(start_src_aligned):
+
+
+        leal    -4(%edi,%eax,4), %edi   C &dst[size-2]
+	psllq	%mm6, %mm5
+
+	testl	$4, %edi
+	psrlq	$32, %mm5		C return value
+
+	jz	L(start_dst_aligned)
+
+
+	C dst isn't aligned, subtract 4 bytes to make it so, and pretend the
+	C shift is 32 bits extra.  High limb of dst (marked xxx) handled
+	C here separately.
+	C
+	C  source       %edx
+	C  +-------+-------+--
+	C  |      mm1      |  
+	C  +-------+-------+--
+	C                0mod8   4mod8
+	C
+	C  dest         %edi
+	C  +-------+-------+-------+--
+	C  |  xxx  |          
+	C  +-------+-------+-------+--
+	C        0mod8   4mod8   0mod8
+
+	movq	%mm1, %mm0
+	psllq	%mm6, %mm1
+	addl	$32, %ecx		C shift+32
+
+	psrlq	$32, %mm1
+
+	movd	%mm1, 4(%edi)
+	movq	%mm0, %mm1
+	subl	$4, %edi
+
+	movd	%ecx, %mm6		C new lshift
+L(start_dst_aligned):
+
+	decl	%eax			C size-2, two last limbs handled at end
+	movq	%mm1, %mm2		C copy of src high qword
+	negl	%ecx
+
+	andl	$-2, %eax		C round size down to even
+	addl	$64, %ecx
+
+	movl	%eax, %ebx
+	negl	%eax
+
+	andl	$UNROLL_MASK, %eax
+	decl	%ebx
+
+	shll	%eax
+
+	movd	%ecx, %mm7		C rshift = 64-lshift
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(entry) (%eax,%eax,4), %esi
+')
+	shrl	$UNROLL_LOG2, %ebx	C loop counter
+
+	leal	ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx
+	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
+	movl	PARAM_SIZE, %eax	C for use at end
+	jmp	*%esi
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See README.family about old gas bugs
+	leal	(%eax,%eax,4), %esi
+	addl	$L(entry)-L(here), %esi
+	addl	(%esp), %esi
+
+	ret
+')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(32)
+L(top):
+	C eax	size (for use at end)
+	C ebx	loop counter
+	C ecx	rshift
+	C edx	src
+	C esi	computed jump
+	C edi	dst
+	C ebp
+	C
+	C mm0	scratch
+	C mm1	\ carry (alternating, mm2 first)
+	C mm2	/
+	C mm6	lshift
+	C mm7	rshift
+	C
+	C 10 code bytes/limb
+	C
+	C The two chunks differ in whether mm1 or mm2 hold the carry.
+	C The computed jump puts the initial carry in both mm1 and mm2.
+	
+L(entry):
+deflit(CHUNK_COUNT, 4)
+forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 - 8))
+
+ 	movq	disp0(%edx), %mm0
+ 	psllq	%mm6, %mm2
+
+ 	movq	%mm0, %mm1
+ 	psrlq	%mm7, %mm0
+
+ 	por	%mm2, %mm0
+ 	movq	%mm0, disp0(%edi)
+
+
+ 	movq	disp1(%edx), %mm0
+ 	psllq	%mm6, %mm1
+
+ 	movq	%mm0, %mm2
+ 	psrlq	%mm7, %mm0
+
+ 	por	%mm1, %mm0
+ 	movq	%mm0, disp1(%edi)
+')
+
+	subl	$UNROLL_BYTES, %edx
+	subl	$UNROLL_BYTES, %edi
+	decl	%ebx
+
+	jns	L(top)
+
+
+
+define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))')
+
+L(end):
+	testb	$1, %al
+	movl	SAVE_EBX, %ebx
+	psllq	%mm6, %mm2	C wanted left shifted in all cases below
+
+	movd	%mm5, %eax
+
+	movl	SAVE_ESI, %esi
+	jz	L(end_even)
+
+
+L(end_odd):
+
+	C Size odd, destination was aligned.
+	C
+	C                 source        edx+8   edx+4
+	C                 --+---------------+-------+
+	C                   |      mm2      |       |
+	C                 --+---------------+-------+
+	C
+	C dest                            edi
+	C --+---------------+---------------+-------+
+	C   |   written     |               |       |
+	C --+---------------+---------------+-------+
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C Size odd, destination was unaligned.
+	C
+	C                 source        edx+8   edx+4
+	C                 --+---------------+-------+
+	C                   |      mm2      |       |
+	C                 --+---------------+-------+
+	C
+	C         dest                            edi
+	C         --+---------------+---------------+
+	C           |   written     |               |
+	C         --+---------------+---------------+
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C In both cases there's one extra limb of src to fetch and combine
+	C with mm2 to make a qword at (%edi), and in the aligned case
+	C there's an extra limb of dst to be formed from that extra src limb
+	C left shifted.
+
+	movd	disp(4) (%edx), %mm0
+	testb	$32, %cl
+
+	movq	%mm0, %mm1
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+	psllq	%mm6, %mm1
+
+	por	%mm2, %mm0
+
+	movq	%mm0, disp(0) (%edi)
+	jz	L(end_odd_unaligned)
+	movd	%mm1, disp(-4) (%edi)
+L(end_odd_unaligned):
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+
+L(end_even):
+
+	C Size even, destination was aligned.
+	C
+	C                 source        edx+8
+	C                 --+---------------+
+	C                   |      mm2      |
+	C                 --+---------------+
+	C
+	C dest                            edi
+	C --+---------------+---------------+
+	C   |   written     |               |
+	C --+---------------+---------------+
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C Size even, destination was unaligned.
+	C
+	C               source          edx+8
+	C                 --+---------------+
+	C                   |      mm2      |
+	C                 --+---------------+
+	C
+	C         dest                  edi+4
+	C         --+---------------+-------+
+	C           |    written    |       |
+	C         --+---------------+-------+
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C The movq for the aligned case overwrites the movd for the
+	C unaligned case.
+
+	movq	%mm2, %mm0
+	psrlq	$32, %mm2
+
+	testb	$32, %cl
+	movd	%mm2, disp(4) (%edi)
+
+	jz	L(end_even_unaligned)
+	movq	%mm0, disp(0) (%edi)
+L(end_even_unaligned):
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/mod_1.asm b/rts/gmp/mpn/x86/k7/mmx/mod_1.asm
new file mode 100644
index 0000000000..545ca56ddf
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/mod_1.asm
@@ -0,0 +1,457 @@
+dnl  AMD K7 mpn_mod_1 -- mpn by limb remainder.
+dnl 
+dnl  K7: 17.0 cycles/limb.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C                       mp_limb_t carry);
+C
+C The code here is the same as mpn_divrem_1, but with the quotient
+C discarded.  See mpn/x86/k7/mmx/divrem_1.c for some comments.
+
+
+dnl  MUL_THRESHOLD is the size at which the multiply by inverse method is
+dnl  used, rather than plain "divl"s.  Minimum value 2.
+dnl
+dnl  The inverse takes about 50 cycles to calculate, but after that the
+dnl  multiply is 17 c/l versus division at 41 c/l.
+dnl
+dnl  Using mul or div is about the same speed at 3 limbs, so the threshold
+dnl  is set to 4 to get the smaller div code used at 3.
+
+deflit(MUL_THRESHOLD, 4)
+
+
+defframe(PARAM_CARRY,  16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE,    8)
+defframe(PARAM_SRC,     4)
+
+defframe(SAVE_EBX,    -4)
+defframe(SAVE_ESI,    -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+
+defframe(VAR_NORM,    -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC_STOP,-28)
+
+deflit(STACK_SPACE, 28)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_mod_1c)
+deflit(`FRAME',0)
+	movl	PARAM_CARRY, %edx
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+	jmp	LF(mpn_mod_1,start_1c)
+
+EPILOGUE()
+
+
+	ALIGN(32)
+PROLOGUE(mpn_mod_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	$0, %edx		C initial carry (if can't skip a div)
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	orl	%ecx, %ecx
+	jz	L(divide_done)
+
+	movl	-4(%esi,%ecx,4), %eax	C src high limb
+
+	cmpl	%ebp, %eax		C carry flag if high<divisor
+					
+	cmovc(	%eax, %edx)		C src high limb as initial carry
+	sbbl	$0, %ecx		C size-1 to skip one div
+	jz	L(divide_done)
+
+
+	ALIGN(16)
+L(start_1c):
+	C eax	
+	C ebx
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi
+	C ebp	divisor
+
+	cmpl	$MUL_THRESHOLD, %ecx
+	jae	L(mul_by_inverse)
+
+
+
+C With a MUL_THRESHOLD of 4, this "loop" only ever does 1 to 3 iterations,
+C but it's already fast and compact, and there's nothing to gain by
+C expanding it out.
+C
+C Using PARAM_DIVISOR in the divl is a couple of cycles faster than %ebp.
+
+	orl	%ecx, %ecx
+	jz	L(divide_done)
+
+
+L(divide_top):
+	C eax	scratch (quotient)
+	C ebx
+	C ecx	counter, limbs, decrementing
+	C edx	scratch (remainder)
+	C esi	src
+	C edi
+	C ebp
+
+	movl	-4(%esi,%ecx,4), %eax
+
+	divl	PARAM_DIVISOR
+
+	decl	%ecx
+	jnz	L(divide_top)
+
+
+L(divide_done):
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	movl	%edx, %eax
+
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+	C eax
+	C ebx
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi
+	C ebp	divisor
+
+	bsrl	%ebp, %eax		C 31-l
+
+	movl	%ebx, SAVE_EBX
+	leal	-4(%esi), %ebx
+
+	movl	%ebx, VAR_SRC_STOP
+	movl	%edi, SAVE_EDI
+
+	movl	%ecx, %ebx		C size
+	movl	$31, %ecx
+
+	movl	%edx, %edi		C carry
+	movl	$-1, %edx
+
+	C
+
+	xorl	%eax, %ecx		C l
+	incl	%eax			C 32-l
+
+	shll	%cl, %ebp		C d normalized
+	movl	%ecx, VAR_NORM
+
+	movd	%eax, %mm7
+
+	movl	$-1, %eax
+	subl	%ebp, %edx		C (b-d)-1 so  edx:eax = b*(b-d)-1
+
+	divl	%ebp			C floor (b*(b-d)-1) / d
+
+	C
+
+	movl	%eax, VAR_INVERSE
+	leal	-12(%esi,%ebx,4), %eax	C &src[size-3]
+
+	movl	8(%eax), %esi		C src high limb
+	movl	4(%eax), %edx		C src second highest limb
+
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shldl(	%cl, %edx, %esi)	C n10 = high,second << l
+
+	movl	%eax, %ecx		C &src[size-3]
+
+
+ifelse(MUL_THRESHOLD,2,`
+	cmpl	$2, %ebx
+	je	L(inverse_two_left)
+')
+
+
+C The dependent chain here is the same as in mpn_divrem_1, but a few
+C instructions are saved by not needing to store the quotient limbs.
+C Unfortunately this doesn't get the code down to the theoretical 16 c/l.
+C
+C There's four dummy instructions in the loop, all of which are necessary
+C for the claimed 17 c/l.  It's a 1 to 3 cycle slowdown if any are removed,
+C or changed from load to store or vice versa.  They're not completely
+C random, since they correspond to what mpn_divrem_1 has, but there's no
+C obvious reason why they're necessary.  Presumably they induce something
+C good in the out of order execution, perhaps through some load/store
+C ordering and/or decoding effects.
+C
+C The q1==0xFFFFFFFF case is handled here the same as in mpn_divrem_1.  On
+C on special data that comes out as q1==0xFFFFFFFF always, the loop runs at
+C about 13.5 c/l.
+
+	ALIGN(32)
+L(inverse_top):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	src pointer, decrementing
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	scratch (src qword)
+	C mm7	rshift for normalization
+
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+	movl	PARAM_SIZE, %ebx   C dummy
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movq	(%ecx), %mm0       C next src limb and the one below it
+	subl	$4, %ecx
+
+	movl	%ecx, PARAM_SIZE   C dummy
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	C
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+	jz	L(q1_ff)
+	nop                        C dummy
+
+	mull	%ebx		   C (q1+1)*d
+
+	psrlq	%mm7, %mm0
+	leal	0(%ecx), %ecx      C dummy
+
+	C
+
+	C
+
+	subl	%eax, %esi
+	movl	VAR_SRC_STOP, %eax
+
+	C
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	movd	%mm0, %esi
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	cmpl	%eax, %ecx
+	jne	L(inverse_top)
+
+
+L(inverse_loop_done):
+
+
+C -----------------------------------------------------------------------------
+
+L(inverse_two_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	&src[-1]
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	scratch (src dword)
+	C mm7	rshift
+
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movd	4(%ecx), %mm0	   C src low limb
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx
+
+	mull	%ebx		   C (q1+1)*d
+
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+
+	C
+
+	subl	%eax, %esi
+
+	C
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	movd	%mm0, %esi
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+
+
+C One limb left
+
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	src limb, shifted
+	C mm7	rshift
+
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movl	VAR_NORM, %ecx     C for final denorm
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	C
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx
+
+	mull	%ebx		   C (q1+1)*d
+
+	movl	SAVE_EBX, %ebx
+
+	C
+
+	C
+
+	subl	%eax, %esi
+
+	movl	%esi, %eax	   C remainder
+	movl	SAVE_ESI, %esi
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	leal	(%ebp,%eax), %edx
+	movl	SAVE_EBP, %ebp
+
+	cmovc(	%edx, %eax)	   C n - q1*d if underflow from using q1+1
+	movl	SAVE_EDI, %edi
+
+	shrl	%cl, %eax	   C denorm remainder
+	addl	$STACK_SPACE, %esp
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+	C eax	(divisor)
+	C ebx	(q1+1 == 0)
+	C ecx	src pointer
+	C edx
+	C esi	n10
+	C edi	(n2)
+	C ebp	divisor
+
+	movl	VAR_SRC_STOP, %edx
+	leal	(%ebp,%esi), %edi	C n-q*d remainder -> next n2
+	psrlq	%mm7, %mm0
+
+	movd	%mm0, %esi		C next n10
+
+	cmpl	%ecx, %edx
+	jne	L(inverse_top)
+	jmp	L(inverse_loop_done)
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/popham.asm b/rts/gmp/mpn/x86/k7/mmx/popham.asm
new file mode 100644
index 0000000000..fa7c8c04a5
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/popham.asm
@@ -0,0 +1,239 @@
+dnl  AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming
+dnl  distance.
+dnl 
+dnl  K7: popcount 5.0 cycles/limb, hamdist 6.0 cycles/limb
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  Only recent versions of gas know psadbw, in particular gas 2.9.1 on
+dnl  FreeBSD 3.3 and 3.4 doesn't recognise it.
+
+define(psadbw_mm4_mm0,
+`ifelse(m4_ifdef_anyof_p(`HAVE_TARGET_CPU_athlon',
+                         `HAVE_TARGET_CPU_pentium3'),1,
+	`.byte 0x0f,0xf6,0xc4	C psadbw %mm4, %mm0',
+
+`m4_warning(`warning, using simulated and only partly functional psadbw, use for testing only
+')	C this works enough for the sum of bytes done below, making it
+	C possible to test on an older cpu
+	leal	-8(%esp), %esp
+	movq	%mm4, (%esp)
+	movq	%mm0, %mm4
+forloop(i,1,7,
+`	psrlq	$ 8, %mm4
+	paddb	%mm4, %mm0
+')
+	pushl	$ 0
+	pushl	$ 0xFF
+	pand	(%esp), %mm0
+	movq	8(%esp), %mm4
+	leal	16(%esp), %esp
+')')
+
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
+C
+C The code here is almost certainly not optimal, but is already a 3x speedup
+C over the generic C code.  The main improvement would be to interleave
+C processing of two qwords in the loop so as to fully exploit the available
+C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs).
+C
+C The loop is based on the example "Efficient 64-bit population count using
+C MMX instructions" in the Athlon Optimization Guide, AMD document 22007,
+C page 158 of rev E (reference in mpn/x86/k7/README).
+
+ifdef(`OPERATION_popcount',,
+`ifdef(`OPERATION_hamdist',,
+`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
+')')')
+
+define(HAM,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_hamdist',`$1')')
+
+define(POP,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_popcount',`$1')')
+
+HAM(`
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC2,   8)
+defframe(PARAM_SRC,    4)
+define(M4_function,mpn_hamdist)
+')
+POP(`
+defframe(PARAM_SIZE,   8)
+defframe(PARAM_SRC,    4)
+define(M4_function,mpn_popcount)
+')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+
+
+ifdef(`PIC',,`
+	dnl  non-PIC
+
+	DATA
+	ALIGN(8)
+
+define(LS,
+m4_assert_numargs(1)
+`LF(M4_function,`$1')')
+
+LS(rodata_AAAAAAAAAAAAAAAA):
+	.long	0xAAAAAAAA
+	.long	0xAAAAAAAA
+
+LS(rodata_3333333333333333):
+	.long	0x33333333
+	.long	0x33333333
+
+LS(rodata_0F0F0F0F0F0F0F0F):
+	.long	0x0F0F0F0F
+	.long	0x0F0F0F0F
+')
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	orl	%ecx, %ecx
+	jz	L(zero)
+
+ifdef(`PIC',`
+	movl	$0xAAAAAAAA, %eax
+	movl	$0x33333333, %edx
+
+	movd	%eax, %mm7
+	movd	%edx, %mm6
+
+	movl	$0x0F0F0F0F, %eax
+
+	punpckldq %mm7, %mm7
+	punpckldq %mm6, %mm6
+
+	movd	%eax, %mm5
+	movd	%edx, %mm4
+
+	punpckldq %mm5, %mm5
+
+',`
+	movq	LS(rodata_AAAAAAAAAAAAAAAA), %mm7
+	movq	LS(rodata_3333333333333333), %mm6
+	movq	LS(rodata_0F0F0F0F0F0F0F0F), %mm5
+')
+	pxor	%mm4, %mm4
+
+define(REG_AAAAAAAAAAAAAAAA,%mm7)
+define(REG_3333333333333333,%mm6)
+define(REG_0F0F0F0F0F0F0F0F,%mm5)
+define(REG_0000000000000000,%mm4)
+
+
+	movl	PARAM_SRC, %eax
+HAM(`	movl	PARAM_SRC2, %edx')
+
+	pxor	%mm2, %mm2	C total
+
+	shrl	%ecx
+	jnc	L(top)
+
+	movd	(%eax,%ecx,8), %mm1
+
+HAM(`	movd	0(%edx,%ecx,8), %mm0
+	pxor	%mm0, %mm1
+')
+	orl	%ecx, %ecx
+	jmp	L(loaded)
+
+
+	ALIGN(16)
+L(top):
+	C eax	src
+	C ebx
+	C ecx	counter, qwords, decrementing
+	C edx	[hamdist] src2
+	C
+	C mm0	(scratch)
+	C mm1	(scratch)
+	C mm2	total (low dword)
+	C mm3
+	C mm4	\
+	C mm5	| special constants
+	C mm6	|
+	C mm7	/
+
+	movq	-8(%eax,%ecx,8), %mm1
+
+HAM(`	pxor	-8(%edx,%ecx,8), %mm1')
+	decl	%ecx
+
+L(loaded):
+	movq	%mm1, %mm0
+	pand	REG_AAAAAAAAAAAAAAAA, %mm1
+
+	psrlq	$1, %mm1
+
+	psubd	%mm1, %mm0	C bit pairs
+
+
+	movq	%mm0, %mm1
+	psrlq	$2, %mm0
+
+	pand	REG_3333333333333333, %mm0
+	pand	REG_3333333333333333, %mm1
+
+	paddd	%mm1, %mm0	C nibbles
+
+
+	movq	%mm0, %mm1
+	psrlq	$4, %mm0
+
+	pand	REG_0F0F0F0F0F0F0F0F, %mm0
+	pand	REG_0F0F0F0F0F0F0F0F, %mm1
+
+	paddd	%mm1, %mm0	C bytes
+
+
+	psadbw_mm4_mm0
+
+	paddd	%mm0, %mm2	C add to total
+	jnz	L(top)
+
+
+	movd	%mm2, %eax
+	emms
+	ret
+
+
+L(zero):
+	movl	$0, %eax
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mmx/rshift.asm b/rts/gmp/mpn/x86/k7/mmx/rshift.asm
new file mode 100644
index 0000000000..abb546cd5b
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mmx/rshift.asm
@@ -0,0 +1,471 @@
+dnl  AMD K7 mpn_rshift -- mpn right shift.
+dnl 
+dnl  K7: 1.21 cycles/limb (at 16 limbs/loop).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K7: UNROLL_COUNT cycles/limb
+dnl           4           1.51
+dnl           8           1.26
+dnl          16           1.21
+dnl          32           1.2
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C Shift src,size right by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the left.  The bits shifted out at the right are
+C the return value.
+C
+C This code uses 64-bit MMX operations, which makes it possible to handle
+C two limbs at a time, for a theoretical 1.0 cycles/limb.  Plain integer
+C code, on the other hand, suffers from shrd being a vector path decode and
+C running at 3 cycles back-to-back.
+C
+C Full speed depends on source and destination being aligned, and some hairy
+C setups and finish-ups are done to arrange this for the loop.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 10)
+',`
+deflit(UNROLL_THRESHOLD, 10)
+')
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+defframe(SAVE_EDI, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EBX, -12)
+deflit(SAVE_SIZE, 12)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_SRC, %edx
+	subl	$SAVE_SIZE, %esp
+deflit(`FRAME',SAVE_SIZE)
+
+	movl	PARAM_SHIFT, %ecx
+	movl	%edi, SAVE_EDI
+
+	movl	PARAM_DST, %edi
+	decl	%eax
+	jnz	L(more_than_one_limb)
+
+	movl	(%edx), %edx		C src limb
+
+	shrdl(	%cl, %edx, %eax)	C eax was decremented to zero
+
+ 	shrl	%cl, %edx
+
+	movl	%edx, (%edi)		C dst limb
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(more_than_one_limb):
+	C eax	size-1
+	C ebx
+	C ecx	shift
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+
+	movd	PARAM_SHIFT, %mm6	C rshift
+	movd	(%edx), %mm5		C src low limb
+	cmp	$UNROLL_THRESHOLD-1, %eax
+
+	jae	L(unroll)
+	leal	(%edx,%eax,4), %edx	C &src[size-1]
+	leal	-4(%edi,%eax,4), %edi	C &dst[size-2]
+
+	movd	(%edx), %mm4		C src high limb
+	negl	%eax
+
+
+L(simple_top):
+	C eax	loop counter, limbs, negative
+	C ebx
+	C ecx	shift
+	C edx	carry
+	C edx	&src[size-1]
+	C edi	&dst[size-2]
+	C ebp
+	C
+	C mm0	scratch
+	C mm4	src high limb
+	C mm5	src low limb
+	C mm6	shift
+
+	movq	(%edx,%eax,4), %mm0
+	incl	%eax
+
+ 	psrlq	%mm6, %mm0
+
+	movd	%mm0, (%edi,%eax,4)
+	jnz	L(simple_top)
+
+
+	psllq	$32, %mm5
+ 	psrlq	%mm6, %mm4
+
+	psrlq	%mm6, %mm5
+	movd	%mm4, 4(%edi)		C dst high limb
+
+	movd	%mm5, %eax		C return value
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll):
+	C eax	size-1
+	C ebx
+	C ecx	shift
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+	C
+	C mm5	src low limb
+	C mm6	rshift
+
+	testb	$4, %dl
+	movl	%esi, SAVE_ESI
+	movl	%ebx, SAVE_EBX
+
+	psllq	$32, %mm5
+	jz	L(start_src_aligned)
+
+
+	C src isn't aligned, process low limb separately (marked xxx) and
+	C step src and dst by one limb, making src aligned.
+	C
+	C source                  edx
+	C --+-------+-------+-------+
+	C           |          xxx  |
+	C --+-------+-------+-------+
+	C         4mod8   0mod8   4mod8
+	C
+	C         dest            edi
+	C         --+-------+-------+
+	C           |       |  xxx  |  
+	C         --+-------+-------+
+
+	movq	(%edx), %mm0		C src low two limbs
+	addl	$4, %edx
+	movl	%eax, PARAM_SIZE	C size-1
+
+	addl	$4, %edi
+	decl	%eax			C size-2 is new size-1
+
+	psrlq	%mm6, %mm0
+	movl	%edi, PARAM_DST		C new dst
+
+	movd	%mm0, -4(%edi)
+L(start_src_aligned):
+
+
+	movq	(%edx), %mm1		C src low two limbs
+	decl	%eax			C size-2, two last limbs handled at end
+	testl	$4, %edi
+
+	psrlq	%mm6, %mm5
+	jz	L(start_dst_aligned)
+
+
+	C dst isn't aligned, add 4 to make it so, and pretend the shift is
+	C 32 bits extra.  Low limb of dst (marked xxx) handled here separately.
+	C
+	C          source          edx
+	C          --+-------+-------+
+	C            |      mm1      |
+	C          --+-------+-------+
+	C                  4mod8   0mod8
+	C
+	C  dest                    edi
+	C  --+-------+-------+-------+
+	C                    |  xxx  |        
+	C  --+-------+-------+-------+
+	C          4mod8   0mod8   4mod8
+
+	movq	%mm1, %mm0
+	psrlq	%mm6, %mm1
+	addl	$32, %ecx		C shift+32
+
+	movd	%mm1, (%edi)
+	movq	%mm0, %mm1
+	addl	$4, %edi		C new dst
+
+	movd	%ecx, %mm6
+L(start_dst_aligned):
+
+
+	movq	%mm1, %mm2		C copy of src low two limbs
+	negl	%ecx
+	andl	$-2, %eax		C round size down to even
+
+	movl	%eax, %ebx
+	negl	%eax
+	addl	$64, %ecx
+
+	andl	$UNROLL_MASK, %eax
+	decl	%ebx
+
+	shll	%eax
+
+	movd	%ecx, %mm7		C lshift = 64-rshift
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(entry) (%eax,%eax,4), %esi
+	negl	%eax
+')
+	shrl	$UNROLL_LOG2, %ebx	C loop counter
+
+	leal	ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx
+	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
+	movl	PARAM_SIZE, %eax	C for use at end
+
+	jmp	*%esi
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See README.family about old gas bugs
+	leal	(%eax,%eax,4), %esi
+	addl	$L(entry)-L(here), %esi
+	addl	(%esp), %esi
+	negl	%eax
+
+	ret
+')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(64)
+L(top):
+	C eax	size, for use at end
+	C ebx	loop counter
+	C ecx	lshift
+	C edx	src
+	C esi	was computed jump
+	C edi	dst
+	C ebp
+	C
+	C mm0	scratch
+	C mm1	\ carry (alternating)
+	C mm2	/
+	C mm6	rshift
+	C mm7	lshift
+	C
+	C 10 code bytes/limb
+	C
+	C The two chunks differ in whether mm1 or mm2 hold the carry.
+	C The computed jump puts the initial carry in both mm1 and mm2.
+	
+L(entry):
+deflit(CHUNK_COUNT, 4)
+forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 + 8))
+
+ 	movq	disp0(%edx), %mm0
+ 	psrlq	%mm6, %mm2
+
+ 	movq	%mm0, %mm1
+ 	psllq	%mm7, %mm0
+
+ 	por	%mm2, %mm0
+ 	movq	%mm0, disp0(%edi)
+
+
+ 	movq	disp1(%edx), %mm0
+ 	psrlq	%mm6, %mm1
+
+ 	movq	%mm0, %mm2
+ 	psllq	%mm7, %mm0
+
+ 	por	%mm1, %mm0
+ 	movq	%mm0, disp1(%edi)
+')
+
+	addl	$UNROLL_BYTES, %edx
+	addl	$UNROLL_BYTES, %edi
+	decl	%ebx
+
+	jns	L(top)
+
+
+deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
+deflit(`disp1', eval(disp0-0 + 8))
+
+	testb	$1, %al
+	psrlq	%mm6, %mm2	C wanted rshifted in all cases below
+	movl	SAVE_ESI, %esi
+
+	movd	%mm5, %eax		C return value
+
+	movl	SAVE_EBX, %ebx
+	jz	L(end_even)
+
+	
+	C Size odd, destination was aligned.
+	C
+	C source
+	C       edx
+	C +-------+---------------+--
+	C |       |      mm2      |
+	C +-------+---------------+--
+	C
+	C dest                  edi
+	C +-------+---------------+---------------+--
+	C |       |               |    written    |
+	C +-------+---------------+---------------+--
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C Size odd, destination was unaligned.
+	C
+	C source
+	C       edx
+	C +-------+---------------+--
+	C |       |      mm2      |
+	C +-------+---------------+--
+	C
+	C dest          edi
+	C +---------------+---------------+--
+	C |               |    written    |
+	C +---------------+---------------+--
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C In both cases there's one extra limb of src to fetch and combine
+	C with mm2 to make a qword to store, and in the aligned case there's
+	C a further extra limb of dst to be formed.
+
+
+	movd	disp0(%edx), %mm0
+	movq	%mm0, %mm1
+
+	psllq	%mm7, %mm0
+	testb	$32, %cl
+
+ 	por	%mm2, %mm0
+	psrlq	%mm6, %mm1
+
+	movq	%mm0, disp0(%edi)
+	jz	L(finish_odd_unaligned)
+
+	movd	%mm1, disp1(%edi)
+L(finish_odd_unaligned):
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+
+L(end_even):
+
+	C Size even, destination was aligned.
+	C
+	C source
+	C +---------------+--
+	C |      mm2      |
+	C +---------------+--
+	C
+	C dest          edi
+	C +---------------+---------------+--
+	C |               |      mm3      |
+	C +---------------+---------------+--
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C Size even, destination was unaligned.
+	C
+	C source
+	C +---------------+--
+	C |      mm2      |
+	C +---------------+--
+	C
+	C dest  edi
+	C +-------+---------------+--
+	C |       |      mm3      |
+	C +-------+---------------+--
+	C
+	C mm6 = shift+32
+	C mm7 = 64-(shift+32)
+
+
+	C The movd for the unaligned case is the same data as the movq for
+	C the aligned case, it's just a choice between whether one or two
+	C limbs should be written.
+
+
+	testb	$32, %cl
+	movd	%mm2, disp0(%edi)
+
+	jz	L(end_even_unaligned)
+
+	movq	%mm2, disp0(%edi)
+L(end_even_unaligned):
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mul_1.asm b/rts/gmp/mpn/x86/k7/mul_1.asm
new file mode 100644
index 0000000000..07f7085b10
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mul_1.asm
@@ -0,0 +1,265 @@
+dnl  AMD K7 mpn_mul_1 -- mpn by limb multiply.
+dnl 
+dnl  K7: 3.4 cycles/limb (at 16 limbs/loop).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K7: UNROLL_COUNT cycles/limb
+dnl           8           3.9
+dnl          16           3.4
+dnl          32           3.4
+dnl          64           3.35
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t multiplier);
+C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       mp_limb_t multiplier, mp_limb_t carry);
+C
+C Multiply src,size by mult and store the result in dst,size.
+C Return the carry limb from the top of the result.
+C
+C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
+C the low limb of the destination.
+C
+C Variations on the unrolled loop have been tried, with the current
+C registers or with the counter on the stack to free up ecx.  The current
+C code is the fastest found.
+C
+C An interesting effect is that removing the stores "movl %ebx, disp0(%edi)"
+C from the unrolled loop actually slows it down to 5.0 cycles/limb.  Code
+C with this change can be tested on sizes of the form UNROLL_COUNT*n+1
+C without having to change the computed jump.  There's obviously something
+C fishy going on, perhaps with what execution units the mul needs.
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+defframe(SAVE_EBP, -4)
+defframe(SAVE_EDI, -8)
+defframe(SAVE_ESI, -12)
+defframe(SAVE_EBX, -16)
+deflit(STACK_SPACE, 16)
+
+dnl  Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 7)
+',`
+deflit(UNROLL_THRESHOLD, 5)
+')
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_mul_1c)
+deflit(`FRAME',0)
+	movl	PARAM_CARRY, %edx
+	jmp	LF(mpn_mul_1,start_nc)
+EPILOGUE()
+
+
+PROLOGUE(mpn_mul_1)
+deflit(`FRAME',0)
+	xorl	%edx, %edx	C initial carry
+L(start_nc):
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME', STACK_SPACE)
+
+	movl	%edi, SAVE_EDI
+	movl	%ebx, SAVE_EBX
+	movl	%edx, %ebx
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+	cmpl	$UNROLL_THRESHOLD, %ecx
+
+	movl	PARAM_DST, %edi
+	movl	%ebp, SAVE_EBP
+	jae	L(unroll)
+
+	leal	(%esi,%ecx,4), %esi
+	leal	(%edi,%ecx,4), %edi
+	negl	%ecx
+
+	movl	PARAM_MULTIPLIER, %ebp
+
+L(simple):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter (negative)
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+
+	movl	(%esi,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%ebx, %eax
+	movl	%eax, (%edi,%ecx,4)
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+	incl	%ecx
+	jnz	L(simple)
+
+	movl	%ebx, %eax
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C The mov to load the next source limb is done well ahead of the mul, this
+C is necessary for full speed.  It leads to one limb handled separately
+C after the loop.
+C
+C When unrolling to 32 or more, an offset of +4 is used on the src pointer,
+C to avoid having an 0x80 displacement in the code for the last limb in the
+C unrolled loop.  This is for a fair comparison between 16 and 32 unrolling.
+
+ifelse(eval(UNROLL_COUNT >= 32),1,`
+deflit(SRC_OFFSET,4)
+',`
+deflit(SRC_OFFSET,)
+')
+
+	C this is offset 0x62, so close enough to aligned
+L(unroll):
+	C eax
+	C ebx	initial carry
+	C ecx	size
+	C edx
+	C esi	src
+	C edi	dst
+	C ebp
+deflit(`FRAME', STACK_SPACE)
+
+	leal	-1(%ecx), %edx	C one limb handled at end
+	leal	-2(%ecx), %ecx	C and ecx is one less than edx
+	movl	%ebp, SAVE_EBP
+
+	negl	%edx
+	shrl	$UNROLL_LOG2, %ecx	C unrolled loop counter
+	movl	(%esi), %eax		C src low limb
+
+	andl	$UNROLL_MASK, %edx
+	movl	PARAM_DST, %edi
+
+	movl	%edx, %ebp
+	shll	$4, %edx
+
+	C 17 code bytes per limb
+ifdef(`PIC',`
+	call	L(add_eip_to_edx)
+L(here):
+',`
+	leal	L(entry) (%edx,%ebp), %edx
+')
+	negl	%ebp
+
+	leal	ifelse(UNROLL_BYTES,256,128+) SRC_OFFSET(%esi,%ebp,4), %esi
+	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%ebp,4), %edi
+	movl	PARAM_MULTIPLIER, %ebp
+
+	jmp	*%edx
+
+
+ifdef(`PIC',`
+L(add_eip_to_edx):
+	C See README.family about old gas bugs
+	leal	(%edx,%ebp), %edx
+	addl	$L(entry)-L(here), %edx
+	addl	(%esp), %edx
+	ret
+')
+
+
+C ----------------------------------------------------------------------------
+	ALIGN(32)
+L(top):
+	C eax	next src limb
+	C ebx	carry
+	C ecx	counter
+	C edx	scratch
+	C esi	src+4
+	C edi	dst
+	C ebp	multiplier
+	C
+	C 17 code bytes per limb processed
+
+L(entry):
+forloop(i, 0, UNROLL_COUNT-1, `
+	deflit(`disp_dst', eval(i*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp_src', eval(disp_dst + 4-(SRC_OFFSET-0)))
+ 
+	mull	%ebp
+	
+	addl	%eax, %ebx
+Zdisp(	movl,	disp_src,(%esi), %eax)
+Zdisp(	movl,	%ebx, disp_dst,(%edi))
+
+	movl	$0, %ebx
+	adcl	%edx, %ebx
+')
+
+	decl	%ecx
+
+	leal	UNROLL_BYTES(%esi), %esi
+	leal	UNROLL_BYTES(%edi), %edi
+	jns	L(top)
+
+
+deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
+
+	mull	%ebp
+	
+	addl	%eax, %ebx
+	movl	$0, %eax
+	movl	SAVE_ESI, %esi
+
+	movl	%ebx, disp0(%edi)
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_EDI, %edi
+
+	adcl	%edx, %eax
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	ret
+	
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/mul_basecase.asm b/rts/gmp/mpn/x86/k7/mul_basecase.asm
new file mode 100644
index 0000000000..c4be62e633
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/mul_basecase.asm
@@ -0,0 +1,593 @@
+dnl  AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
+dnl 
+dnl  K7: approx 4.42 cycles per cross product at around 20x20 limbs (16
+dnl      limbs/loop unrolling).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  K7 UNROLL_COUNT cycles/product (at around 20x20)
+dnl           8           4.67    
+dnl          16           4.59
+dnl          32           4.42
+dnl  Maximum possible with the current code is 32.
+dnl
+dnl  At 32 the typical 13-26 limb sizes from the karatsuba code will get
+dnl  done with a straight run through a block of code, no inner loop.  Using
+dnl  32 gives 1k of code, but the k7 has a 64k L1 code cache.
+
+deflit(UNROLL_COUNT, 32)
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xsize,
+C                        mp_srcptr yp, mp_size_t ysize);
+C
+C Calculate xp,xsize multiplied by yp,ysize, storing the result in
+C wp,xsize+ysize.
+C
+C This routine is essentially the same as mpn/generic/mul_basecase.c, but
+C it's faster because it does most of the mpn_addmul_1() startup
+C calculations only once.  The saving is 15-25% on typical sizes coming from
+C the Karatsuba multiply code.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 5)
+',`
+deflit(UNROLL_THRESHOLD, 5)
+')
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP,   16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP,   8)
+defframe(PARAM_WP,   4)
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_XSIZE, %ecx
+	movl	PARAM_YP, %eax
+
+	movl	PARAM_XP, %edx
+	movl	(%eax), %eax	C yp low limb
+
+	cmpl	$2, %ecx
+	ja	L(xsize_more_than_two)
+	je	L(two_by_something)
+
+
+	C one limb by one limb
+
+	mull	(%edx)
+
+	movl	PARAM_WP, %ecx
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(two_by_something):
+deflit(`FRAME',0)
+	decl	PARAM_YSIZE
+	pushl	%ebx		defframe_pushl(`SAVE_EBX')
+	movl	%eax, %ecx	C yp low limb
+
+	movl	PARAM_WP, %ebx
+	pushl	%esi		defframe_pushl(`SAVE_ESI')
+	movl	%edx, %esi	C xp
+
+	movl	(%edx), %eax	C xp low limb	
+	jnz	L(two_by_two)
+
+
+	C two limbs by one limb
+
+	mull	%ecx	
+
+	movl	%eax, (%ebx)
+	movl	4(%esi), %eax
+	movl	%edx, %esi	C carry
+
+	mull	%ecx
+
+	addl	%eax, %esi
+
+	movl	%esi, 4(%ebx)
+	movl	SAVE_ESI, %esi
+
+	adcl	$0, %edx
+
+	movl	%edx, 8(%ebx)
+	movl	SAVE_EBX, %ebx
+	addl	$FRAME, %esp
+
+	ret
+	
+
+
+C -----------------------------------------------------------------------------
+C Could load yp earlier into another register.
+
+	ALIGN(16)
+L(two_by_two):
+	C eax	xp low limb
+	C ebx	wp
+	C ecx	yp low limb
+	C edx
+	C esi	xp
+	C edi
+	C ebp
+
+dnl  FRAME carries on from previous
+
+	mull	%ecx		C xp[0] * yp[0]
+
+	push	%edi		defframe_pushl(`SAVE_EDI')
+	movl	%edx, %edi	C carry, for wp[1]
+
+	movl	%eax, (%ebx)
+	movl	4(%esi), %eax
+
+	mull	%ecx		C xp[1] * yp[0]
+
+	addl	%eax, %edi
+	movl	PARAM_YP, %ecx
+
+	adcl	$0, %edx
+	movl	4(%ecx), %ecx	C yp[1]
+	movl	%edi, 4(%ebx)
+
+	movl	4(%esi), %eax	C xp[1]
+	movl	%edx, %edi	C carry, for wp[2]
+
+	mull	%ecx		C xp[1] * yp[1]
+
+	addl	%eax, %edi
+
+	adcl	$0, %edx
+	movl	(%esi), %eax	C xp[0]
+
+	movl	%edx, %esi	C carry, for wp[3]
+
+	mull	%ecx		C xp[0] * yp[1]
+
+	addl	%eax, 4(%ebx)
+	adcl	%edx, %edi
+	movl	%edi, 8(%ebx)
+
+	adcl	$0, %esi
+	movl	SAVE_EDI, %edi
+	movl	%esi, 12(%ebx)
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBX, %ebx
+	addl	$FRAME, %esp
+
+	ret
+
+	
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(xsize_more_than_two):
+
+C The first limb of yp is processed with a simple mpn_mul_1 style loop
+C inline.  Unrolling this doesn't seem worthwhile since it's only run once
+C (whereas the addmul below is run ysize-1 many times).  A call to the
+C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
+C popping, and doesn't seem likely to be worthwhile on the typical 13-26
+C limb operations the Karatsuba code calls here with.
+
+	C eax	yp[0]
+	C ebx
+	C ecx	xsize
+	C edx	xp
+	C esi
+	C edi
+	C ebp
+
+dnl  FRAME doesn't carry on from previous, no pushes yet here
+defframe(`SAVE_EBX',-4)
+defframe(`SAVE_ESI',-8)
+defframe(`SAVE_EDI',-12)
+defframe(`SAVE_EBP',-16)
+deflit(`FRAME',0)
+
+	subl	$16, %esp
+deflit(`FRAME',16)
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_WP, %edi
+
+	movl	%ebx, SAVE_EBX
+	movl	%ebp, SAVE_EBP
+	movl	%eax, %ebp
+
+	movl	%esi, SAVE_ESI
+	xorl	%ebx, %ebx
+	leal	(%edx,%ecx,4), %esi	C xp end
+
+	leal	(%edi,%ecx,4), %edi	C wp end of mul1
+	negl	%ecx
+
+
+L(mul1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	xp end
+	C edi	wp end of mul1
+	C ebp	multiplier
+
+	movl	(%esi,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%ebx, %eax
+	movl	%eax, (%edi,%ecx,4)
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+	incl	%ecx
+	jnz	L(mul1)
+
+
+	movl	PARAM_YSIZE, %edx
+	movl	PARAM_XSIZE, %ecx
+
+	movl	%ebx, (%edi)		C final carry
+	decl	%edx
+
+	jnz	L(ysize_more_than_one)
+
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBX, %ebx
+
+	movl	SAVE_EBP, %ebp
+	movl	SAVE_ESI, %esi
+	addl	$FRAME, %esp
+
+	ret
+
+
+L(ysize_more_than_one):
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	movl	PARAM_YP, %eax
+
+	jae	L(unroll)
+
+
+C -----------------------------------------------------------------------------
+	C simple addmul looping
+	C
+	C eax	yp
+	C ebx
+	C ecx	xsize
+	C edx	ysize-1
+	C esi	xp end
+	C edi	wp end of mul1
+	C ebp
+
+	leal	4(%eax,%edx,4), %ebp	C yp end
+	negl	%ecx
+	negl	%edx
+
+	movl	(%esi,%ecx,4), %eax	C xp low limb
+	movl	%edx, PARAM_YSIZE	C -(ysize-1)
+	incl	%ecx
+
+	xorl	%ebx, %ebx		C initial carry
+	movl	%ecx, PARAM_XSIZE	C -(xsize-1)
+	movl	%ebp, PARAM_YP
+
+	movl	(%ebp,%edx,4), %ebp	C yp second lowest limb - multiplier
+	jmp	L(simple_outer_entry)
+
+
+	C this is offset 0x121 so close enough to aligned
+L(simple_outer_top):	
+	C ebp	ysize counter, negative
+
+	movl	PARAM_YP, %edx
+	movl	PARAM_XSIZE, %ecx	C -(xsize-1)
+	xorl	%ebx, %ebx		C carry
+
+	movl	%ebp, PARAM_YSIZE
+	addl	$4, %edi		C next position in wp
+
+	movl	(%edx,%ebp,4), %ebp	C yp limb - multiplier
+	movl	-4(%esi,%ecx,4), %eax	C xp low limb
+
+
+L(simple_outer_entry):
+
+L(simple_inner):
+	C eax	xp limb
+	C ebx	carry limb
+	C ecx	loop counter (negative)
+	C edx	scratch
+	C esi	xp end
+	C edi	wp end
+	C ebp	multiplier
+
+	mull	%ebp
+
+	addl	%eax, %ebx
+	adcl	$0, %edx
+
+	addl	%ebx, (%edi,%ecx,4)
+	movl	(%esi,%ecx,4), %eax
+	adcl	$0, %edx
+
+	incl	%ecx
+	movl	%edx, %ebx
+	jnz	L(simple_inner)
+
+
+	mull	%ebp
+
+	movl	PARAM_YSIZE, %ebp
+	addl	%eax, %ebx
+
+	adcl	$0, %edx
+	addl	%ebx, (%edi)
+
+	adcl	$0, %edx
+	incl	%ebp
+
+	movl	%edx, 4(%edi)
+	jnz	L(simple_outer_top)
+
+
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBP, %ebp
+	addl	$FRAME, %esp
+
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+C
+C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
+C comments.
+C
+C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
+C increment xp and wp.  This is used to adjust back xp and wp, and rshifted
+C to given an initial VAR_COUNTER at the top of the outer loop.
+C
+C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
+C up to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled loop.
+C
+C VAR_XP_LOW is the least significant limb of xp, which is needed at the
+C start of the unrolled loop.
+C
+C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
+C inclusive.
+C
+C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
+C added to give the location of the next limb of yp, which is the multiplier
+C in the unrolled loop.
+C
+C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
+C outer loop to take care of xp, wp and the inner loop counter.
+
+defframe(VAR_COUNTER,  -20)
+defframe(VAR_ADJUST,   -24)
+defframe(VAR_JMP,      -28)
+defframe(VAR_XP_LOW,   -32)
+deflit(VAR_EXTRA_SPACE, 16)
+
+
+L(unroll):
+	C eax	yp
+	C ebx
+	C ecx	xsize
+	C edx	ysize-1
+	C esi	xp end
+	C edi	wp end of mul1
+	C ebp
+
+	movl	PARAM_XP, %esi
+	movl	4(%eax), %ebp		C multiplier (yp second limb)
+	leal	4(%eax,%edx,4), %eax	C yp adjust for ysize indexing
+
+	movl	PARAM_WP, %edi
+	movl	%eax, PARAM_YP
+	negl	%edx
+
+	movl	%edx, PARAM_YSIZE
+	leal	UNROLL_COUNT-2(%ecx), %ebx	C (xsize-1)+UNROLL_COUNT-1
+	decl	%ecx				C xsize-1
+
+	movl	(%esi), %eax		C xp low limb
+	andl	$-UNROLL_MASK-1, %ebx
+	negl	%ecx
+
+	subl	$VAR_EXTRA_SPACE, %esp
+deflit(`FRAME',16+VAR_EXTRA_SPACE)
+	negl	%ebx
+	andl	$UNROLL_MASK, %ecx
+
+	movl	%ebx, VAR_ADJUST
+	movl	%ecx, %edx
+	shll	$4, %ecx
+
+	sarl	$UNROLL_LOG2, %ebx
+
+	C 17 code bytes per limb
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(unroll_here):
+',`
+	leal	L(unroll_entry) (%ecx,%edx,1), %ecx
+')
+	negl	%edx
+
+	movl	%eax, VAR_XP_LOW
+	movl	%ecx, VAR_JMP
+	leal	4(%edi,%edx,4), %edi	C wp and xp, adjust for unrolling,
+	leal	4(%esi,%edx,4), %esi	C  and start at second limb
+	jmp	L(unroll_outer_entry)
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See README.family about old gas bugs
+	leal	(%ecx,%edx,1), %ecx
+	addl	$L(unroll_entry)-L(unroll_here), %ecx
+	addl	(%esp), %ecx
+	ret
+')
+
+
+C --------------------------------------------------------------------------
+	ALIGN(32)
+L(unroll_outer_top):
+	C ebp	ysize counter, negative
+
+	movl	VAR_ADJUST, %ebx
+	movl	PARAM_YP, %edx
+
+	movl	VAR_XP_LOW, %eax
+	movl	%ebp, PARAM_YSIZE	C store incremented ysize counter
+
+	leal	4(%edi,%ebx,4), %edi
+	leal	(%esi,%ebx,4), %esi
+	sarl	$UNROLL_LOG2, %ebx
+
+	movl	(%edx,%ebp,4), %ebp	C yp next multiplier
+	movl	VAR_JMP, %ecx
+
+L(unroll_outer_entry):
+	mull	%ebp
+
+	testb	$1, %cl		C and clear carry bit
+	movl	%ebx, VAR_COUNTER
+	movl	$0, %ebx
+
+	movl	$0, %ecx
+	cmovz(	%eax, %ecx)	C eax into low carry, zero into high carry limb
+	cmovnz(	%eax, %ebx)
+
+	C Extra fetch of VAR_JMP is bad, but registers are tight
+	jmp	*VAR_JMP
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(32)
+L(unroll_top):
+	C eax	xp limb
+	C ebx	carry high
+	C ecx	carry low
+	C edx	scratch
+	C esi	xp+8
+	C edi	wp
+	C ebp	yp multiplier limb
+	C
+	C VAR_COUNTER  loop counter, negative
+	C
+	C 17 bytes each limb
+
+L(unroll_entry):
+
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 + 4))
+
+Zdisp(	movl,	disp0,(%esi), %eax)
+	adcl	%edx, %ebx
+
+	mull	%ebp
+
+Zdisp(	addl,	%ecx, disp0,(%edi))
+	movl	$0, %ecx
+
+	adcl	%eax, %ebx
+
+
+	movl	disp1(%esi), %eax
+	adcl	%edx, %ecx	
+
+	mull	%ebp
+
+	addl	%ebx, disp1(%edi)
+	movl	$0, %ebx
+
+	adcl	%eax, %ecx
+')
+
+
+	incl	VAR_COUNTER
+	leal	UNROLL_BYTES(%esi), %esi
+	leal	UNROLL_BYTES(%edi), %edi
+
+	jnz	L(unroll_top)
+
+
+	C eax
+	C ebx	zero
+	C ecx	low
+	C edx	high
+	C esi
+	C edi	wp, pointing at second last limb)
+	C ebp
+	C
+	C carry flag to be added to high
+
+deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
+deflit(`disp1', eval(disp0-0 + 4))
+
+	movl	PARAM_YSIZE, %ebp
+	adcl	$0, %edx
+	addl	%ecx, disp0(%edi)
+
+	adcl	$0, %edx
+	incl	%ebp
+
+	movl	%edx, disp1(%edi)
+	jnz	L(unroll_outer_top)
+
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBP, %ebp
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBX, %ebx
+	addl	$FRAME, %esp
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/k7/sqr_basecase.asm b/rts/gmp/mpn/x86/k7/sqr_basecase.asm
new file mode 100644
index 0000000000..84861ea66b
--- /dev/null
+++ b/rts/gmp/mpn/x86/k7/sqr_basecase.asm
@@ -0,0 +1,627 @@
+dnl  AMD K7 mpn_sqr_basecase -- square an mpn number.
+dnl 
+dnl  K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product
+dnl  (measured on the speed difference between 25 and 50 limbs, which is
+dnl  roughly the Karatsuba recursing range).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for
+dnl  some comments.
+
+deflit(KARATSUBA_SQR_THRESHOLD_MAX, 66)
+
+ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE',
+`define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)')
+
+m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD')
+deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3))
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C With a KARATSUBA_SQR_THRESHOLD around 50 this code is about 1500 bytes,
+C which is quite a bit, but is considered good value since squares big
+C enough to use most of the code will be spending quite a few cycles in it.
+
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %eax
+	cmpl	$2, %ecx
+
+	movl	PARAM_DST, %edx
+	je	L(two_limbs)
+	ja	L(three_or_more)
+
+
+C------------------------------------------------------------------------------
+C one limb only
+	C eax	src
+	C ecx	size
+	C edx	dst
+
+	movl	(%eax), %eax
+	movl	%edx, %ecx
+
+	mull	%eax
+
+	movl	%edx, 4(%ecx)
+	movl	%eax, (%ecx)
+	ret
+
+
+C------------------------------------------------------------------------------
+C
+C Using the read/modify/write "add"s seems to be faster than saving and
+C restoring registers.  Perhaps the loads for the first set hide under the
+C mul latency and the second gets store to load forwarding.
+
+	ALIGN(16)
+L(two_limbs):
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	dst
+deflit(`FRAME',0)
+
+	pushl	%ebx		FRAME_pushl()
+	movl	%eax, %ebx	C src
+	movl	(%eax), %eax
+
+	movl	%edx, %ecx	C dst
+
+	mull	%eax		C src[0]^2
+
+	movl	%eax, (%ecx)	C dst[0]
+	movl	4(%ebx), %eax
+
+	movl	%edx, 4(%ecx)	C dst[1]
+
+	mull	%eax		C src[1]^2
+
+	movl	%eax, 8(%ecx)	C dst[2]
+	movl	(%ebx), %eax
+
+	movl	%edx, 12(%ecx)	C dst[3]
+
+	mull	4(%ebx)		C src[0]*src[1]
+
+	popl	%ebx
+
+ 	addl	%eax, 4(%ecx)
+	adcl	%edx, 8(%ecx)
+	adcl	$0, 12(%ecx)
+	ASSERT(nc)
+
+	addl	%eax, 4(%ecx)
+	adcl	%edx, 8(%ecx)
+	adcl	$0, 12(%ecx)
+	ASSERT(nc)
+
+	ret
+
+
+C------------------------------------------------------------------------------
+defframe(SAVE_EBX,  -4)
+defframe(SAVE_ESI,  -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+deflit(STACK_SPACE, 16)
+
+L(three_or_more):
+	subl	$STACK_SPACE, %esp
+	cmpl	$4, %ecx
+	jae	L(four_or_more)
+deflit(`FRAME',STACK_SPACE)
+
+
+C------------------------------------------------------------------------------
+C Three limbs
+C
+C Writing out the loads and stores separately at the end of this code comes
+C out about 10 cycles faster than using adcls to memory.
+
+	C eax	src
+	C ecx	size
+	C edx	dst
+
+	movl	%ebx, SAVE_EBX
+	movl	%eax, %ebx	C src
+	movl	(%eax), %eax
+
+	movl	%edx, %ecx	C dst
+	movl	%esi, SAVE_ESI
+	movl	%edi, SAVE_EDI
+
+	mull	%eax		C src[0] ^ 2
+
+	movl	%eax, (%ecx)
+	movl	4(%ebx), %eax
+	movl	%edx, 4(%ecx)
+
+	mull	%eax		C src[1] ^ 2
+
+	movl	%eax, 8(%ecx)
+	movl	8(%ebx), %eax
+	movl	%edx, 12(%ecx)
+
+	mull	%eax		C src[2] ^ 2
+
+	movl	%eax, 16(%ecx)
+	movl	(%ebx), %eax
+	movl	%edx, 20(%ecx)
+
+	mull	4(%ebx)		C src[0] * src[1]
+
+	movl	%eax, %esi
+	movl	(%ebx), %eax
+	movl	%edx, %edi
+
+	mull	8(%ebx)		C src[0] * src[2]
+
+	addl	%eax, %edi
+	movl	%ebp, SAVE_EBP
+	movl	$0, %ebp
+
+	movl	4(%ebx), %eax
+	adcl	%edx, %ebp
+
+	mull	8(%ebx)		C src[1] * src[2]
+
+	xorl	%ebx, %ebx
+	addl	%eax, %ebp
+
+	adcl	$0, %edx
+
+	C eax
+	C ebx	zero, will be dst[5]
+	C ecx	dst
+	C edx	dst[4]
+	C esi	dst[1]
+	C edi	dst[2]
+	C ebp	dst[3]
+
+	adcl	$0, %edx
+	addl	%esi, %esi
+
+	adcl	%edi, %edi
+	movl	4(%ecx), %eax
+
+	adcl	%ebp, %ebp
+
+	adcl	%edx, %edx
+
+	adcl	$0, %ebx
+	addl	%eax, %esi
+	movl	8(%ecx), %eax
+
+	adcl	%eax, %edi
+	movl	12(%ecx), %eax
+	movl	%esi, 4(%ecx)
+
+	adcl	%eax, %ebp
+	movl	16(%ecx), %eax
+	movl	%edi, 8(%ecx)
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EDI, %edi
+
+	adcl	%eax, %edx
+	movl	20(%ecx), %eax
+	movl	%ebp, 12(%ecx)
+
+	adcl	%ebx, %eax
+	ASSERT(nc)
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_EBP, %ebp
+
+	movl	%edx, 16(%ecx)
+	movl	%eax, 20(%ecx)
+	addl	$FRAME, %esp
+
+	ret
+
+
+C------------------------------------------------------------------------------
+L(four_or_more):
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+C Further products are added in rather than stored.
+ 
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+
+defframe(`VAR_COUNTER',-20)
+defframe(`VAR_JMP',    -24)
+deflit(EXTRA_STACK_SPACE, 8)
+
+	movl	%ebx, SAVE_EBX
+	movl	%edi, SAVE_EDI
+	leal	(%edx,%ecx,4), %edi	C &dst[size]
+
+	movl	%esi, SAVE_ESI
+	movl	%ebp, SAVE_EBP
+	leal	(%eax,%ecx,4), %esi	C &src[size]
+
+	movl	(%eax), %ebp		C multiplier
+	movl	$0, %ebx
+	decl	%ecx
+
+	negl	%ecx
+	subl	$EXTRA_STACK_SPACE, %esp
+FRAME_subl_esp(EXTRA_STACK_SPACE)
+
+L(mul_1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter
+	C edx	scratch
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp	multiplier
+
+        movl    (%esi,%ecx,4), %eax
+
+        mull    %ebp
+
+        addl    %ebx, %eax
+        movl    %eax, (%edi,%ecx,4)
+        movl    $0, %ebx
+
+        adcl    %edx, %ebx
+        incl    %ecx
+        jnz     L(mul_1)
+
+
+C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
+C
+C The last two products, which are the bottom right corner of the product
+C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
+C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
+C cases that need to be done.
+C
+C The unrolled code is the same as in mpn_addmul_1, see that routine for
+C some comments.
+C
+C VAR_COUNTER is the outer loop, running from -size+4 to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled code, stepped by one code
+C chunk each outer loop.
+C
+C K7 does branch prediction on indirect jumps, which is bad since it's a
+C different target each time.  There seems no way to avoid this.
+
+dnl  This value also hard coded in some shifts and adds
+deflit(CODE_BYTES_PER_LIMB, 17)
+
+dnl  With the unmodified &src[size] and &dst[size] pointers, the
+dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
+dnl  values up to 31, but above that an offset must be added to them.
+
+deflit(OFFSET,
+ifelse(eval(UNROLL_COUNT>31),1,
+eval((UNROLL_COUNT-31)*4),
+0))
+
+dnl  Because the last chunk of code is generated differently, a label placed
+dnl  at the end doesn't work.  Instead calculate the implied end using the
+dnl  start and how many chunks of code there are.
+
+deflit(UNROLL_INNER_END,
+`L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)')
+
+	C eax
+	C ebx	carry
+	C ecx
+	C edx
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp
+
+	movl	PARAM_SIZE, %ecx
+	movl	%ebx, (%edi)
+
+	subl	$4, %ecx
+	jz	L(corner)
+
+	negl	%ecx
+ifelse(OFFSET,0,,`subl	$OFFSET, %edi')
+ifelse(OFFSET,0,,`subl	$OFFSET, %esi')
+
+	movl	%ecx, %edx
+	shll	$4, %ecx
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
+')
+
+
+	C The calculated jump mustn't come out to before the start of the
+	C code available.  This is the limit UNROLL_COUNT puts on the src
+	C operand size, but checked here directly using the jump address.
+	ASSERT(ae,
+	`movl_text_address(L(unroll_inner_start), %eax)
+	cmpl	%eax, %ecx')
+
+
+C------------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll_outer_top):
+	C eax
+	C ebx	high limb to store
+	C ecx	VAR_JMP
+	C edx	VAR_COUNTER, limbs, negative
+	C esi	&src[size], constant
+	C edi	dst ptr, high of last addmul
+	C ebp
+
+	movl	-12+OFFSET(%esi,%edx,4), %ebp	C next multiplier
+	movl	-8+OFFSET(%esi,%edx,4), %eax	C first of multiplicand
+
+	movl	%edx, VAR_COUNTER
+
+	mull	%ebp
+
+define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz($@)',`cmovnz($@)')')
+
+	testb	$1, %cl
+	movl	%edx, %ebx	C high carry
+	movl	%ecx, %edx	C jump
+
+	movl	%eax, %ecx	C low carry
+	cmovX(	%ebx, %ecx)	C high carry reverse
+	cmovX(	%eax, %ebx)	C low carry reverse
+
+	leal	CODE_BYTES_PER_LIMB(%edx), %eax
+	xorl	%edx, %edx
+	leal	4(%edi), %edi
+
+	movl	%eax, VAR_JMP
+
+	jmp	*%eax
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	addl	(%esp), %ecx
+	addl	$UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %ecx
+	addl	%edx, %ecx
+	ret
+')
+
+
+	C Must be an even address to preserve the significance of the low
+	C bit of the jump address indicating which way around ecx/ebx should
+	C start.
+	ALIGN(2)
+
+L(unroll_inner_start):
+	C eax	next limb
+	C ebx	carry high
+	C ecx	carry low
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+
+forloop(`i', UNROLL_COUNT, 1, `
+	deflit(`disp_src', eval(-i*4 + OFFSET))
+	deflit(`disp_dst', eval(disp_src - 4))
+
+	m4_assert(`disp_src>=-128 && disp_src<128')
+	m4_assert(`disp_dst>=-128 && disp_dst<128')
+
+ifelse(eval(i%2),0,`
+Zdisp(	movl,	disp_src,(%esi), %eax)
+        adcl    %edx, %ebx
+
+        mull	%ebp
+
+Zdisp(  addl,	%ecx, disp_dst,(%edi))
+	movl	$0, %ecx
+
+	adcl	%eax, %ebx
+
+',`
+	dnl  this bit comes out last
+Zdisp(  movl,	disp_src,(%esi), %eax)
+	adcl	%edx, %ecx
+
+	mull    %ebp
+
+dnl Zdisp(	addl	%ebx, disp_src,(%edi))
+	addl	%ebx, disp_dst(%edi)
+ifelse(forloop_last,0,
+`	movl	$0, %ebx')
+
+	adcl    %eax, %ecx
+')
+')
+
+	C eax	next limb
+	C ebx	carry high
+	C ecx	carry low
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+
+        adcl    $0, %edx
+	addl	%ecx, -4+OFFSET(%edi)
+	movl	VAR_JMP, %ecx
+
+        adcl    $0, %edx
+	
+	movl	%edx, m4_empty_if_zero(OFFSET) (%edi)
+	movl	VAR_COUNTER, %edx
+
+	incl	%edx
+	jnz	L(unroll_outer_top)
+	
+
+ifelse(OFFSET,0,,`
+	addl	$OFFSET, %esi
+	addl	$OFFSET, %edi
+')
+
+
+C------------------------------------------------------------------------------
+L(corner):
+	C esi	&src[size]
+	C edi	&dst[2*size-5]
+
+	movl	-12(%esi), %ebp
+	movl	-8(%esi), %eax
+	movl	%eax, %ecx
+
+	mull	%ebp
+
+	addl	%eax, -4(%edi)
+	movl	-4(%esi), %eax
+
+	adcl	$0, %edx
+	movl	%edx, %ebx
+	movl	%eax, %esi
+
+	mull	%ebp
+
+	addl	%ebx, %eax
+
+	adcl	$0, %edx
+	addl	%eax, (%edi)
+	movl	%esi, %eax
+
+	adcl	$0, %edx
+	movl	%edx, %ebx
+
+	mull	%ecx
+
+	addl	%ebx, %eax
+	movl	%eax, 4(%edi)
+
+	adcl	$0, %edx
+	movl	%edx, 8(%edi)
+	
+
+
+C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
+
+L(lshift_start):
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_DST, %edi
+	xorl	%ecx, %ecx		C clear carry
+
+	leal	(%edi,%eax,8), %edi
+	notl	%eax			C -size-1, preserve carry
+
+	leal	2(%eax), %eax		C -(size-1)
+
+L(lshift):
+	C eax	counter, negative
+	C ebx
+	C ecx
+	C edx
+	C esi
+	C edi	dst, pointing just after last limb
+	C ebp
+
+	rcll	-4(%edi,%eax,8)
+	rcll	(%edi,%eax,8)
+	incl	%eax
+	jnz	L(lshift)
+
+	setc	%al
+
+	movl	PARAM_SRC, %esi
+	movl	%eax, -4(%edi)		C dst most significant limb
+
+	movl	PARAM_SIZE, %ecx
+
+
+C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
+C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+	movl	(%esi), %eax		C src[0]
+
+	mull	%eax
+
+	leal	(%esi,%ecx,4), %esi	C src point just after last limb
+	negl	%ecx
+
+	movl	%eax, (%edi,%ecx,8)	C dst[0]
+	incl	%ecx
+
+L(diag):
+	C eax	scratch
+	C ebx	scratch
+	C ecx	counter, negative
+	C edx	carry
+	C esi	src just after last limb
+	C edi	dst just after last limb
+	C ebp
+
+	movl	(%esi,%ecx,4), %eax
+	movl	%edx, %ebx
+
+	mull	%eax
+
+	addl	%ebx, -4(%edi,%ecx,8)
+	adcl	%eax, (%edi,%ecx,8)
+	adcl	$0, %edx
+
+	incl	%ecx
+	jnz	L(diag)
+
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBX, %ebx
+
+	addl	%edx, -4(%edi)		C dst most significant limb
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBP, %ebp
+	addl	$FRAME, %esp
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/lshift.asm b/rts/gmp/mpn/x86/lshift.asm
new file mode 100644
index 0000000000..4735335cbe
--- /dev/null
+++ b/rts/gmp/mpn/x86/lshift.asm
@@ -0,0 +1,90 @@
+dnl  x86 mpn_lshift -- mpn left shift.
+
+dnl  Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation,
+dnl  Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	.text
+	ALIGN(8)
+PROLOGUE(mpn_lshift)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+deflit(`FRAME',12)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%edx
+	movl	PARAM_SHIFT,%ecx
+
+	subl	$4,%esi			C adjust src
+
+	movl	(%esi,%edx,4),%ebx	C read most significant limb
+	xorl	%eax,%eax
+	shldl(	%cl, %ebx, %eax)	C compute carry limb
+	decl	%edx
+	jz	L(end)
+	pushl	%eax			C push carry limb onto stack
+	testb	$1,%dl
+	jnz	L(1)			C enter loop in the middle
+	movl	%ebx,%eax
+
+	ALIGN(8)
+L(oop):	movl	(%esi,%edx,4),%ebx	C load next lower limb
+	shldl(	%cl, %ebx, %eax)	C compute result limb
+	movl	%eax,(%edi,%edx,4)	C store it
+	decl	%edx
+L(1):	movl	(%esi,%edx,4),%eax
+	shldl(	%cl, %eax, %ebx)
+	movl	%ebx,(%edi,%edx,4)
+	decl	%edx
+	jnz	L(oop)
+
+	shll	%cl,%eax		C compute least significant limb
+	movl	%eax,(%edi)		C store it
+
+	popl	%eax			C pop carry limb
+
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+L(end):	shll	%cl,%ebx		C compute least significant limb
+	movl	%ebx,(%edi)		C store it
+
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/mod_1.asm b/rts/gmp/mpn/x86/mod_1.asm
new file mode 100644
index 0000000000..3908161b3e
--- /dev/null
+++ b/rts/gmp/mpn/x86/mod_1.asm
@@ -0,0 +1,141 @@
+dnl  x86 mpn_mod_1 -- mpn by limb remainder.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl        cycles/limb
+dnl  K6        20
+dnl  P5        44
+dnl  P6        39
+dnl  486   approx 42 maybe
+dnl
+dnl  The following have their own optimized mod_1 implementations, but for
+dnl  reference the code here runs as follows.
+dnl
+dnl  P6MMX     39
+dnl  K7        41
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C                       mp_limb_t carry);
+C
+C Divide src,size by divisor and return the remainder.  The quotient is
+C discarded.
+C
+C See mpn/x86/divrem_1.asm for some comments.
+
+defframe(PARAM_CARRY,  16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE,   8)
+defframe(PARAM_SRC,    4)
+
+	.text
+	ALIGN(16)
+
+PROLOGUE(mpn_mod_1c)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%ebx		FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DIVISOR, %esi
+	orl	%ecx, %ecx
+
+	movl	PARAM_CARRY, %edx
+	jnz	LF(mpn_mod_1,top)
+
+	popl	%esi
+	movl	%edx, %eax
+
+	popl	%ebx
+	
+	ret
+
+EPILOGUE()
+
+
+PROLOGUE(mpn_mod_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%ebx		FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	pushl	%esi		FRAME_pushl()
+
+	orl	%ecx, %ecx
+	jz	L(done_zero)
+
+	movl	PARAM_DIVISOR, %esi
+	movl	-4(%ebx,%ecx,4), %eax	C src high limb
+
+	cmpl	%esi, %eax
+
+	sbbl	%edx, %edx		C -1 if high<divisor
+
+	addl	%edx, %ecx		C skip one division if high<divisor
+	jz	L(done_eax)
+
+	andl	%eax, %edx		C carry if high<divisor
+
+
+L(top):
+	C eax	scratch (quotient)
+	C ebx	src
+	C ecx	counter
+	C edx	carry (remainder)
+	C esi	divisor
+	C edi
+	C ebp
+
+	movl	-4(%ebx,%ecx,4), %eax
+
+	divl	%esi
+
+	loop_or_decljnz	L(top)
+
+
+	movl	%edx, %eax
+L(done_eax):
+	popl	%esi
+
+	popl	%ebx
+
+	ret
+
+
+L(done_zero):
+	popl	%esi
+	xorl	%eax, %eax
+
+	popl	%ebx
+
+	ret
+	
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/mul_1.asm b/rts/gmp/mpn/x86/mul_1.asm
new file mode 100644
index 0000000000..8817f291bc
--- /dev/null
+++ b/rts/gmp/mpn/x86/mul_1.asm
@@ -0,0 +1,130 @@
+dnl  x86 mpn_mul_1 (for 386, 486, and Pentium Pro) -- Multiply a limb vector
+dnl  with a limb and store the result in a second limb vector.
+dnl 
+dnl      cycles/limb
+dnl  P6:     5.5
+dnl
+dnl  The following CPUs have their own optimized code, but for reference the
+dnl  code here runs as follows.
+dnl
+dnl      cycles/limb
+dnl  P5:    12.5
+dnl  K6:    10.5
+dnl  K7:     4.5
+
+
+dnl  Copyright (C) 1992, 1994, 1997, 1998, 1999, 2000 Free Software
+dnl  Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t multiplier);
+
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_mul_1)
+deflit(`FRAME',0)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%ecx
+
+	xorl	%ebx,%ebx
+	andl	$3,%ecx
+	jz	L(end0)
+
+L(oop0):
+	movl	(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	leal	4(%esi),%esi
+	addl	%ebx,%eax
+	movl	$0,%ebx
+	adcl	%ebx,%edx
+	movl	%eax,(%edi)
+	movl	%edx,%ebx	C propagate carry into cylimb
+
+	leal	4(%edi),%edi
+	decl	%ecx
+	jnz	L(oop0)
+
+L(end0):
+	movl	PARAM_SIZE,%ecx
+	shrl	$2,%ecx
+	jz	L(end)
+
+
+	ALIGN(8)
+L(oop):	movl	(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	addl	%eax,%ebx
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	4(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	movl	%ebx,(%edi)
+	addl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	movl	8(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	movl	%ebp,4(%edi)
+	addl	%eax,%ebx	C new lo + cylimb
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	12(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	movl	%ebx,8(%edi)
+	addl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	movl	%ebp,12(%edi)
+
+	leal	16(%esi),%esi
+	leal	16(%edi),%edi
+	decl	%ecx
+	jnz	L(oop)
+
+L(end):	movl	%ebx,%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/mul_basecase.asm b/rts/gmp/mpn/x86/mul_basecase.asm
new file mode 100644
index 0000000000..3a9b73895b
--- /dev/null
+++ b/rts/gmp/mpn/x86/mul_basecase.asm
@@ -0,0 +1,209 @@
+dnl  x86 mpn_mul_basecase -- Multiply two limb vectors and store the result
+dnl  in a third limb vector.
+
+
+dnl  Copyright (C) 1996, 1997, 1998, 1999, 2000 Free Software Foundation,
+dnl  Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xsize,
+C                        mp_srcptr yp, mp_size_t ysize);
+C
+C This was written in a haste since the Pentium optimized code that was used
+C for all x86 machines was slow for the Pentium II.  This code would benefit
+C from some cleanup.
+C
+C To shave off some percentage of the run-time, one should make 4 variants
+C of the Louter loop, for the four different outcomes of un mod 4.  That
+C would avoid Loop0 altogether.  Code expansion would be > 4-fold for that
+C part of the function, but since it is not very large, that would be
+C acceptable.
+C
+C The mul loop (at L(oopM)) might need some tweaking.  It's current speed is
+C unknown.
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP,   16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP,   8)
+defframe(PARAM_WP,   4)
+
+defframe(VAR_MULTIPLIER, -4)
+defframe(VAR_COUNTER,    -8)
+deflit(VAR_STACK_SPACE,  8)
+
+	.text
+	ALIGN(8)
+
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+	subl	$VAR_STACK_SPACE,%esp
+	pushl	%esi
+	pushl	%ebp
+	pushl	%edi
+deflit(`FRAME',eval(VAR_STACK_SPACE+12))
+
+	movl	PARAM_XP,%esi
+	movl	PARAM_WP,%edi
+	movl	PARAM_YP,%ebp
+
+	movl	(%esi),%eax		C load xp[0]
+	mull	(%ebp)			C multiply by yp[0]
+	movl	%eax,(%edi)		C store to wp[0]
+	movl	PARAM_XSIZE,%ecx	C xsize
+	decl	%ecx			C If xsize = 1, ysize = 1 too
+	jz	L(done)
+
+	pushl	%ebx
+FRAME_pushl()
+	movl	%edx,%ebx
+
+	leal	4(%esi),%esi
+	leal	4(%edi),%edi
+
+L(oopM):
+	movl	(%esi),%eax		C load next limb at xp[j]
+	leal	4(%esi),%esi
+	mull	(%ebp)
+	addl	%ebx,%eax
+	movl	%edx,%ebx
+	adcl	$0,%ebx
+	movl	%eax,(%edi)
+	leal	4(%edi),%edi
+	decl	%ecx
+	jnz	L(oopM)
+
+	movl	%ebx,(%edi)		C most significant limb of product
+	addl	$4,%edi			C increment wp
+	movl	PARAM_XSIZE,%eax
+	shll	$2,%eax
+	subl	%eax,%edi
+	subl	%eax,%esi
+
+	movl	PARAM_YSIZE,%eax	C ysize
+	decl	%eax
+	jz	L(skip)
+	movl	%eax,VAR_COUNTER	C set index i to ysize
+
+L(outer):
+	movl	PARAM_YP,%ebp		C yp
+	addl	$4,%ebp			C make ebp point to next v limb
+	movl	%ebp,PARAM_YP
+	movl	(%ebp),%eax		C copy y limb ...
+	movl	%eax,VAR_MULTIPLIER	C ... to stack slot
+	movl	PARAM_XSIZE,%ecx
+
+	xorl	%ebx,%ebx
+	andl	$3,%ecx
+	jz	L(end0)
+
+L(oop0):
+	movl	(%esi),%eax
+	mull	VAR_MULTIPLIER
+	leal	4(%esi),%esi
+	addl	%ebx,%eax
+	movl	$0,%ebx
+	adcl	%ebx,%edx
+	addl	%eax,(%edi)
+	adcl	%edx,%ebx		C propagate carry into cylimb
+
+	leal	4(%edi),%edi
+	decl	%ecx
+	jnz	L(oop0)
+
+L(end0):
+	movl	PARAM_XSIZE,%ecx
+	shrl	$2,%ecx
+	jz	L(endX)
+
+	ALIGN(8)
+L(oopX):
+	movl	(%esi),%eax
+	mull	VAR_MULTIPLIER
+	addl	%eax,%ebx
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	4(%esi),%eax
+	mull	VAR_MULTIPLIER
+	addl	%ebx,(%edi)
+	adcl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	movl	8(%esi),%eax
+	mull	VAR_MULTIPLIER
+	addl	%ebp,4(%edi)
+	adcl	%eax,%ebx	C new lo + cylimb
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	12(%esi),%eax
+	mull	VAR_MULTIPLIER
+	addl	%ebx,8(%edi)
+	adcl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	addl	%ebp,12(%edi)
+	adcl	$0,%ebx		C propagate carry into cylimb
+
+	leal	16(%esi),%esi
+	leal	16(%edi),%edi
+	decl	%ecx
+	jnz	L(oopX)
+
+L(endX):
+	movl	%ebx,(%edi)
+	addl	$4,%edi
+
+	C we incremented wp and xp in the loop above; compensate
+	movl	PARAM_XSIZE,%eax
+	shll	$2,%eax
+	subl	%eax,%edi
+	subl	%eax,%esi
+
+	movl	VAR_COUNTER,%eax
+	decl	%eax
+	movl	%eax,VAR_COUNTER
+	jnz	L(outer)
+
+L(skip):
+	popl	%ebx
+	popl	%edi
+	popl	%ebp
+	popl	%esi
+	addl	$8,%esp
+	ret
+
+L(done):
+	movl	%edx,4(%edi)	   C store to wp[1]
+	popl	%edi
+	popl	%ebp
+	popl	%esi
+	addl	$8,%esp
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/p6/README b/rts/gmp/mpn/x86/p6/README
new file mode 100644
index 0000000000..7dbc905a0d
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/README
@@ -0,0 +1,95 @@
+
+                      INTEL P6 MPN SUBROUTINES
+
+
+
+This directory contains code optimized for Intel P6 class CPUs, meaning
+PentiumPro, Pentium II and Pentium III.  The mmx and p3mmx subdirectories
+have routines using MMX instructions.
+
+
+
+STATUS
+
+Times for the loops, with all code and data in L1 cache, are as follows.
+Some of these might be able to be improved.
+
+                               cycles/limb
+
+	mpn_add_n/sub_n           3.7
+
+	mpn_copyi                 0.75
+	mpn_copyd                 2.4
+
+	mpn_divrem_1             39.0
+	mpn_mod_1                39.0
+	mpn_divexact_by3          8.5
+
+	mpn_mul_1                 5.5
+	mpn_addmul/submul_1       6.35
+
+	mpn_l/rshift              2.5
+
+	mpn_mul_basecase          8.2 cycles/crossproduct (approx)
+	mpn_sqr_basecase          4.0 cycles/crossproduct (approx)
+	                          or 7.75 cycles/triangleproduct (approx)
+
+Pentium II and III have MMX and get the following improvements.
+
+	mpn_divrem_1             25.0 integer part, 17.5 fractional part
+	mpn_mod_1                24.0
+
+	mpn_l/rshift              1.75
+
+
+
+
+NOTES
+
+Write-allocate L1 data cache means prefetching of destinations is unnecessary.
+
+Mispredicted branches have a penalty of between 9 and 15 cycles, and even up
+to 26 cycles depending how far speculative execution has gone.  The 9 cycle
+minimum penalty comes from the issue pipeline being 9 stages.
+
+A copy with rep movs seems to copy 16 bytes at a time, since speeds for 4,
+5, 6 or 7 limb operations are all the same.  The 0.75 cycles/limb would be 3
+cycles per 16 byte block.
+
+
+
+
+CODING
+
+Instructions in general code have been shown grouped if they can execute
+together, which means up to three instructions with no successive
+dependencies, and with only the first being a multiple micro-op.
+
+P6 has out-of-order execution, so the groupings are really only showing
+dependent paths where some shuffling might allow some latencies to be
+hidden.
+
+
+
+
+REFERENCES
+
+"Intel Architecture Optimization Reference Manual", 1999, revision 001 dated
+02/99, order number 245127 (order number 730795-001 is in the document too).
+Available on-line:
+
+	http://download.intel.com/design/PentiumII/manuals/245127.htm
+
+"Intel Architecture Optimization Manual", 1997, order number 242816.  This
+is an older document mostly about P5 and not as good as the above.
+Available on-line:
+
+	http://download.intel.com/design/PentiumII/manuals/242816.htm
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/rts/gmp/mpn/x86/p6/aorsmul_1.asm b/rts/gmp/mpn/x86/p6/aorsmul_1.asm
new file mode 100644
index 0000000000..feb364ec0b
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/aorsmul_1.asm
@@ -0,0 +1,300 @@
+dnl  Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
+dnl 
+dnl  P6: 6.35 cycles/limb (at 16 limbs/loop).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  P6 UNROLL_COUNT cycles/limb
+dnl          8           6.7
+dnl         16           6.35
+dnl         32           6.3
+dnl         64           6.3
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_addmul_1', `
+	define(M4_inst,        addl)
+	define(M4_function_1,  mpn_addmul_1)
+	define(M4_function_1c, mpn_addmul_1c)
+	define(M4_description, add it to)
+	define(M4_desc_retval, carry)
+',`ifdef(`OPERATION_submul_1', `
+	define(M4_inst,        subl)
+	define(M4_function_1,  mpn_submul_1)
+	define(M4_function_1c, mpn_submul_1c)
+	define(M4_description, subtract it from)
+	define(M4_desc_retval, borrow)
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                            mp_limb_t mult);
+C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                             mp_limb_t mult, mp_limb_t carry);
+C
+C Calculate src,size multiplied by mult and M4_description dst,size.
+C Return the M4_desc_retval limb from the top of the result.
+C
+C This code is pretty much the same as the K6 code.  The unrolled loop is
+C the same, but there's just a few scheduling tweaks in the setups and the
+C simple loop.
+C
+C A number of variations have been tried for the unrolled loop, with one or
+C two carries, and with loads scheduled earlier, but nothing faster than 6
+C cycles/limb has been found.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 5)
+',`
+deflit(UNROLL_THRESHOLD, 5)
+')
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	.text
+	ALIGN(32)
+
+PROLOGUE(M4_function_1c)
+	pushl	%ebx
+deflit(`FRAME',4)
+	movl	PARAM_CARRY, %ebx
+	jmp	LF(M4_function_1,start_nc)
+EPILOGUE()
+
+PROLOGUE(M4_function_1)
+	push	%ebx
+deflit(`FRAME',4)
+	xorl	%ebx, %ebx	C initial carry
+
+L(start_nc):
+	movl	PARAM_SIZE, %ecx
+	pushl	%esi
+deflit(`FRAME',8)
+
+	movl	PARAM_SRC, %esi
+	pushl	%edi
+deflit(`FRAME',12)
+
+	movl	PARAM_DST, %edi
+	pushl	%ebp
+deflit(`FRAME',16)
+	cmpl	$UNROLL_THRESHOLD, %ecx
+
+	movl	PARAM_MULTIPLIER, %ebp
+	jae	L(unroll)
+
+	
+	C simple loop
+	C this is offset 0x22, so close enough to aligned
+L(simple):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+
+	movl	(%esi), %eax
+	addl	$4, %edi
+
+	mull	%ebp
+
+	addl	%ebx, %eax
+	adcl	$0, %edx
+
+	M4_inst	%eax, -4(%edi)
+	movl	%edx, %ebx
+
+	adcl	$0, %ebx
+	decl	%ecx
+
+	leal	4(%esi), %esi
+	jnz	L(simple)
+
+
+	popl	%ebp
+	popl	%edi
+
+	popl	%esi
+	movl	%ebx, %eax
+
+	popl	%ebx
+	ret
+
+
+
+C------------------------------------------------------------------------------
+C VAR_JUMP holds the computed jump temporarily because there's not enough
+C registers when doing the mul for the initial two carry limbs.
+C
+C The add/adc for the initial carry in %ebx is necessary only for the
+C mpn_add/submul_1c entry points.  Duplicating the startup code to
+C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
+C idea.
+
+dnl  overlapping with parameters already fetched
+define(VAR_COUNTER,`PARAM_SIZE')
+define(VAR_JUMP,   `PARAM_DST')
+
+	C this is offset 0x43, so close enough to aligned
+L(unroll):
+	C eax
+	C ebx	initial carry
+	C ecx	size
+	C edx
+	C esi	src
+	C edi	dst
+	C ebp
+
+	movl	%ecx, %edx
+	decl	%ecx
+
+	subl	$2, %edx
+	negl	%ecx
+
+	shrl	$UNROLL_LOG2, %edx
+	andl	$UNROLL_MASK, %ecx
+
+	movl	%edx, VAR_COUNTER
+	movl	%ecx, %edx
+
+	C 15 code bytes per limb
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	shll	$4, %edx
+	negl	%ecx
+
+	leal	L(entry) (%edx,%ecx,1), %edx
+')
+	movl	(%esi), %eax		C src low limb
+
+	movl	%edx, VAR_JUMP
+	leal	ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
+
+	mull	%ebp
+
+	addl	%ebx, %eax	C initial carry (from _1c)
+	adcl	$0, %edx
+
+	movl	%edx, %ebx	C high carry
+	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
+
+	movl	VAR_JUMP, %edx
+	testl	$1, %ecx
+	movl	%eax, %ecx	C low carry
+
+	cmovnz(	%ebx, %ecx)	C high,low carry other way around
+	cmovnz(	%eax, %ebx)
+
+	jmp	*%edx
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	shll	$4, %edx
+	negl	%ecx
+
+	C See README.family about old gas bugs
+	leal	(%edx,%ecx,1), %edx
+	addl	$L(entry)-L(here), %edx
+
+	addl	(%esp), %edx
+
+	ret
+')
+
+
+C -----------------------------------------------------------
+	ALIGN(32)
+L(top):
+deflit(`FRAME',16)
+	C eax	scratch
+	C ebx	carry hi
+	C ecx	carry lo
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+	C
+	C VAR_COUNTER	loop counter
+	C
+	C 15 code bytes per limb
+
+	addl	$UNROLL_BYTES, %edi
+
+L(entry):
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 + 4))
+
+Zdisp(	movl,	disp0,(%esi), %eax)
+	mull	%ebp
+Zdisp(	M4_inst,%ecx, disp0,(%edi))
+	adcl	%eax, %ebx
+	movl	%edx, %ecx
+	adcl	$0, %ecx
+
+	movl	disp1(%esi), %eax
+	mull	%ebp
+	M4_inst	%ebx, disp1(%edi)
+	adcl	%eax, %ecx
+	movl	%edx, %ebx
+	adcl	$0, %ebx
+')
+
+	decl	VAR_COUNTER
+	leal	UNROLL_BYTES(%esi), %esi
+
+	jns	L(top)
+
+
+deflit(`disp0',	eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
+
+	M4_inst	%ecx, disp0(%edi)
+	movl	%ebx, %eax
+
+	popl	%ebp
+	popl	%edi
+
+	popl	%esi
+	popl	%ebx
+	adcl	$0, %eax
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/p6/diveby3.asm b/rts/gmp/mpn/x86/p6/diveby3.asm
new file mode 100644
index 0000000000..a77703ea89
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/diveby3.asm
@@ -0,0 +1,37 @@
+dnl  Intel P6 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
+dnl       
+dnl  P6: 8.5 cycles/limb
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl  The P5 code runs well on P6, in fact better than anything else found so
+dnl  far.  An imul is 4 cycles, meaning the two cmp/sbbl pairs on the
+dnl  dependent path are taking 4.5 cycles.
+dnl
+dnl  The destination cache line prefetching is unnecessary on P6, but
+dnl  removing it is a 2 cycle slowdown (approx), so it must be inducing
+dnl  something good in the out of order execution.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_divexact_by3c)
+include_mpn(`x86/pentium/diveby3.asm')
diff --git a/rts/gmp/mpn/x86/p6/gmp-mparam.h b/rts/gmp/mpn/x86/p6/gmp-mparam.h
new file mode 100644
index 0000000000..d7bfb6d60c
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/gmp-mparam.h
@@ -0,0 +1,96 @@
+/* Intel P6 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+#ifndef UMUL_TIME
+#define UMUL_TIME   5   /* cycles */
+#endif
+#ifndef UDIV_TIME
+#define UDIV_TIME   39  /* cycles */
+#endif
+
+#ifndef COUNT_TRAILING_ZEROS_TIME
+#define COUNT_TRAILING_ZEROS_TIME   2  /* cycles */
+#endif
+
+
+/* Generated by tuneup.c, 2000-07-06. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   23
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      139
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   52
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      166
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD             116
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD             66
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            20
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD          54
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE  { 592, 1440, 2688, 5632, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD     608
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD         5888
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE  { 656, 1504, 2944, 6656, 18432, 57344, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD     672
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD         5888
+#endif
diff --git a/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm b/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm
new file mode 100644
index 0000000000..f1b011b623
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/mmx/divrem_1.asm
@@ -0,0 +1,677 @@
+dnl  Intel Pentium-II mpn_divrem_1 -- mpn by limb division.
+dnl 
+dnl  P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                         mp_srcptr src, mp_size_t size,
+C                         mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C                          mp_srcptr src, mp_size_t size,
+C                          mp_limb_t divisor, mp_limb_t carry);
+C
+C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm,
+C see that file for some comments.  It's likely what's here can be improved.
+
+
+dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
+dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
+dnl
+dnl  The different speeds of the integer and fraction parts means that using
+dnl  xsize+size isn't quite right.  The threshold wants to be a bit higher
+dnl  for the integer part and a bit lower for the fraction part.  (Or what's
+dnl  really wanted is to speed up the integer part!)
+dnl
+dnl  The threshold is set to make the integer part right.  At 4 limbs the
+dnl  div and mul are about the same there, but on the fractional part the
+dnl  mul is much faster.
+
+deflit(MUL_THRESHOLD, 4)
+
+
+defframe(PARAM_CARRY,  24)
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE,   16)
+defframe(PARAM_SRC,    12)
+defframe(PARAM_XSIZE,  8)
+defframe(PARAM_DST,    4)
+
+defframe(SAVE_EBX,    -4)
+defframe(SAVE_ESI,    -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+
+defframe(VAR_NORM,    -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC,     -28)
+defframe(VAR_DST,     -32)
+defframe(VAR_DST_STOP,-36)
+
+deflit(STACK_SPACE, 36)
+
+	.text
+	ALIGN(16)
+
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+	movl	PARAM_CARRY, %edx
+
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	leal	-4(%edi,%ebx,4), %edi
+	jmp	LF(mpn_divrem_1,start_1c)
+
+EPILOGUE()
+
+
+	C offset 0x31, close enough to aligned
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	$0, %edx		C initial carry (if can't skip a div)
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+	orl	%ecx, %ecx
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
+	jz	L(no_skip_div)
+
+	movl	-4(%esi,%ecx,4), %eax	C src high limb
+	cmpl	%ebp, %eax		C one less div if high<divisor
+	jnb	L(no_skip_div)
+
+	movl	$0, (%edi,%ecx,4)	C dst high limb
+	decl	%ecx			C size-1
+	movl	%eax, %edx		C src high limb as initial carry
+L(no_skip_div):
+
+
+L(start_1c):
+	C eax	
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	leal	(%ebx,%ecx), %eax	C size+xsize
+	cmpl	$MUL_THRESHOLD, %eax
+	jae	L(mul_by_inverse)
+
+	orl	%ecx, %ecx
+	jz	L(divide_no_integer)
+
+L(divide_integer):
+	C eax	scratch (quotient)
+	C ebx	xsize
+	C ecx	counter
+	C edx	scratch (remainder)
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	movl	-4(%esi,%ecx,4), %eax
+
+	divl	%ebp
+
+	movl	%eax, (%edi,%ecx,4)
+	decl	%ecx
+	jnz	L(divide_integer)
+
+
+L(divide_no_integer):
+	movl	PARAM_DST, %edi
+	orl	%ebx, %ebx
+	jnz	L(divide_fraction)
+
+L(divide_done):
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBX, %ebx
+	movl	%edx, %eax
+
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+L(divide_fraction):
+	C eax	scratch (quotient)
+	C ebx	counter
+	C ecx
+	C edx	scratch (remainder)
+	C esi
+	C edi	dst
+	C ebp	divisor
+
+	movl	$0, %eax
+
+	divl	%ebp
+
+	movl	%eax, -4(%edi,%ebx,4)
+	decl	%ebx
+	jnz	L(divide_fraction)
+
+	jmp	L(divide_done)
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+	C eax
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	leal	12(%edi), %ebx
+
+	movl	%ebx, VAR_DST_STOP
+	leal	4(%edi,%ecx,4), %edi	C &dst[xsize+size]
+
+	movl	%edi, VAR_DST
+	movl	%ecx, %ebx		C size
+
+	bsrl	%ebp, %ecx		C 31-l
+	movl	%edx, %edi		C carry
+
+	leal	1(%ecx), %eax		C 32-l
+	xorl	$31, %ecx		C l
+
+	movl	%ecx, VAR_NORM
+	movl	$-1, %edx
+
+	shll	%cl, %ebp		C d normalized
+	movd	%eax, %mm7
+
+	movl	$-1, %eax
+	subl	%ebp, %edx		C (b-d)-1 giving edx:eax = b*(b-d)-1
+
+	divl	%ebp			C floor (b*(b-d)-1) / d
+
+	movl	%eax, VAR_INVERSE
+	orl	%ebx, %ebx		C size
+	leal	-12(%esi,%ebx,4), %eax	C &src[size-3]
+
+	movl	%eax, VAR_SRC
+	jz	L(start_zero)
+
+	movl	8(%eax), %esi		C src high limb
+	cmpl	$1, %ebx
+	jz	L(start_one)
+
+L(start_two_or_more):
+	movl	4(%eax), %edx		C src second highest limb
+
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shldl(	%cl, %edx, %esi)	C n10 = high,second << l
+
+	cmpl	$2, %ebx
+	je	L(integer_two_left)
+	jmp	L(integer_top)
+
+
+L(start_one):
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shll	%cl, %esi		C n10 = high << l
+	jmp	L(integer_one_left)
+
+
+L(start_zero):
+	shll	%cl, %edi		C n2 = carry << l
+	movl	$0, %esi		C n10 = 0
+
+	C we're here because xsize+size>=MUL_THRESHOLD, so with size==0 then
+	C must have xsize!=0
+	jmp	L(fraction_some)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C This loop runs at about 25 cycles, which is probably sub-optimal, and
+C certainly more than the dependent chain would suggest.  A better loop, or
+C a better rough analysis of what's possible, would be welcomed.
+C
+C In the current implementation, the following successively dependent
+C micro-ops seem to exist.
+C
+C		       uops
+C		n2+n1	1   (addl)
+C		mul	5
+C		q1+1	3   (addl/adcl)
+C		mul	5
+C		sub	3   (subl/sbbl)
+C		addback	2   (cmov)
+C		       ---
+C		       19
+C
+C Lack of registers hinders explicit scheduling and it might be that the
+C normal out of order execution isn't able to hide enough under the mul
+C latencies.
+C
+C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than
+C cmov (and takes one uop off the dependent chain).  A sarl/andl/addl
+C combination was tried for the addback (despite the fact it would lengthen
+C the dependent chain) but found to be no faster.
+
+
+	ALIGN(16)
+L(integer_top):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (src, dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	d
+	C
+	C mm0	scratch (src qword)
+	C mm7	rshift for normalization
+
+	movl	%esi, %eax
+	movl	%ebp, %ebx
+
+	sarl	$31, %eax          C -n1
+	movl	VAR_SRC, %ecx
+
+	andl	%eax, %ebx         C -n1 & d
+	negl	%eax               C n1
+
+	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+	addl	%edi, %eax         C n2+n1
+	movq	(%ecx), %mm0       C next src limb and the one below it
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	subl	$4, %ecx
+
+	movl	%ecx, VAR_SRC
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	movl	%ebp, %eax	   C d
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+	jz	L(q1_ff)
+
+	mull	%ebx		   C (q1+1)*d
+
+	movl	VAR_DST, %ecx
+	psrlq	%mm7, %mm0
+
+	C
+
+	C
+
+	C
+
+	subl	%eax, %esi
+	movl	VAR_DST_STOP, %eax
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	movd	%mm0, %esi
+
+	sbbl	$0, %ebx	   C q
+	subl	$4, %ecx
+
+	movl	%ebx, (%ecx)
+	cmpl	%eax, %ecx
+
+	movl	%ecx, VAR_DST
+	jne	L(integer_top)
+
+
+L(integer_loop_done):
+
+
+C -----------------------------------------------------------------------------
+C
+C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
+C q1_ff special case.  This make the code a bit smaller and simpler, and
+C costs only 2 cycles (each).
+
+L(integer_two_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (src, dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	src limb, shifted
+	C mm7	rshift
+
+
+ 	movl	%esi, %eax
+ 	movl	%ebp, %ebx
+
+ 	sarl	$31, %eax          C -n1
+ 	movl	PARAM_SRC, %ecx
+
+ 	andl	%eax, %ebx         C -n1 & d
+ 	negl	%eax               C n1
+
+ 	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+ 	addl	%edi, %eax         C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movd	(%ecx), %mm0	   C src low limb
+
+	movl	VAR_DST_STOP, %ecx
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx
+
+	mull	%ebx		   C (q1+1)*d
+
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+
+	C
+
+	C
+
+	subl	%eax, %esi
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	movd	%mm0, %esi
+
+	sbbl	$0, %ebx	   C q
+
+	movl	%ebx, -4(%ecx)
+
+
+C -----------------------------------------------------------------------------
+L(integer_one_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	src limb, shifted
+	C mm7	rshift
+
+
+ 	movl	%esi, %eax
+ 	movl	%ebp, %ebx
+
+ 	sarl	$31, %eax          C -n1
+ 	movl	VAR_DST_STOP, %ecx
+
+ 	andl	%eax, %ebx         C -n1 & d
+ 	negl	%eax               C n1
+
+ 	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+ 	addl	%edi, %eax         C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	C
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	C
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx           C q1 if q1+1 overflowed
+
+	mull	%ebx
+
+	C
+
+	C
+
+	C
+
+	C
+
+	subl	%eax, %esi
+	movl	PARAM_XSIZE, %eax
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+
+	sbbl	$0, %ebx	   C q
+
+	movl	%ebx, -8(%ecx)
+	subl	$8, %ecx
+
+
+
+	orl	%eax, %eax         C xsize
+	jnz	L(fraction_some)
+
+	movl	%edi, %eax
+L(fraction_done):
+	movl	VAR_NORM, %ecx
+	movl	SAVE_EBP, %ebp
+
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EBX, %ebx
+	addl	$STACK_SPACE, %esp
+
+	shrl	%cl, %eax
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+	C eax	(divisor)
+	C ebx	(q1+1 == 0)
+	C ecx
+	C edx
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+
+	movl	VAR_DST, %ecx
+	movl	VAR_DST_STOP, %edx
+	subl	$4, %ecx
+
+	movl	%ecx, VAR_DST
+	psrlq	%mm7, %mm0
+	leal	(%ebp,%esi), %edi	C n-q*d remainder -> next n2
+
+	movl	$-1, (%ecx)
+	movd	%mm0, %esi		C next n10
+
+	cmpl	%ecx, %edx
+	jne	L(integer_top)
+
+	jmp	L(integer_loop_done)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C In the current implementation, the following successively dependent
+C micro-ops seem to exist.
+C
+C		       uops
+C		mul	5
+C		q1+1	1   (addl)
+C		mul	5
+C		sub	3   (negl/sbbl)
+C		addback	2   (cmov)
+C		       ---
+C		       16
+C
+C The loop in fact runs at about 17.5 cycles.  Using a sarl/andl/addl for
+C the addback was found to be a touch slower.
+
+
+	ALIGN(16)
+L(fraction_some):
+	C eax
+	C ebx
+	C ecx
+	C edx
+	C esi
+	C edi	carry
+	C ebp	divisor
+
+	movl	PARAM_DST, %esi
+	movl	VAR_DST_STOP, %ecx
+	movl	%edi, %eax
+
+	subl	$8, %ecx
+
+
+	ALIGN(16)
+L(fraction_top):
+	C eax	n2, then scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	dst, decrementing
+	C edx	scratch
+	C esi	dst stop point
+	C edi	n2
+	C ebp	divisor
+
+	mull	VAR_INVERSE	C m*n2
+
+	movl	%ebp, %eax	C d
+	subl	$4, %ecx	C dst
+	leal	1(%edi), %ebx
+
+	C
+
+	C
+
+	C
+
+	addl	%edx, %ebx	C 1 + high(n2<<32 + m*n2) = q1+1
+
+	mull	%ebx		C (q1+1)*d
+
+	C
+
+	C
+
+	C
+
+	C
+
+	negl	%eax		C low of n - (q1+1)*d
+
+ 	sbbl	%edx, %edi	C high of n - (q1+1)*d, caring only about carry
+	leal    (%ebp,%eax), %edx
+
+ 	cmovc(	%edx, %eax)	C n - q1*d if underflow from using q1+1
+
+ 	sbbl	$0, %ebx	C q
+	movl	%eax, %edi	C remainder->n2
+ 	cmpl	%esi, %ecx
+
+	movl	%ebx, (%ecx)	C previous q
+	jne	L(fraction_top)
+
+
+	jmp	L(fraction_done)
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/p6/mmx/mod_1.asm b/rts/gmp/mpn/x86/p6/mmx/mod_1.asm
new file mode 100644
index 0000000000..e7d8d94d33
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/mmx/mod_1.asm
@@ -0,0 +1,444 @@
+dnl  Intel Pentium-II mpn_mod_1 -- mpn by limb remainder.
+dnl 
+dnl  P6MMX: 24.0 cycles/limb.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C                       mp_limb_t carry);
+C
+C The code here very similar to mpn_divrem_1, but with the quotient
+C discarded.  What's here probably isn't optimal.
+C
+C See mpn/x86/p6/mmx/divrem_1.c and mpn/x86/k7/mmx/mod_1.asm for some
+C comments.
+
+
+dnl  MUL_THRESHOLD is the size at which the multiply by inverse method is
+dnl  used, rather than plain "divl"s.  Minimum value 2.
+
+deflit(MUL_THRESHOLD, 4)
+
+
+defframe(PARAM_CARRY,  16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE,    8)
+defframe(PARAM_SRC,     4)
+
+defframe(SAVE_EBX,    -4)
+defframe(SAVE_ESI,    -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+
+defframe(VAR_NORM,    -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC_STOP,-28)
+
+deflit(STACK_SPACE, 28)
+
+	.text
+	ALIGN(16)
+
+PROLOGUE(mpn_mod_1c)
+deflit(`FRAME',0)
+	movl	PARAM_CARRY, %edx
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+	jmp	LF(mpn_mod_1,start_1c)
+
+EPILOGUE()
+
+
+	ALIGN(16)
+PROLOGUE(mpn_mod_1)
+deflit(`FRAME',0)
+
+	movl	$0, %edx		C initial carry (if can't skip a div)
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	orl	%ecx, %ecx
+	jz	L(divide_done)
+
+	movl	-4(%esi,%ecx,4), %eax	C src high limb
+
+	cmpl	%ebp, %eax		C carry flag if high<divisor
+					
+	cmovc(	%eax, %edx)		C src high limb as initial carry
+	sbbl	$0, %ecx		C size-1 to skip one div
+	jz	L(divide_done)
+
+
+	ALIGN(16)
+L(start_1c):
+	C eax	
+	C ebx
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi
+	C ebp	divisor
+
+	cmpl	$MUL_THRESHOLD, %ecx
+	jae	L(mul_by_inverse)
+
+
+	orl	%ecx, %ecx
+	jz	L(divide_done)
+
+
+L(divide_top):
+	C eax	scratch (quotient)
+	C ebx
+	C ecx	counter, limbs, decrementing
+	C edx	scratch (remainder)
+	C esi	src
+	C edi
+	C ebp
+
+	movl	-4(%esi,%ecx,4), %eax
+
+	divl	%ebp
+
+	decl	%ecx
+	jnz	L(divide_top)
+
+
+L(divide_done):
+	movl	SAVE_ESI, %esi
+	movl	%edx, %eax
+
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+	C eax
+	C ebx
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi
+	C ebp	divisor
+
+	movl	%ebx, SAVE_EBX
+	leal	-4(%esi), %ebx
+
+	movl	%ebx, VAR_SRC_STOP
+	movl	%ecx, %ebx		C size
+
+	movl	%edi, SAVE_EDI
+	movl	%edx, %edi		C carry
+
+	bsrl	%ebp, %ecx		C 31-l
+	movl	$-1, %edx
+
+	leal	1(%ecx), %eax		C 32-l
+	xorl	$31, %ecx		C l
+
+	movl	%ecx, VAR_NORM
+	shll	%cl, %ebp		C d normalized
+
+	movd	%eax, %mm7
+	movl	$-1, %eax
+	subl	%ebp, %edx		C (b-d)-1 so  edx:eax = b*(b-d)-1
+
+	divl	%ebp			C floor (b*(b-d)-1) / d
+
+	C
+
+	movl	%eax, VAR_INVERSE
+	leal	-12(%esi,%ebx,4), %eax	C &src[size-3]
+
+	movl	8(%eax), %esi		C src high limb
+	movl	4(%eax), %edx		C src second highest limb
+
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shldl(	%cl, %edx, %esi)	C n10 = high,second << l
+
+	movl	%eax, %ecx		C &src[size-3]
+
+
+ifelse(MUL_THRESHOLD,2,`
+	cmpl	$2, %ebx
+	je	L(inverse_two_left)
+')
+
+
+C The dependent chain here is the same as in mpn_divrem_1, but a few
+C instructions are saved by not needing to store the quotient limbs.  This
+C gets it down to 24 c/l, which is still a bit away from a theoretical 19
+C c/l.
+
+	ALIGN(16)
+L(inverse_top):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	src pointer, decrementing
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	scratch (src qword)
+	C mm7	rshift for normalization
+
+
+	movl	%esi, %eax
+	movl	%ebp, %ebx
+
+	sarl	$31, %eax          C -n1
+
+	andl	%eax, %ebx         C -n1 & d
+	negl	%eax               C n1
+
+	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+	addl	%edi, %eax         C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movq	(%ecx), %mm0       C next src limb and the one below it
+	subl	$4, %ecx
+
+	C
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+	movl	%ebp, %eax	   C d
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+	jz	L(q1_ff)
+
+	mull	%ebx		   C (q1+1)*d
+
+	psrlq	%mm7, %mm0
+	movl	VAR_SRC_STOP, %ebx
+
+	C
+
+	C
+
+	C
+
+	subl	%eax, %esi
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	movd	%mm0, %esi
+	cmpl	%ebx, %ecx
+
+	jne	L(inverse_top)
+
+
+L(inverse_loop_done):
+
+
+C -----------------------------------------------------------------------------
+
+L(inverse_two_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	&src[-1]
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	scratch (src dword)
+	C mm7	rshift
+
+	movl	%esi, %eax
+	movl	%ebp, %ebx
+
+	sarl	$31, %eax          C -n1
+
+	andl	%eax, %ebx         C -n1 & d
+	negl	%eax               C n1
+
+	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+	addl	%edi, %eax         C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movd	4(%ecx), %mm0	   C src low limb
+
+	C
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx
+	movl	%ebp, %eax	   C d
+
+	mull	%ebx		   C (q1+1)*d
+
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+
+	C
+
+	C
+
+	subl	%eax, %esi
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	movd	%mm0, %esi
+
+
+C One limb left
+
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	src limb, shifted
+	C mm7	rshift
+
+	movl	%esi, %eax
+	movl	%ebp, %ebx
+
+	sarl	$31, %eax          C -n1
+
+	andl	%eax, %ebx         C -n1 & d
+	negl	%eax               C n1
+
+	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+	addl	%edi, %eax         C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movl	VAR_NORM, %ecx     C for final denorm
+
+	C
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2<<32 + m*(n2+n1))
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx
+	movl	%ebp, %eax	   C d
+
+	mull	%ebx		   C (q1+1)*d
+
+	movl	SAVE_EBX, %ebx
+
+	C
+
+	C
+
+	C
+
+	subl	%eax, %esi
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	leal	(%ebp,%esi), %edx
+	movl	SAVE_EBP, %ebp
+
+	movl	%esi, %eax	   C remainder
+	movl	SAVE_ESI, %esi
+
+	cmovc(	%edx, %eax)	   C n - q1*d if underflow from using q1+1
+	movl	SAVE_EDI, %edi
+
+	shrl	%cl, %eax	   C denorm remainder
+	addl	$STACK_SPACE, %esp
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+	C eax	(divisor)
+	C ebx	(q1+1 == 0)
+	C ecx	src pointer
+	C edx
+	C esi	n10
+	C edi	(n2)
+	C ebp	divisor
+
+	leal	(%ebp,%esi), %edi	C n-q*d remainder -> next n2
+	movl	VAR_SRC_STOP, %edx
+	psrlq	%mm7, %mm0
+
+	movd	%mm0, %esi		C next n10
+	cmpl	%ecx, %edx
+	jne	L(inverse_top)
+
+	jmp	L(inverse_loop_done)
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/p6/mmx/popham.asm b/rts/gmp/mpn/x86/p6/mmx/popham.asm
new file mode 100644
index 0000000000..50f9a11218
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/mmx/popham.asm
@@ -0,0 +1,31 @@
+dnl  Intel Pentium-II mpn_popcount, mpn_hamdist -- population count and
+dnl  hamming distance.
+dnl 
+dnl  P6MMX: popcount 11 cycles/limb (approx), hamdist 11.5 cycles/limb
+dnl  (approx)
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+include_mpn(`x86/k6/mmx/popham.asm')
diff --git a/rts/gmp/mpn/x86/p6/p3mmx/popham.asm b/rts/gmp/mpn/x86/p6/p3mmx/popham.asm
new file mode 100644
index 0000000000..e63fbf334b
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/p3mmx/popham.asm
@@ -0,0 +1,30 @@
+dnl  Intel Pentium-III mpn_popcount, mpn_hamdist -- population count and
+dnl  hamming distance.
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl  Haven't actually measured it, but the K7 code with the psadbw should be
+dnl  good on P-III.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+include_mpn(`x86/k7/mmx/popham.asm')
diff --git a/rts/gmp/mpn/x86/p6/sqr_basecase.asm b/rts/gmp/mpn/x86/p6/sqr_basecase.asm
new file mode 100644
index 0000000000..174c78406a
--- /dev/null
+++ b/rts/gmp/mpn/x86/p6/sqr_basecase.asm
@@ -0,0 +1,641 @@
+dnl  Intel P6 mpn_sqr_basecase -- square an mpn number.
+dnl 
+dnl  P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular
+dnl  product (measured on the speed difference between 20 and 40 limbs,
+dnl  which is the Karatsuba recursing range).
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+dnl  These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for
+dnl  a description.  The only difference here is that UNROLL_COUNT can go up
+dnl  to 64 (not 63) making KARATSUBA_SQR_THRESHOLD_MAX 67.
+
+deflit(KARATSUBA_SQR_THRESHOLD_MAX, 67)
+
+ifdef(`KARATSUBA_SQR_THRESHOLD_OVERRIDE',
+`define(`KARATSUBA_SQR_THRESHOLD',KARATSUBA_SQR_THRESHOLD_OVERRIDE)')
+
+m4_config_gmp_mparam(`KARATSUBA_SQR_THRESHOLD')
+deflit(UNROLL_COUNT, eval(KARATSUBA_SQR_THRESHOLD-3))
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the given size
+C is small.
+C
+C The code size might look a bit excessive, but not all of it is executed so
+C it won't all get into the code cache.  The 1x1, 2x2 and 3x3 special cases
+C clearly apply only to those sizes; mid sizes like 10x10 only need part of
+C the unrolled addmul; and big sizes like 40x40 that do use the full
+C unrolling will least be making good use of it, because 40x40 will take
+C something like 7000 cycles.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	.text
+	ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %edx
+
+	movl	PARAM_SRC, %eax
+
+	cmpl	$2, %edx
+	movl	PARAM_DST, %ecx
+	je	L(two_limbs)
+
+	movl	(%eax), %eax
+	ja	L(three_or_more)
+
+
+C -----------------------------------------------------------------------------
+C one limb only
+	C eax	src limb
+	C ebx
+	C ecx	dst
+	C edx
+
+	mull	%eax
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(two_limbs):
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx
+
+defframe(SAVE_ESI, -4)
+defframe(SAVE_EBX, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+deflit(`STACK_SPACE',16)
+
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	%eax, %esi
+	movl	(%eax), %eax
+
+	mull	%eax		C src[0]^2
+
+	movl	%eax, (%ecx)	C dst[0]
+	movl	4(%esi), %eax
+
+	movl	%ebx, SAVE_EBX
+	movl	%edx, %ebx	C dst[1]
+
+	mull	%eax		C src[1]^2
+
+	movl	%edi, SAVE_EDI
+	movl	%eax, %edi	C dst[2]
+	movl	(%esi), %eax
+
+	movl	%ebp, SAVE_EBP
+	movl	%edx, %ebp	C dst[3]
+
+	mull	4(%esi)		C src[0]*src[1]
+
+	addl	%eax, %ebx
+	movl	SAVE_ESI, %esi
+
+	adcl	%edx, %edi
+
+	adcl	$0, %ebp
+	addl	%ebx, %eax
+	movl	SAVE_EBX, %ebx
+
+	adcl	%edi, %edx
+	movl	SAVE_EDI, %edi
+
+	adcl	$0, %ebp
+
+	movl	%eax, 4(%ecx)
+
+	movl	%ebp, 12(%ecx)
+	movl	SAVE_EBP, %ebp
+
+	movl	%edx, 8(%ecx)
+	addl	$FRAME, %esp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(three_or_more):
+	C eax	src low limb
+	C ebx
+	C ecx	dst
+	C edx	size
+deflit(`FRAME',0)
+
+	pushl	%esi	defframe_pushl(`SAVE_ESI')
+	cmpl	$4, %edx
+
+	movl	PARAM_SRC, %esi
+	jae	L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+
+	C eax	src low limb
+	C ebx
+	C ecx	dst
+	C edx
+	C esi	src
+	C edi
+	C ebp
+
+	pushl	%ebp	defframe_pushl(`SAVE_EBP')
+	pushl	%edi	defframe_pushl(`SAVE_EDI')
+
+	mull	%eax		C src[0] ^ 2
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+
+	movl	4(%esi), %eax
+	xorl	%ebp, %ebp
+
+	mull	%eax		C src[1] ^ 2
+
+	movl	%eax, 8(%ecx)
+	movl	%edx, 12(%ecx)
+	movl	8(%esi), %eax
+
+	pushl	%ebx	defframe_pushl(`SAVE_EBX')
+
+	mull	%eax		C src[2] ^ 2
+
+	movl	%eax, 16(%ecx)
+	movl	%edx, 20(%ecx)
+
+	movl	(%esi), %eax
+
+	mull	4(%esi)		C src[0] * src[1]
+
+	movl	%eax, %ebx
+	movl	%edx, %edi
+
+	movl	(%esi), %eax
+
+	mull	8(%esi)		C src[0] * src[2]
+
+	addl	%eax, %edi
+	movl	%edx, %ebp
+
+	adcl	$0, %ebp
+	movl	4(%esi), %eax
+
+	mull	8(%esi)		C src[1] * src[2]
+
+	xorl	%esi, %esi
+	addl	%eax, %ebp
+
+	C eax
+	C ebx	dst[1]
+	C ecx	dst
+	C edx	dst[4]
+	C esi	zero, will be dst[5]
+	C edi	dst[2]
+	C ebp	dst[3]
+
+	adcl	$0, %edx
+	addl	%ebx, %ebx
+
+	adcl	%edi, %edi
+
+	adcl	%ebp, %ebp
+
+	adcl	%edx, %edx
+	movl	4(%ecx), %eax
+
+	adcl	$0, %esi
+	addl	%ebx, %eax
+
+	movl	%eax, 4(%ecx)
+	movl	8(%ecx), %eax
+
+	adcl	%edi, %eax
+	movl	12(%ecx), %ebx
+
+	adcl	%ebp, %ebx
+	movl	16(%ecx), %edi
+
+	movl	%eax, 8(%ecx)
+	movl	SAVE_EBP, %ebp
+
+	movl	%ebx, 12(%ecx)
+	movl	SAVE_EBX, %ebx
+
+	adcl	%edx, %edi
+	movl	20(%ecx), %eax
+
+	movl	%edi, 16(%ecx)
+	movl	SAVE_EDI, %edi
+
+	adcl	%esi, %eax	C no carry out of this
+	movl	SAVE_ESI, %esi
+
+	movl	%eax, 20(%ecx)
+	addl	$FRAME, %esp
+
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+defframe(VAR_COUNTER,-20)
+defframe(VAR_JMP,    -24)
+deflit(`STACK_SPACE',24)
+
+L(four_or_more):
+	C eax	src low limb
+	C ebx
+	C ecx
+	C edx	size
+	C esi	src
+	C edi
+	C ebp
+deflit(`FRAME',4)  dnl  %esi already pushed
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+ 
+	subl	$STACK_SPACE-FRAME, %esp
+deflit(`FRAME',STACK_SPACE)
+	movl	$1, %ecx
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	movl	%ebx, SAVE_EBX
+	subl	%edx, %ecx		C -(size-1)
+
+	movl	%ebp, SAVE_EBP
+	movl	$0, %ebx		C initial carry
+
+	leal	(%esi,%edx,4), %esi	C &src[size]
+	movl	%eax, %ebp		C multiplier
+
+	leal	-4(%edi,%edx,4), %edi	C &dst[size-1]
+
+
+C This loop runs at just over 6 c/l.
+
+L(mul_1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter, limbs, negative, -(size-1) to -1
+	C edx	scratch
+	C esi	&src[size]
+	C edi	&dst[size-1]
+	C ebp	multiplier
+
+	movl	%ebp, %eax
+
+	mull	(%esi,%ecx,4)
+
+	addl	%ebx, %eax
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+	movl	%eax, 4(%edi,%ecx,4)
+
+	incl	%ecx
+	jnz	L(mul_1)
+
+
+	movl	%ebx, 4(%edi)
+
+
+C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
+C
+C The last two addmuls, which are the bottom right corner of the product
+C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
+C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
+C cases that need to be done.
+C
+C The unrolled code is the same as mpn_addmul_1(), see that routine for some
+C comments.
+C
+C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled code, stepped by one code
+C chunk each outer loop.
+
+dnl  This is also hard-coded in the address calculation below.
+deflit(CODE_BYTES_PER_LIMB, 15)
+
+dnl  With &src[size] and &dst[size-1] pointers, the displacements in the
+dnl  unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above
+dnl  that an offset must be added to them.
+deflit(OFFSET,
+ifelse(eval(UNROLL_COUNT>32),1,
+eval((UNROLL_COUNT-32)*4),
+0))
+
+	C eax
+	C ebx	carry
+	C ecx
+	C edx
+	C esi	&src[size]
+	C edi	&dst[size-1]
+	C ebp
+
+	movl	PARAM_SIZE, %ecx
+
+	subl	$4, %ecx
+	jz	L(corner)
+
+	movl	%ecx, %edx
+	negl	%ecx
+
+	shll	$4, %ecx
+ifelse(OFFSET,0,,`subl	$OFFSET, %esi')
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
+')
+	negl	%edx
+
+ifelse(OFFSET,0,,`subl	$OFFSET, %edi')
+
+	C The calculated jump mustn't be before the start of the available
+	C code.  This is the limit that UNROLL_COUNT puts on the src operand
+	C size, but checked here using the jump address directly.
+
+	ASSERT(ae,
+	`movl_text_address( L(unroll_inner_start), %eax)
+	cmpl	%eax, %ecx')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll_outer_top):
+	C eax
+	C ebx	high limb to store
+	C ecx	VAR_JMP
+	C edx	VAR_COUNTER, limbs, negative
+	C esi	&src[size], constant
+	C edi	dst ptr, second highest limb of last addmul
+	C ebp
+
+	movl	-12+OFFSET(%esi,%edx,4), %ebp	C multiplier
+	movl	%edx, VAR_COUNTER
+
+	movl	-8+OFFSET(%esi,%edx,4), %eax	C first limb of multiplicand
+
+	mull	%ebp
+
+define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')')
+
+	testb	$1, %cl
+
+	movl	%edx, %ebx	C high carry
+	leal	4(%edi), %edi
+
+	movl	%ecx, %edx	C jump
+
+	movl	%eax, %ecx	C low carry
+	leal	CODE_BYTES_PER_LIMB(%edx), %edx
+
+	cmovX(	%ebx, %ecx)	C high carry reverse
+	cmovX(	%eax, %ebx)	C low carry reverse
+	movl	%edx, VAR_JMP
+	jmp	*%edx
+
+
+	C Must be on an even address here so the low bit of the jump address
+	C will indicate which way around ecx/ebx should start.
+
+	ALIGN(2)
+
+L(unroll_inner_start):
+	C eax	scratch
+	C ebx	carry high
+	C ecx	carry low
+	C edx	scratch
+	C esi	src pointer
+	C edi	dst pointer
+	C ebp	multiplier
+	C
+	C 15 code bytes each limb
+	C ecx/ebx reversed on each chunk
+
+forloop(`i', UNROLL_COUNT, 1, `
+	deflit(`disp_src', eval(-i*4 + OFFSET))
+	deflit(`disp_dst', eval(disp_src))
+
+	m4_assert(`disp_src>=-128 && disp_src<128')
+	m4_assert(`disp_dst>=-128 && disp_dst<128')
+
+ifelse(eval(i%2),0,`
+Zdisp(	movl,	disp_src,(%esi), %eax)
+	mull	%ebp
+Zdisp(	addl,	%ebx, disp_dst,(%edi))
+	adcl	%eax, %ecx
+	movl	%edx, %ebx
+	adcl	$0, %ebx
+',`
+	dnl  this one comes out last
+Zdisp(	movl,	disp_src,(%esi), %eax)
+	mull	%ebp
+Zdisp(	addl,	%ecx, disp_dst,(%edi))
+	adcl	%eax, %ebx
+	movl	%edx, %ecx
+	adcl	$0, %ecx
+')
+')
+L(unroll_inner_end):
+
+	addl	%ebx, m4_empty_if_zero(OFFSET)(%edi)
+
+	movl	VAR_COUNTER, %edx
+	adcl	$0, %ecx
+
+	movl	%ecx, m4_empty_if_zero(OFFSET+4)(%edi)
+	movl	VAR_JMP, %ecx
+
+	incl	%edx
+	jnz	L(unroll_outer_top)
+	
+
+ifelse(OFFSET,0,,`
+	addl	$OFFSET, %esi
+	addl	$OFFSET, %edi
+')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(corner):
+	C eax
+	C ebx
+	C ecx
+	C edx
+	C esi	&src[size]
+	C edi	&dst[2*size-5]
+	C ebp
+
+	movl	-12(%esi), %eax
+
+	mull	-8(%esi)
+
+	addl	%eax, (%edi)
+	movl	-12(%esi), %eax
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+
+	mull	-4(%esi)
+
+	addl	%eax, %ebx
+	movl	-8(%esi), %eax
+
+	adcl	$0, %edx
+
+	addl	%ebx, 4(%edi)
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+
+	mull	-4(%esi)
+
+	movl	PARAM_SIZE, %ecx
+	addl	%ebx, %eax
+
+	adcl	$0, %edx
+
+	movl	%eax, 8(%edi)
+
+	movl	%edx, 12(%edi)
+	movl	PARAM_DST, %edi
+
+
+C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
+
+	subl	$1, %ecx		C size-1
+	xorl	%eax, %eax		C ready for final adcl, and clear carry
+
+	movl	%ecx, %edx
+	movl	PARAM_SRC, %esi
+
+
+L(lshift):
+	C eax
+	C ebx
+	C ecx	counter, size-1 to 1
+	C edx	size-1 (for later use)
+	C esi	src (for later use)
+	C edi	dst, incrementing
+	C ebp
+
+	rcll	4(%edi)
+	rcll	8(%edi)
+
+	leal	8(%edi), %edi
+	decl	%ecx
+	jnz	L(lshift)
+
+
+	adcl	%eax, %eax
+
+	movl	%eax, 4(%edi)		C dst most significant limb
+	movl	(%esi), %eax		C src[0]
+
+	leal	4(%esi,%edx,4), %esi	C &src[size]
+	subl	%edx, %ecx		C -(size-1)
+
+
+C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
+C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+
+	mull	%eax
+
+	movl	%eax, (%edi,%ecx,8)	C dst[0]
+
+
+L(diag):
+	C eax	scratch
+	C ebx	scratch
+	C ecx	counter, negative
+	C edx	carry
+	C esi	&src[size]
+	C edi	dst[2*size-2]
+	C ebp
+
+	movl	(%esi,%ecx,4), %eax
+	movl	%edx, %ebx
+
+	mull	%eax
+
+	addl	%ebx, 4(%edi,%ecx,8)
+	adcl	%eax, 8(%edi,%ecx,8)
+	adcl	$0, %edx
+
+	incl	%ecx
+	jnz	L(diag)
+
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBX, %ebx
+
+	addl	%edx, 4(%edi)		C dst most significant limb
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBP, %ebp
+	addl	$FRAME, %esp
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+ifdef(`PIC',`
+L(pic_calc):
+	addl	(%esp), %ecx
+	addl	$L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
+	addl	%edx, %ecx
+	ret
+')
+
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/README b/rts/gmp/mpn/x86/pentium/README
new file mode 100644
index 0000000000..3b9ec8ac6f
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/README
@@ -0,0 +1,77 @@
+
+                   INTEL PENTIUM P5 MPN SUBROUTINES
+
+
+This directory contains mpn functions optimized for Intel Pentium (P5,P54)
+processors.  The mmx subdirectory has code for Pentium with MMX (P55).
+
+
+STATUS
+
+                                cycles/limb
+
+	mpn_add_n/sub_n            2.375
+
+	mpn_copyi/copyd            1.0
+
+	mpn_divrem_1              44.0
+	mpn_mod_1                 44.0
+	mpn_divexact_by3          15.0
+
+	mpn_l/rshift               5.375 normal (6.0 on P54)
+				   1.875 special shift by 1 bit
+
+	mpn_mul_1                 13.0
+	mpn_add/submul_1          14.0
+
+	mpn_mul_basecase          14.2 cycles/crossproduct (approx)
+
+	mpn_sqr_basecase           8 cycles/crossproduct (approx)
+                                   or 15.5 cycles/triangleproduct (approx)
+
+Pentium MMX gets the following improvements
+
+	mpn_l/rshift               1.75
+
+
+1. mpn_lshift and mpn_rshift run at about 6 cycles/limb on P5 and P54, but the
+documentation indicates that they should take only 43/8 = 5.375 cycles/limb,
+or 5 cycles/limb asymptotically.  The P55 runs them at the expected speed.
+
+2. mpn_add_n and mpn_sub_n run at asymptotically 2 cycles/limb.  Due to loop
+overhead and other delays (cache refill?), they run at or near 2.5 cycles/limb.
+
+3. mpn_mul_1, mpn_addmul_1, mpn_submul_1 all run 1 cycle faster than they
+should.  Intel documentation says a mul instruction is 10 cycles, but it
+measures 9 and the routines using it run with it as 9.
+
+
+
+RELEVANT OPTIMIZATION ISSUES
+
+1. Pentium doesn't allocate cache lines on writes, unlike most other modern
+processors.  Since the functions in the mpn class do array writes, we have to
+handle allocating the destination cache lines by reading a word from it in the
+loops, to achieve the best performance.
+
+2. Pairing of memory operations requires that the two issued operations refer
+to different cache banks.  The simplest way to insure this is to read/write
+two words from the same object.  If we make operations on different objects,
+they might or might not be to the same cache bank.
+
+
+
+REFERENCES
+
+"Intel Architecture Optimization Manual", 1997, order number 242816.  This
+is mostly about P5, the parts about P6 aren't relevant.  Available on-line:
+
+        http://download.intel.com/design/PentiumII/manuals/242816.htm
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/rts/gmp/mpn/x86/pentium/aors_n.asm b/rts/gmp/mpn/x86/pentium/aors_n.asm
new file mode 100644
index 0000000000..a61082a456
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/aors_n.asm
@@ -0,0 +1,196 @@
+dnl  Intel Pentium mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+dnl
+dnl  P5: 2.375 cycles/limb
+
+
+dnl  Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
+dnl  Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+ifdef(`OPERATION_add_n',`
+	define(M4_inst,        adcl)
+	define(M4_function_n,  mpn_add_n)
+	define(M4_function_nc, mpn_add_nc)
+
+',`ifdef(`OPERATION_sub_n',`
+	define(M4_inst,        sbbl)
+	define(M4_function_n,  mpn_sub_n)
+	define(M4_function_nc, mpn_sub_nc)
+
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                          mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                           mp_size_t size, mp_limb_t carry);
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST,  4)
+
+	.text
+	ALIGN(8)
+PROLOGUE(M4_function_nc)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC1,%esi
+	movl	PARAM_SRC2,%ebp
+	movl	PARAM_SIZE,%ecx
+
+	movl	(%ebp),%ebx
+
+	decl	%ecx
+	movl	%ecx,%edx
+	shrl	$3,%ecx
+	andl	$7,%edx
+	testl	%ecx,%ecx		C zero carry flag
+	jz	L(endgo)
+
+	pushl	%edx
+FRAME_pushl()
+	movl	PARAM_CARRY,%eax
+	shrl	$1,%eax			C shift bit 0 into carry
+	jmp	LF(M4_function_n,oop)
+
+L(endgo):
+deflit(`FRAME',16)
+	movl	PARAM_CARRY,%eax
+	shrl	$1,%eax			C shift bit 0 into carry
+	jmp	LF(M4_function_n,end)
+
+EPILOGUE()
+
+
+	ALIGN(8)
+PROLOGUE(M4_function_n)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC1,%esi
+	movl	PARAM_SRC2,%ebp
+	movl	PARAM_SIZE,%ecx
+
+	movl	(%ebp),%ebx
+
+	decl	%ecx
+	movl	%ecx,%edx
+	shrl	$3,%ecx
+	andl	$7,%edx
+	testl	%ecx,%ecx		C zero carry flag
+	jz	L(end)
+	pushl	%edx
+FRAME_pushl()
+
+	ALIGN(8)
+L(oop):	movl	28(%edi),%eax		C fetch destination cache line
+	leal	32(%edi),%edi
+
+L(1):	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	M4_inst	%ebx,%eax
+	movl	4(%ebp),%ebx
+	M4_inst	%ebx,%edx
+	movl	8(%ebp),%ebx
+	movl	%eax,-32(%edi)
+	movl	%edx,-28(%edi)
+
+L(2):	movl	8(%esi),%eax
+	movl	12(%esi),%edx
+	M4_inst	%ebx,%eax
+	movl	12(%ebp),%ebx
+	M4_inst	%ebx,%edx
+	movl	16(%ebp),%ebx
+	movl	%eax,-24(%edi)
+	movl	%edx,-20(%edi)
+
+L(3):	movl	16(%esi),%eax
+	movl	20(%esi),%edx
+	M4_inst	%ebx,%eax
+	movl	20(%ebp),%ebx
+	M4_inst	%ebx,%edx
+	movl	24(%ebp),%ebx
+	movl	%eax,-16(%edi)
+	movl	%edx,-12(%edi)
+
+L(4):	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	M4_inst	%ebx,%eax
+	movl	28(%ebp),%ebx
+	M4_inst	%ebx,%edx
+	movl	32(%ebp),%ebx
+	movl	%eax,-8(%edi)
+	movl	%edx,-4(%edi)
+
+	leal	32(%esi),%esi
+	leal	32(%ebp),%ebp
+	decl	%ecx
+	jnz	L(oop)
+
+	popl	%edx
+FRAME_popl()
+L(end):
+	decl	%edx			C test %edx w/o clobbering carry
+	js	L(end2)
+	incl	%edx
+L(oop2):
+	leal	4(%edi),%edi
+	movl	(%esi),%eax
+	M4_inst	%ebx,%eax
+	movl	4(%ebp),%ebx
+	movl	%eax,-4(%edi)
+	leal	4(%esi),%esi
+	leal	4(%ebp),%ebp
+	decl	%edx
+	jnz	L(oop2)
+L(end2):
+	movl	(%esi),%eax
+	M4_inst	%ebx,%eax
+	movl	%eax,(%edi)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/aorsmul_1.asm b/rts/gmp/mpn/x86/pentium/aorsmul_1.asm
new file mode 100644
index 0000000000..147b55610f
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/aorsmul_1.asm
@@ -0,0 +1,99 @@
+dnl  Intel Pentium mpn_addmul_1 -- mpn by limb multiplication.
+dnl 
+dnl  P5: 14.0 cycles/limb
+
+
+dnl  Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation,
+dnl  Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA. */
+
+
+include(`../config.m4')
+
+
+ifdef(`OPERATION_addmul_1', `
+      define(M4_inst,        addl)
+      define(M4_function_1,  mpn_addmul_1)
+
+',`ifdef(`OPERATION_submul_1', `
+      define(M4_inst,        subl)
+      define(M4_function_1,  mpn_submul_1)
+
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                          mp_limb_t mult);
+
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	.text
+	ALIGN(8)
+
+PROLOGUE(M4_function_1)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST, %edi
+	movl	PARAM_SRC, %esi
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_MULTIPLIER, %ebp
+
+	leal	(%edi,%ecx,4), %edi
+	leal	(%esi,%ecx,4), %esi
+	negl	%ecx
+	xorl	%ebx, %ebx
+	ALIGN(8)
+
+L(oop):	adcl	$0, %ebx
+	movl	(%esi,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%ebx, %eax
+	movl	(%edi,%ecx,4), %ebx
+
+	adcl	$0, %edx
+	M4_inst	%eax, %ebx
+
+	movl	%ebx, (%edi,%ecx,4)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(oop)
+
+	adcl	$0, %ebx
+	movl	%ebx, %eax
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/diveby3.asm b/rts/gmp/mpn/x86/pentium/diveby3.asm
new file mode 100644
index 0000000000..dbac81642f
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/diveby3.asm
@@ -0,0 +1,183 @@
+dnl  Intel P5 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
+dnl       
+dnl  P5: 15.0 cycles/limb
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_divexact_by3c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                              mp_limb_t carry);
+
+defframe(PARAM_CARRY,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,   8)
+defframe(PARAM_DST,   4)
+
+dnl  multiplicative inverse of 3, modulo 2^32
+deflit(INVERSE_3,        0xAAAAAAAB)
+
+dnl  ceil(b/3), ceil(b*2/3) and floor(b*2/3) where b=2^32
+deflit(ONE_THIRD_CEIL,   0x55555556)
+deflit(TWO_THIRDS_CEIL,  0xAAAAAAAB)
+deflit(TWO_THIRDS_FLOOR, 0xAAAAAAAA)
+
+	.text
+	ALIGN(8)
+
+PROLOGUE(mpn_divexact_by3c)
+deflit(`FRAME',0)
+
+	movl	PARAM_SRC, %ecx
+	movl	PARAM_SIZE, %edx
+
+	decl	%edx
+	jnz	L(two_or_more)
+
+	movl	(%ecx), %edx
+	movl	PARAM_CARRY, %eax	C risk of cache bank clash here
+
+	movl	PARAM_DST, %ecx
+	subl	%eax, %edx
+
+	sbbl	%eax, %eax		C 0 or -1
+
+	imull	$INVERSE_3, %edx, %edx
+
+	negl	%eax			C 0 or 1
+	cmpl	$ONE_THIRD_CEIL, %edx
+
+	sbbl	$-1, %eax		C +1 if edx>=ceil(b/3)
+	cmpl	$TWO_THIRDS_CEIL, %edx
+
+	sbbl	$-1, %eax		C +1 if edx>=ceil(b*2/3)
+	movl	%edx, (%ecx)
+
+	ret
+
+
+L(two_or_more):
+	C eax
+	C ebx
+	C ecx	src
+	C edx	size-1
+	C esi
+	C edi
+	C ebp
+
+	pushl	%ebx	FRAME_pushl()
+	pushl	%esi	FRAME_pushl()
+
+	pushl	%edi	FRAME_pushl()
+	pushl	%ebp	FRAME_pushl()
+
+	movl	PARAM_DST, %edi
+	movl	PARAM_CARRY, %esi
+
+	movl	(%ecx), %eax		C src low limb
+	xorl	%ebx, %ebx
+
+	sub	%esi, %eax
+	movl	$TWO_THIRDS_FLOOR, %esi
+
+	leal	(%ecx,%edx,4), %ecx	C &src[size-1]
+	leal	(%edi,%edx,4), %edi	C &dst[size-1]
+
+	adcl	$0, %ebx		C carry, 0 or 1
+	negl	%edx			C -(size-1)
+
+
+C The loop needs a source limb ready at the top, which leads to one limb
+C handled separately at the end, and the special case above for size==1.
+C There doesn't seem to be any scheduling that would keep the speed but move
+C the source load and carry subtract up to the top.
+C
+C The destination cache line prefetching adds 1 cycle to the loop but is
+C considered worthwhile.  The slowdown is a factor of 1.07, but will prevent
+C repeated write-throughs if the destination isn't in L1.  A version using
+C an outer loop to prefetch only every 8 limbs (a cache line) proved to be
+C no faster, due to unavoidable branch mispreditions in the inner loop.
+C
+C setc is 2 cycles on P54, so an adcl is used instead.  If the movl $0,%ebx
+C could be avoided then the src limb fetch could pair up and save a cycle.
+C This would probably mean going to a two limb loop with the carry limb
+C alternately positive or negative, since an sbbl %ebx,%ebx will leave a
+C value which is in the opposite sense to the preceding sbbl/adcl %ebx,%eax.
+C
+C A register is used for TWO_THIRDS_FLOOR because a cmp can't be done as
+C "cmpl %edx, $n" with the immediate as the second operand.
+C
+C The "4" source displacement is in the loop rather than the setup because
+C this gets L(top) aligned to 8 bytes at no cost.
+
+	ALIGN(8)
+L(top):
+	C eax	source limb, carry subtracted
+	C ebx	carry (0 or 1)
+	C ecx	&src[size-1]
+	C edx	counter, limbs, negative
+	C esi	TWO_THIRDS_FLOOR
+	C edi	&dst[size-1]
+	C ebp	scratch (result limb)
+
+	imull	$INVERSE_3, %eax, %ebp
+
+	cmpl	$ONE_THIRD_CEIL, %ebp
+	movl	(%edi,%edx,4), %eax	C dst cache line prefetch
+
+	sbbl	$-1, %ebx		C +1 if ebp>=ceil(b/3)
+	cmpl	%ebp, %esi
+
+	movl	4(%ecx,%edx,4), %eax	C next src limb
+
+	sbbl	%ebx, %eax		C and further -1 if ebp>=ceil(b*2/3)
+	movl	$0, %ebx
+
+	adcl	$0, %ebx		C new carry
+	movl	%ebp, (%edi,%edx,4)
+
+	incl	%edx
+	jnz	L(top)
+
+
+
+	imull	$INVERSE_3, %eax, %edx
+
+	cmpl	$ONE_THIRD_CEIL, %edx
+	movl	%edx, (%edi)
+
+	sbbl	$-1, %ebx	C +1 if edx>=ceil(b/3)
+	cmpl	$TWO_THIRDS_CEIL, %edx
+
+	sbbl	$-1, %ebx	C +1 if edx>=ceil(b*2/3)
+	popl	%ebp
+
+	movl	%ebx, %eax
+	popl	%edi
+
+	popl	%esi
+	popl	%ebx
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/gmp-mparam.h b/rts/gmp/mpn/x86/pentium/gmp-mparam.h
new file mode 100644
index 0000000000..d3ed3d73ce
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/gmp-mparam.h
@@ -0,0 +1,97 @@
+/* Intel P54 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+#ifndef UMUL_TIME
+#define UMUL_TIME   9 /* cycles */
+#endif
+#ifndef UDIV_TIME
+#define UDIV_TIME   41 /* cycles */
+#endif
+
+/* bsf takes 18-42 cycles, put an average for uniform random numbers */
+#ifndef COUNT_TRAILING_ZEROS_TIME
+#define COUNT_TRAILING_ZEROS_TIME   20  /* cycles */
+#endif
+
+
+/* Generated by tuneup.c, 2000-07-06. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   14
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD      179
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   22
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD      153
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              46
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD            110
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            13
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        4
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD          25
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE  { 496, 928, 1920, 4608, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD     512
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD         3840
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE  { 496, 1184, 1920, 5632, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD     512
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD         3840
+#endif
diff --git a/rts/gmp/mpn/x86/pentium/lshift.asm b/rts/gmp/mpn/x86/pentium/lshift.asm
new file mode 100644
index 0000000000..e1e35d4c57
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/lshift.asm
@@ -0,0 +1,236 @@
+dnl  Intel Pentium mpn_lshift -- mpn left shift.
+dnl
+dnl          cycles/limb
+dnl  P5,P54:    6.0
+dnl  P55:       5.375
+
+
+dnl  Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
+dnl  Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
+C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	.text
+	ALIGN(8)
+PROLOGUE(mpn_lshift)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%ebp
+	movl	PARAM_SHIFT,%ecx
+
+C We can use faster code for shift-by-1 under certain conditions. 
+	cmp	$1,%ecx
+	jne	L(normal)
+	leal	4(%esi),%eax
+	cmpl	%edi,%eax
+	jnc	L(special)		C jump if s_ptr + 1 >= res_ptr
+	leal	(%esi,%ebp,4),%eax
+	cmpl	%eax,%edi
+	jnc	L(special)		C jump if res_ptr >= s_ptr + size
+
+L(normal):
+	leal	-4(%edi,%ebp,4),%edi
+	leal	-4(%esi,%ebp,4),%esi
+
+	movl	(%esi),%edx
+	subl	$4,%esi
+	xorl	%eax,%eax
+	shldl(	%cl, %edx, %eax)	C compute carry limb
+	pushl	%eax			C push carry limb onto stack
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+	jz	L(end)
+
+	movl	(%edi),%eax		C fetch destination cache line
+
+	ALIGN(4)
+L(oop):	movl	-28(%edi),%eax		C fetch destination cache line
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	-4(%esi),%edx
+	shldl(	%cl, %eax, %ebx)
+	shldl(	%cl, %edx, %eax)
+	movl	%ebx,(%edi)
+	movl	%eax,-4(%edi)
+
+	movl	-8(%esi),%ebx
+	movl	-12(%esi),%eax
+	shldl(	%cl, %ebx, %edx)
+	shldl(	%cl, %eax, %ebx)
+	movl	%edx,-8(%edi)
+	movl	%ebx,-12(%edi)
+
+	movl	-16(%esi),%edx
+	movl	-20(%esi),%ebx
+	shldl(	%cl, %edx, %eax)
+	shldl(	%cl, %ebx, %edx)
+	movl	%eax,-16(%edi)
+	movl	%edx,-20(%edi)
+
+	movl	-24(%esi),%eax
+	movl	-28(%esi),%edx
+	shldl(	%cl, %eax, %ebx)
+	shldl(	%cl, %edx, %eax)
+	movl	%ebx,-24(%edi)
+	movl	%eax,-28(%edi)
+
+	subl	$32,%esi
+	subl	$32,%edi
+	decl	%ebp
+	jnz	L(oop)
+
+L(end):	popl	%ebp
+	andl	$7,%ebp
+	jz	L(end2)
+L(oop2):
+	movl	(%esi),%eax
+	shldl(	%cl,%eax,%edx)
+	movl	%edx,(%edi)
+	movl	%eax,%edx
+	subl	$4,%esi
+	subl	$4,%edi
+	decl	%ebp
+	jnz	L(oop2)
+
+L(end2):
+	shll	%cl,%edx		C compute least significant limb
+	movl	%edx,(%edi)		C store it
+
+	popl	%eax			C pop carry limb
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+
+C We loop from least significant end of the arrays, which is only
+C permissable if the source and destination don't overlap, since the
+C function is documented to work for overlapping source and destination.
+
+L(special):
+	movl	(%esi),%edx
+	addl	$4,%esi
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+
+	addl	%edx,%edx
+	incl	%ebp
+	decl	%ebp
+	jz	L(Lend)
+
+	movl	(%edi),%eax		C fetch destination cache line
+
+	ALIGN(4)
+L(Loop):
+	movl	28(%edi),%eax		C fetch destination cache line
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	adcl	%eax,%eax
+	movl	%ebx,(%edi)
+	adcl	%edx,%edx
+	movl	%eax,4(%edi)
+
+	movl	8(%esi),%ebx
+	movl	12(%esi),%eax
+	adcl	%ebx,%ebx
+	movl	%edx,8(%edi)
+	adcl	%eax,%eax
+	movl	%ebx,12(%edi)
+
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	adcl	%edx,%edx
+	movl	%eax,16(%edi)
+	adcl	%ebx,%ebx
+	movl	%edx,20(%edi)
+
+	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	adcl	%eax,%eax
+	movl	%ebx,24(%edi)
+	adcl	%edx,%edx
+	movl	%eax,28(%edi)
+
+	leal	32(%esi),%esi		C use leal not to clobber carry
+	leal	32(%edi),%edi
+	decl	%ebp
+	jnz	L(Loop)
+
+L(Lend):
+	popl	%ebp
+	sbbl	%eax,%eax		C save carry in %eax
+	andl	$7,%ebp
+	jz	L(Lend2)
+	addl	%eax,%eax		C restore carry from eax
+L(Loop2):
+	movl	%edx,%ebx
+	movl	(%esi),%edx
+	adcl	%edx,%edx
+	movl	%ebx,(%edi)
+
+	leal	4(%esi),%esi		C use leal not to clobber carry
+	leal	4(%edi),%edi
+	decl	%ebp
+	jnz	L(Loop2)
+
+	jmp	L(L1)
+L(Lend2):
+	addl	%eax,%eax		C restore carry from eax
+L(L1):	movl	%edx,(%edi)		C store last limb
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h b/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h
new file mode 100644
index 0000000000..2379077d0c
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mmx/gmp-mparam.h
@@ -0,0 +1,97 @@
+/* Intel P55 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+
+#define BITS_PER_MP_LIMB 32
+#define BYTES_PER_MP_LIMB 4
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 32
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
+
+
+#ifndef UMUL_TIME
+#define UMUL_TIME   9 /* cycles */
+#endif
+#ifndef UDIV_TIME
+#define UDIV_TIME   41 /* cycles */
+#endif
+
+/* bsf takes 18-42 cycles, put an average for uniform random numbers */
+#ifndef COUNT_TRAILING_ZEROS_TIME
+#define COUNT_TRAILING_ZEROS_TIME   20  /* cycles */
+#endif
+
+
+/* Generated by tuneup.c, 2000-07-06. */
+
+#ifndef KARATSUBA_MUL_THRESHOLD
+#define KARATSUBA_MUL_THRESHOLD   14
+#endif
+#ifndef TOOM3_MUL_THRESHOLD
+#define TOOM3_MUL_THRESHOLD       99
+#endif
+
+#ifndef KARATSUBA_SQR_THRESHOLD
+#define KARATSUBA_SQR_THRESHOLD   22
+#endif
+#ifndef TOOM3_SQR_THRESHOLD
+#define TOOM3_SQR_THRESHOLD       89
+#endif
+
+#ifndef BZ_THRESHOLD
+#define BZ_THRESHOLD              40
+#endif
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD             98
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            13
+#endif
+
+#ifndef GCD_ACCEL_THRESHOLD
+#define GCD_ACCEL_THRESHOLD        5
+#endif
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD          25
+#endif
+
+#ifndef FFT_MUL_TABLE
+#define FFT_MUL_TABLE  { 496, 1056, 1920, 4608, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_MUL_THRESHOLD
+#define FFT_MODF_MUL_THRESHOLD     512
+#endif
+#ifndef FFT_MUL_THRESHOLD
+#define FFT_MUL_THRESHOLD         3840
+#endif
+
+#ifndef FFT_SQR_TABLE
+#define FFT_SQR_TABLE  { 496, 1184, 2176, 5632, 14336, 40960, 0 }
+#endif
+#ifndef FFT_MODF_SQR_THRESHOLD
+#define FFT_MODF_SQR_THRESHOLD     512
+#endif
+#ifndef FFT_SQR_THRESHOLD
+#define FFT_SQR_THRESHOLD         4352
+#endif
diff --git a/rts/gmp/mpn/x86/pentium/mmx/lshift.asm b/rts/gmp/mpn/x86/pentium/mmx/lshift.asm
new file mode 100644
index 0000000000..2225438658
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mmx/lshift.asm
@@ -0,0 +1,455 @@
+dnl  Intel P5 mpn_lshift -- mpn left shift.
+dnl 
+dnl  P5: 1.75 cycles/limb.
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C Shift src,size left by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the right.  Return the bits shifted out at the
+C left.
+C
+C The comments in mpn_rshift apply here too.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+dnl  minimum 5, because the unrolled loop can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+	.text
+	ALIGN(8)
+
+PROLOGUE(mpn_lshift)
+
+	pushl	%ebx
+	pushl	%edi
+deflit(`FRAME',8)
+
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_DST, %edx
+
+	movl	PARAM_SRC, %ebx
+	movl	PARAM_SHIFT, %ecx
+
+	cmp	$UNROLL_THRESHOLD, %eax
+	jae	L(unroll)
+
+	movl	-4(%ebx,%eax,4), %edi	C src high limb
+	decl	%eax
+
+	jnz	L(simple)
+
+	shldl(	%cl, %edi, %eax)	C eax was decremented to zero
+
+ 	shll	%cl, %edi
+
+	movl	%edi, (%edx)		C dst low limb
+	popl	%edi			C risk of data cache bank clash
+
+	popl	%ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(simple):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	movd	(%ebx,%eax,4), %mm5	C src high limb
+
+ 	movd	%ecx, %mm6		C lshift
+	negl	%ecx
+
+	psllq	%mm6, %mm5
+	addl	$32, %ecx
+
+	movd	%ecx, %mm7
+	psrlq	$32, %mm5		C retval
+
+
+L(simple_top):
+	C eax	counter, limbs, negative
+	C ebx	src
+	C ecx
+	C edx	dst
+	C esi
+	C edi
+	C
+	C mm0	scratch
+	C mm5	return value
+	C mm6	shift
+	C mm7	32-shift
+
+	movq	-4(%ebx,%eax,4), %mm0
+	decl	%eax
+
+ 	psrlq	%mm7, %mm0
+
+	C
+
+	movd	%mm0, 4(%edx,%eax,4)
+	jnz	L(simple_top)
+
+
+	movd	(%ebx), %mm0
+
+	movd	%mm5, %eax
+ 	psllq	%mm6, %mm0
+
+	popl	%edi
+	popl	%ebx
+
+	movd	%mm0, (%edx)
+
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(unroll):
+	C eax	size
+	C ebx	src
+	C ecx	shift
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	movd	-4(%ebx,%eax,4), %mm5	C src high limb
+	leal	(%ebx,%eax,4), %edi
+
+ 	movd	%ecx, %mm6		C lshift
+	andl	$4, %edi
+
+	psllq	%mm6, %mm5
+	jz	L(start_src_aligned)
+
+
+	C src isn't aligned, process high limb separately (marked xxx) to
+	C make it so.
+	C
+	C  source     -8(ebx,%eax,4)
+	C                  |
+	C  +-------+-------+-------+--
+	C  |               |          
+	C  +-------+-------+-------+--
+	C        0mod8   4mod8   0mod8
+	C
+	C  dest
+	C     -4(edx,%eax,4)
+	C          |
+	C  +-------+-------+--
+	C  |  xxx  |       |  
+	C  +-------+-------+--
+
+	movq	-8(%ebx,%eax,4), %mm0	C unaligned load
+
+	psllq	%mm6, %mm0
+	decl	%eax
+
+	psrlq	$32, %mm0
+
+	C
+
+	movd	%mm0, (%edx,%eax,4)
+L(start_src_aligned):
+
+	movq	-8(%ebx,%eax,4), %mm1	C src high qword
+	leal	(%edx,%eax,4), %edi
+
+	andl	$4, %edi
+	psrlq	$32, %mm5		C return value
+
+	movq	-16(%ebx,%eax,4), %mm3	C src second highest qword
+	jz	L(start_dst_aligned)
+
+	C dst isn't aligned, subtract 4 to make it so, and pretend the shift
+	C is 32 bits extra.  High limb of dst (marked xxx) handled here
+	C separately.
+	C
+	C  source     -8(ebx,%eax,4)
+	C                  |
+	C  +-------+-------+--
+	C  |      mm1      |  
+	C  +-------+-------+--
+	C                0mod8   4mod8
+	C
+	C  dest
+	C     -4(edx,%eax,4)
+	C          |
+	C  +-------+-------+-------+--
+	C  |  xxx  |               |
+	C  +-------+-------+-------+--
+	C        0mod8   4mod8   0mod8
+
+	movq	%mm1, %mm0
+	addl	$32, %ecx		C new shift
+
+	psllq	%mm6, %mm0
+
+	movd	%ecx, %mm6
+	psrlq	$32, %mm0
+
+	C wasted cycle here waiting for %mm0
+
+	movd	%mm0, -4(%edx,%eax,4)
+	subl	$4, %edx
+L(start_dst_aligned):
+
+
+ 	psllq	%mm6, %mm1
+	negl	%ecx			C -shift
+
+        addl    $64, %ecx		C 64-shift
+ 	movq	%mm3, %mm2
+
+        movd    %ecx, %mm7
+	subl	$8, %eax		C size-8
+
+ 	psrlq	%mm7, %mm3
+
+ 	por	%mm1, %mm3		C mm3 ready to store
+	jc	L(finish)
+
+
+	C The comments in mpn_rshift apply here too.
+
+	ALIGN(8)
+L(unroll_loop):
+	C eax	counter, limbs
+	C ebx	src
+	C ecx
+	C edx	dst
+	C esi
+	C edi
+	C
+	C mm0
+	C mm1
+	C mm2	src qword from 48(%ebx,%eax,4)
+	C mm3	dst qword ready to store to 56(%edx,%eax,4)
+	C
+	C mm5	return value
+	C mm6	lshift
+	C mm7	rshift
+
+ 	movq	8(%ebx,%eax,4), %mm0
+ 	psllq	%mm6, %mm2
+
+ 	movq	%mm0, %mm1
+ 	psrlq	%mm7, %mm0
+
+ 	movq	%mm3, 24(%edx,%eax,4)	C prev
+ 	por	%mm2, %mm0
+
+ 	movq	(%ebx,%eax,4), %mm3	C
+ 	psllq	%mm6, %mm1		C
+
+ 	movq	%mm0, 16(%edx,%eax,4)
+ 	movq	%mm3, %mm2		C
+
+ 	psrlq	%mm7, %mm3		C
+	subl	$4, %eax
+
+ 	por	%mm1, %mm3		C
+	jnc	L(unroll_loop)
+
+
+
+L(finish):
+	C eax	-4 to -1 representing respectively 0 to 3 limbs remaining
+
+	testb	$2, %al
+
+	jz	L(finish_no_two)
+
+ 	movq	8(%ebx,%eax,4), %mm0
+ 	psllq	%mm6, %mm2
+
+ 	movq	%mm0, %mm1
+ 	psrlq	%mm7, %mm0
+
+ 	movq	%mm3, 24(%edx,%eax,4)	C prev
+ 	por	%mm2, %mm0
+
+	movq	%mm1, %mm2
+	movq	%mm0, %mm3
+
+	subl	$2, %eax
+L(finish_no_two):
+
+
+	C eax	-4 or -3 representing respectively 0 or 1 limbs remaining
+	C
+	C mm2	src prev qword, from 48(%ebx,%eax,4)
+	C mm3	dst qword, for 56(%edx,%eax,4)
+
+	testb	$1, %al
+	movd	%mm5, %eax	C retval
+
+	popl	%edi
+	jz	L(finish_zero)
+
+
+	C One extra src limb, destination was aligned.
+	C
+	C                 source                  ebx
+	C                 --+---------------+-------+
+	C                   |      mm2      |       |
+	C                 --+---------------+-------+
+	C
+	C dest         edx+12           edx+4     edx
+	C --+---------------+---------------+-------+
+	C   |      mm3      |               |       |
+	C --+---------------+---------------+-------+
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C One extra src limb, destination was unaligned.
+	C
+	C                 source                  ebx
+	C                 --+---------------+-------+
+	C                   |      mm2      |       |
+	C                 --+---------------+-------+
+	C
+	C         dest         edx+12           edx+4
+	C         --+---------------+---------------+
+	C           |      mm3      |               |
+	C         --+---------------+---------------+
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C In both cases there's one extra limb of src to fetch and combine
+	C with mm2 to make a qword at 4(%edx), and in the aligned case
+	C there's an extra limb of dst to be formed from that extra src limb
+	C left shifted.
+
+
+        movd    (%ebx), %mm0
+ 	psllq	%mm6, %mm2
+
+	movq	%mm3, 12(%edx)
+	psllq	$32, %mm0
+
+        movq    %mm0, %mm1
+        psrlq   %mm7, %mm0
+
+        por     %mm2, %mm0
+        psllq   %mm6, %mm1
+
+	movq    %mm0, 4(%edx)
+	psrlq	$32, %mm1
+
+        andl	$32, %ecx
+	popl	%ebx
+
+	jz	L(finish_one_unaligned)
+
+	movd	%mm1, (%edx)
+L(finish_one_unaligned):
+
+	emms
+
+        ret
+
+
+L(finish_zero):
+
+	C No extra src limbs, destination was aligned.
+	C
+	C                 source          ebx
+	C                 --+---------------+
+	C                   |      mm2      |
+	C                 --+---------------+
+	C
+	C dest          edx+8             edx
+	C --+---------------+---------------+
+	C   |      mm3      |               |
+	C --+---------------+---------------+
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C No extra src limbs, destination was unaligned.
+	C
+	C               source            ebx
+	C                 --+---------------+
+	C                   |      mm2      |
+	C                 --+---------------+
+	C
+	C         dest          edx+8   edx+4
+	C         --+---------------+-------+
+	C           |      mm3      |       |
+	C         --+---------------+-------+
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C The movd for the unaligned case writes the same data to 4(%edx)
+	C that the movq does for the aligned case.
+
+
+ 	movq	%mm3, 8(%edx)
+	andl	$32, %ecx
+
+ 	psllq	%mm6, %mm2
+	jz	L(finish_zero_unaligned)
+
+ 	movq	%mm2, (%edx)
+L(finish_zero_unaligned):
+
+	psrlq	$32, %mm2
+	popl	%ebx
+
+	movd	%mm5, %eax	C retval
+
+	movd	%mm2, 4(%edx)
+
+	emms
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/mmx/popham.asm b/rts/gmp/mpn/x86/pentium/mmx/popham.asm
new file mode 100644
index 0000000000..587a07ab3d
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mmx/popham.asm
@@ -0,0 +1,30 @@
+dnl  Intel P55 mpn_popcount, mpn_hamdist -- population count and hamming
+dnl  distance.
+dnl 
+dnl  P55: popcount 11.5 cycles/limb, hamdist 12.0 cycles/limb
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+include_mpn(`x86/k6/mmx/popham.asm')
diff --git a/rts/gmp/mpn/x86/pentium/mmx/rshift.asm b/rts/gmp/mpn/x86/pentium/mmx/rshift.asm
new file mode 100644
index 0000000000..7672630d57
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mmx/rshift.asm
@@ -0,0 +1,460 @@
+dnl  Intel P5 mpn_rshift -- mpn right shift.
+dnl 
+dnl  P5: 1.75 cycles/limb.
+
+
+dnl  Copyright (C) 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C Shift src,size right by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the left.  Return the bits shifted out at the
+C right.
+C
+C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,
+C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.
+C
+C Full speed depends on source and destination being aligned.  Unaligned mmx
+C loads and stores on P5 don't pair and have a 2 cycle penalty.  Some hairy
+C setups and finish-ups are done to ensure alignment for the loop.
+C
+C MMX shifts work out a bit faster even for the simple loop.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+dnl  Minimum 5, because the unrolled loop can't handle less.
+deflit(UNROLL_THRESHOLD, 5)
+
+	.text
+	ALIGN(8)
+
+PROLOGUE(mpn_rshift)
+
+	pushl	%ebx
+	pushl	%edi
+deflit(`FRAME',8)
+
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_DST, %edx
+
+	movl	PARAM_SRC, %ebx
+	movl	PARAM_SHIFT, %ecx
+
+	cmp	$UNROLL_THRESHOLD, %eax
+	jae	L(unroll)
+
+	decl	%eax
+	movl	(%ebx), %edi		C src low limb
+
+	jnz	L(simple)
+
+	shrdl(	%cl, %edi, %eax)	C eax was decremented to zero
+
+ 	shrl	%cl, %edi
+
+	movl	%edi, (%edx)		C dst low limb
+	popl	%edi			C risk of data cache bank clash
+
+	popl	%ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(simple):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	movd	(%ebx), %mm5		C src[0]
+	leal	(%ebx,%eax,4), %ebx	C &src[size-1]
+
+ 	movd	%ecx, %mm6		C rshift
+	leal	-4(%edx,%eax,4), %edx	C &dst[size-2]
+
+	psllq	$32, %mm5
+	negl	%eax
+
+
+C This loop is 5 or 8 cycles, with every second load unaligned and a wasted
+C cycle waiting for the mm0 result to be ready.  For comparison a shrdl is 4
+C cycles and would be 8 in a simple loop.  Using mmx helps the return value
+C and last limb calculations too.
+
+L(simple_top):
+	C eax	counter, limbs, negative
+	C ebx	&src[size-1]
+	C ecx	return value
+	C edx	&dst[size-2]
+	C
+	C mm0	scratch
+	C mm5	return value
+	C mm6	shift
+
+	movq	(%ebx,%eax,4), %mm0
+	incl	%eax
+
+ 	psrlq	%mm6, %mm0
+
+	movd	%mm0, (%edx,%eax,4)
+	jnz	L(simple_top)
+
+
+	movd	(%ebx), %mm0
+	psrlq	%mm6, %mm5		C return value
+
+ 	psrlq	%mm6, %mm0
+	popl	%edi
+
+	movd	%mm5, %eax
+	popl	%ebx
+
+	movd	%mm0, 4(%edx)
+
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(unroll):
+	C eax	size
+	C ebx	src
+	C ecx	shift
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	movd	(%ebx), %mm5		C src[0]
+	movl	$4, %edi
+
+ 	movd	%ecx, %mm6		C rshift
+	testl	%edi, %ebx
+
+	psllq	$32, %mm5
+	jz	L(start_src_aligned)
+
+
+	C src isn't aligned, process low limb separately (marked xxx) and
+	C step src and dst by one limb, making src aligned.
+	C
+	C source                  ebx
+	C --+-------+-------+-------+
+	C           |          xxx  |
+	C --+-------+-------+-------+
+	C         4mod8   0mod8   4mod8
+	C
+	C         dest            edx
+	C         --+-------+-------+
+	C           |       |  xxx  |  
+	C         --+-------+-------+
+
+	movq	(%ebx), %mm0		C unaligned load
+
+	psrlq	%mm6, %mm0
+	addl	$4, %ebx
+
+	decl	%eax
+
+	movd	%mm0, (%edx)
+	addl	$4, %edx
+L(start_src_aligned):
+
+
+	movq	(%ebx), %mm1
+	testl	%edi, %edx
+
+	psrlq	%mm6, %mm5		C retval
+	jz	L(start_dst_aligned)
+
+	C dst isn't aligned, add 4 to make it so, and pretend the shift is
+	C 32 bits extra.  Low limb of dst (marked xxx) handled here
+	C separately.
+	C
+	C          source          ebx
+	C          --+-------+-------+
+	C            |      mm1      |
+	C          --+-------+-------+
+	C                  4mod8   0mod8
+	C
+	C  dest                    edx
+	C  --+-------+-------+-------+
+	C                    |  xxx  |        
+	C  --+-------+-------+-------+
+	C          4mod8   0mod8   4mod8
+
+	movq	%mm1, %mm0
+	addl	$32, %ecx		C new shift
+
+	psrlq	%mm6, %mm0
+
+	movd	%ecx, %mm6
+
+	movd	%mm0, (%edx)
+	addl	$4, %edx
+L(start_dst_aligned):
+
+
+	movq	8(%ebx), %mm3
+	negl	%ecx
+
+ 	movq	%mm3, %mm2		C mm2 src qword
+        addl    $64, %ecx
+
+        movd    %ecx, %mm7
+ 	psrlq	%mm6, %mm1
+
+	leal	-12(%ebx,%eax,4), %ebx
+	leal	-20(%edx,%eax,4), %edx
+
+ 	psllq	%mm7, %mm3
+	subl	$7, %eax		C size-7
+
+ 	por	%mm1, %mm3		C mm3 ready to store
+	negl	%eax			C -(size-7)
+
+	jns	L(finish)
+
+
+	C This loop is the important bit, the rest is just support.  Careful
+	C instruction scheduling achieves the claimed 1.75 c/l.  The
+	C relevant parts of the pairing rules are:
+	C
+	C - mmx loads and stores execute only in the U pipe
+	C - only one mmx shift in a pair
+	C - wait one cycle before storing an mmx register result
+	C - the usual address generation interlock
+	C
+	C Two qword calculations are slightly interleaved.  The instructions
+	C marked "C" belong to the second qword, and the "C prev" one is for
+	C the second qword from the previous iteration.
+
+	ALIGN(8)
+L(unroll_loop):
+	C eax	counter, limbs, negative
+	C ebx	&src[size-12]
+	C ecx
+	C edx	&dst[size-12]
+	C esi
+	C edi
+	C
+	C mm0
+	C mm1
+	C mm2	src qword from -8(%ebx,%eax,4)
+	C mm3	dst qword ready to store to -8(%edx,%eax,4)
+	C
+	C mm5	return value
+	C mm6	rshift
+	C mm7	lshift
+
+ 	movq	(%ebx,%eax,4), %mm0
+ 	psrlq	%mm6, %mm2
+
+ 	movq	%mm0, %mm1
+ 	psllq	%mm7, %mm0
+
+ 	movq	%mm3, -8(%edx,%eax,4)	C prev
+ 	por	%mm2, %mm0
+
+ 	movq	8(%ebx,%eax,4), %mm3	C
+ 	psrlq	%mm6, %mm1		C
+
+ 	movq	%mm0, (%edx,%eax,4)
+ 	movq	%mm3, %mm2		C
+
+ 	psllq	%mm7, %mm3		C
+	addl	$4, %eax
+
+ 	por	%mm1, %mm3		C
+	js	L(unroll_loop)
+
+
+L(finish):
+	C eax	0 to 3 representing respectively 3 to 0 limbs remaining
+
+	testb	$2, %al
+
+	jnz	L(finish_no_two)
+
+ 	movq	(%ebx,%eax,4), %mm0
+ 	psrlq	%mm6, %mm2
+
+ 	movq	%mm0, %mm1
+ 	psllq	%mm7, %mm0
+
+ 	movq	%mm3, -8(%edx,%eax,4)	C prev
+ 	por	%mm2, %mm0
+
+	movq	%mm1, %mm2
+	movq	%mm0, %mm3
+
+	addl	$2, %eax
+L(finish_no_two):
+
+
+	C eax	2 or 3 representing respectively 1 or 0 limbs remaining
+	C
+	C mm2	src prev qword, from -8(%ebx,%eax,4)
+	C mm3	dst qword, for -8(%edx,%eax,4)
+
+	testb	$1, %al
+	popl	%edi
+
+	movd	%mm5, %eax	C retval
+	jnz	L(finish_zero)
+
+
+	C One extra limb, destination was aligned.
+	C
+	C source                ebx
+	C +-------+---------------+--
+	C |       |      mm2      |
+	C +-------+---------------+--
+	C
+	C dest                                  edx
+	C +-------+---------------+---------------+--
+	C |       |               |      mm3      |
+	C +-------+---------------+---------------+--
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C One extra limb, destination was unaligned.
+	C
+	C source                ebx
+	C +-------+---------------+--
+	C |       |      mm2      |
+	C +-------+---------------+--
+	C
+	C dest                          edx
+	C +---------------+---------------+--
+	C |               |      mm3      |
+	C +---------------+---------------+--
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C In both cases there's one extra limb of src to fetch and combine
+	C with mm2 to make a qword at 8(%edx), and in the aligned case
+	C there's a further extra limb of dst to be formed.
+
+
+        movd    8(%ebx), %mm0
+ 	psrlq	%mm6, %mm2
+
+        movq    %mm0, %mm1
+        psllq   %mm7, %mm0
+
+	movq	%mm3, (%edx)
+        por     %mm2, %mm0
+
+        psrlq   %mm6, %mm1
+        andl	$32, %ecx
+
+	popl	%ebx
+	jz	L(finish_one_unaligned)
+
+        C dst was aligned, must store one extra limb
+	movd	%mm1, 16(%edx)
+L(finish_one_unaligned):
+
+	movq    %mm0, 8(%edx)
+
+	emms
+
+        ret
+
+
+L(finish_zero):
+
+	C No extra limbs, destination was aligned.
+	C
+	C source        ebx
+	C +---------------+--
+	C |      mm2      |
+	C +---------------+--
+	C
+	C dest                        edx+4
+	C +---------------+---------------+--
+	C |               |      mm3      |
+	C +---------------+---------------+--
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C No extra limbs, destination was unaligned.
+	C
+	C source        ebx
+	C +---------------+--
+	C |      mm2      |
+	C +---------------+--
+	C
+	C dest                edx+4
+	C +-------+---------------+--
+	C |       |      mm3      |
+	C +-------+---------------+--
+	C
+	C mm6 = shift+32
+	C mm7 = 64-(shift+32)
+
+
+	C The movd for the unaligned case is clearly the same data as the
+	C movq for the aligned case, it's just a choice between whether one
+	C or two limbs should be written.
+
+
+ 	movq	%mm3, 4(%edx)
+ 	psrlq	%mm6, %mm2
+
+ 	movd	%mm2, 12(%edx)
+	andl	$32, %ecx
+
+	popl	%ebx
+	jz	L(finish_zero_unaligned)
+
+ 	movq	%mm2, 12(%edx)
+L(finish_zero_unaligned):
+
+	emms
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/mul_1.asm b/rts/gmp/mpn/x86/pentium/mul_1.asm
new file mode 100644
index 0000000000..08639eca09
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mul_1.asm
@@ -0,0 +1,79 @@
+dnl  Intel Pentium mpn_mul_1 -- mpn by limb multiplication.
+dnl
+dnl  P5: 13.0 cycles/limb
+
+dnl  Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation,
+dnl  Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA. */
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t multiplier);
+
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	.text
+	ALIGN(8)
+PROLOGUE(mpn_mul_1)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST, %edi
+	movl	PARAM_SRC, %esi
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_MULTIPLIER, %ebp
+
+	leal	(%edi,%ecx,4), %edi
+	leal	(%esi,%ecx,4), %esi
+	negl	%ecx
+	xorl	%ebx, %ebx
+	ALIGN(8)
+
+L(oop):	adcl	$0, %ebx
+	movl	(%esi,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%eax, %ebx
+
+	movl	%ebx, (%edi,%ecx,4)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(oop)
+
+	adcl	$0, %ebx
+	movl	%ebx, %eax
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/mul_basecase.asm b/rts/gmp/mpn/x86/pentium/mul_basecase.asm
new file mode 100644
index 0000000000..d9f79a0831
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/mul_basecase.asm
@@ -0,0 +1,135 @@
+dnl  Intel Pentium mpn_mul_basecase -- mpn by mpn multiplication.
+dnl 
+dnl  P5: 14.2 cycles/crossproduct (approx)
+
+
+dnl  Copyright (C) 1996, 1998, 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xsize,
+C                        mp_srcptr yp, mp_size_t ysize);
+
+defframe(PARAM_YSIZE, 20)
+defframe(PARAM_YP,    16)
+defframe(PARAM_XSIZE, 12)
+defframe(PARAM_XP,    8)
+defframe(PARAM_WP,    4)
+
+defframe(VAR_COUNTER, -4)
+
+	.text
+	ALIGN(8)
+PROLOGUE(mpn_mul_basecase)
+
+	pushl	%eax			C dummy push for allocating stack slot
+	pushl	%esi
+	pushl	%ebp
+	pushl	%edi
+deflit(`FRAME',16)
+
+	movl	PARAM_XP,%esi
+	movl	PARAM_WP,%edi
+	movl	PARAM_YP,%ebp
+
+	movl	(%esi),%eax		C load xp[0]
+	mull	(%ebp)			C multiply by yp[0]
+	movl	%eax,(%edi)		C store to wp[0]
+	movl	PARAM_XSIZE,%ecx	C xsize
+	decl	%ecx			C If xsize = 1, ysize = 1 too
+	jz	L(done)
+
+	movl	PARAM_XSIZE,%eax
+	pushl	%ebx
+FRAME_pushl()
+	movl	%edx,%ebx
+	leal	(%esi,%eax,4),%esi	C make xp point at end
+	leal	(%edi,%eax,4),%edi	C offset wp by xsize
+	negl	%ecx			C negate j size/index for inner loop
+	xorl	%eax,%eax		C clear carry
+
+	ALIGN(8)
+L(oop1):	adcl	$0,%ebx
+	movl	(%esi,%ecx,4),%eax	C load next limb at xp[j]
+	mull	(%ebp)
+	addl	%ebx,%eax
+	movl	%eax,(%edi,%ecx,4)
+	incl	%ecx
+	movl	%edx,%ebx
+	jnz	L(oop1)
+
+	adcl	$0,%ebx
+	movl	PARAM_YSIZE,%eax
+	movl	%ebx,(%edi)		C most significant limb of product
+	addl	$4,%edi			C increment wp
+	decl	%eax
+	jz	L(skip)
+	movl	%eax,VAR_COUNTER	C set index i to ysize
+
+L(outer):
+	addl	$4,%ebp			C make ebp point to next y limb
+	movl	PARAM_XSIZE,%ecx
+	negl	%ecx
+	xorl	%ebx,%ebx
+
+	C code at 0x61 here, close enough to aligned
+L(oop2):
+	adcl	$0,%ebx
+	movl	(%esi,%ecx,4),%eax
+	mull	(%ebp)
+	addl	%ebx,%eax
+	movl	(%edi,%ecx,4),%ebx
+	adcl	$0,%edx
+	addl	%eax,%ebx
+	movl	%ebx,(%edi,%ecx,4)
+	incl	%ecx
+	movl	%edx,%ebx
+	jnz	L(oop2)
+
+	adcl	$0,%ebx
+
+	movl	%ebx,(%edi)
+	addl	$4,%edi
+	movl	VAR_COUNTER,%eax
+	decl	%eax
+	movl	%eax,VAR_COUNTER
+	jnz	L(outer)
+
+L(skip):
+	popl	%ebx
+	popl	%edi
+	popl	%ebp
+	popl	%esi
+	addl	$4,%esp
+	ret
+
+L(done):
+	movl	%edx,4(%edi)	C store to wp[1]
+	popl	%edi
+	popl	%ebp
+	popl	%esi
+	popl	%eax		C dummy pop for deallocating stack slot
+	ret
+
+EPILOGUE()
+
diff --git a/rts/gmp/mpn/x86/pentium/rshift.asm b/rts/gmp/mpn/x86/pentium/rshift.asm
new file mode 100644
index 0000000000..e8f5ae8ec8
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/rshift.asm
@@ -0,0 +1,236 @@
+dnl  Intel Pentium mpn_rshift -- mpn right shift.
+dnl
+dnl          cycles/limb
+dnl  P5,P54:    6.0
+dnl  P55:       5.375
+
+
+dnl  Copyright (C) 1992, 1994, 1995, 1996, 1999, 2000 Free Software
+dnl  Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
+C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	.text
+	ALIGN(8)
+PROLOGUE(mpn_rshift)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%ebp
+	movl	PARAM_SHIFT,%ecx
+
+C We can use faster code for shift-by-1 under certain conditions. 
+	cmp	$1,%ecx
+	jne	L(normal)
+	leal	4(%edi),%eax
+	cmpl	%esi,%eax
+	jnc	L(special)		C jump if res_ptr + 1 >= s_ptr
+	leal	(%edi,%ebp,4),%eax
+	cmpl	%eax,%esi
+	jnc	L(special)		C jump if s_ptr >= res_ptr + size
+
+L(normal):
+	movl	(%esi),%edx
+	addl	$4,%esi
+	xorl	%eax,%eax
+	shrdl(	%cl, %edx, %eax)	C compute carry limb
+	pushl	%eax			C push carry limb onto stack
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+	jz	L(end)
+
+	movl	(%edi),%eax		C fetch destination cache line
+
+	ALIGN(4)
+L(oop):	movl	28(%edi),%eax		C fetch destination cache line
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	shrdl(	%cl, %eax, %ebx)
+	shrdl(	%cl, %edx, %eax)
+	movl	%ebx,(%edi)
+	movl	%eax,4(%edi)
+
+	movl	8(%esi),%ebx
+	movl	12(%esi),%eax
+	shrdl(	%cl, %ebx, %edx)
+	shrdl(	%cl, %eax, %ebx)
+	movl	%edx,8(%edi)
+	movl	%ebx,12(%edi)
+
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	shrdl(	%cl, %edx, %eax)
+	shrdl(	%cl, %ebx, %edx)
+	movl	%eax,16(%edi)
+	movl	%edx,20(%edi)
+
+	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	shrdl(	%cl, %eax, %ebx)
+	shrdl(	%cl, %edx, %eax)
+	movl	%ebx,24(%edi)
+	movl	%eax,28(%edi)
+
+	addl	$32,%esi
+	addl	$32,%edi
+	decl	%ebp
+	jnz	L(oop)
+
+L(end):	popl	%ebp
+	andl	$7,%ebp
+	jz	L(end2)
+L(oop2):
+	movl	(%esi),%eax
+	shrdl(	%cl,%eax,%edx)		C compute result limb
+	movl	%edx,(%edi)
+	movl	%eax,%edx
+	addl	$4,%esi
+	addl	$4,%edi
+	decl	%ebp
+	jnz	L(oop2)
+
+L(end2):
+	shrl	%cl,%edx		C compute most significant limb
+	movl	%edx,(%edi)		C store it
+
+	popl	%eax			C pop carry limb
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+
+C We loop from least significant end of the arrays, which is only
+C permissable if the source and destination don't overlap, since the
+C function is documented to work for overlapping source and destination.
+
+L(special):
+	leal	-4(%edi,%ebp,4),%edi
+	leal	-4(%esi,%ebp,4),%esi
+
+	movl	(%esi),%edx
+	subl	$4,%esi
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+
+	shrl	%edx
+	incl	%ebp
+	decl	%ebp
+	jz	L(Lend)
+
+	movl	(%edi),%eax		C fetch destination cache line
+
+	ALIGN(4)
+L(Loop):
+	movl	-28(%edi),%eax		C fetch destination cache line
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	-4(%esi),%edx
+	rcrl	%eax
+	movl	%ebx,(%edi)
+	rcrl	%edx
+	movl	%eax,-4(%edi)
+
+	movl	-8(%esi),%ebx
+	movl	-12(%esi),%eax
+	rcrl	%ebx
+	movl	%edx,-8(%edi)
+	rcrl	%eax
+	movl	%ebx,-12(%edi)
+
+	movl	-16(%esi),%edx
+	movl	-20(%esi),%ebx
+	rcrl	%edx
+	movl	%eax,-16(%edi)
+	rcrl	%ebx
+	movl	%edx,-20(%edi)
+
+	movl	-24(%esi),%eax
+	movl	-28(%esi),%edx
+	rcrl	%eax
+	movl	%ebx,-24(%edi)
+	rcrl	%edx
+	movl	%eax,-28(%edi)
+
+	leal	-32(%esi),%esi		C use leal not to clobber carry
+	leal	-32(%edi),%edi
+	decl	%ebp
+	jnz	L(Loop)
+
+L(Lend):
+	popl	%ebp
+	sbbl	%eax,%eax		C save carry in %eax
+	andl	$7,%ebp
+	jz	L(Lend2)
+	addl	%eax,%eax		C restore carry from eax
+L(Loop2):
+	movl	%edx,%ebx
+	movl	(%esi),%edx
+	rcrl	%edx
+	movl	%ebx,(%edi)
+
+	leal	-4(%esi),%esi		C use leal not to clobber carry
+	leal	-4(%edi),%edi
+	decl	%ebp
+	jnz	L(Loop2)
+
+	jmp	L(L1)
+L(Lend2):
+	addl	%eax,%eax		C restore carry from eax
+L(L1):	movl	%edx,(%edi)		C store last limb
+
+	movl	$0,%eax
+	rcrl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/pentium/sqr_basecase.asm b/rts/gmp/mpn/x86/pentium/sqr_basecase.asm
new file mode 100644
index 0000000000..c8584df13c
--- /dev/null
+++ b/rts/gmp/mpn/x86/pentium/sqr_basecase.asm
@@ -0,0 +1,520 @@
+dnl  Intel P5 mpn_sqr_basecase -- square an mpn number.
+dnl 
+dnl  P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
+dnl  product at around 20x20 limbs.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Calculate src,size squared, storing the result in dst,2*size.
+C
+C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the size is
+C small.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	.text
+	ALIGN(8)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %edx
+	movl	PARAM_SRC, %eax
+
+	cmpl	$2, %edx
+	movl	PARAM_DST, %ecx
+
+	je	L(two_limbs)
+
+	movl	(%eax), %eax
+	ja	L(three_or_more)
+
+C -----------------------------------------------------------------------------
+C one limb only
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx
+
+	mull	%eax
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+
+	ret
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(two_limbs):
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx	size
+
+	pushl	%ebp
+	pushl	%edi
+
+	pushl	%esi
+	pushl	%ebx
+
+	movl	%eax, %ebx
+	movl	(%eax), %eax
+
+	mull	%eax		C src[0]^2
+
+	movl	%eax, (%ecx)	C dst[0]
+	movl	%edx, %esi	C dst[1]
+
+	movl	4(%ebx), %eax
+
+	mull	%eax		C src[1]^2
+
+	movl	%eax, %edi	C dst[2]
+	movl	%edx, %ebp	C dst[3]
+
+	movl	(%ebx), %eax
+
+	mull	4(%ebx)		C src[0]*src[1]
+
+	addl	%eax, %esi
+	popl	%ebx
+
+	adcl	%edx, %edi
+
+	adcl	$0, %ebp
+	addl	%esi, %eax
+
+	adcl	%edi, %edx
+	movl	%eax, 4(%ecx)
+
+	adcl	$0, %ebp
+	popl	%esi
+
+	movl	%edx, 8(%ecx)
+	movl	%ebp, 12(%ecx)
+
+	popl	%edi
+	popl	%ebp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(three_or_more):
+	C eax	src low limb
+	C ebx
+	C ecx	dst
+	C edx	size
+
+	cmpl	$4, %edx
+	pushl	%ebx
+deflit(`FRAME',4)
+
+	movl	PARAM_SRC, %ebx
+	jae	L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+	C eax	src low limb
+	C ebx	src
+	C ecx	dst
+	C edx	size
+
+	pushl	%ebp
+	pushl	%edi
+
+	mull	%eax		C src[0] ^ 2
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+
+	movl	4(%ebx), %eax
+	xorl	%ebp, %ebp
+
+	mull	%eax		C src[1] ^ 2
+
+	movl	%eax, 8(%ecx)
+	movl	%edx, 12(%ecx)
+
+	movl	8(%ebx), %eax
+	pushl	%esi		C risk of cache bank clash
+
+	mull	%eax		C src[2] ^ 2
+
+	movl	%eax, 16(%ecx)
+	movl	%edx, 20(%ecx)
+
+	movl	(%ebx), %eax
+
+	mull	4(%ebx)		C src[0] * src[1]
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	movl	(%ebx), %eax
+
+	mull	8(%ebx)		C src[0] * src[2]
+
+	addl	%eax, %edi
+	movl	%edx, %ebp
+
+	adcl	$0, %ebp
+	movl	4(%ebx), %eax
+
+	mull	8(%ebx)		C src[1] * src[2]
+
+	xorl	%ebx, %ebx
+	addl	%eax, %ebp
+
+	C eax
+	C ebx	zero, will be dst[5]
+	C ecx	dst
+	C edx	dst[4]
+	C esi	dst[1]
+	C edi	dst[2]
+	C ebp	dst[3]
+
+	adcl	$0, %edx
+	addl	%esi, %esi
+
+	adcl	%edi, %edi
+
+	adcl	%ebp, %ebp
+
+	adcl	%edx, %edx
+	movl	4(%ecx), %eax
+
+	adcl	$0, %ebx
+	addl	%esi, %eax
+
+	movl	%eax, 4(%ecx)
+	movl	8(%ecx), %eax
+
+	adcl	%edi, %eax
+	movl	12(%ecx), %esi
+
+	adcl	%ebp, %esi
+	movl	16(%ecx), %edi
+
+	movl	%eax, 8(%ecx)
+	movl	%esi, 12(%ecx)
+
+	adcl	%edx, %edi
+	popl	%esi
+
+	movl	20(%ecx), %eax
+	movl	%edi, 16(%ecx)
+
+	popl	%edi
+	popl	%ebp
+
+	adcl	%ebx, %eax	C no carry out of this
+	popl	%ebx
+
+	movl	%eax, 20(%ecx)
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(four_or_more):
+	C eax	src low limb
+	C ebx	src
+	C ecx	dst
+	C edx	size
+	C esi
+	C edi
+	C ebp
+	C
+	C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+
+deflit(`FRAME',4)
+
+	pushl	%edi
+FRAME_pushl()
+	pushl	%esi
+FRAME_pushl()
+
+	pushl	%ebp
+FRAME_pushl()
+	leal	(%ecx,%edx,4), %edi	C dst end of this mul1
+
+	leal	(%ebx,%edx,4), %esi	C src end
+	movl	%ebx, %ebp		C src
+
+	negl	%edx			C -size
+	xorl	%ebx, %ebx		C clear carry limb and carry flag
+
+	leal	1(%edx), %ecx		C -(size-1)
+
+L(mul1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp	src
+
+	adcl	$0, %ebx
+	movl	(%esi,%ecx,4), %eax
+
+	mull	(%ebp)
+
+	addl	%eax, %ebx
+
+	movl	%ebx, (%edi,%ecx,4)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(mul1)
+
+
+	C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
+	C n=1..size-2.
+	C
+	C The last two products, which are the end corner of the product
+	C triangle, are handled separately to save looping overhead.  These
+	C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
+	C If size is 4 then it's only these that need to be done.
+	C
+	C In the outer loop %esi is a constant, and %edi just advances by 1
+	C limb each time.  The size of the operation decreases by 1 limb
+	C each time.
+
+	C eax
+	C ebx	carry (needing carry flag added)
+	C ecx
+	C edx
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp
+
+	adcl	$0, %ebx
+	movl	PARAM_SIZE, %edx
+
+	movl	%ebx, (%edi)
+	subl	$4, %edx
+
+	negl	%edx
+	jz	L(corner)
+
+
+L(outer):
+	C ebx	previous carry limb to store
+	C edx	outer loop counter (negative)
+	C esi	&src[size]
+	C edi	dst, pointing at stored carry limb of previous loop
+
+	pushl	%edx			C new outer loop counter
+	leal	-2(%edx), %ecx
+
+	movl	%ebx, (%edi)
+	addl	$4, %edi
+
+	addl	$4, %ebp
+	xorl	%ebx, %ebx		C initial carry limb, clear carry flag
+
+L(inner):
+	C eax	scratch
+	C ebx	carry (needing carry flag added)
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	&src[size]
+	C edi	dst end of this addmul
+	C ebp	&src[j]
+
+	adcl	$0, %ebx
+	movl	(%esi,%ecx,4), %eax
+
+	mull	(%ebp)
+
+	addl	%ebx, %eax
+	movl	(%edi,%ecx,4), %ebx
+
+	adcl	$0, %edx
+	addl	%eax, %ebx
+
+	movl	%ebx, (%edi,%ecx,4)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(inner)
+
+
+	adcl	$0, %ebx
+	popl	%edx		C outer loop counter
+
+	incl	%edx
+	jnz	L(outer)
+
+
+	movl	%ebx, (%edi)
+
+L(corner):
+	C esi	&src[size]
+	C edi	&dst[2*size-4]
+
+	movl	-8(%esi), %eax
+	movl	-4(%edi), %ebx		C risk of data cache bank clash here
+
+	mull	-12(%esi)		C src[size-2]*src[size-3]
+
+	addl	%eax, %ebx
+	movl	%edx, %ecx
+
+	adcl	$0, %ecx
+	movl	-4(%esi), %eax
+
+	mull	-12(%esi)		C src[size-1]*src[size-3]
+
+	addl	%ecx, %eax
+	movl	(%edi), %ecx
+
+	adcl	$0, %edx
+	movl	%ebx, -4(%edi)
+
+	addl	%eax, %ecx
+	movl	%edx, %ebx
+
+	adcl	$0, %ebx
+	movl	-4(%esi), %eax
+
+	mull	-8(%esi)		C src[size-1]*src[size-2]
+
+	movl	%ecx, 0(%edi)
+	addl	%eax, %ebx
+
+	adcl	$0, %edx
+	movl	PARAM_SIZE, %eax
+
+	negl	%eax
+	movl	%ebx, 4(%edi)
+
+	addl	$1, %eax		C -(size-1) and clear carry
+	movl	%edx, 8(%edi)
+
+
+C -----------------------------------------------------------------------------
+C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
+
+L(lshift):
+	C eax	counter, negative
+	C ebx	next limb
+	C ecx
+	C edx
+	C esi
+	C edi	&dst[2*size-4]
+	C ebp
+
+	movl	12(%edi,%eax,8), %ebx
+
+	rcll	%ebx
+	movl	16(%edi,%eax,8), %ecx
+	
+	rcll	%ecx
+	movl	%ebx, 12(%edi,%eax,8)
+
+	movl	%ecx, 16(%edi,%eax,8)
+	incl	%eax
+
+	jnz	L(lshift)
+
+
+	adcl	%eax, %eax		C high bit out
+	movl	PARAM_SRC, %esi
+
+	movl	PARAM_SIZE, %ecx	C risk of cache bank clash
+	movl	%eax, 12(%edi)		C dst most significant limb
+
+
+C -----------------------------------------------------------------------------
+C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
+C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+	movl	(%esi), %eax		C src[0]
+	leal	(%esi,%ecx,4), %esi	C src end
+
+	negl	%ecx
+
+	mull	%eax
+
+	movl	%eax, 16(%edi,%ecx,8)	C dst[0]
+	movl	%edx, %ebx
+
+	addl	$1, %ecx		C size-1 and clear carry
+
+L(diag):
+	C eax	scratch (low product)
+	C ebx	carry limb
+	C ecx	counter, negative
+	C edx	scratch (high product)
+	C esi	&src[size]
+	C edi	&dst[2*size-4]
+	C ebp	scratch (fetched dst limbs)
+
+	movl	(%esi,%ecx,4), %eax
+	adcl	$0, %ebx
+
+	mull	%eax
+
+	movl	16-4(%edi,%ecx,8), %ebp
+
+	addl	%ebp, %ebx
+	movl	16(%edi,%ecx,8), %ebp
+
+	adcl	%eax, %ebp
+	movl	%ebx, 16-4(%edi,%ecx,8)
+
+	movl	%ebp, 16(%edi,%ecx,8)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(diag)
+
+
+	adcl	$0, %edx
+	movl	16-4(%edi), %eax	C dst most significant limb
+
+	addl	%eax, %edx
+	popl	%ebp
+
+	movl	%edx, 16-4(%edi)
+	popl	%esi		C risk of cache bank clash
+
+	popl	%edi
+	popl	%ebx
+
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/rshift.asm b/rts/gmp/mpn/x86/rshift.asm
new file mode 100644
index 0000000000..c9881fd966
--- /dev/null
+++ b/rts/gmp/mpn/x86/rshift.asm
@@ -0,0 +1,92 @@
+dnl  x86 mpn_rshift -- mpn right shift.
+
+dnl  Copyright (C) 1992, 1994, 1996, 1999, 2000 Free Software Foundation,
+dnl  Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	.text
+	ALIGN(8)
+PROLOGUE(mpn_rshift)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+deflit(`FRAME',12)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%edx
+	movl	PARAM_SHIFT,%ecx
+
+	leal	-4(%edi,%edx,4),%edi
+	leal	(%esi,%edx,4),%esi
+	negl	%edx
+
+	movl	(%esi,%edx,4),%ebx	C read least significant limb
+	xorl	%eax,%eax
+	shrdl(	%cl, %ebx, %eax)	C compute carry limb
+	incl	%edx
+	jz	L(end)
+	pushl	%eax			C push carry limb onto stack
+	testb	$1,%dl
+	jnz	L(1)			C enter loop in the middle
+	movl	%ebx,%eax
+
+	ALIGN(8)
+L(oop):	movl	(%esi,%edx,4),%ebx	C load next higher limb
+	shrdl(	%cl, %ebx, %eax)	C compute result limb
+	movl	%eax,(%edi,%edx,4)	C store it
+	incl	%edx
+L(1):	movl	(%esi,%edx,4),%eax
+	shrdl(	%cl, %eax, %ebx)
+	movl	%ebx,(%edi,%edx,4)
+	incl	%edx
+	jnz	L(oop)
+
+	shrl	%cl,%eax		C compute most significant limb
+	movl	%eax,(%edi)		C store it
+
+	popl	%eax			C pop carry limb
+
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+L(end):	shrl	%cl,%ebx		C compute most significant limb
+	movl	%ebx,(%edi)		C store it
+
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/udiv.asm b/rts/gmp/mpn/x86/udiv.asm
new file mode 100644
index 0000000000..9fe022b107
--- /dev/null
+++ b/rts/gmp/mpn/x86/udiv.asm
@@ -0,0 +1,44 @@
+dnl  x86 mpn_udiv_qrnnd -- 2 by 1 limb division
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_udiv_qrnnd (mp_limb_t *remptr, mp_limb_t high, mp_limb_t low,
+C                           mp_limb_t divisor);
+
+defframe(PARAM_DIVISOR, 16)
+defframe(PARAM_LOW,     12)
+defframe(PARAM_HIGH,    8)
+defframe(PARAM_REMPTR,  4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_udiv_qrnnd)
+deflit(`FRAME',0)
+	movl	PARAM_LOW, %eax
+	movl	PARAM_HIGH, %edx
+	divl	PARAM_DIVISOR
+	movl	PARAM_REMPTR, %ecx
+	movl	%edx, (%ecx)
+	ret
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/umul.asm b/rts/gmp/mpn/x86/umul.asm
new file mode 100644
index 0000000000..3d289d1784
--- /dev/null
+++ b/rts/gmp/mpn/x86/umul.asm
@@ -0,0 +1,43 @@
+dnl  mpn_umul_ppmm -- 1x1->2 limb multiplication
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl 
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl 
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl 
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_umul_ppmm (mp_limb_t *lowptr, mp_limb_t m1, mp_limb_t m2);
+C
+
+defframe(PARAM_M2,    12)
+defframe(PARAM_M1,     8)
+defframe(PARAM_LOWPTR, 4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_umul_ppmm)
+deflit(`FRAME',0)
+	movl	PARAM_LOWPTR, %ecx
+	movl	PARAM_M1, %eax
+	mull	PARAM_M2
+	movl	%eax, (%ecx)
+	movl	%edx, %eax
+	ret
+EPILOGUE()
diff --git a/rts/gmp/mpn/x86/x86-defs.m4 b/rts/gmp/mpn/x86/x86-defs.m4
new file mode 100644
index 0000000000..2dad698002
--- /dev/null
+++ b/rts/gmp/mpn/x86/x86-defs.m4
@@ -0,0 +1,713 @@
+divert(-1)
+
+dnl  m4 macros for x86 assembler.
+
+
+dnl  Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+dnl 
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 2.1 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public
+dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
+dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
+dnl  Suite 330, Boston, MA 02111-1307, USA.
+
+
+dnl  Notes:
+dnl
+dnl  m4 isn't perfect for processing BSD style x86 assembler code, the main
+dnl  problems are,
+dnl
+dnl  1. Doing define(foo,123) and then using foo in an addressing mode like
+dnl     foo(%ebx) expands as a macro rather than a constant.  This is worked
+dnl     around by using deflit() from asm-defs.m4, instead of define().
+dnl
+dnl  2. Immediates in macro definitions need a space or `' to stop the $
+dnl     looking like a macro parameter.  For example,
+dnl
+dnl  	        define(foo, `mov $ 123, %eax')
+dnl
+dnl     This is only a problem in macro definitions, not in ordinary text,
+dnl     nor in macro parameters like text passed to forloop() or ifdef().
+
+
+deflit(BYTES_PER_MP_LIMB, 4)
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Replacement PROLOGUE/EPILOGUE with more sophisticated error checking.
+dnl  Nesting and overlapping not allowed.
+dnl
+
+
+dnl  Usage: PROLOGUE(functionname)
+dnl
+dnl  Generate a function prologue.  functionname gets GSYM_PREFIX added.
+dnl  Examples,
+dnl
+dnl         PROLOGUE(mpn_add_n)
+dnl         PROLOGUE(somefun)
+
+define(`PROLOGUE',
+m4_assert_numargs(1)
+m4_assert_defined(`PROLOGUE_cpu')
+`ifdef(`PROLOGUE_current_function',
+`m4_error(`PROLOGUE'(`PROLOGUE_current_function') needs an `EPILOGUE'() before `PROLOGUE'($1)
+)')dnl
+m4_file_seen()dnl
+define(`PROLOGUE_current_function',`$1')dnl
+PROLOGUE_cpu(GSYM_PREFIX`'$1)')
+
+
+dnl  Usage: EPILOGUE()
+dnl
+dnl  Notice the function name is passed to EPILOGUE_cpu(), letting it use $1
+dnl  instead of the long PROLOGUE_current_function symbol.
+
+define(`EPILOGUE',
+m4_assert_numargs(0)
+m4_assert_defined(`EPILOGUE_cpu')
+`ifdef(`PROLOGUE_current_function',,
+`m4_error(`EPILOGUE'() with no `PROLOGUE'()
+)')dnl
+EPILOGUE_cpu(GSYM_PREFIX`'PROLOGUE_current_function)`'dnl
+undefine(`PROLOGUE_current_function')')
+
+m4wrap_prepend(
+`ifdef(`PROLOGUE_current_function',
+`m4_error(`EPILOGUE() for PROLOGUE('PROLOGUE_current_function`) never seen
+')')')
+
+
+dnl  Usage: PROLOGUE_assert_inside()
+dnl
+dnl  Use this unquoted on a line on its own at the start of a macro
+dnl  definition to add some code to check the macro is only used inside a
+dnl  PROLOGUE/EPILOGUE pair, and that hence PROLOGUE_current_function is
+dnl  defined.
+
+define(PROLOGUE_assert_inside,
+m4_assert_numargs(0)
+``PROLOGUE_assert_inside_internal'(m4_doublequote($`'0))`dnl '')
+
+define(PROLOGUE_assert_inside_internal,
+m4_assert_numargs(1)
+`ifdef(`PROLOGUE_current_function',,
+`m4_error(`$1 used outside a PROLOGUE / EPILOGUE pair
+')')')
+
+
+dnl  Usage: L(labelname)
+dnl         LF(functionname,labelname)
+dnl
+dnl  Generate a local label in the current or given function.  For LF(),
+dnl  functionname gets GSYM_PREFIX added, the same as with PROLOGUE().
+dnl
+dnl  For example, in a function mpn_add_n (and with MPN_PREFIX __gmpn),
+dnl
+dnl         L(bar)          => L__gmpn_add_n__bar
+dnl         LF(somefun,bar) => Lsomefun__bar
+dnl
+dnl  The funtion name and label name get two underscores between them rather
+dnl  than one to guard against clashing with a separate external symbol that
+dnl  happened to be called functionname_labelname.  (Though this would only
+dnl  happen if the local label prefix is is empty.)  Underscores are used so
+dnl  the whole label will still be a valid C identifier and so can be easily
+dnl  used in gdb.
+
+dnl  LSYM_PREFIX can be L$, so defn() is used to prevent L expanding as the
+dnl  L macro and making an infinite recursion.
+define(LF,
+m4_assert_numargs(2)
+m4_assert_defined(`LSYM_PREFIX')
+`defn(`LSYM_PREFIX')GSYM_PREFIX`'$1`'__$2')
+
+define(`L',
+m4_assert_numargs(1)
+PROLOGUE_assert_inside()
+`LF(PROLOGUE_current_function,`$1')')
+
+
+dnl  Called: PROLOGUE_cpu(gsym)
+dnl          EPILOGUE_cpu(gsym)
+
+define(PROLOGUE_cpu,
+m4_assert_numargs(1)
+	`GLOBL	$1
+	TYPE($1,`function')
+$1:')
+
+define(EPILOGUE_cpu,
+m4_assert_numargs(1)
+`	SIZE($1,.-$1)')
+
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Various x86 macros.
+dnl
+
+
+dnl  Usage: ALIGN_OFFSET(bytes,offset)
+dnl
+dnl  Align to `offset' away from a multiple of `bytes'.
+dnl
+dnl  This is useful for testing, for example align to something very strict
+dnl  and see what effect offsets from it have, "ALIGN_OFFSET(256,32)".
+dnl
+dnl  Generally you wouldn't execute across the padding, but it's done with
+dnl  nop's so it'll work.
+
+define(ALIGN_OFFSET,
+m4_assert_numargs(2)
+`ALIGN($1)
+forloop(`i',1,$2,`	nop
+')')
+
+
+dnl  Usage: defframe(name,offset)
+dnl
+dnl  Make a definition like the following with which to access a parameter
+dnl  or variable on the stack.
+dnl
+dnl         define(name,`FRAME+offset(%esp)')
+dnl
+dnl  Actually m4_empty_if_zero(FRAME+offset) is used, which will save one
+dnl  byte if FRAME+offset is zero, by putting (%esp) rather than 0(%esp).
+dnl  Use define(`defframe_empty_if_zero_disabled',1) if for some reason the
+dnl  zero offset is wanted.
+dnl
+dnl  The new macro also gets a check that when it's used FRAME is actually
+dnl  defined, and that the final %esp offset isn't negative, which would
+dnl  mean an attempt to access something below the current %esp.
+dnl
+dnl  deflit() is used rather than a plain define(), so the new macro won't
+dnl  delete any following parenthesized expression.  name(%edi) will come
+dnl  out say as 16(%esp)(%edi).  This isn't valid assembler and should
+dnl  provoke an error, which is better than silently giving just 16(%esp).
+dnl
+dnl  See README.family for more on the suggested way to access the stack
+dnl  frame.
+
+define(defframe,
+m4_assert_numargs(2)
+`deflit(`$1',
+m4_assert_defined(`FRAME')
+`defframe_check_notbelow(`$1',$2,FRAME)dnl
+defframe_empty_if_zero(FRAME+($2))(%esp)')')
+
+dnl  Called: defframe_empty_if_zero(expression)
+define(defframe_empty_if_zero,
+`ifelse(defframe_empty_if_zero_disabled,1,
+`eval($1)',
+`m4_empty_if_zero($1)')')
+
+dnl  Called: defframe_check_notbelow(`name',offset,FRAME)
+define(defframe_check_notbelow,
+m4_assert_numargs(3)
+`ifelse(eval(($3)+($2)<0),1,
+`m4_error(`$1 at frame offset $2 used when FRAME is only $3 bytes
+')')')
+
+
+dnl  Usage: FRAME_pushl()
+dnl         FRAME_popl()
+dnl         FRAME_addl_esp(n)
+dnl         FRAME_subl_esp(n)
+dnl
+dnl  Adjust FRAME appropriately for a pushl or popl, or for an addl or subl
+dnl  %esp of n bytes.
+dnl
+dnl  Using these macros is completely optional.  Sometimes it makes more
+dnl  sense to put explicit deflit(`FRAME',N) forms, especially when there's
+dnl  jumps and different sequences of FRAME values need to be used in
+dnl  different places.
+
+define(FRAME_pushl,
+m4_assert_numargs(0)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME+4))')
+
+define(FRAME_popl,
+m4_assert_numargs(0)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME-4))')
+
+define(FRAME_addl_esp,
+m4_assert_numargs(1)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME-($1)))')
+
+define(FRAME_subl_esp,
+m4_assert_numargs(1)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME+($1)))')
+
+
+dnl  Usage: defframe_pushl(name)
+dnl
+dnl  Do a combination of a FRAME_pushl() and a defframe() to name the stack
+dnl  location just pushed.  This should come after a pushl instruction.
+dnl  Putting it on the same line works and avoids lengthening the code.  For
+dnl  example,
+dnl
+dnl         pushl   %eax     defframe_pushl(VAR_COUNTER)
+dnl
+dnl  Notice the defframe() is done with an unquoted -FRAME thus giving its
+dnl  current value without tracking future changes.
+
+define(defframe_pushl,
+`FRAME_pushl()defframe(`$1',-FRAME)')
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Assembler instruction macros.
+dnl
+
+
+dnl  Usage: emms_or_femms
+dnl         femms_available_p
+dnl
+dnl  femms_available_p expands to 1 or 0 according to whether the AMD 3DNow
+dnl  femms instruction is available.  emms_or_femms expands to femms if
+dnl  available, or emms if not.
+dnl
+dnl  emms_or_femms is meant for use in the K6 directory where plain K6
+dnl  (without femms) and K6-2 and K6-3 (with a slightly faster femms) are
+dnl  supported together.
+dnl
+dnl  On K7 femms is no longer faster and is just an alias for emms, so plain
+dnl  emms may as well be used.
+
+define(femms_available_p,
+m4_assert_numargs(-1)
+`m4_ifdef_anyof_p(
+	`HAVE_TARGET_CPU_k62',
+	`HAVE_TARGET_CPU_k63',
+	`HAVE_TARGET_CPU_athlon')')
+
+define(emms_or_femms,
+m4_assert_numargs(-1)
+`ifelse(femms_available_p,1,`femms',`emms')')
+
+
+dnl  Usage: femms
+dnl
+dnl  The gas 2.9.1 that comes with FreeBSD 3.4 doesn't support femms, so the
+dnl  following is a replacement using .byte.
+dnl
+dnl  If femms isn't available, an emms is generated instead, for convenience
+dnl  when testing on a machine without femms.
+
+define(femms,
+m4_assert_numargs(-1)
+`ifelse(femms_available_p,1,
+`.byte	15,14	C AMD 3DNow femms',
+`emms`'dnl
+m4_warning(`warning, using emms in place of femms, use for testing only
+')')')
+
+
+dnl  Usage: jadcl0(op)
+dnl
+dnl  Issue a jnc/incl as a substitute for adcl $0,op.  This isn't an exact
+dnl  replacement, since it doesn't set the flags like adcl does.
+dnl
+dnl  This finds a use in K6 mpn_addmul_1, mpn_submul_1, mpn_mul_basecase and
+dnl  mpn_sqr_basecase because on K6 an adcl is slow, the branch
+dnl  misprediction penalty is small, and the multiply algorithm used leads
+dnl  to a carry bit on average only 1/4 of the time.
+dnl
+dnl  jadcl0_disabled can be set to 1 to instead issue an ordinary adcl for
+dnl  comparison.  For example,
+dnl
+dnl		define(`jadcl0_disabled',1)
+dnl
+dnl  When using a register operand, eg. "jadcl0(%edx)", the jnc/incl code is
+dnl  the same size as an adcl.  This makes it possible to use the exact same
+dnl  computed jump code when testing the relative speed of jnc/incl and adcl
+dnl  with jadcl0_disabled.
+
+define(jadcl0,
+m4_assert_numargs(1)
+`ifelse(jadcl0_disabled,1,
+	`adcl	$`'0, $1',
+	`jnc	1f
+	incl	$1
+1:dnl')')
+
+
+dnl  Usage: cmov_available_p
+dnl
+dnl  Expand to 1 if cmov is available, 0 if not.
+
+define(cmov_available_p,
+`m4_ifdef_anyof_p(
+	`HAVE_TARGET_CPU_pentiumpro',
+	`HAVE_TARGET_CPU_pentium2',
+	`HAVE_TARGET_CPU_pentium3',
+	`HAVE_TARGET_CPU_athlon')')
+
+
+dnl  Usage: x86_lookup(target, key,value, key,value, ...)
+dnl         x86_lookup_p(target, key,value, key,value, ...)
+dnl
+dnl  Look for `target' among the `key' parameters.
+dnl
+dnl  x86_lookup expands to the corresponding `value', or generates an error
+dnl  if `target' isn't found.
+dnl
+dnl  x86_lookup_p expands to 1 if `target' is found, or 0 if not.
+
+define(x86_lookup,
+`ifelse(eval($#<3),1,
+`m4_error(`unrecognised part of x86 instruction: $1
+')',
+`ifelse(`$1',`$2', `$3',
+`x86_lookup(`$1',shift(shift(shift($@))))')')')
+
+define(x86_lookup_p,
+`ifelse(eval($#<3),1, `0',
+`ifelse(`$1',`$2',    `1',
+`x86_lookup_p(`$1',shift(shift(shift($@))))')')')
+
+
+dnl  Usage: x86_opcode_reg32(reg)
+dnl         x86_opcode_reg32_p(reg)
+dnl
+dnl  x86_opcode_reg32 expands to the standard 3 bit encoding for the given
+dnl  32-bit register, eg. `%ebp' turns into 5.
+dnl
+dnl  x86_opcode_reg32_p expands to 1 if reg is a valid 32-bit register, or 0
+dnl  if not.
+
+define(x86_opcode_reg32,
+m4_assert_numargs(1)
+`x86_lookup(`$1',x86_opcode_reg32_list)')
+
+define(x86_opcode_reg32_p,
+m4_assert_onearg()
+`x86_lookup_p(`$1',x86_opcode_reg32_list)')
+
+define(x86_opcode_reg32_list,
+``%eax',0,
+`%ecx',1,
+`%edx',2,
+`%ebx',3,
+`%esp',4,
+`%ebp',5,
+`%esi',6,
+`%edi',7')
+
+
+dnl  Usage: x86_opcode_tttn(cond)
+dnl
+dnl  Expand to the 4-bit "tttn" field value for the given x86 branch
+dnl  condition (like `c', `ae', etc).
+
+define(x86_opcode_tttn,
+m4_assert_numargs(1)
+`x86_lookup(`$1',x86_opcode_ttn_list)')
+
+define(x86_opcode_tttn_list,
+``o',  0,
+`no',  1,
+`b',   2, `c',  2, `nae',2,
+`nb',  3, `nc', 3, `ae', 3,
+`e',   4, `z',  4,
+`ne',  5, `nz', 5,
+`be',  6, `na', 6,
+`nbe', 7, `a',  7,
+`s',   8,
+`ns',  9,
+`p',  10, `pe', 10, `npo',10,
+`np', 11, `npe',11, `po', 11,
+`l',  12, `nge',12,
+`nl', 13, `ge', 13,
+`le', 14, `ng', 14,
+`nle',15, `g',  15')
+
+
+dnl  Usage: cmovCC(srcreg,dstreg)
+dnl
+dnl  Generate a cmov instruction if the target supports cmov, or simulate it
+dnl  with a conditional jump if not (the latter being meant only for
+dnl  testing).  For example,
+dnl
+dnl         cmovz(  %eax, %ebx)
+dnl
+dnl  cmov instructions are generated using .byte sequences, since only
+dnl  recent versions of gas know cmov.
+dnl
+dnl  The source operand can only be a plain register.  (m4 code implementing
+dnl  full memory addressing modes exists, believe it or not, but isn't
+dnl  currently needed and isn't included.)
+dnl
+dnl  All the standard conditions are defined.  Attempting to use one without
+dnl  the macro parentheses, such as just "cmovbe %eax, %ebx", will provoke
+dnl  an error.  This ensures the necessary .byte sequences aren't
+dnl  accidentally missed.
+
+dnl  Called: define_cmov_many(cond,tttn,cond,tttn,...)
+define(define_cmov_many,
+`ifelse(m4_length(`$1'),0,,
+`define_cmov(`$1',`$2')define_cmov_many(shift(shift($@)))')')
+
+dnl  Called: define_cmov(cond,tttn)
+define(define_cmov,
+m4_assert_numargs(2)
+`define(`cmov$1',
+m4_instruction_wrapper()
+m4_assert_numargs(2)
+`cmov_internal'(m4_doublequote($`'0),``$1',`$2'',dnl
+m4_doublequote($`'1),m4_doublequote($`'2)))')
+
+define_cmov_many(x86_opcode_tttn_list)
+
+
+dnl  Called: cmov_internal(name,cond,tttn,src,dst)
+define(cmov_internal,
+m4_assert_numargs(5)
+`ifelse(cmov_available_p,1,
+`cmov_bytes_tttn(`$1',`$3',`$4',`$5')',
+`m4_warning(`warning, simulating cmov with jump, use for testing only
+')cmov_simulate(`$2',`$4',`$5')')')
+
+dnl  Called: cmov_simulate(cond,src,dst)
+dnl  If this is going to be used with memory operands for the source it will
+dnl  need to be changed to do a fetch even if the condition is false, so as
+dnl  to trigger exceptions the same way a real cmov does.
+define(cmov_simulate,
+m4_assert_numargs(3)
+	`j$1	1f	C cmov$1 $2, $3
+	jmp	2f
+1:	movl	$2, $3
+2:')
+
+dnl  Called: cmov_bytes_tttn(name,tttn,src,dst)
+define(cmov_bytes_tttn,
+m4_assert_numargs(4)
+`.byte	dnl
+15, dnl
+eval(64+$2), dnl
+eval(192+8*x86_opcode_reg32(`$4')+x86_opcode_reg32(`$3')) dnl
+	C `$1 $3, $4'')
+
+
+dnl  Usage: loop_or_decljnz label
+dnl
+dnl  Generate either a "loop" instruction or a "decl %ecx / jnz", whichever
+dnl  is better.  "loop" is better on K6 and probably on 386, on other chips
+dnl  separate decl/jnz is better.
+dnl
+dnl  This macro is just for mpn/x86/divrem_1.asm and mpn/x86/mod_1.asm where
+dnl  this loop_or_decljnz variation is enough to let the code be shared by
+dnl  all chips.
+
+define(loop_or_decljnz,
+`ifelse(loop_is_better_p,1,
+	`loop',
+	`decl	%ecx
+	jnz')')
+
+define(loop_is_better_p,
+`m4_ifdef_anyof_p(`HAVE_TARGET_CPU_k6',
+                  `HAVE_TARGET_CPU_k62',
+                  `HAVE_TARGET_CPU_k63',
+                  `HAVE_TARGET_CPU_i386')')
+
+
+dnl  Usage: Zdisp(inst,op,op,op)
+dnl
+dnl  Generate explicit .byte sequences if necessary to force a byte-sized
+dnl  zero displacement on an instruction.  For example,
+dnl
+dnl         Zdisp(  movl,   0,(%esi), %eax)
+dnl
+dnl  expands to
+dnl
+dnl                 .byte   139,70,0  C movl 0(%esi), %eax
+dnl
+dnl  If the displacement given isn't 0, then normal assembler code is
+dnl  generated.  For example,
+dnl
+dnl         Zdisp(  movl,   4,(%esi), %eax)
+dnl
+dnl  expands to
+dnl
+dnl                 movl    4(%esi), %eax
+dnl
+dnl  This means a single Zdisp() form can be used with an expression for the
+dnl  displacement, and .byte will be used only if necessary.  The
+dnl  displacement argument is eval()ed.
+dnl
+dnl  Because there aren't many places a 0(reg) form is wanted, Zdisp is
+dnl  implemented with a table of instructions and encodings.  A new entry is
+dnl  needed for any different operation or registers.
+
+define(Zdisp,
+`define(`Zdisp_found',0)dnl
+Zdisp_match( movl, %eax, 0,(%edi), `137,71,0',    $@)`'dnl
+Zdisp_match( movl, %ebx, 0,(%edi), `137,95,0',    $@)`'dnl
+Zdisp_match( movl, %esi, 0,(%edi), `137,119,0',   $@)`'dnl
+Zdisp_match( movl, 0,(%ebx), %eax, `139,67,0',    $@)`'dnl
+Zdisp_match( movl, 0,(%ebx), %esi, `139,115,0',   $@)`'dnl
+Zdisp_match( movl, 0,(%esi), %eax, `139,70,0',    $@)`'dnl
+Zdisp_match( movl, 0,(%esi,%ecx,4), %eax, `0x8b,0x44,0x8e,0x00',      $@)`'dnl
+Zdisp_match( addl, %ebx, 0,(%edi), `1,95,0',      $@)`'dnl
+Zdisp_match( addl, %ecx, 0,(%edi), `1,79,0',      $@)`'dnl
+Zdisp_match( addl, %esi, 0,(%edi), `1,119,0',     $@)`'dnl
+Zdisp_match( subl, %ecx, 0,(%edi), `41,79,0',     $@)`'dnl
+Zdisp_match( adcl, 0,(%edx), %esi, `19,114,0',    $@)`'dnl
+Zdisp_match( sbbl, 0,(%edx), %esi, `27,114,0',    $@)`'dnl
+Zdisp_match( movq, 0,(%eax,%ecx,8), %mm0, `0x0f,0x6f,0x44,0xc8,0x00', $@)`'dnl
+Zdisp_match( movq, 0,(%ebx,%eax,4), %mm0, `0x0f,0x6f,0x44,0x83,0x00', $@)`'dnl
+Zdisp_match( movq, 0,(%ebx,%eax,4), %mm2, `0x0f,0x6f,0x54,0x83,0x00', $@)`'dnl
+Zdisp_match( movq, 0,(%esi),        %mm0, `15,111,70,0',     $@)`'dnl
+Zdisp_match( movq, %mm0,        0,(%edi), `15,127,71,0',     $@)`'dnl
+Zdisp_match( movq, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7f,0x54,0x81,0x00', $@)`'dnl
+Zdisp_match( movq, %mm2, 0,(%edx,%eax,4), `0x0f,0x7f,0x54,0x82,0x00', $@)`'dnl
+Zdisp_match( movq, %mm0, 0,(%edx,%ecx,8), `0x0f,0x7f,0x44,0xca,0x00', $@)`'dnl
+Zdisp_match( movd, 0,(%eax,%ecx,8), %mm1, `0x0f,0x6e,0x4c,0xc8,0x00', $@)`'dnl
+Zdisp_match( movd, 0,(%edx,%ecx,8), %mm0, `0x0f,0x6e,0x44,0xca,0x00', $@)`'dnl
+Zdisp_match( movd, %mm0, 0,(%eax,%ecx,4), `0x0f,0x7e,0x44,0x88,0x00', $@)`'dnl
+Zdisp_match( movd, %mm0, 0,(%ecx,%eax,4), `0x0f,0x7e,0x44,0x81,0x00', $@)`'dnl
+Zdisp_match( movd, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7e,0x54,0x81,0x00', $@)`'dnl
+ifelse(Zdisp_found,0,
+`m4_error(`unrecognised instruction in Zdisp: $1 $2 $3 $4
+')')')
+
+define(Zdisp_match,
+`ifelse(eval(m4_stringequal_p(`$1',`$6')
+	&& m4_stringequal_p(`$2',0)
+	&& m4_stringequal_p(`$3',`$8')
+	&& m4_stringequal_p(`$4',`$9')),1,
+`define(`Zdisp_found',1)dnl
+ifelse(eval(`$7'),0,
+`	.byte	$5  C `$1 0$3, $4'',
+`	$6	$7$8, $9')',
+
+`ifelse(eval(m4_stringequal_p(`$1',`$6')
+	&& m4_stringequal_p(`$2',`$7')
+	&& m4_stringequal_p(`$3',0)
+	&& m4_stringequal_p(`$4',`$9')),1,
+`define(`Zdisp_found',1)dnl
+ifelse(eval(`$8'),0,
+`	.byte	$5  C `$1 $2, 0$4'',
+`	$6	$7, $8$9')')')')
+
+
+dnl  Usage: shldl(count,src,dst)
+dnl         shrdl(count,src,dst)
+dnl         shldw(count,src,dst)
+dnl         shrdw(count,src,dst)
+dnl
+dnl  Generate a double-shift instruction, possibly omitting a %cl count
+dnl  parameter if that's what the assembler requires, as indicated by
+dnl  WANT_SHLDL_CL in config.m4.  For example,
+dnl
+dnl         shldl(  %cl, %eax, %ebx)
+dnl
+dnl  turns into either
+dnl
+dnl         shldl   %cl, %eax, %ebx
+dnl  or
+dnl         shldl   %eax, %ebx
+dnl
+dnl  Immediate counts are always passed through unchanged.  For example,
+dnl
+dnl         shrdl(  $2, %esi, %edi)
+dnl  becomes
+dnl         shrdl   $2, %esi, %edi
+dnl
+dnl
+dnl  If you forget to use the macro form "shldl( ...)" and instead write
+dnl  just a plain "shldl ...", an error results.  This ensures the necessary
+dnl  variant treatment of %cl isn't accidentally bypassed.
+
+define(define_shd_instruction,
+`define($1,
+m4_instruction_wrapper()
+m4_assert_numargs(3)
+`shd_instruction'(m4_doublequote($`'0),m4_doublequote($`'1),dnl
+m4_doublequote($`'2),m4_doublequote($`'3)))')
+
+dnl  Effectively: define(shldl,`shd_instruction(`$0',`$1',`$2',`$3')') etc
+define_shd_instruction(shldl)
+define_shd_instruction(shrdl)
+define_shd_instruction(shldw)
+define_shd_instruction(shrdw)
+
+dnl  Called: shd_instruction(op,count,src,dst)
+define(shd_instruction,
+m4_assert_numargs(4)
+m4_assert_defined(`WANT_SHLDL_CL')
+`ifelse(eval(m4_stringequal_p(`$2',`%cl') && !WANT_SHLDL_CL),1,
+``$1'	`$3', `$4'',
+``$1'	`$2', `$3', `$4'')')
+
+
+dnl  Usage: ASSERT(cond, instructions)
+dnl
+dnl  If WANT_ASSERT is 1, output the given instructions and expect the given
+dnl  flags condition to then be satisfied.  For example,
+dnl
+dnl         ASSERT(ne, `cmpl %eax, %ebx')
+dnl
+dnl  The instructions can be omitted to just assert a flags condition with
+dnl  no extra calculation.  For example,
+dnl
+dnl         ASSERT(nc)
+dnl
+dnl  When `instructions' is not empty, a pushf/popf is added to preserve the
+dnl  flags, but the instructions themselves must preserve any registers that
+dnl  matter.  FRAME is adjusted for the push and pop, so the instructions
+dnl  given can use defframe() stack variables.
+
+define(ASSERT,
+m4_assert_numargs_range(1,2)
+`ifelse(WANT_ASSERT,1,
+	`C ASSERT
+ifelse(`$2',,,`	pushf	ifdef(`FRAME',`FRAME_pushl()')')
+	$2
+	j`$1'	1f
+	ud2	C assertion failed
+1:
+ifelse(`$2',,,`	popf	ifdef(`FRAME',`FRAME_popl()')')
+')')
+
+
+dnl  Usage: movl_text_address(label,register)
+dnl
+dnl  Get the address of a text segment label, using either a plain movl or a
+dnl  position-independent calculation, as necessary.  For example,
+dnl
+dnl         movl_code_address(L(foo),%eax)
+dnl
+dnl  This macro is only meant for use in ASSERT()s or when testing, since
+dnl  the PIC sequence it generates will want to be done with a ret balancing
+dnl  the call on CPUs with return address branch predition.
+dnl
+dnl  The addl generated here has a backward reference to 1b, and so won't
+dnl  suffer from the two forwards references bug in old gas (described in
+dnl  mpn/x86/README.family).
+
+define(movl_text_address,
+`ifdef(`PIC',
+	`call	1f
+1:	popl	$2	C %eip
+	addl	`$'$1-1b, $2',
+	`movl	`$'$1, $2')')
+
+
+divert`'dnl
diff --git a/rts/gmp/mpn/z8000/add_n.s b/rts/gmp/mpn/z8000/add_n.s
new file mode 100644
index 0000000000..3a136107fe
--- /dev/null
+++ b/rts/gmp/mpn/z8000/add_n.s
@@ -0,0 +1,53 @@
+! Z8000 __gmpn_add_n -- Add two limb vectors of equal, non-zero length.
+
+! Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r7
+! s1_ptr	r6
+! s2_ptr	r5
+! size		r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp.  We'd
+! then add 2x the number of words written to r7...
+
+	unseg
+	.text
+	even
+	global ___gmpn_add_n
+___gmpn_add_n:
+	pop	r0,@r6
+	pop	r1,@r5
+	add	r0,r1
+	ld	@r7,r0
+	dec	r4
+	jr	eq,Lend
+Loop:	pop	r0,@r6
+	pop	r1,@r5
+	adc	r0,r1
+	inc	r7,#2
+	ld	@r7,r0
+	dec	r4
+	jr	ne,Loop
+Lend:	ld	r2,r4		! use 0 already in r4
+	adc	r2,r2
+	ret	t
diff --git a/rts/gmp/mpn/z8000/gmp-mparam.h b/rts/gmp/mpn/z8000/gmp-mparam.h
new file mode 100644
index 0000000000..4216df673c
--- /dev/null
+++ b/rts/gmp/mpn/z8000/gmp-mparam.h
@@ -0,0 +1,27 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#define BITS_PER_MP_LIMB 16
+#define BYTES_PER_MP_LIMB 2
+#define BITS_PER_LONGINT 32
+#define BITS_PER_INT 16
+#define BITS_PER_SHORTINT 16
+#define BITS_PER_CHAR 8
diff --git a/rts/gmp/mpn/z8000/mul_1.s b/rts/gmp/mpn/z8000/mul_1.s
new file mode 100644
index 0000000000..20fadd340a
--- /dev/null
+++ b/rts/gmp/mpn/z8000/mul_1.s
@@ -0,0 +1,68 @@
+! Z8000 __gmpn_mul_1 -- Multiply a limb vector with a limb and store
+! the result in a second limb vector.
+
+! Copyright (C) 1993, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r7
+! s1_ptr	r6
+! size		r5
+! s2_limb	r4
+
+	unseg
+	.text
+	even
+	global ___gmpn_mul_1
+___gmpn_mul_1:
+	sub	r2,r2		! zero carry limb
+	and	r4,r4
+	jr	mi,Lneg
+
+Lpos:	pop	r1,@r6
+	ld	r9,r1
+	mult	rr8,r4
+	and	r1,r1		! shift msb of loaded limb into cy
+	jr	mi,Lp		! branch if loaded limb's msb is set
+	add	r8,r4		! hi_limb += sign_comp2
+Lp:	add	r9,r2		! lo_limb += cy_limb
+	xor	r2,r2
+	adc	r2,r8
+	ld	@r7,r9
+	inc	r7,#2
+	dec	r5
+	jr	ne,Lpos
+	ret t
+
+Lneg:	pop	r1,@r6
+	ld	r9,r1
+	mult	rr8,r4
+	add	r8,r1		! hi_limb += sign_comp1
+	and	r1,r1
+	jr	mi,Ln
+	add	r8,r4		! hi_limb += sign_comp2
+Ln:	add	r9,r2		! lo_limb += cy_limb
+	xor	r2,r2
+	adc	r2,r8
+	ld	@r7,r9
+	inc	r7,#2
+	dec	r5
+	jr	ne,Lneg
+	ret t
diff --git a/rts/gmp/mpn/z8000/sub_n.s b/rts/gmp/mpn/z8000/sub_n.s
new file mode 100644
index 0000000000..bd9a7ad409
--- /dev/null
+++ b/rts/gmp/mpn/z8000/sub_n.s
@@ -0,0 +1,54 @@
+! Z8000 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+! store difference in a third limb vector.
+
+! Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r7
+! s1_ptr	r6
+! s2_ptr	r5
+! size		r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp.  We'd
+! then add 2x the number of words written to r7...
+
+	unseg
+	.text
+	even
+	global ___gmpn_sub_n
+___gmpn_sub_n:
+	pop	r0,@r6
+	pop	r1,@r5
+	sub	r0,r1
+	ld	@r7,r0
+	dec	r4
+	jr	eq,Lend
+Loop:	pop	r0,@r6
+	pop	r1,@r5
+	sbc	r0,r1
+	inc	r7,#2
+	ld	@r7,r0
+	dec	r4
+	jr	ne,Loop
+Lend:	ld	r2,r4		! use 0 already in r4
+	adc	r2,r2
+	ret	t
diff --git a/rts/gmp/mpn/z8000x/add_n.s b/rts/gmp/mpn/z8000x/add_n.s
new file mode 100644
index 0000000000..7f130785c5
--- /dev/null
+++ b/rts/gmp/mpn/z8000x/add_n.s
@@ -0,0 +1,56 @@
+! Z8000 (32 bit limb version) __gmpn_add_n -- Add two limb vectors of equal,
+! non-zero length.
+
+! Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r7
+! s1_ptr	r6
+! s2_ptr	r5
+! size		r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp.  We'd
+! then add 2x the number of words written to r7...
+
+	segm
+	.text
+	even
+	global ___gmpn_add_n
+___gmpn_add_n:
+	popl	rr0,@r6
+	popl	rr8,@r5
+	addl	rr0,rr8
+	ldl	@r7,rr0
+	dec	r4
+	jr	eq,Lend
+Loop:	popl	rr0,@r6
+	popl	rr8,@r5
+	adc	r1,r9
+	adc	r0,r8
+	inc	r7,#4
+	ldl	@r7,rr0
+	dec	r4
+	jr	ne,Loop
+Lend:	ld	r2,r4		! use 0 already in r4
+	ld	r3,r4
+	adc	r2,r2
+	ret	t
diff --git a/rts/gmp/mpn/z8000x/sub_n.s b/rts/gmp/mpn/z8000x/sub_n.s
new file mode 100644
index 0000000000..f416d1d6eb
--- /dev/null
+++ b/rts/gmp/mpn/z8000x/sub_n.s
@@ -0,0 +1,56 @@
+! Z8000 (32 bit limb version) __gmpn_sub_n -- Subtract two limb vectors of the
+! same length > 0 and store difference in a third limb vector.
+
+! Copyright (C) 1993, 1994, 2000 Free Software Foundation, Inc.
+
+! This file is part of the GNU MP Library.
+
+! The GNU MP Library is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at your
+! option) any later version.
+
+! The GNU MP Library is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+! or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+! License for more details.
+
+! You should have received a copy of the GNU Lesser General Public License
+! along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+! MA 02111-1307, USA.
+
+
+! INPUT PARAMETERS
+! res_ptr	r7
+! s1_ptr	r6
+! s2_ptr	r5
+! size		r4
+
+! If we are really crazy, we can use push to write a few result words
+! backwards, using push just because it is faster than reg+disp.  We'd
+! then add 2x the number of words written to r7...
+
+	segm
+	.text
+	even
+	global ___gmpn_sub_n
+___gmpn_sub_n:
+	popl	rr0,@r6
+	popl	rr8,@r5
+	subl	rr0,rr8
+	ldl	@r7,rr0
+	dec	r4
+	jr	eq,Lend
+Loop:	popl	rr0,@r6
+	popl	rr8,@r5
+	sbc	r1,r9
+	sbc	r0,r8
+	inc	r7,#4
+	ldl	@r7,rr0
+	dec	r4
+	jr	ne,Loop
+Lend:	ld	r2,r4		! use 0 already in r4
+	ld	r3,r4
+	adc	r2,r2
+	ret	t
diff --git a/rts/gmp/mpz/Makefile.am b/rts/gmp/mpz/Makefile.am
new file mode 100644
index 0000000000..cd6fec4e21
--- /dev/null
+++ b/rts/gmp/mpz/Makefile.am
@@ -0,0 +1,58 @@
+## Process this file with automake to generate Makefile.in
+
+# Copyright (C) 1996, 1998, 1999, 2000 Free Software Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+AUTOMAKE_OPTIONS = gnu no-dependencies
+
+SUBDIRS = tests
+
+INCLUDES = -I$(top_srcdir) -DOPERATION_$*
+
+noinst_LTLIBRARIES = libmpz.la
+libmpz_la_SOURCES = \
+  abs.c add.c add_ui.c addmul_ui.c and.c array_init.c \
+  bin_ui.c bin_uiui.c cdiv_q.c \
+  cdiv_q_ui.c cdiv_qr.c cdiv_qr_ui.c cdiv_r.c cdiv_r_ui.c cdiv_ui.c \
+  clear.c clrbit.c cmp.c cmp_si.c cmp_ui.c cmpabs.c cmpabs_ui.c com.c \
+  divexact.c dump.c fac_ui.c fdiv_q.c fdiv_q_2exp.c fdiv_q_ui.c \
+  fdiv_qr.c fdiv_qr_ui.c fdiv_r.c fdiv_r_2exp.c fdiv_r_ui.c fdiv_ui.c \
+  fib_ui.c fits_sint_p.c fits_slong_p.c fits_sshort_p.c fits_uint_p.c \
+  fits_ulong_p.c fits_ushort_p.c gcd.c gcd_ui.c gcdext.c get_d.c get_si.c \
+  get_str.c get_ui.c getlimbn.c hamdist.c init.c inp_raw.c inp_str.c \
+  invert.c ior.c iset.c iset_d.c iset_si.c iset_str.c iset_ui.c \
+  jacobi.c kronsz.c kronuz.c kronzs.c kronzu.c \
+  lcm.c legendre.c mod.c mul.c mul_2exp.c neg.c nextprime.c \
+  out_raw.c out_str.c perfpow.c perfsqr.c popcount.c pow_ui.c powm.c \
+  powm_ui.c pprime_p.c random.c random2.c realloc.c remove.c root.c rrandomb.c \
+  scan0.c scan1.c set.c set_d.c set_f.c set_q.c set_si.c set_str.c \
+  set_ui.c setbit.c size.c sizeinbase.c sqrt.c sqrtrem.c sub.c \
+  sub_ui.c swap.c tdiv_ui.c tdiv_q.c tdiv_q_2exp.c tdiv_q_ui.c tdiv_qr.c \
+  tdiv_qr_ui.c tdiv_r.c tdiv_r_2exp.c tdiv_r_ui.c tstbit.c ui_pow_ui.c \
+  urandomb.c urandomm.c xor.c
+
+EXTRA_DIST = mul_siui.c
+nodist_libmpz_la_SOURCES = mul_si.c mul_ui.c
+CLEANFILES = $(nodist_libmpz_la_SOURCES)
+
+mul_si.c: $(srcdir)/mul_siui.c
+	cp $(srcdir)/mul_siui.c mul_si.c
+mul_ui.c: $(srcdir)/mul_siui.c
+	cp $(srcdir)/mul_siui.c mul_ui.c
diff --git a/rts/gmp/mpz/Makefile.in b/rts/gmp/mpz/Makefile.in
new file mode 100644
index 0000000000..e0f2cdc133
--- /dev/null
+++ b/rts/gmp/mpz/Makefile.in
@@ -0,0 +1,457 @@
+# Makefile.in generated automatically by automake 1.4a from Makefile.am
+
+# Copyright (C) 1994, 1995-8, 1999 Free Software Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+SHELL = @SHELL@
+
+srcdir = @srcdir@
+top_srcdir = @top_srcdir@
+VPATH = @srcdir@
+prefix = @prefix@
+exec_prefix = @exec_prefix@
+
+bindir = @bindir@
+sbindir = @sbindir@
+libexecdir = @libexecdir@
+datadir = @datadir@
+sysconfdir = @sysconfdir@
+sharedstatedir = @sharedstatedir@
+localstatedir = @localstatedir@
+libdir = @libdir@
+infodir = @infodir@
+mandir = @mandir@
+includedir = @includedir@
+oldincludedir = /usr/include
+
+DESTDIR =
+
+pkgdatadir = $(datadir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+
+top_builddir = ..
+
+ACLOCAL = @ACLOCAL@
+AUTOCONF = @AUTOCONF@
+AUTOMAKE = @AUTOMAKE@
+AUTOHEADER = @AUTOHEADER@
+
+INSTALL = @INSTALL@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_FLAG =
+transform = @program_transform_name@
+
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+
+@SET_MAKE@
+build_alias = @build_alias@
+build_triplet = @build@
+host_alias = @host_alias@
+host_triplet = @host@
+target_alias = @target_alias@
+target_triplet = @target@
+AMDEP = @AMDEP@
+AMTAR = @AMTAR@
+AR = @AR@
+AS = @AS@
+AWK = @AWK@
+CALLING_CONVENTIONS_OBJS = @CALLING_CONVENTIONS_OBJS@
+CC = @CC@
+CCAS = @CCAS@
+CPP = @CPP@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+EXEEXT = @EXEEXT@
+LIBTOOL = @LIBTOOL@
+LN_S = @LN_S@
+M4 = @M4@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+PACKAGE = @PACKAGE@
+RANLIB = @RANLIB@
+SPEED_CYCLECOUNTER_OBJS = @SPEED_CYCLECOUNTER_OBJS@
+STRIP = @STRIP@
+U = @U@
+VERSION = @VERSION@
+gmp_srclinks = @gmp_srclinks@
+install_sh = @install_sh@
+mpn_objects = @mpn_objects@
+mpn_objs_in_libgmp = @mpn_objs_in_libgmp@
+
+# Copyright (C) 1996, 1998, 1999, 2000 Free Software Foundation, Inc.
+#
+# This file is part of the GNU MP Library.
+#
+# The GNU MP Library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at your
+# option) any later version.
+#
+# The GNU MP Library is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+# MA 02111-1307, USA.
+
+
+AUTOMAKE_OPTIONS = gnu no-dependencies
+
+SUBDIRS =
+
+INCLUDES = -I$(top_srcdir) -DOPERATION_$*
+
+noinst_LTLIBRARIES = libmpz.la
+libmpz_la_SOURCES = \
+  abs.c add.c add_ui.c addmul_ui.c and.c array_init.c \
+  bin_ui.c bin_uiui.c cdiv_q.c \
+  cdiv_q_ui.c cdiv_qr.c cdiv_qr_ui.c cdiv_r.c cdiv_r_ui.c cdiv_ui.c \
+  clear.c clrbit.c cmp.c cmp_si.c cmp_ui.c cmpabs.c cmpabs_ui.c com.c \
+  divexact.c dump.c fac_ui.c fdiv_q.c fdiv_q_2exp.c fdiv_q_ui.c \
+  fdiv_qr.c fdiv_qr_ui.c fdiv_r.c fdiv_r_2exp.c fdiv_r_ui.c fdiv_ui.c \
+  fib_ui.c fits_sint_p.c fits_slong_p.c fits_sshort_p.c fits_uint_p.c \
+  fits_ulong_p.c fits_ushort_p.c gcd.c gcd_ui.c gcdext.c get_d.c get_si.c \
+  get_str.c get_ui.c getlimbn.c hamdist.c init.c inp_raw.c inp_str.c \
+  invert.c ior.c iset.c iset_d.c iset_si.c iset_str.c iset_ui.c \
+  jacobi.c kronsz.c kronuz.c kronzs.c kronzu.c \
+  lcm.c legendre.c mod.c mul.c mul_2exp.c neg.c nextprime.c \
+  out_raw.c out_str.c perfpow.c perfsqr.c popcount.c pow_ui.c powm.c \
+  powm_ui.c pprime_p.c random.c random2.c realloc.c remove.c root.c rrandomb.c \
+  scan0.c scan1.c set.c set_d.c set_f.c set_q.c set_si.c set_str.c \
+  set_ui.c setbit.c size.c sizeinbase.c sqrt.c sqrtrem.c sub.c \
+  sub_ui.c swap.c tdiv_ui.c tdiv_q.c tdiv_q_2exp.c tdiv_q_ui.c tdiv_qr.c \
+  tdiv_qr_ui.c tdiv_r.c tdiv_r_2exp.c tdiv_r_ui.c tstbit.c ui_pow_ui.c \
+  urandomb.c urandomm.c xor.c
+
+
+EXTRA_DIST = mul_siui.c
+nodist_libmpz_la_SOURCES = mul_si.c mul_ui.c
+CLEANFILES = $(nodist_libmpz_la_SOURCES)
+subdir = mpz
+mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs
+CONFIG_HEADER = ../config.h
+CONFIG_CLEAN_FILES = 
+LTLIBRARIES =  $(noinst_LTLIBRARIES)
+
+
+DEFS = @DEFS@ -I. -I$(srcdir) -I..
+CPPFLAGS = @CPPFLAGS@
+LDFLAGS = @LDFLAGS@
+LIBS = @LIBS@
+libmpz_la_LDFLAGS = 
+libmpz_la_LIBADD = 
+am_libmpz_la_OBJECTS =  abs.lo add.lo add_ui.lo addmul_ui.lo and.lo \
+array_init.lo bin_ui.lo bin_uiui.lo cdiv_q.lo cdiv_q_ui.lo cdiv_qr.lo \
+cdiv_qr_ui.lo cdiv_r.lo cdiv_r_ui.lo cdiv_ui.lo clear.lo clrbit.lo \
+cmp.lo cmp_si.lo cmp_ui.lo cmpabs.lo cmpabs_ui.lo com.lo divexact.lo \
+dump.lo fac_ui.lo fdiv_q.lo fdiv_q_2exp.lo fdiv_q_ui.lo fdiv_qr.lo \
+fdiv_qr_ui.lo fdiv_r.lo fdiv_r_2exp.lo fdiv_r_ui.lo fdiv_ui.lo \
+fib_ui.lo fits_sint_p.lo fits_slong_p.lo fits_sshort_p.lo \
+fits_uint_p.lo fits_ulong_p.lo fits_ushort_p.lo gcd.lo gcd_ui.lo \
+gcdext.lo get_d.lo get_si.lo get_str.lo get_ui.lo getlimbn.lo \
+hamdist.lo init.lo inp_raw.lo inp_str.lo invert.lo ior.lo iset.lo \
+iset_d.lo iset_si.lo iset_str.lo iset_ui.lo jacobi.lo kronsz.lo \
+kronuz.lo kronzs.lo kronzu.lo lcm.lo legendre.lo mod.lo mul.lo \
+mul_2exp.lo neg.lo nextprime.lo out_raw.lo out_str.lo perfpow.lo \
+perfsqr.lo popcount.lo pow_ui.lo powm.lo powm_ui.lo pprime_p.lo \
+random.lo random2.lo realloc.lo remove.lo root.lo rrandomb.lo scan0.lo \
+scan1.lo set.lo set_d.lo set_f.lo set_q.lo set_si.lo set_str.lo \
+set_ui.lo setbit.lo size.lo sizeinbase.lo sqrt.lo sqrtrem.lo sub.lo \
+sub_ui.lo swap.lo tdiv_ui.lo tdiv_q.lo tdiv_q_2exp.lo tdiv_q_ui.lo \
+tdiv_qr.lo tdiv_qr_ui.lo tdiv_r.lo tdiv_r_2exp.lo tdiv_r_ui.lo \
+tstbit.lo ui_pow_ui.lo urandomb.lo urandomm.lo xor.lo
+nodist_libmpz_la_OBJECTS =  mul_si.lo mul_ui.lo
+libmpz_la_OBJECTS =  $(am_libmpz_la_OBJECTS) $(nodist_libmpz_la_OBJECTS)
+COMPILE = $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CFLAGS = @CFLAGS@
+CCLD = $(CC)
+LINK = $(LIBTOOL) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+DIST_SOURCES =  $(libmpz_la_SOURCES)
+DIST_COMMON =  README Makefile.am Makefile.in
+
+
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+
+GZIP_ENV = --best
+depcomp = 
+SOURCES = $(libmpz_la_SOURCES) $(nodist_libmpz_la_SOURCES)
+OBJECTS = $(am_libmpz_la_OBJECTS) $(nodist_libmpz_la_OBJECTS)
+
+all: all-redirect
+.SUFFIXES:
+.SUFFIXES: .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4) 
+	cd $(top_srcdir) && $(AUTOMAKE) --gnu mpz/Makefile
+
+Makefile: $(srcdir)/Makefile.in  $(top_builddir)/config.status
+	cd $(top_builddir) \
+	  && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
+
+
+mostlyclean-noinstLTLIBRARIES:
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+
+distclean-noinstLTLIBRARIES:
+
+maintainer-clean-noinstLTLIBRARIES:
+
+mostlyclean-compile:
+	-rm -f *.o core *.core
+	-rm -f *.$(OBJEXT)
+
+clean-compile:
+
+distclean-compile:
+	-rm -f *.tab.c
+
+maintainer-clean-compile:
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+distclean-libtool:
+
+maintainer-clean-libtool:
+
+libmpz.la: $(libmpz_la_OBJECTS) $(libmpz_la_DEPENDENCIES)
+	$(LINK)  $(libmpz_la_LDFLAGS) $(libmpz_la_OBJECTS) $(libmpz_la_LIBADD) $(LIBS)
+.c.o:
+	$(COMPILE) -c $<
+.c.obj:
+	$(COMPILE) -c `cygpath -w $<`
+.c.lo:
+	$(LTCOMPILE) -c -o $@ $<
+
+# This directory's subdirectories are mostly independent; you can cd
+# into them and run `make' without going through this Makefile.
+# To change the values of `make' variables: instead of editing Makefiles,
+# (1) if the variable is set in `config.status', edit `config.status'
+#     (which will cause the Makefiles to be regenerated when you run `make');
+# (2) otherwise, pass the desired values on the `make' command line.
+
+all-recursive install-data-recursive install-exec-recursive \
+installdirs-recursive install-recursive uninstall-recursive  \
+check-recursive installcheck-recursive info-recursive dvi-recursive:
+	@set fnord $(MAKEFLAGS); amf=$$2; \
+	dot_seen=no; \
+	target=`echo $@ | sed s/-recursive//`; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    dot_seen=yes; \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	   || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
+	done; \
+	if test "$$dot_seen" = "no"; then \
+	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
+	fi; test -z "$$fail"
+
+mostlyclean-recursive clean-recursive distclean-recursive \
+maintainer-clean-recursive:
+	@set fnord $(MAKEFLAGS); amf=$$2; \
+	dot_seen=no; \
+	rev=''; list='$(SUBDIRS)'; for subdir in $$list; do \
+	  rev="$$subdir $$rev"; \
+	  if test "$$subdir" = "."; then dot_seen=yes; else :; fi; \
+	done; \
+	test "$$dot_seen" = "no" && rev=". $$rev"; \
+	target=`echo $@ | sed s/-recursive//`; \
+	for subdir in $$rev; do \
+	  echo "Making $$target in $$subdir"; \
+	  if test "$$subdir" = "."; then \
+	    local_target="$$target-am"; \
+	  else \
+	    local_target="$$target"; \
+	  fi; \
+	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
+	   || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
+	done && test -z "$$fail"
+tags-recursive:
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+	  test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
+	done
+
+tags: TAGS
+
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	mkid -f$$here/ID $$unique $(LISP)
+
+TAGS: tags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	tags=; \
+	here=`pwd`; \
+	list='$(SUBDIRS)'; for subdir in $$list; do \
+   if test "$$subdir" = .; then :; else \
+	    test -f $$subdir/TAGS && tags="$$tags -i $$here/$$subdir/TAGS"; \
+   fi; \
+	done; \
+	list='$(SOURCES) $(HEADERS) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	test -z "$(ETAGS_ARGS)$$unique$(LISP)$$tags" \
+	  || etags $(ETAGS_ARGS) $$tags  $$unique $(LISP)
+
+mostlyclean-tags:
+
+clean-tags:
+
+distclean-tags:
+	-rm -f TAGS ID
+
+maintainer-clean-tags:
+
+distdir = $(top_builddir)/$(PACKAGE)-$(VERSION)/$(subdir)
+
+distdir: $(DISTFILES)
+	@for file in $(DISTFILES); do \
+	  d=$(srcdir); \
+	  if test -d $$d/$$file; then \
+	    cp -pR $$d/$$file $(distdir); \
+	  else \
+	    test -f $(distdir)/$$file \
+	    || cp -p $$d/$$file $(distdir)/$$file || :; \
+	  fi; \
+	done
+	for subdir in $(SUBDIRS); do \
+	  if test "$$subdir" = .; then :; else \
+	    test -d $(distdir)/$$subdir \
+	    || mkdir $(distdir)/$$subdir \
+	    || exit 1; \
+	    (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) top_distdir=../$(top_distdir) distdir=../$(distdir)/$$subdir distdir) \
+	      || exit 1; \
+	  fi; \
+	done
+info-am:
+info: info-recursive
+dvi-am:
+dvi: dvi-recursive
+check-am: all-am
+check: check-recursive
+installcheck-am:
+installcheck: installcheck-recursive
+install-exec-am:
+install-exec: install-exec-recursive
+
+install-data-am:
+install-data: install-data-recursive
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+install: install-recursive
+uninstall-am:
+uninstall: uninstall-recursive
+all-am: Makefile $(LTLIBRARIES)
+all-redirect: all-recursive
+install-strip:
+	$(MAKE) $(AM_MAKEFLAGS) INSTALL_STRIP_FLAG=-s install
+installdirs: installdirs-recursive
+installdirs-am:
+
+
+mostlyclean-generic:
+
+clean-generic:
+	-test -z "$(CLEANFILES)" || rm -f $(CLEANFILES)
+
+distclean-generic:
+	-rm -f Makefile $(CONFIG_CLEAN_FILES)
+	-rm -f config.cache config.log stamp-h stamp-h[0-9]*
+
+maintainer-clean-generic:
+	-rm -f Makefile.in
+mostlyclean-am:  mostlyclean-noinstLTLIBRARIES mostlyclean-compile \
+		mostlyclean-libtool mostlyclean-tags \
+		mostlyclean-generic
+
+mostlyclean: mostlyclean-recursive
+
+clean-am:  clean-noinstLTLIBRARIES clean-compile clean-libtool \
+		clean-tags clean-generic mostlyclean-am
+
+clean: clean-recursive
+
+distclean-am:  distclean-noinstLTLIBRARIES distclean-compile \
+		distclean-libtool distclean-tags distclean-generic \
+		clean-am
+	-rm -f libtool
+
+distclean: distclean-recursive
+
+maintainer-clean-am:  maintainer-clean-noinstLTLIBRARIES \
+		maintainer-clean-compile maintainer-clean-libtool \
+		maintainer-clean-tags maintainer-clean-generic \
+		distclean-am
+	@echo "This command is intended for maintainers to use;"
+	@echo "it deletes files that may require special tools to rebuild."
+
+maintainer-clean: maintainer-clean-recursive
+
+.PHONY: mostlyclean-noinstLTLIBRARIES distclean-noinstLTLIBRARIES \
+clean-noinstLTLIBRARIES maintainer-clean-noinstLTLIBRARIES \
+mostlyclean-compile distclean-compile clean-compile \
+maintainer-clean-compile mostlyclean-libtool distclean-libtool \
+clean-libtool maintainer-clean-libtool install-recursive \
+uninstall-recursive install-data-recursive uninstall-data-recursive \
+install-exec-recursive uninstall-exec-recursive installdirs-recursive \
+uninstalldirs-recursive all-recursive check-recursive \
+installcheck-recursive info-recursive dvi-recursive \
+mostlyclean-recursive distclean-recursive clean-recursive \
+maintainer-clean-recursive tags tags-recursive mostlyclean-tags \
+distclean-tags clean-tags maintainer-clean-tags distdir info-am info \
+dvi-am dvi check check-am installcheck-am installcheck install-exec-am \
+install-exec install-data-am install-data install-am install \
+uninstall-am uninstall all-redirect all-am all install-strip \
+installdirs-am installdirs mostlyclean-generic distclean-generic \
+clean-generic maintainer-clean-generic clean mostlyclean distclean \
+maintainer-clean
+
+
+mul_si.c: $(srcdir)/mul_siui.c
+	cp $(srcdir)/mul_siui.c mul_si.c
+mul_ui.c: $(srcdir)/mul_siui.c
+	cp $(srcdir)/mul_siui.c mul_ui.c
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/rts/gmp/mpz/README b/rts/gmp/mpz/README
new file mode 100644
index 0000000000..06b481d770
--- /dev/null
+++ b/rts/gmp/mpz/README
@@ -0,0 +1,23 @@
+This directory contains functions for GMP's integer function layer.
+
+In this version of GMP, integers are represented like in the figure below.
+(Please note that the format might change between every version, and that
+depending on the internal format in any way is a bad idea.)
+
+   most				least
+significant		     significant
+   limb				limb
+
+				 _mp_d
+				  /
+				 /
+				\/
+	 ____ ____ ____ ____ ____
+	|____|____|____|____|____|
+
+	<------- _mp_size ------->
+
+
+The most significant limb will be non-zero.  The _mp_size field's sign
+reflects the sign of the number.  Its absolute value is the count of limbs
+in the number.
diff --git a/rts/gmp/mpz/abs.c b/rts/gmp/mpz/abs.c
new file mode 100644
index 0000000000..0b5eab1ce6
--- /dev/null
+++ b/rts/gmp/mpz/abs.c
@@ -0,0 +1,51 @@
+/* mpz_abs(dst, src) -- Assign the absolute value of SRC to DST.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_abs (mpz_ptr w, mpz_srcptr u)
+#else
+mpz_abs (w, u)
+     mpz_ptr w;
+     mpz_srcptr u;
+#endif
+{
+  mp_ptr wp, up;
+  mp_size_t size;
+
+  size = ABS (u->_mp_size);
+
+  if (u != w)
+    {
+      if (w->_mp_alloc < size)
+	_mpz_realloc (w, size);
+
+      wp = w->_mp_d;
+      up = u->_mp_d;
+
+      MPN_COPY (wp, up, size);
+    }
+
+  w->_mp_size = size;
+}
diff --git a/rts/gmp/mpz/add.c b/rts/gmp/mpz/add.c
new file mode 100644
index 0000000000..a22c3778fb
--- /dev/null
+++ b/rts/gmp/mpz/add.c
@@ -0,0 +1,123 @@
+/* mpz_add -- Add two integers.
+
+Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#ifdef BERKELEY_MP
+#include "mp.h"
+#endif
+
+#ifndef BERKELEY_MP
+void
+#if __STDC__
+mpz_add (mpz_ptr w, mpz_srcptr u, mpz_srcptr v)
+#else
+mpz_add (w, u, v)
+     mpz_ptr w;
+     mpz_srcptr u;
+     mpz_srcptr v;
+#endif
+#else /* BERKELEY_MP */
+void
+#if __STDC__
+madd (mpz_srcptr u, mpz_srcptr v, mpz_ptr w)
+#else
+madd (u, v, w)
+     mpz_srcptr u;
+     mpz_srcptr v;
+     mpz_ptr w;
+#endif
+#endif /* BERKELEY_MP */
+{
+  mp_srcptr up, vp;
+  mp_ptr wp;
+  mp_size_t usize, vsize, wsize;
+  mp_size_t abs_usize;
+  mp_size_t abs_vsize;
+
+  usize = u->_mp_size;
+  vsize = v->_mp_size;
+  abs_usize = ABS (usize);
+  abs_vsize = ABS (vsize);
+
+  if (abs_usize < abs_vsize)
+    {
+      /* Swap U and V. */
+      MPZ_SRCPTR_SWAP (u, v);
+      MP_SIZE_T_SWAP (usize, vsize);
+      MP_SIZE_T_SWAP (abs_usize, abs_vsize);
+    }
+
+  /* True: ABS_USIZE >= ABS_VSIZE.  */
+
+  /* If not space for w (and possible carry), increase space.  */
+  wsize = abs_usize + 1;
+  if (w->_mp_alloc < wsize)
+    _mpz_realloc (w, wsize);
+
+  /* These must be after realloc (u or v may be the same as w).  */
+  up = u->_mp_d;
+  vp = v->_mp_d;
+  wp = w->_mp_d;
+
+  if ((usize ^ vsize) < 0)
+    {
+      /* U and V have different sign.  Need to compare them to determine
+	 which operand to subtract from which.  */
+
+      /* This test is right since ABS_USIZE >= ABS_VSIZE.  */
+      if (abs_usize != abs_vsize)
+	{
+	  mpn_sub (wp, up, abs_usize, vp, abs_vsize);
+	  wsize = abs_usize;
+	  MPN_NORMALIZE (wp, wsize);
+	  if (usize < 0)
+	    wsize = -wsize;
+	}
+      else if (mpn_cmp (up, vp, abs_usize) < 0)
+	{
+	  mpn_sub_n (wp, vp, up, abs_usize);
+	  wsize = abs_usize;
+	  MPN_NORMALIZE (wp, wsize);
+	  if (usize >= 0)
+	    wsize = -wsize;
+	}
+      else
+	{
+	  mpn_sub_n (wp, up, vp, abs_usize);
+	  wsize = abs_usize;
+	  MPN_NORMALIZE (wp, wsize);
+	  if (usize < 0)
+	    wsize = -wsize;
+	}
+    }
+  else
+    {
+      /* U and V have same sign.  Add them.  */
+      mp_limb_t cy_limb = mpn_add (wp, up, abs_usize, vp, abs_vsize);
+      wp[abs_usize] = cy_limb;
+      wsize = abs_usize + cy_limb;
+      if (usize < 0)
+	wsize = -wsize;
+    }
+
+  w->_mp_size = wsize;
+}
diff --git a/rts/gmp/mpz/add_ui.c b/rts/gmp/mpz/add_ui.c
new file mode 100644
index 0000000000..28dbd71f45
--- /dev/null
+++ b/rts/gmp/mpz/add_ui.c
@@ -0,0 +1,84 @@
+/* mpz_add_ui -- Add an mpz_t and an unsigned one-word integer.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_add_ui (mpz_ptr w, mpz_srcptr u, unsigned long int v)
+#else
+mpz_add_ui (w, u, v)
+     mpz_ptr w;
+     mpz_srcptr u;
+     unsigned long int v;
+#endif
+{
+  mp_srcptr up;
+  mp_ptr wp;
+  mp_size_t usize, wsize;
+  mp_size_t abs_usize;
+
+  usize = u->_mp_size;
+  abs_usize = ABS (usize);
+
+  /* If not space for W (and possible carry), increase space.  */
+  wsize = abs_usize + 1;
+  if (w->_mp_alloc < wsize)
+    _mpz_realloc (w, wsize);
+
+  /* These must be after realloc (U may be the same as W).  */
+  up = u->_mp_d;
+  wp = w->_mp_d;
+
+  if (abs_usize == 0)
+    {
+      wp[0] = v;
+      w->_mp_size = v != 0;
+      return;
+    }
+
+  if (usize >= 0)
+    {
+      mp_limb_t cy;
+      cy = mpn_add_1 (wp, up, abs_usize, (mp_limb_t) v);
+      wp[abs_usize] = cy;
+      wsize = abs_usize + cy;
+    }
+  else
+    {
+      /* The signs are different.  Need exact comparison to determine
+	 which operand to subtract from which.  */
+      if (abs_usize == 1 && up[0] < v)
+	{
+	  wp[0] = v - up[0];
+	  wsize = 1;
+	}
+      else
+	{
+	  mpn_sub_1 (wp, up, abs_usize, (mp_limb_t) v);
+	  /* Size can decrease with at most one limb.  */
+	  wsize = -(abs_usize - (wp[abs_usize - 1] == 0));
+	}
+    }
+
+  w->_mp_size = wsize;
+}
diff --git a/rts/gmp/mpz/addmul_ui.c b/rts/gmp/mpz/addmul_ui.c
new file mode 100644
index 0000000000..7b38d3624d
--- /dev/null
+++ b/rts/gmp/mpz/addmul_ui.c
@@ -0,0 +1,214 @@
+/* mpz_addmul_ui(prodsum, multiplier, small_multiplicand) --
+   Add MULTIPLICATOR times SMALL_MULTIPLICAND to PRODSUM.
+
+Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+static mp_limb_t mpn_neg1 _PROTO ((mp_ptr, mp_size_t));
+
+#if 0
+#undef  MPN_NORMALIZE
+#define MPN_NORMALIZE(DST, NLIMBS) \
+  do {									\
+    while (--(NLIMBS) >= 0 && (DST)[NLIMBS] == 0)			\
+      ;									\
+    (NLIMBS)++;								\
+  } while (0)
+#undef  MPN_NORMALIZE_NOT_ZERO
+#define MPN_NORMALIZE_NOT_ZERO(DST, NLIMBS) \
+  do {									\
+    while ((DST)[--(NLIMBS)] == 0)					\
+      ;									\
+    (NLIMBS)++;								\
+  } while (0)
+#endif
+
+void
+#if __STDC__
+mpz_addmul_ui (mpz_ptr rz, mpz_srcptr az, unsigned long int bu)
+#else
+mpz_addmul_ui (rz, az, bu)
+     mpz_ptr rz;
+     mpz_srcptr az;
+     unsigned long int bu;
+#endif
+{
+  mp_size_t rn, an;
+  mp_ptr rp, ap;
+
+  an = SIZ (az);
+
+  /* If either multiplier is zero, result is unaffected.  */
+  if (bu == 0 || an == 0)
+    return;
+
+  rn = SIZ (rz);
+
+  if (rn == 0)
+    {
+      mp_limb_t cy;
+
+      an = ABS (an);
+      if (ALLOC (rz) <= an)
+	_mpz_realloc (rz, an + 1);
+      rp = PTR (rz);
+      ap = PTR (az);
+      cy = mpn_mul_1 (rp, ap, an, (mp_limb_t) bu);
+      rp[an] = cy;
+      an += cy != 0;
+      SIZ (rz) = SIZ (az) >= 0 ? an : -an;
+      return;
+    }
+
+  if ((an ^ rn) >= 0)
+    {
+      /* Sign of operands are the same--really add.  */
+      an = ABS (an);
+      rn = ABS (rn);
+      if (rn > an)
+	{
+	  mp_limb_t cy;
+	  if (ALLOC (rz) <= rn)
+	    _mpz_realloc (rz, rn + 1);
+	  rp = PTR (rz);
+	  ap = PTR (az);
+	  cy = mpn_addmul_1 (rp, ap, an, (mp_limb_t) bu);
+	  cy = mpn_add_1 (rp + an, rp + an, rn - an, cy);
+	  rp[rn] = cy;
+	  rn += cy != 0;
+	  SIZ (rz) = SIZ (rz) >= 0 ? rn : -rn;
+	  return;
+	}
+      else
+	{
+	  mp_limb_t cy;
+	  if (ALLOC (rz) <= an)
+	    _mpz_realloc (rz, an + 1);
+	  rp = PTR (rz);
+	  ap = PTR (az);
+	  cy = mpn_addmul_1 (rp, ap, rn, (mp_limb_t) bu);
+	  if (an != rn)
+	    {
+	      mp_limb_t cy2;
+	      cy2 = mpn_mul_1 (rp + rn, ap + rn, an - rn, (mp_limb_t) bu);
+	      cy = cy2 + mpn_add_1 (rp + rn, rp + rn, an - rn, cy);
+	    }
+	  rn = an;
+	  rp[rn] = cy;
+	  rn += cy != 0;
+	  SIZ (rz) = SIZ (rz) >= 0 ? rn : -rn;
+	  return;
+	}
+    }
+  else
+    {
+      /* Sign of operands are different--actually subtract.  */
+      an = ABS (an);
+      rn = ABS (rn);
+      if (rn > an)
+	{
+	  mp_limb_t cy;
+	  rp = PTR (rz);
+	  ap = PTR (az);
+	  cy = mpn_submul_1 (rp, ap, an, (mp_limb_t) bu);
+	  cy = mpn_sub_1 (rp + an, rp + an, rn - an, cy);
+	  if (cy != 0)
+	    {
+	      mpn_neg1 (rp, rn);
+	      MPN_NORMALIZE_NOT_ZERO (rp, rn);
+	    }
+	  else
+	    {
+	      MPN_NORMALIZE (rp, rn);
+	      rn = -rn;
+	    }
+
+	  SIZ (rz) = SIZ (rz) >= 0 ? -rn : rn;
+	  return;
+	}
+      else
+	{
+	  /* Tricky case.  We need to subtract an operand that might be larger
+	     than the minuend.  To avoid allocating temporary space, we compute
+	     a*b-r instead of r-a*b and then negate.  */
+	  mp_limb_t cy;
+	  if (ALLOC (rz) <= an)
+	    _mpz_realloc (rz, an + 1);
+	  rp = PTR (rz);
+	  ap = PTR (az);
+	  cy = mpn_submul_1 (rp, ap, rn, (mp_limb_t) bu);
+	  if (an != rn)
+	    {
+	      mp_limb_t cy2;
+	      cy -= mpn_neg1 (rp, rn);
+	      cy2 = mpn_mul_1 (rp + rn, ap + rn, an - rn, (mp_limb_t) bu);
+	      if (cy == ~(mp_limb_t) 0)
+		cy = cy2 - mpn_sub_1 (rp + rn, rp + rn, an - rn, (mp_limb_t) 1);
+	      else
+		cy = cy2 + mpn_add_1 (rp + rn, rp + rn, an - rn, cy);
+	      rp[an] = cy;
+	      rn = an + (cy != 0);
+	      rn -= rp[rn - 1] == 0;
+	    }
+	  else if (cy != 0)
+	    {
+	      cy -= mpn_neg1 (rp, rn);
+	      rp[an] = cy;
+	      rn = an + 1;
+	      MPN_NORMALIZE_NOT_ZERO (rp, rn);
+	    }
+	  else
+	    {
+	      rn = an;
+	      MPN_NORMALIZE (rp, rn);
+	      rn = -rn;
+	    }
+
+	  SIZ (rz) = SIZ (rz) >= 0 ? -rn : rn;
+	  return;
+	}
+    }
+}
+
+static mp_limb_t
+#if __STDC__
+mpn_neg1 (mp_ptr rp, mp_size_t rn)
+#else
+mpn_neg1 (rp, rn)
+     mp_ptr rp;
+     mp_size_t rn;
+#endif
+{
+  mp_size_t i;
+
+  while (rn != 0 && rp[0] == 0)
+    rp++, rn--;
+
+  if (rn != 0)
+    {
+      rp[0] = -rp[0];
+      for (i = 1; i < rn; i++)
+	rp[i] = ~rp[i];
+      return 1;
+    }
+  return 0;
+}
diff --git a/rts/gmp/mpz/and.c b/rts/gmp/mpz/and.c
new file mode 100644
index 0000000000..354e9455bf
--- /dev/null
+++ b/rts/gmp/mpz/and.c
@@ -0,0 +1,278 @@
+/* mpz_and -- Logical and.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_and (mpz_ptr res, mpz_srcptr op1, mpz_srcptr op2)
+#else
+mpz_and (res, op1, op2)
+     mpz_ptr res;
+     mpz_srcptr op1;
+     mpz_srcptr op2;
+#endif
+{
+  mp_srcptr op1_ptr, op2_ptr;
+  mp_size_t op1_size, op2_size;
+  mp_ptr res_ptr;
+  mp_size_t res_size;
+  mp_size_t i;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+  op1_size = op1->_mp_size;
+  op2_size = op2->_mp_size;
+
+  op1_ptr = op1->_mp_d;
+  op2_ptr = op2->_mp_d;
+  res_ptr = res->_mp_d;
+
+  if (op1_size >= 0)
+    {
+      if (op2_size >= 0)
+	{
+	  res_size = MIN (op1_size, op2_size);
+	  /* First loop finds the size of the result.  */
+	  for (i = res_size - 1; i >= 0; i--)
+	    if ((op1_ptr[i] & op2_ptr[i]) != 0)
+	      break;
+	  res_size = i + 1;
+
+	  /* Handle allocation, now then we know exactly how much space is
+	     needed for the result.  */
+	  if (res->_mp_alloc < res_size)
+	    {
+	      _mpz_realloc (res, res_size);
+	      op1_ptr = op1->_mp_d;
+	      op2_ptr = op2->_mp_d;
+	      res_ptr = res->_mp_d;
+	    }
+
+	  /* Second loop computes the real result.  */
+	  for (i = res_size - 1; i >= 0; i--)
+	    res_ptr[i] = op1_ptr[i] & op2_ptr[i];
+
+	  res->_mp_size = res_size;
+	  return;
+	}
+      else /* op2_size < 0 */
+	{
+	  /* Fall through to the code at the end of the function.  */
+	}
+    }
+  else
+    {
+      if (op2_size < 0)
+	{
+	  mp_ptr opx;
+	  mp_limb_t cy;
+	  mp_size_t res_alloc;
+
+	  /* Both operands are negative, so will be the result.
+	     -((-OP1) & (-OP2)) = -(~(OP1 - 1) & ~(OP2 - 1)) =
+	     = ~(~(OP1 - 1) & ~(OP2 - 1)) + 1 =
+	     = ((OP1 - 1) | (OP2 - 1)) + 1      */
+
+	  /* It might seem as we could end up with an (invalid) result with
+	     a leading zero-limb here when one of the operands is of the
+	     type 1,,0,,..,,.0.  But some analysis shows that we surely
+	     would get carry into the zero-limb in this situation...  */
+
+	  op1_size = -op1_size;
+	  op2_size = -op2_size;
+
+	  res_alloc = 1 + MAX (op1_size, op2_size);
+
+	  opx = (mp_ptr) TMP_ALLOC (op1_size * BYTES_PER_MP_LIMB);
+	  mpn_sub_1 (opx, op1_ptr, op1_size, (mp_limb_t) 1);
+	  op1_ptr = opx;
+
+	  opx = (mp_ptr) TMP_ALLOC (op2_size * BYTES_PER_MP_LIMB);
+	  mpn_sub_1 (opx, op2_ptr, op2_size, (mp_limb_t) 1);
+	  op2_ptr = opx;
+
+	  if (res->_mp_alloc < res_alloc)
+	    {
+	      _mpz_realloc (res, res_alloc);
+	      res_ptr = res->_mp_d;
+	      /* Don't re-read OP1_PTR and OP2_PTR.  They point to
+		 temporary space--never to the space RES->_mp_d used
+		 to point to before reallocation.  */
+	    }
+
+	  if (op1_size >= op2_size)
+	    {
+	      MPN_COPY (res_ptr + op2_size, op1_ptr + op2_size,
+			op1_size - op2_size);
+	      for (i = op2_size - 1; i >= 0; i--)
+		res_ptr[i] = op1_ptr[i] | op2_ptr[i];
+	      res_size = op1_size;
+	    }
+	  else
+	    {
+	      MPN_COPY (res_ptr + op1_size, op2_ptr + op1_size,
+			op2_size - op1_size);
+	      for (i = op1_size - 1; i >= 0; i--)
+		res_ptr[i] = op1_ptr[i] | op2_ptr[i];
+	      res_size = op2_size;
+	    }
+
+	  cy = mpn_add_1 (res_ptr, res_ptr, res_size, (mp_limb_t) 1);
+	  if (cy)
+	    {
+	      res_ptr[res_size] = cy;
+	      res_size++;
+	    }
+
+	  res->_mp_size = -res_size;
+	  TMP_FREE (marker);
+	  return;
+	}
+      else
+	{
+	  /* We should compute -OP1 & OP2.  Swap OP1 and OP2 and fall
+	     through to the code that handles OP1 & -OP2.  */
+          MPZ_SRCPTR_SWAP (op1, op2);
+          MPN_SRCPTR_SWAP (op1_ptr,op1_size, op2_ptr,op2_size);
+	}
+
+    }
+
+  {
+#if ANDNEW
+    mp_size_t op2_lim;
+    mp_size_t count;
+
+    /* OP2 must be negated as with infinite precision.
+
+       Scan from the low end for a non-zero limb.  The first non-zero
+       limb is simply negated (two's complement).  Any subsequent
+       limbs are one's complemented.  Of course, we don't need to
+       handle more limbs than there are limbs in the other, positive
+       operand as the result for those limbs is going to become zero
+       anyway.  */
+
+    /* Scan for the least significant non-zero OP2 limb, and zero the
+       result meanwhile for those limb positions.  (We will surely
+       find a non-zero limb, so we can write the loop with one
+       termination condition only.)  */
+    for (i = 0; op2_ptr[i] == 0; i++)
+      res_ptr[i] = 0;
+    op2_lim = i;
+
+    op2_size = -op2_size;
+
+    if (op1_size <= op2_size)
+      {
+	/* The ones-extended OP2 is >= than the zero-extended OP1.
+	   RES_SIZE <= OP1_SIZE.  Find the exact size.  */
+	for (i = op1_size - 1; i > op2_lim; i--)
+	  if ((op1_ptr[i] & ~op2_ptr[i]) != 0)
+	    break;
+	res_size = i + 1;
+	for (i = res_size - 1; i > op2_lim; i--)
+	  res_ptr[i] = op1_ptr[i] & ~op2_ptr[i];
+	res_ptr[op2_lim] = op1_ptr[op2_lim] & -op2_ptr[op2_lim];
+	/* Yes, this *can* happen!  */
+	MPN_NORMALIZE (res_ptr, res_size);
+      }
+    else
+      {
+	/* The ones-extended OP2 is < than the zero-extended OP1.
+	   RES_SIZE == OP1_SIZE, since OP1 is normalized.  */
+	res_size = op1_size;
+	MPN_COPY (res_ptr + op2_size, op1_ptr + op2_size, op1_size - op2_size);
+	for (i = op2_size - 1; i > op2_lim; i--)
+	  res_ptr[i] = op1_ptr[i] & ~op2_ptr[i];
+	res_ptr[op2_lim] = op1_ptr[op2_lim] & -op2_ptr[op2_lim];
+      }
+
+    res->_mp_size = res_size;
+#else
+
+    /* OP1 is positive and zero-extended,
+       OP2 is negative and ones-extended.
+       The result will be positive.
+       OP1 & -OP2 = OP1 & ~(OP2 - 1).  */
+
+    mp_ptr opx;
+
+    op2_size = -op2_size;
+    opx = (mp_ptr) TMP_ALLOC (op2_size * BYTES_PER_MP_LIMB);
+    mpn_sub_1 (opx, op2_ptr, op2_size, (mp_limb_t) 1);
+    op2_ptr = opx;
+
+    if (op1_size > op2_size)
+      {
+	/* The result has the same size as OP1, since OP1 is normalized
+	   and longer than the ones-extended OP2.  */
+	res_size = op1_size;
+
+	/* Handle allocation, now then we know exactly how much space is
+	   needed for the result.  */
+	if (res->_mp_alloc < res_size)
+	  {
+	    _mpz_realloc (res, res_size);
+	    res_ptr = res->_mp_d;
+	    op1_ptr = op1->_mp_d;
+	    /* Don't re-read OP2_PTR.  It points to temporary space--never
+	       to the space RES->_mp_d used to point to before reallocation.  */
+	  }
+
+	MPN_COPY (res_ptr + op2_size, op1_ptr + op2_size,
+		  res_size - op2_size);
+	for (i = op2_size - 1; i >= 0; i--)
+	  res_ptr[i] = op1_ptr[i] & ~op2_ptr[i];
+
+	res->_mp_size = res_size;
+      }
+    else
+      {
+	/* Find out the exact result size.  Ignore the high limbs of OP2,
+	   OP1 is zero-extended and would make the result zero.  */
+	for (i = op1_size - 1; i >= 0; i--)
+	  if ((op1_ptr[i] & ~op2_ptr[i]) != 0)
+	    break;
+	res_size = i + 1;
+
+	/* Handle allocation, now then we know exactly how much space is
+	   needed for the result.  */
+	if (res->_mp_alloc < res_size)
+	  {
+	    _mpz_realloc (res, res_size);
+	    res_ptr = res->_mp_d;
+	    op1_ptr = op1->_mp_d;
+	    /* Don't re-read OP2_PTR.  It points to temporary space--never
+	       to the space RES->_mp_d used to point to before reallocation.  */
+	  }
+
+	for (i = res_size - 1; i >= 0; i--)
+	  res_ptr[i] = op1_ptr[i] & ~op2_ptr[i];
+
+	res->_mp_size = res_size;
+      }
+#endif
+  }
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/array_init.c b/rts/gmp/mpz/array_init.c
new file mode 100644
index 0000000000..1c22046986
--- /dev/null
+++ b/rts/gmp/mpz/array_init.c
@@ -0,0 +1,48 @@
+/* mpz_array_init (array, array_size, size_per_elem) --
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_array_init (mpz_ptr arr, mp_size_t arr_size, mp_size_t nbits)
+#else
+mpz_array_init (arr, arr_size, nbits)
+     mpz_ptr arr;
+     mp_size_t arr_size;
+     mp_size_t nbits;
+#endif
+{
+  register mp_ptr p;
+  register size_t i;
+  mp_size_t nlimbs;
+
+  nlimbs = (nbits + BITS_PER_MP_LIMB - 1) / BITS_PER_MP_LIMB;
+  p = (mp_ptr) (*_mp_allocate_func) (arr_size * nlimbs * BYTES_PER_MP_LIMB);
+
+  for (i = 0; i < arr_size; i++)
+    {
+      arr[i]._mp_alloc = nlimbs + 1; /* Yes, lie a little... */
+      arr[i]._mp_size = 0;
+      arr[i]._mp_d = p + i * nlimbs;
+    }
+}
diff --git a/rts/gmp/mpz/bin_ui.c b/rts/gmp/mpz/bin_ui.c
new file mode 100644
index 0000000000..a7a6c98218
--- /dev/null
+++ b/rts/gmp/mpz/bin_ui.c
@@ -0,0 +1,141 @@
+/* mpz_bin_uiui - compute n over k.
+
+Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* This is a poor implementation.  Look at bin_uiui.c for improvement ideas.
+   In fact consider calling mpz_bin_uiui() when the arguments fit, leaving
+   the code here only for big n.
+
+   The identity bin(n,k) = (-1)^k * bin(-n+k-1,k) can be found in Knuth vol
+   1 section 1.2.6 part G. */
+
+
+/* Enhancement: use mpn_divexact_1 when it exists */
+#define DIVIDE()                                        \
+  ASSERT (SIZ(r) > 0);                                  \
+  ASSERT_NOCARRY (mpn_divrem_1 (PTR(r), (mp_size_t) 0,  \
+                                PTR(r), SIZ(r), kacc)); \
+  SIZ(r) -= (PTR(r)[SIZ(r)-1] == 0);
+
+void
+#if __STDC__
+mpz_bin_ui (mpz_ptr r, mpz_srcptr n, unsigned long int k)
+#else
+mpz_bin_ui (r, n, k)
+     mpz_ptr r;
+     mpz_srcptr n;
+     unsigned long int k;
+#endif
+{
+  mpz_t      ni;
+  mp_limb_t  i;
+  mpz_t      nacc;
+  mp_limb_t  kacc;
+  mp_size_t  negate;
+  
+  if (mpz_sgn (n) < 0)
+    {
+      /* bin(n,k) = (-1)^k * bin(-n+k-1,k), and set ni = -n+k-1 - k = -n-1 */
+      mpz_init (ni);
+      mpz_neg (ni, n);
+      mpz_sub_ui (ni, ni, 1L);
+      negate = (k & 1);   /* (-1)^k */
+    }
+  else
+    {
+      /* bin(n,k) == 0 if k>n
+         (no test for this under the n<0 case, since -n+k-1 >= k there) */
+      if (mpz_cmp_ui (n, k) < 0)
+        {
+          mpz_set_ui (r, 0L);
+          return;
+        }
+
+      /* set ni = n-k */
+      mpz_init (ni);
+      mpz_sub_ui (ni, n, k);
+      negate = 0;
+    }
+
+  /* Now wanting bin(ni+k,k), with ni positive, and "negate" is the sign (0
+     for positive, 1 for negative). */
+  mpz_set_ui (r, 1L);
+
+  /* Rewrite bin(n,k) as bin(n,n-k) if that is smaller.  In this case it's
+     whether ni+k-k < k meaning ni<k, and if so change to denominator ni+k-k
+     = ni, and new ni of ni+k-ni = k.  */
+  if (mpz_cmp_ui (ni, k) < 0)
+    {
+      unsigned long  tmp;
+      tmp = k;
+      k = mpz_get_ui (ni);
+      mpz_set_ui (ni, tmp);
+    }
+
+  kacc = 1;
+  mpz_init_set_ui (nacc, 1);
+
+  for (i = 1; i <= k; i++)
+    {
+      mp_limb_t k1, k0;
+
+#if 0
+      mp_limb_t nacclow;
+      int c;
+
+      nacclow = PTR(nacc)[0];
+      for (c = 0; (((kacc | nacclow) & 1) == 0); c++)
+	{
+	  kacc >>= 1;
+	  nacclow >>= 1;
+	}
+      mpz_div_2exp (nacc, nacc, c);
+#endif
+
+      mpz_add_ui (ni, ni, 1);
+      mpz_mul (nacc, nacc, ni);
+      umul_ppmm (k1, k0, kacc, i);
+      if (k1 != 0)
+	{
+	  /* Accumulator overflow.  Perform bignum step.  */
+	  mpz_mul (r, r, nacc);
+	  mpz_set_ui (nacc, 1);
+          DIVIDE ();
+	  kacc = i;
+	}
+      else
+	{
+	  /* Save new products in accumulators to keep accumulating.  */
+	  kacc = k0;
+	}
+    }
+
+  mpz_mul (r, r, nacc);
+  DIVIDE ();
+  SIZ(r) = (SIZ(r) ^ -negate) + negate;
+
+  mpz_clear (nacc);
+  mpz_clear (ni);
+}
diff --git a/rts/gmp/mpz/bin_uiui.c b/rts/gmp/mpz/bin_uiui.c
new file mode 100644
index 0000000000..b37541ba54
--- /dev/null
+++ b/rts/gmp/mpz/bin_uiui.c
@@ -0,0 +1,120 @@
+/* mpz_bin_uiui - compute n over k.
+
+Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* Avoid reallocs by rounding up any new size */
+#define ROUNDUP_MASK  15
+
+/* Enhancement: use mpn_divexact_1 when it exists */
+#define MULDIV()                                                \
+  MPZ_REALLOC (r, (SIZ(r)+1)|ROUNDUP_MASK);                     \
+  PTR(r)[SIZ(r)] = mpn_mul_1 (PTR(r), PTR(r), SIZ(r), nacc);    \
+  ASSERT_NOCARRY (mpn_divrem_1 (PTR(r), (mp_size_t) 0,          \
+                                PTR(r), SIZ(r)+1, kacc));       \
+  SIZ(r) += (PTR(r)[SIZ(r)] != 0);
+
+void
+#if __STDC__
+mpz_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
+#else
+mpz_bin_uiui (r, n, k)
+     mpz_ptr r;
+     unsigned long int n;
+     unsigned long int k;
+#endif
+{
+  unsigned long int  i, j;
+  mp_limb_t          nacc, kacc;
+  unsigned long int  cnt;
+
+  /* bin(n,k) = 0 if k>n. */
+  if (n < k)
+    {
+      mpz_set_ui (r, 0);
+      return;
+    }
+
+  /* Rewrite bin(n,k) as bin(n,n-k) if that is smaller. */
+  k = MIN (k, n-k);
+
+  /* bin(n,0) = 1 */
+  if (k == 0)
+    {
+      mpz_set_ui (r, 1);
+      return;
+    }
+
+  j = n - k + 1;
+  mpz_set_ui (r, j);
+
+  /* Initialize accumulators.  */
+  nacc = 1;
+  kacc = 1;
+
+  cnt = 0;
+  for (i = 2; i <= k; i++)
+    {
+      mp_limb_t n1, n0, k1, k0;
+
+      j++;
+#if 0
+      /* Remove common multiples of 2.  This will allow us to accumulate
+         more in nacc and kacc before we need a bignum step.  It would make
+         sense to cancel factors of 3, 5, etc too, but this would be best
+         handled by sieving out factors.  Alternatively, we could perform a
+         gcd of the accumulators just as they have overflown, and keep
+         accumulating until the gcd doesn't remove a significant factor.  */
+      while (((nacc | kacc) & 1) == 0)
+        {
+          nacc >>= 1;
+          kacc >>= 1;
+        }
+#else
+      cnt = ((nacc | kacc) & 1) ^ 1;
+      nacc >>= cnt;
+      kacc >>= cnt;
+#endif
+      /* Accumulate next multiples.  */
+      umul_ppmm (n1, n0, nacc, j);
+      umul_ppmm (k1, k0, kacc, i);
+      if (n1 != 0)
+        {
+          /* Accumulator overflow.  Perform bignum step.  */
+          MULDIV ();
+          nacc = j;
+          kacc = i;
+        }
+      else
+        {
+          if (k1 != 0) abort ();
+          /* Save new products in accumulators to keep accumulating.  */
+          nacc = n0;
+          kacc = k0;
+        }
+    }
+
+  /* Take care of whatever is left in accumulators.  */
+  MULDIV ();
+}
diff --git a/rts/gmp/mpz/cdiv_q.c b/rts/gmp/mpz/cdiv_q.c
new file mode 100644
index 0000000000..b15ba8aaa9
--- /dev/null
+++ b/rts/gmp/mpz/cdiv_q.c
@@ -0,0 +1,51 @@
+/* mpz_cdiv_q -- Division rounding the quotient towards +infinity.  The
+   remainder gets the opposite sign as the denominator.
+
+Copyright (C) 1994, 1995, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_cdiv_q (mpz_ptr quot, mpz_srcptr dividend, mpz_srcptr divisor)
+#else
+mpz_cdiv_q (quot, dividend, divisor)
+     mpz_ptr quot;
+     mpz_srcptr dividend;
+     mpz_srcptr divisor;
+#endif
+{
+  mp_size_t dividend_size = dividend->_mp_size;
+  mp_size_t divisor_size = divisor->_mp_size;
+  mpz_t rem;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  MPZ_TMP_INIT (rem, ABS (divisor_size));
+
+  mpz_tdiv_qr (quot, rem, dividend, divisor);
+
+  if ((divisor_size ^ dividend_size) >= 0 && rem->_mp_size != 0)
+    mpz_add_ui (quot, quot, 1L);
+
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/cdiv_q_ui.c b/rts/gmp/mpz/cdiv_q_ui.c
new file mode 100644
index 0000000000..74f3a90b83
--- /dev/null
+++ b/rts/gmp/mpz/cdiv_q_ui.c
@@ -0,0 +1,67 @@
+/* mpz_cdiv_q_ui -- Division rounding the quotient towards +infinity.  The
+   remainder gets the opposite sign as the denominator.  In order to make it
+   always fit into the return type, the negative of the true remainder is
+   returned.
+
+Copyright (C) 1994, 1996, 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_cdiv_q_ui (mpz_ptr quot, mpz_srcptr dividend, unsigned long int divisor)
+#else
+mpz_cdiv_q_ui (quot, dividend, divisor)
+     mpz_ptr quot;
+     mpz_srcptr dividend;
+     unsigned long int divisor;
+#endif
+{
+  mp_size_t dividend_size;
+  mp_size_t size;
+  mp_ptr quot_ptr;
+  mp_limb_t remainder_limb;
+
+  if (divisor == 0)
+    DIVIDE_BY_ZERO;
+
+  dividend_size = dividend->_mp_size;
+  size = ABS (dividend_size);
+
+  if (quot->_mp_alloc < size)
+    _mpz_realloc (quot, size);
+
+  quot_ptr = quot->_mp_d;
+
+  remainder_limb = mpn_divmod_1 (quot_ptr, dividend->_mp_d, size,
+				   (mp_limb_t) divisor);
+
+  if (remainder_limb != 0 && dividend_size >= 0)
+    {
+      mpn_incr_u (quot_ptr, (mp_limb_t) 1);
+      remainder_limb = divisor - remainder_limb;
+    }
+
+  size -= size != 0 && quot_ptr[size - 1] == 0;
+  quot->_mp_size = dividend_size >= 0 ? size : -size;
+
+  return remainder_limb;
+}
diff --git a/rts/gmp/mpz/cdiv_qr.c b/rts/gmp/mpz/cdiv_qr.c
new file mode 100644
index 0000000000..29c7c41a4e
--- /dev/null
+++ b/rts/gmp/mpz/cdiv_qr.c
@@ -0,0 +1,64 @@
+/* mpz_cdiv_qr -- Division rounding the quotient towards +infinity.  The
+   remainder gets the opposite sign as the denominator.
+
+Copyright (C) 1994, 1995, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_cdiv_qr (mpz_ptr quot, mpz_ptr rem, mpz_srcptr dividend, mpz_srcptr divisor)
+#else
+mpz_cdiv_qr (quot, rem, dividend, divisor)
+     mpz_ptr quot;
+     mpz_ptr rem;
+     mpz_srcptr dividend;
+     mpz_srcptr divisor;
+#endif
+{
+  mp_size_t divisor_size = divisor->_mp_size;
+  mp_size_t xsize;
+  mpz_t temp_divisor;		/* N.B.: lives until function returns! */
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  /* We need the original value of the divisor after the quotient and
+     remainder have been preliminary calculated.  We have to copy it to
+     temporary space if it's the same variable as either QUOT or REM.  */
+  if (quot == divisor || rem == divisor)
+    {
+      MPZ_TMP_INIT (temp_divisor, ABS (divisor_size));
+      mpz_set (temp_divisor, divisor);
+      divisor = temp_divisor;
+    }
+
+  xsize = dividend->_mp_size ^ divisor_size;;
+  mpz_tdiv_qr (quot, rem, dividend, divisor);
+
+  if (xsize >= 0 && rem->_mp_size != 0)
+    {
+      mpz_add_ui (quot, quot, 1L);
+      mpz_sub (rem, rem, divisor);
+    }
+
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/cdiv_qr_ui.c b/rts/gmp/mpz/cdiv_qr_ui.c
new file mode 100644
index 0000000000..a7873c6e20
--- /dev/null
+++ b/rts/gmp/mpz/cdiv_qr_ui.c
@@ -0,0 +1,71 @@
+/* mpz_cdiv_qr_ui -- Division rounding the quotient towards +infinity.  The
+   remainder gets the opposite sign as the denominator.  In order to make it
+   always fit into the return type, the negative of the true remainder is
+   returned.
+
+Copyright (C) 1994, 1995, 1996, 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_cdiv_qr_ui (mpz_ptr quot, mpz_ptr rem, mpz_srcptr dividend, unsigned long int divisor)
+#else
+mpz_cdiv_qr_ui (quot, rem, dividend, divisor)
+     mpz_ptr quot;
+     mpz_ptr rem;
+     mpz_srcptr dividend;
+     unsigned long int divisor;
+#endif
+{
+  mp_size_t dividend_size;
+  mp_size_t size;
+  mp_ptr quot_ptr;
+  mp_limb_t remainder_limb;
+
+  if (divisor == 0)
+    DIVIDE_BY_ZERO;
+
+  dividend_size = dividend->_mp_size;
+  size = ABS (dividend_size);
+
+  if (quot->_mp_alloc < size)
+    _mpz_realloc (quot, size);
+
+  quot_ptr = quot->_mp_d;
+
+  remainder_limb = mpn_divmod_1 (quot_ptr, dividend->_mp_d, size,
+				 (mp_limb_t) divisor);
+
+  if (remainder_limb != 0 && dividend_size >= 0)
+    {
+      mpn_incr_u (quot_ptr, (mp_limb_t) 1);
+      remainder_limb = divisor - remainder_limb;
+    }
+
+  size -= size != 0 && quot_ptr[size - 1] == 0;
+  quot->_mp_size = dividend_size >= 0 ? size : -size;
+
+  rem->_mp_d[0] = remainder_limb;
+  rem->_mp_size = -(remainder_limb != 0);
+
+  return remainder_limb;
+}
diff --git a/rts/gmp/mpz/cdiv_r.c b/rts/gmp/mpz/cdiv_r.c
new file mode 100644
index 0000000000..e96ce7e677
--- /dev/null
+++ b/rts/gmp/mpz/cdiv_r.c
@@ -0,0 +1,59 @@
+/* mpz_cdiv_r -- Division rounding the quotient towards +infinity.  The
+   remainder gets the opposite sign as the denominator.
+
+Copyright (C) 1994, 1995, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_cdiv_r (mpz_ptr rem, mpz_srcptr dividend, mpz_srcptr divisor)
+#else
+mpz_cdiv_r (rem, dividend, divisor)
+     mpz_ptr rem;
+     mpz_srcptr dividend;
+     mpz_srcptr divisor;
+#endif
+{
+  mp_size_t divisor_size = divisor->_mp_size;
+  mpz_t temp_divisor;		/* N.B.: lives until function returns! */
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  /* We need the original value of the divisor after the remainder has been
+     preliminary calculated.  We have to copy it to temporary space if it's
+     the same variable as REM.  */
+  if (rem == divisor)
+    {
+
+      MPZ_TMP_INIT (temp_divisor, ABS (divisor_size));
+      mpz_set (temp_divisor, divisor);
+      divisor = temp_divisor;
+    }
+
+  mpz_tdiv_r (rem, dividend, divisor);
+
+  if ((divisor_size ^ dividend->_mp_size) >= 0 && rem->_mp_size != 0)
+    mpz_sub (rem, rem, divisor);
+
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/cdiv_r_ui.c b/rts/gmp/mpz/cdiv_r_ui.c
new file mode 100644
index 0000000000..e17e2381c0
--- /dev/null
+++ b/rts/gmp/mpz/cdiv_r_ui.c
@@ -0,0 +1,57 @@
+/* mpz_cdiv_r_ui -- Division rounding the quotient towards +infinity.  The
+   remainder gets the opposite sign as the denominator.  In order to make it
+   always fit into the return type, the negative of the true remainder is
+   returned.
+
+Copyright (C) 1994, 1995, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_cdiv_r_ui (mpz_ptr rem, mpz_srcptr dividend, unsigned long int divisor)
+#else
+mpz_cdiv_r_ui (rem, dividend, divisor)
+     mpz_ptr rem;
+     mpz_srcptr dividend;
+     unsigned long int divisor;
+#endif
+{
+  mp_size_t dividend_size;
+  mp_size_t size;
+  mp_limb_t remainder_limb;
+
+  if (divisor == 0)
+    DIVIDE_BY_ZERO;
+
+  dividend_size = dividend->_mp_size;
+  size = ABS (dividend_size);
+
+  remainder_limb = mpn_mod_1 (dividend->_mp_d, size, (mp_limb_t) divisor);
+
+  if (remainder_limb != 0 && dividend_size >= 0)
+    remainder_limb = divisor - remainder_limb;
+
+  rem->_mp_d[0] = remainder_limb;
+  rem->_mp_size = -(remainder_limb != 0);
+
+  return remainder_limb;
+}
diff --git a/rts/gmp/mpz/cdiv_ui.c b/rts/gmp/mpz/cdiv_ui.c
new file mode 100644
index 0000000000..63547a78c0
--- /dev/null
+++ b/rts/gmp/mpz/cdiv_ui.c
@@ -0,0 +1,50 @@
+/* mpz_cdiv_ui -- Division rounding the quotient towards +infinity.  The
+   remainder gets the opposite sign as the denominator.  In order to make it
+   always fit into the return type, the negative of the true remainder is
+   returned.
+
+Copyright (C) 1994, 1995, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_cdiv_ui (mpz_srcptr dividend, unsigned long int divisor)
+#else
+mpz_cdiv_ui (dividend, divisor)
+     mpz_srcptr dividend;
+     unsigned long int divisor;
+#endif
+{
+  mp_size_t dividend_size;
+  mp_size_t size;
+  mp_limb_t remainder_limb;
+
+  dividend_size = dividend->_mp_size;
+  size = ABS (dividend_size);
+
+  remainder_limb = mpn_mod_1 (dividend->_mp_d, size, (mp_limb_t) divisor);
+
+  if (remainder_limb != 0 && dividend_size >= 0)
+    remainder_limb = divisor - remainder_limb;
+
+  return remainder_limb;
+}
diff --git a/rts/gmp/mpz/clear.c b/rts/gmp/mpz/clear.c
new file mode 100644
index 0000000000..5224553f9e
--- /dev/null
+++ b/rts/gmp/mpz/clear.c
@@ -0,0 +1,35 @@
+/* mpz_clear -- de-allocate the space occupied by the dynamic digit space of
+   an integer.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_clear (mpz_ptr m)
+#else
+mpz_clear (m)
+     mpz_ptr m;
+#endif
+{
+  (*_mp_free_func) (m->_mp_d, m->_mp_alloc * BYTES_PER_MP_LIMB);
+}
diff --git a/rts/gmp/mpz/clrbit.c b/rts/gmp/mpz/clrbit.c
new file mode 100644
index 0000000000..865d84902f
--- /dev/null
+++ b/rts/gmp/mpz/clrbit.c
@@ -0,0 +1,114 @@
+/* mpz_clrbit -- clear a specified bit.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_clrbit (mpz_ptr d, unsigned long int bit_index)
+#else
+mpz_clrbit (d, bit_index)
+     mpz_ptr d;
+     unsigned long int bit_index;
+#endif
+{
+  mp_size_t dsize = d->_mp_size;
+  mp_ptr dp = d->_mp_d;
+  mp_size_t limb_index;
+
+  limb_index = bit_index / BITS_PER_MP_LIMB;
+  if (dsize >= 0)
+    {
+      if (limb_index < dsize)
+	{
+	  dp[limb_index] &= ~((mp_limb_t) 1 << (bit_index % BITS_PER_MP_LIMB));
+	  MPN_NORMALIZE (dp, dsize);
+	  d->_mp_size = dsize;
+	}
+      else
+	;
+    }
+  else
+    {
+      mp_size_t zero_bound;
+
+      /* Simulate two's complement arithmetic, i.e. simulate
+	 1. Set OP = ~(OP - 1) [with infinitely many leading ones].
+	 2. clear the bit.
+	 3. Set OP = ~OP + 1.  */
+
+      dsize = -dsize;
+
+      /* No upper bound on this loop, we're sure there's a non-zero limb
+	 sooner ot later.  */
+      for (zero_bound = 0; ; zero_bound++)
+	if (dp[zero_bound] != 0)
+	  break;
+
+      if (limb_index > zero_bound)
+	{
+	  if (limb_index < dsize)
+	    dp[limb_index] |= (mp_limb_t) 1 << (bit_index % BITS_PER_MP_LIMB);
+	  else
+	    {
+	      /* Ugh.  The bit should be cleared outside of the end of the
+		 number.  We have to increase the size of the number.  */
+	      if (d->_mp_alloc < limb_index + 1)
+		{
+		  _mpz_realloc (d, limb_index + 1);
+		  dp = d->_mp_d;
+		}
+	      MPN_ZERO (dp + dsize, limb_index - dsize);
+	      dp[limb_index] = (mp_limb_t) 1 << (bit_index % BITS_PER_MP_LIMB);
+	      d->_mp_size = -(limb_index + 1);
+	    }
+	}
+      else if (limb_index == zero_bound)
+	{
+	  dp[limb_index] = ((dp[limb_index] - 1)
+			    | ((mp_limb_t) 1 << (bit_index % BITS_PER_MP_LIMB))) + 1;
+	  if (dp[limb_index] == 0)
+	    {
+	      mp_size_t i;
+	      for (i = limb_index + 1; i < dsize; i++)
+		{
+		  dp[i] += 1;
+		  if (dp[i] != 0)
+		    goto fin;
+		}
+	      /* We got carry all way out beyond the end of D.  Increase
+		 its size (and allocation if necessary).  */
+	      dsize++;
+	      if (d->_mp_alloc < dsize)
+		{
+		  _mpz_realloc (d, dsize);
+		  dp = d->_mp_d;
+		}
+	      dp[i] = 1;
+	      d->_mp_size = -dsize;
+	    fin:;
+	    }
+	}
+      else
+	;
+    }
+}
diff --git a/rts/gmp/mpz/cmp.c b/rts/gmp/mpz/cmp.c
new file mode 100644
index 0000000000..60628348e5
--- /dev/null
+++ b/rts/gmp/mpz/cmp.c
@@ -0,0 +1,75 @@
+/* mpz_cmp(u,v) -- Compare U, V.  Return postive, zero, or negative
+   based on if U > V, U == V, or U < V.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#ifdef BERKELEY_MP
+#include "mp.h"
+#endif
+#include "gmp.h"
+#include "gmp-impl.h"
+
+#ifndef BERKELEY_MP
+int
+#if __STDC__
+mpz_cmp (mpz_srcptr u, mpz_srcptr v)
+#else
+mpz_cmp (u, v)
+     mpz_srcptr u;
+     mpz_srcptr v;
+#endif
+#else /* BERKELEY_MP */
+int
+#if __STDC__
+mcmp (mpz_srcptr u, mpz_srcptr v)
+#else
+mcmp (u, v)
+     mpz_srcptr u;
+     mpz_srcptr v;
+#endif
+#endif /* BERKELEY_MP */
+{
+  mp_size_t usize = u->_mp_size;
+  mp_size_t vsize = v->_mp_size;
+  mp_size_t size;
+  mp_srcptr up, vp;
+  int cmp;
+
+  if (usize != vsize)
+    return usize - vsize;
+
+  if (usize == 0)
+    return 0;
+
+  size = ABS (usize);
+
+  up = u->_mp_d;
+  vp = v->_mp_d;
+
+  cmp = mpn_cmp (up, vp, size);
+
+  if (cmp == 0)
+    return 0;
+
+  if ((cmp < 0) == (usize < 0))
+    return 1;
+  else
+    return -1;
+}
diff --git a/rts/gmp/mpz/cmp_si.c b/rts/gmp/mpz/cmp_si.c
new file mode 100644
index 0000000000..0c2212fbe9
--- /dev/null
+++ b/rts/gmp/mpz/cmp_si.c
@@ -0,0 +1,64 @@
+/* mpz_cmp_si(u,v) -- Compare an integer U with a single-word int V.
+   Return positive, zero, or negative based on if U > V, U == V, or U < V.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+_mpz_cmp_si (mpz_srcptr u, signed long int v_digit)
+#else
+_mpz_cmp_si (u, v_digit)
+     mpz_srcptr u;
+     signed long int v_digit;
+#endif
+{
+  mp_size_t usize = u->_mp_size;
+  mp_size_t vsize;
+  mp_limb_t u_digit;
+
+  vsize = 0;
+  if (v_digit > 0)
+    vsize = 1;
+  else if (v_digit < 0)
+    {
+      vsize = -1;
+      v_digit = -v_digit;
+    }
+
+  if (usize != vsize)
+    return usize - vsize;
+
+  if (usize == 0)
+    return 0;
+
+  u_digit = u->_mp_d[0];
+
+  if (u_digit == (mp_limb_t) (unsigned long) v_digit)
+    return 0;
+
+  if (u_digit > (mp_limb_t) (unsigned long) v_digit)
+    return usize;
+  else
+    return -usize;
+}
diff --git a/rts/gmp/mpz/cmp_ui.c b/rts/gmp/mpz/cmp_ui.c
new file mode 100644
index 0000000000..fd84f301c1
--- /dev/null
+++ b/rts/gmp/mpz/cmp_ui.c
@@ -0,0 +1,53 @@
+/* mpz_cmp_ui.c -- Compare a mpz_t a with an mp_limb_t b.  Return positive,
+  zero, or negative based on if a > b, a == b, or a < b.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+_mpz_cmp_ui (mpz_srcptr u, unsigned long int v_digit)
+#else
+_mpz_cmp_ui (u, v_digit)
+     mpz_srcptr u;
+     unsigned long int v_digit;
+#endif
+{
+  mp_size_t usize = u->_mp_size;
+
+  if (usize == 0)
+    return -(v_digit != 0);
+
+  if (usize == 1)
+    {
+      mp_limb_t u_digit;
+
+      u_digit = u->_mp_d[0];
+      if (u_digit > v_digit)
+	return 1;
+      if (u_digit < v_digit)
+	return -1;
+      return 0;
+    }
+
+  return (usize > 0) ? 1 : -1;
+}
diff --git a/rts/gmp/mpz/cmpabs.c b/rts/gmp/mpz/cmpabs.c
new file mode 100644
index 0000000000..037d7a9145
--- /dev/null
+++ b/rts/gmp/mpz/cmpabs.c
@@ -0,0 +1,57 @@
+/* mpz_cmpabs(u,v) -- Compare U, V.  Return postive, zero, or negative
+   based on if U > V, U == V, or U < V.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+mpz_cmpabs (mpz_srcptr u, mpz_srcptr v)
+#else
+mpz_cmpabs (u, v)
+     mpz_srcptr u;
+     mpz_srcptr v;
+#endif
+{
+  mp_size_t usize = u->_mp_size;
+  mp_size_t vsize = v->_mp_size;
+  mp_size_t size;
+  mp_srcptr up, vp;
+  int cmp;
+
+  usize = ABS (usize);
+  vsize = ABS (vsize);
+
+  if (usize != vsize)
+    return usize - vsize;
+
+  if (usize == 0)
+    return 0;
+
+  up = u->_mp_d;
+  vp = v->_mp_d;
+
+  cmp = mpn_cmp (up, vp, usize);
+
+  return cmp;
+}
diff --git a/rts/gmp/mpz/cmpabs_ui.c b/rts/gmp/mpz/cmpabs_ui.c
new file mode 100644
index 0000000000..db816b5820
--- /dev/null
+++ b/rts/gmp/mpz/cmpabs_ui.c
@@ -0,0 +1,56 @@
+/* mpz_cmpabs_ui.c -- Compare a mpz_t a with an mp_limb_t b.  Return positive,
+  zero, or negative based on if a > b, a == b, or a < b.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1997, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+mpz_cmpabs_ui (mpz_srcptr u, unsigned long int v_digit)
+#else
+mpz_cmpabs_ui (u, v_digit)
+     mpz_srcptr u;
+     unsigned long int v_digit;
+#endif
+{
+  mp_size_t usize = u->_mp_size;
+
+  if (usize == 0)
+    return -(v_digit != 0);
+
+  usize = ABS (usize);
+
+  if (usize == 1)
+    {
+      mp_limb_t u_digit;
+
+      u_digit = u->_mp_d[0];
+      if (u_digit > v_digit)
+	return 1;
+      if (u_digit < v_digit)
+	return -1;
+      return 0;
+    }
+
+  return 1;
+}
diff --git a/rts/gmp/mpz/com.c b/rts/gmp/mpz/com.c
new file mode 100644
index 0000000000..18d6427779
--- /dev/null
+++ b/rts/gmp/mpz/com.c
@@ -0,0 +1,93 @@
+/* mpz_com(mpz_ptr dst, mpz_ptr src) -- Assign the bit-complemented value of
+   SRC to DST.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_com (mpz_ptr dst, mpz_srcptr src)
+#else
+mpz_com (dst, src)
+     mpz_ptr dst;
+     mpz_srcptr src;
+#endif
+{
+  mp_size_t size = src->_mp_size;
+  mp_srcptr src_ptr;
+  mp_ptr dst_ptr;
+
+  if (size >= 0)
+    {
+      /* As with infinite precision: one's complement, two's complement.
+	 But this can be simplified using the identity -x = ~x + 1.
+	 So we're going to compute (~~x) + 1 = x + 1!  */
+
+      if (dst->_mp_alloc < size + 1)
+	_mpz_realloc (dst, size + 1);
+
+      src_ptr = src->_mp_d;
+      dst_ptr = dst->_mp_d;
+
+      if (size == 0)
+	{
+	  /* Special case, as mpn_add wants the first arg's size >= the
+	     second arg's size.  */
+	  dst_ptr[0] = 1;
+	  dst->_mp_size = -1;
+	  return;
+	}
+
+      {
+	mp_limb_t cy;
+
+	cy = mpn_add_1 (dst_ptr, src_ptr, size, (mp_limb_t) 1);
+	if (cy)
+	  {
+	    dst_ptr[size] = cy;
+	    size++;
+	  }
+      }
+
+      /* Store a negative size, to indicate ones-extension.  */
+      dst->_mp_size = -size;
+    }
+  else
+    {
+      /* As with infinite precision: two's complement, then one's complement.
+	 But that can be simplified using the identity -x = ~(x - 1).
+	 So we're going to compute ~~(x - 1) = x - 1!  */
+      size = -size;
+
+      if (dst->_mp_alloc < size)
+	_mpz_realloc (dst, size);
+
+      src_ptr = src->_mp_d;
+      dst_ptr = dst->_mp_d;
+
+      mpn_sub_1 (dst_ptr, src_ptr, size, (mp_limb_t) 1);
+      size -= dst_ptr[size - 1] == 0;
+
+      /* Store a positive size, to indicate zero-extension.  */
+      dst->_mp_size = size;
+    }
+}
diff --git a/rts/gmp/mpz/divexact.c b/rts/gmp/mpz/divexact.c
new file mode 100644
index 0000000000..c2970454fd
--- /dev/null
+++ b/rts/gmp/mpz/divexact.c
@@ -0,0 +1,125 @@
+/* mpz_divexact -- finds quotient when known that quot * den == num && den != 0.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1998, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.  */
+
+/*  Ken Weber (kweber@mat.ufrgs.br, kweber@mcs.kent.edu)
+
+    Funding for this work has been partially provided by Conselho Nacional
+    de Desenvolvimento Cienti'fico e Tecnolo'gico (CNPq) do Brazil, Grant
+    301314194-2, and was done while I was a visiting reseacher in the Instituto
+    de Matema'tica at Universidade Federal do Rio Grande do Sul (UFRGS).
+
+    References:
+	T. Jebelean, An algorithm for exact division, Journal of Symbolic
+	Computation, v. 15, 1993, pp. 169-180.	*/
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void
+#if __STDC__
+mpz_divexact (mpz_ptr quot, mpz_srcptr num, mpz_srcptr den)
+#else
+mpz_divexact (quot, num, den)
+     mpz_ptr quot;
+     mpz_srcptr num;
+     mpz_srcptr den;
+#endif
+{
+  mp_ptr qp, tp;
+  mp_size_t qsize, tsize;
+  mp_srcptr np, dp;
+  mp_size_t nsize, dsize;
+  TMP_DECL (marker);
+
+  nsize = ABS (num->_mp_size);
+  dsize = ABS (den->_mp_size);
+
+  qsize = nsize - dsize + 1;
+  if (quot->_mp_alloc < qsize)
+    _mpz_realloc (quot, qsize);
+
+  np = num->_mp_d;
+  dp = den->_mp_d;
+  qp = quot->_mp_d;
+
+  if (nsize == 0)
+    {
+      if (dsize == 0)
+	DIVIDE_BY_ZERO;
+      quot->_mp_size = 0;
+      return;
+    }
+
+  if (dsize <= 1)
+    {
+      if (dsize == 1)
+	{
+	  mpn_divmod_1 (qp, np, nsize, dp[0]);
+	  qsize -= qp[qsize - 1] == 0;
+	  quot->_mp_size = (num->_mp_size ^ den->_mp_size) >= 0 ? qsize : -qsize;
+	  return;
+	}
+
+      /*  Generate divide-by-zero error since dsize == 0.  */
+      DIVIDE_BY_ZERO;
+    }
+
+  TMP_MARK (marker);
+
+  /*  QUOT <-- NUM/2^r, T <-- DEN/2^r where = r number of twos in DEN.  */
+  while (dp[0] == 0)
+    np += 1, nsize -= 1, dp += 1, dsize -= 1;
+  tsize = MIN (qsize, dsize);
+  if ((dp[0] & 1) != 0)
+    {
+      if (quot == den)		/*  QUOT and DEN overlap.  */
+	{
+	  tp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
+	  MPN_COPY (tp, dp, tsize);
+	}
+      else
+	tp = (mp_ptr) dp;
+      if (qp != np)
+	MPN_COPY_INCR (qp, np, qsize);
+    }
+  else
+    {
+      unsigned int r;
+      tp = (mp_ptr) TMP_ALLOC (tsize * BYTES_PER_MP_LIMB);
+      count_trailing_zeros (r, dp[0]);
+      mpn_rshift (tp, dp, tsize, r);
+      if (dsize > tsize)
+	tp[tsize - 1] |= dp[tsize] << (BITS_PER_MP_LIMB - r);
+      mpn_rshift (qp, np, qsize, r);
+      if (nsize > qsize)
+	qp[qsize - 1] |= np[qsize] << (BITS_PER_MP_LIMB - r);
+    }
+
+  /*  Now QUOT <-- QUOT/T.  */
+  mpn_bdivmod (qp, qp, qsize, tp, tsize, qsize * BITS_PER_MP_LIMB);
+  MPN_NORMALIZE (qp, qsize);
+
+  quot->_mp_size = (num->_mp_size ^ den->_mp_size) >= 0 ? qsize : -qsize;
+
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/dump.c b/rts/gmp/mpz/dump.c
new file mode 100644
index 0000000000..dc318ac8cf
--- /dev/null
+++ b/rts/gmp/mpz/dump.c
@@ -0,0 +1,44 @@
+/* mpz_dump - Dump an integer to stdout.
+
+   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS NOT SAFE TO
+   CALL THIS FUNCTION DIRECTLY.  IN FACT, IT IS ALMOST GUARANTEED THAT THIS
+   FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h>
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_dump (mpz_srcptr u)
+#else
+mpz_dump (u)
+     mpz_srcptr u;
+#endif
+{
+  char *str;
+
+  str = mpz_get_str (0, 10, u);
+  printf ("%s\n", str);
+  (*_mp_free_func) (str, 0);/* ??? broken alloc interface, pass what size ??? */
+}
diff --git a/rts/gmp/mpz/fac_ui.c b/rts/gmp/mpz/fac_ui.c
new file mode 100644
index 0000000000..85f40f271c
--- /dev/null
+++ b/rts/gmp/mpz/fac_ui.c
@@ -0,0 +1,157 @@
+/* mpz_fac_ui(result, n) -- Set RESULT to N!.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#ifdef DBG
+#include <stdio.h>
+#endif
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void
+#if __STDC__
+mpz_fac_ui (mpz_ptr result, unsigned long int n)
+#else
+mpz_fac_ui (result, n)
+     mpz_ptr result;
+     unsigned long int n;
+#endif
+{
+#if SIMPLE_FAC
+
+  /* Be silly.  Just multiply the numbers in ascending order.  O(n**2).  */
+
+  unsigned long int k;
+
+  mpz_set_ui (result, 1L);
+
+  for (k = 2; k <= n; k++)
+    mpz_mul_ui (result, result, k);
+#else
+
+  /* Be smarter.  Multiply groups of numbers in ascending order until the
+     product doesn't fit in a limb.  Multiply these partial product in a
+     balanced binary tree fashion, to make the operand have as equal sizes
+     as possible.  When the operands have about the same size, mpn_mul
+     becomes faster.  */
+
+  unsigned long int p, k;
+  mp_limb_t p1, p0;
+
+  /* Stack of partial products, used to make the computation balanced
+     (i.e. make the sizes of the multiplication operands equal).  The
+     topmost position of MP_STACK will contain a one-limb partial product,
+     the second topmost will contain a two-limb partial product, and so
+     on.  MP_STACK[0] will contain a partial product with 2**t limbs.
+     To compute n! MP_STACK needs to be less than
+     log(n)**2/log(BITS_PER_MP_LIMB), so 30 is surely enough.  */
+#define MP_STACK_SIZE 30
+  mpz_t mp_stack[MP_STACK_SIZE];
+
+  /* TOP is an index into MP_STACK, giving the topmost element.
+     TOP_LIMIT_SO_FAR is the largets value it has taken so far.  */
+  int top, top_limit_so_far;
+
+  /* Count of the total number of limbs put on MP_STACK so far.  This
+     variable plays an essential role in making the compututation balanced.
+     See below.  */
+  unsigned int tree_cnt;
+
+  top = top_limit_so_far = -1;
+  tree_cnt = 0;
+  p = 1;
+  for (k = 2; k <= n; k++)
+    {
+      /* Multiply the partial product in P with K.  */
+      umul_ppmm (p1, p0, (mp_limb_t) p, (mp_limb_t) k);
+
+      /* Did we get overflow into the high limb, i.e. is the partial
+	 product now more than one limb?  */
+      if (p1 != 0)
+	{
+	  tree_cnt++;
+
+	  if (tree_cnt % 2 == 0)
+	    {
+	      mp_size_t i;
+
+	      /* TREE_CNT is even (i.e. we have generated an even number of
+		 one-limb partial products), which means that we have a
+		 single-limb product on the top of MP_STACK.  */
+
+	      mpz_mul_ui (mp_stack[top], mp_stack[top], p);
+
+	      /* If TREE_CNT is divisable by 4, 8,..., we have two
+		 similar-sized partial products with 2, 4,... limbs at
+		 the topmost two positions of MP_STACK.  Multiply them
+		 to form a new partial product with 4, 8,... limbs.  */
+	      for (i = 4; (tree_cnt & (i - 1)) == 0; i <<= 1)
+		{
+		  mpz_mul (mp_stack[top - 1],
+			   mp_stack[top], mp_stack[top - 1]);
+		  top--;
+		}
+	    }
+	  else
+	    {
+	      /* Put the single-limb partial product in P on the stack.
+		 (The next time we get a single-limb product, we will
+		 multiply the two together.)  */
+	      top++;
+	      if (top > top_limit_so_far)
+		{
+		  if (top > MP_STACK_SIZE)
+		    abort();
+		  /* The stack is now bigger than ever, initialize the top
+		     element.  */
+		  mpz_init_set_ui (mp_stack[top], p);
+		  top_limit_so_far++;
+		}
+	      else
+		mpz_set_ui (mp_stack[top], p);
+	    }
+
+	  /* We ignored the last result from umul_ppmm.  Put K in P as the
+	     first component of the next single-limb partial product.  */
+	  p = k;
+	}
+      else
+	/* We didn't get overflow in umul_ppmm.  Put p0 in P and try
+	   with one more value of K.  */
+	p = p0;			/* bogus if long != mp_limb_t */
+    }
+
+  /* We have partial products in mp_stack[0..top], in descending order.
+     We also have a small partial product in p.
+     Their product is the final result.  */
+  if (top < 0)
+    mpz_set_ui (result, p);
+  else
+    mpz_mul_ui (result, mp_stack[top--], p);
+  while (top >= 0)
+    mpz_mul (result, result, mp_stack[top--]);
+
+  /* Free the storage allocated for MP_STACK.  */
+  for (top = top_limit_so_far; top >= 0; top--)
+    mpz_clear (mp_stack[top]);
+#endif
+}
diff --git a/rts/gmp/mpz/fdiv_q.c b/rts/gmp/mpz/fdiv_q.c
new file mode 100644
index 0000000000..9d75ca33d2
--- /dev/null
+++ b/rts/gmp/mpz/fdiv_q.c
@@ -0,0 +1,51 @@
+/* mpz_fdiv_q -- Division rounding the quotient towards -infinity.
+   The remainder gets the same sign as the denominator.
+
+Copyright (C) 1994, 1995, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_fdiv_q (mpz_ptr quot, mpz_srcptr dividend, mpz_srcptr divisor)
+#else
+mpz_fdiv_q (quot, dividend, divisor)
+     mpz_ptr quot;
+     mpz_srcptr dividend;
+     mpz_srcptr divisor;
+#endif
+{
+  mp_size_t dividend_size = dividend->_mp_size;
+  mp_size_t divisor_size = divisor->_mp_size;
+  mpz_t rem;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  MPZ_TMP_INIT (rem, ABS (divisor_size));
+
+  mpz_tdiv_qr (quot, rem, dividend, divisor);
+
+  if ((divisor_size ^ dividend_size) < 0 && rem->_mp_size != 0)
+    mpz_sub_ui (quot, quot, 1L);
+
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/fdiv_q_2exp.c b/rts/gmp/mpz/fdiv_q_2exp.c
new file mode 100644
index 0000000000..8e02180ecc
--- /dev/null
+++ b/rts/gmp/mpz/fdiv_q_2exp.c
@@ -0,0 +1,104 @@
+/* mpz_fdiv_q_2exp -- Divide an integer by 2**CNT.  Round the quotient
+   towards -infinity.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1998, 1999 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_fdiv_q_2exp (mpz_ptr w, mpz_srcptr u, unsigned long int cnt)
+#else
+mpz_fdiv_q_2exp (w, u, cnt)
+     mpz_ptr w;
+     mpz_srcptr u;
+     unsigned long int cnt;
+#endif
+{
+  mp_size_t usize = u->_mp_size;
+  mp_size_t wsize;
+  mp_size_t abs_usize = ABS (usize);
+  mp_size_t limb_cnt;
+  mp_ptr wp;
+  mp_limb_t round = 0;
+
+  limb_cnt = cnt / BITS_PER_MP_LIMB;
+  wsize = abs_usize - limb_cnt;
+  if (wsize <= 0)
+    {
+      wp = w->_mp_d;
+      wsize = 0;
+      /* Set ROUND since we know we skip some non-zero words in this case.
+	 Well, if U is zero, we don't, but then this will be taken care of
+	 below, since rounding only really takes place for negative U.  */
+      round = 1;
+      wp[0] = 1;
+      w->_mp_size = -(usize < 0);
+      return;
+    }
+  else
+    {
+      mp_size_t i;
+      mp_ptr up;
+
+      /* Make sure there is enough space.  We make an extra limb
+	 here to account for possible rounding at the end.  */
+      if (w->_mp_alloc < wsize + 1)
+	_mpz_realloc (w, wsize + 1);
+
+      wp = w->_mp_d;
+      up = u->_mp_d;
+
+      /* Set ROUND if we are about skip some non-zero limbs.  */
+      for (i = 0; i < limb_cnt && round == 0; i++)
+	round = up[i];
+
+      cnt %= BITS_PER_MP_LIMB;
+      if (cnt != 0)
+	{
+	  round |= mpn_rshift (wp, up + limb_cnt, wsize, cnt);
+	  wsize -= wp[wsize - 1] == 0;
+	}
+      else
+	{
+	  MPN_COPY_INCR (wp, up + limb_cnt, wsize);
+	}
+    }
+
+  if (usize < 0 && round != 0)
+    {
+      mp_limb_t cy;
+      if (wsize != 0)
+	{
+	  cy = mpn_add_1 (wp, wp, wsize, (mp_limb_t) 1);
+	  wp[wsize] = cy;
+	  wsize += cy;
+	}
+      else
+	{
+	  /* We shifted something negative to zero.  The result is -1.  */
+	  wp[0] = 1;
+	  wsize = 1;
+	}
+    }
+  w->_mp_size = usize >= 0 ? wsize : -wsize;
+}
diff --git a/rts/gmp/mpz/fdiv_q_ui.c b/rts/gmp/mpz/fdiv_q_ui.c
new file mode 100644
index 0000000000..55d2498693
--- /dev/null
+++ b/rts/gmp/mpz/fdiv_q_ui.c
@@ -0,0 +1,65 @@
+/* mpz_fdiv_q_ui -- Division rounding the quotient towards -infinity.
+   The remainder gets the same sign as the denominator.
+
+Copyright (C) 1994, 1995, 1996, 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_fdiv_q_ui (mpz_ptr quot, mpz_srcptr dividend, unsigned long int divisor)
+#else
+mpz_fdiv_q_ui (quot, dividend, divisor)
+     mpz_ptr quot;
+     mpz_srcptr dividend;
+     unsigned long int divisor;
+#endif
+{
+  mp_size_t dividend_size;
+  mp_size_t size;
+  mp_ptr quot_ptr;
+  mp_limb_t remainder_limb;
+
+  if (divisor == 0)
+    DIVIDE_BY_ZERO;
+
+  dividend_size = dividend->_mp_size;
+  size = ABS (dividend_size);
+
+  if (quot->_mp_alloc < size)
+    _mpz_realloc (quot, size);
+
+  quot_ptr = quot->_mp_d;
+
+  remainder_limb = mpn_divmod_1 (quot_ptr, dividend->_mp_d, size,
+				   (mp_limb_t) divisor);
+
+  if (remainder_limb != 0 && dividend_size < 0)
+    {
+      mpn_incr_u (quot_ptr, (mp_limb_t) 1);
+      remainder_limb = divisor - remainder_limb;
+    }
+
+  size -= size != 0 && quot_ptr[size - 1] == 0;
+  quot->_mp_size = dividend_size >= 0 ? size : -size;
+
+  return remainder_limb;
+}
diff --git a/rts/gmp/mpz/fdiv_qr.c b/rts/gmp/mpz/fdiv_qr.c
new file mode 100644
index 0000000000..06ce50607b
--- /dev/null
+++ b/rts/gmp/mpz/fdiv_qr.c
@@ -0,0 +1,64 @@
+/* mpz_fdiv_qr -- Division rounding the quotient towards -infinity.
+   The remainder gets the same sign as the denominator.
+
+Copyright (C) 1994, 1995, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_fdiv_qr (mpz_ptr quot, mpz_ptr rem, mpz_srcptr dividend, mpz_srcptr divisor)
+#else
+mpz_fdiv_qr (quot, rem, dividend, divisor)
+     mpz_ptr quot;
+     mpz_ptr rem;
+     mpz_srcptr dividend;
+     mpz_srcptr divisor;
+#endif
+{
+  mp_size_t divisor_size = divisor->_mp_size;
+  mp_size_t xsize;
+  mpz_t temp_divisor;		/* N.B.: lives until function returns! */
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  /* We need the original value of the divisor after the quotient and
+     remainder have been preliminary calculated.  We have to copy it to
+     temporary space if it's the same variable as either QUOT or REM.  */
+  if (quot == divisor || rem == divisor)
+    {
+      MPZ_TMP_INIT (temp_divisor, ABS (divisor_size));
+      mpz_set (temp_divisor, divisor);
+      divisor = temp_divisor;
+    }
+
+  xsize = dividend->_mp_size ^ divisor_size;;
+  mpz_tdiv_qr (quot, rem, dividend, divisor);
+
+  if (xsize < 0 && rem->_mp_size != 0)
+    {
+      mpz_sub_ui (quot, quot, 1L);
+      mpz_add (rem, rem, divisor);
+    }
+
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/fdiv_qr_ui.c b/rts/gmp/mpz/fdiv_qr_ui.c
new file mode 100644
index 0000000000..600c0dacfc
--- /dev/null
+++ b/rts/gmp/mpz/fdiv_qr_ui.c
@@ -0,0 +1,69 @@
+/* mpz_fdiv_qr_ui -- Division rounding the quotient towards -infinity.
+   The remainder gets the same sign as the denominator.
+
+Copyright (C) 1994, 1995, 1996, 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_fdiv_qr_ui (mpz_ptr quot, mpz_ptr rem, mpz_srcptr dividend, unsigned long int divisor)
+#else
+mpz_fdiv_qr_ui (quot, rem, dividend, divisor)
+     mpz_ptr quot;
+     mpz_ptr rem;
+     mpz_srcptr dividend;
+     unsigned long int divisor;
+#endif
+{
+  mp_size_t dividend_size;
+  mp_size_t size;
+  mp_ptr quot_ptr;
+  mp_limb_t remainder_limb;
+
+  if (divisor == 0)
+    DIVIDE_BY_ZERO;
+
+  dividend_size = dividend->_mp_size;
+  size = ABS (dividend_size);
+
+  if (quot->_mp_alloc < size)
+    _mpz_realloc (quot, size);
+
+  quot_ptr = quot->_mp_d;
+
+  remainder_limb = mpn_divmod_1 (quot_ptr, dividend->_mp_d, size,
+				 (mp_limb_t) divisor);
+
+  if (remainder_limb != 0 && dividend_size < 0)
+    {
+      mpn_incr_u (quot_ptr, (mp_limb_t) 1);
+      remainder_limb = divisor - remainder_limb;
+    }
+
+  size -= size != 0 && quot_ptr[size - 1] == 0;
+  quot->_mp_size = dividend_size >= 0 ? size : -size;
+
+  rem->_mp_d[0] = remainder_limb;
+  rem->_mp_size = remainder_limb != 0;
+
+  return remainder_limb;
+}
diff --git a/rts/gmp/mpz/fdiv_r.c b/rts/gmp/mpz/fdiv_r.c
new file mode 100644
index 0000000000..a3652838d2
--- /dev/null
+++ b/rts/gmp/mpz/fdiv_r.c
@@ -0,0 +1,58 @@
+/* mpz_fdiv_r -- Division rounding the quotient towards -infinity.
+   The remainder gets the same sign as the denominator.
+
+Copyright (C) 1994, 1995, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_fdiv_r (mpz_ptr rem, mpz_srcptr dividend, mpz_srcptr divisor)
+#else
+mpz_fdiv_r (rem, dividend, divisor)
+     mpz_ptr rem;
+     mpz_srcptr dividend;
+     mpz_srcptr divisor;
+#endif
+{
+  mp_size_t divisor_size = divisor->_mp_size;
+  mpz_t temp_divisor;		/* N.B.: lives until function returns! */
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  /* We need the original value of the divisor after the remainder has been
+     preliminary calculated.  We have to copy it to temporary space if it's
+     the same variable as REM.  */
+  if (rem == divisor)
+    {
+      MPZ_TMP_INIT (temp_divisor, ABS (divisor_size));
+      mpz_set (temp_divisor, divisor);
+      divisor = temp_divisor;
+    }
+
+  mpz_tdiv_r (rem, dividend, divisor);
+
+  if ((divisor_size ^ dividend->_mp_size) < 0 && rem->_mp_size != 0)
+    mpz_add (rem, rem, divisor);
+
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/fdiv_r_2exp.c b/rts/gmp/mpz/fdiv_r_2exp.c
new file mode 100644
index 0000000000..081ce19203
--- /dev/null
+++ b/rts/gmp/mpz/fdiv_r_2exp.c
@@ -0,0 +1,156 @@
+/* mpz_fdiv_r_2exp -- Divide a integer by 2**CNT and produce a remainder.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1998, 1999, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_fdiv_r_2exp (mpz_ptr res, mpz_srcptr in, unsigned long int cnt)
+#else
+mpz_fdiv_r_2exp (res, in, cnt)
+     mpz_ptr res;
+     mpz_srcptr in;
+     unsigned long int cnt;
+#endif
+{
+  mp_size_t in_size = ABS (in->_mp_size);
+  mp_size_t res_size;
+  mp_size_t limb_cnt = cnt / BITS_PER_MP_LIMB;
+  mp_srcptr in_ptr = in->_mp_d;
+
+  if (in_size > limb_cnt)
+    {
+      /* The input operand is (probably) greater than 2**CNT.  */
+      mp_limb_t x;
+
+      x = in_ptr[limb_cnt] & (((mp_limb_t) 1 << cnt % BITS_PER_MP_LIMB) - 1);
+      if (x != 0)
+	{
+	  res_size = limb_cnt + 1;
+	  if (res->_mp_alloc < res_size)
+	    _mpz_realloc (res, res_size);
+
+	  res->_mp_d[limb_cnt] = x;
+	}
+      else
+	{
+	  res_size = limb_cnt;
+	  MPN_NORMALIZE (in_ptr, res_size);
+
+	  if (res->_mp_alloc < res_size)
+	    _mpz_realloc (res, res_size);
+
+	  limb_cnt = res_size;
+	}
+    }
+  else
+    {
+      /* The input operand is smaller than 2**CNT.  We perform a no-op,
+	 apart from that we might need to copy IN to RES, and may need
+	 to round the result.  */
+      res_size = in_size;
+      if (res->_mp_alloc < res_size)
+	_mpz_realloc (res, res_size);
+
+      limb_cnt = res_size;
+    }
+
+  if (res != in)
+    MPN_COPY (res->_mp_d, in->_mp_d, limb_cnt);
+  in_size = in->_mp_size;
+  res->_mp_size = res_size;
+  if (in_size < 0 && res_size != 0)
+    {
+      /* Result should be 2^CNT - RES */
+      mpz_t tmp;
+      TMP_DECL (marker);
+      TMP_MARK (marker);
+      MPZ_TMP_INIT (tmp, cnt/BITS_PER_MP_LIMB + 2);
+      mpz_set_ui (tmp, 1L);
+      mpz_mul_2exp (tmp, tmp, cnt);
+      mpz_sub (res, tmp, res);
+      TMP_FREE (marker);
+    }
+}
+
+/* This is an alternative ending of the above function using just low-level
+   functions.  Tested, but perhaps excessive?  */
+#if 0
+  if (in->_mp_size < 0 && res_size != 0)
+    {
+      /* Result should be 2^CNT - RES */
+
+      mp_ptr rp;
+
+      limb_cnt = cnt / BITS_PER_MP_LIMB;
+
+      if (res->_mp_alloc <= limb_cnt)
+	_mpz_realloc (res, limb_cnt + 1);
+      rp = PTR(res);
+      if (res_size > limb_cnt)
+	{
+	  mpn_nz_neg (rp, rp, res_size);
+	  rp[limb_cnt] &= ~(~(mp_limb_t) 0 << cnt % BITS_PER_MP_LIMB);
+	  MPN_NORMALIZE_NOT_ZERO (rp, res_size);
+	}
+      else
+	{
+	  mp_size_t i;
+	  mpn_nz_neg (rp, rp, res_size);
+	  for (i = res_size; i < limb_cnt; i++)
+	    rp[i] = ~ (mp_limb_t) 0;
+	  res_size = limb_cnt;
+	  if (cnt % BITS_PER_MP_LIMB != 0)
+	    {
+	      rp[res_size] = ((mp_limb_t) 1 << (cnt % BITS_PER_MP_LIMB)) - 1;
+	      res_size++;
+	    }
+	  else
+	    MPN_NORMALIZE_NOT_ZERO (rp, res_size);
+	}
+    }
+  SIZ(res) = res_size;
+}
+
+static void
+mpn_nz_neg (rp, sp, n)
+     mp_ptr rp, sp;
+     mp_size_t n;
+{
+  mp_size_t i;
+  mp_limb_t x;
+
+  x = sp[0];
+  rp[0] = -x;
+  for (i = 1; x == 0; i++)
+    {
+      x = sp[i];
+      rp[i] = -x;
+    }
+
+  for (; i < n; i++)
+    {
+      rp[i] = ~sp[i];
+    }
+}
+#endif
diff --git a/rts/gmp/mpz/fdiv_r_ui.c b/rts/gmp/mpz/fdiv_r_ui.c
new file mode 100644
index 0000000000..dd5c743d27
--- /dev/null
+++ b/rts/gmp/mpz/fdiv_r_ui.c
@@ -0,0 +1,55 @@
+/* mpz_fdiv_r_ui -- Division rounding the quotient towards -infinity.
+   The remainder gets the same sign as the denominator.
+
+Copyright (C) 1994, 1995, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_fdiv_r_ui (mpz_ptr rem, mpz_srcptr dividend, unsigned long int divisor)
+#else
+mpz_fdiv_r_ui (rem, dividend, divisor)
+     mpz_ptr rem;
+     mpz_srcptr dividend;
+     unsigned long int divisor;
+#endif
+{
+  mp_size_t dividend_size;
+  mp_size_t size;
+  mp_limb_t remainder_limb;
+
+  if (divisor == 0)
+    DIVIDE_BY_ZERO;
+
+  dividend_size = dividend->_mp_size;
+  size = ABS (dividend_size);
+
+  remainder_limb = mpn_mod_1 (dividend->_mp_d, size, (mp_limb_t) divisor);
+
+  if (remainder_limb != 0 && dividend_size < 0)
+    remainder_limb = divisor - remainder_limb;
+
+  rem->_mp_d[0] = remainder_limb;
+  rem->_mp_size = remainder_limb != 0;
+
+  return remainder_limb;
+}
diff --git a/rts/gmp/mpz/fdiv_ui.c b/rts/gmp/mpz/fdiv_ui.c
new file mode 100644
index 0000000000..f937b5f6d0
--- /dev/null
+++ b/rts/gmp/mpz/fdiv_ui.c
@@ -0,0 +1,48 @@
+/* mpz_fdiv_ui -- Division rounding the quotient towards -infinity.
+   The remainder gets the same sign as the denominator.
+
+Copyright (C) 1994, 1995, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_fdiv_ui (mpz_srcptr dividend, unsigned long int divisor)
+#else
+mpz_fdiv_ui (dividend, divisor)
+     mpz_srcptr dividend;
+     unsigned long int divisor;
+#endif
+{
+  mp_size_t dividend_size;
+  mp_size_t size;
+  mp_limb_t remainder_limb;
+
+  dividend_size = dividend->_mp_size;
+  size = ABS (dividend_size);
+
+  remainder_limb = mpn_mod_1 (dividend->_mp_d, size, (mp_limb_t) divisor);
+
+  if (remainder_limb != 0 && dividend_size < 0)
+    remainder_limb = divisor - remainder_limb;
+
+  return remainder_limb;
+}
diff --git a/rts/gmp/mpz/fib_ui.c b/rts/gmp/mpz/fib_ui.c
new file mode 100644
index 0000000000..4bebb80d94
--- /dev/null
+++ b/rts/gmp/mpz/fib_ui.c
@@ -0,0 +1,165 @@
+/* mpz_fib_ui(result, n) -- Set RESULT to the Nth Fibonacci number.
+
+Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* This is fast, but could be made somewhat faster and neater.
+   The timing is somewhat fluctuating for even/odd sizes because
+   of the extra hair used to save variables and operations.  Here
+   are a few things one might want to address:
+     1. Avoid using 4 intermediate variables in mpz_fib_bigcase.
+     2. Call mpn functions directly.  Straightforward for these functions.
+     3. Merge the three functions into one.
+
+Said by Kevin:
+   Consider using the Lucas numbers L[n] as an auxiliary sequence, making
+   it possible to do the "doubling" operation in mpz_fib_bigcase with two
+   squares rather than two multiplies.  The formulas are a little more
+   complicated, something like the following (untested).
+
+       F[2n] = ((F[n]+L[n])^2 - 6*F[n]^2 - 4*(-1)^n) / 2
+       L[2n] = 5*F[n]^2 + 2*(-1)^n
+
+       F[2n+1] = (F[2n] + L[2n]) / 2
+       L[2n+1] = (5*F[2n] + L[2n]) / 2
+
+   The Lucas number that comes for free here could even be returned.
+
+   Maybe there's formulas with two squares using just F[n], but I don't
+   know of any.
+*/
+
+/* Determine the needed storage for Fib(n).  */
+#define FIB_SIZE(n) (((mp_size_t) ((n)*0.695)) / BITS_PER_MP_LIMB + 2)
+
+static void mpz_fib_bigcase _PROTO ((mpz_t, mpz_t, unsigned long int));
+static void mpz_fib_basecase _PROTO ((mpz_t, mpz_t, unsigned long int));
+
+
+#ifndef FIB_THRESHOLD
+#define FIB_THRESHOLD 60
+#endif
+
+void
+#if __STDC__
+mpz_fib_ui (mpz_t r, unsigned long int n)
+#else
+mpz_fib_ui (r, n)
+     mpz_t r;
+     unsigned long int n;
+#endif
+{
+  if (n == 0)
+    mpz_set_ui (r, 0);
+  else
+    {
+      mpz_t t1;
+      mpz_init (t1);
+      if (n < FIB_THRESHOLD)
+	mpz_fib_basecase (t1, r, n);
+      else
+	mpz_fib_bigcase (t1, r, n);
+      mpz_clear (t1);
+    }
+}
+
+static void
+#if __STDC__
+mpz_fib_basecase (mpz_t t1, mpz_t t2, unsigned long int n)
+#else
+mpz_fib_basecase (t1, t2, n)
+     mpz_t t1;
+     mpz_t t2;
+     unsigned long int n;
+#endif
+{
+  unsigned long int m, i;
+
+  mpz_set_ui (t1, 0);
+  mpz_set_ui (t2, 1);
+  m = n/2;
+  for (i = 0; i < m; i++)
+    {
+      mpz_add (t1, t1, t2);
+      mpz_add (t2, t1, t2);
+    }
+  if ((n & 1) == 0)
+    {
+      mpz_sub (t1, t2, t1);
+      mpz_sub (t2, t2, t1);	/* trick: recover t1 value just overwritten */
+    }
+}
+
+static void
+#if __STDC__
+mpz_fib_bigcase (mpz_t t1, mpz_t t2, unsigned long int n)
+#else
+mpz_fib_bigcase (t1, t2, n)
+     mpz_t t1;
+     mpz_t t2;
+     unsigned long int n;
+#endif
+{
+  unsigned long int n2;
+  int ni, i;
+  mpz_t x1, x2, u1, u2;
+
+  ni = 0;
+  for (n2 = n; n2 >= FIB_THRESHOLD; n2 /= 2)
+    ni++;
+
+  mpz_fib_basecase (t1, t2, n2);
+
+  mpz_init (x1);
+  mpz_init (x2);
+  mpz_init (u1);
+  mpz_init (u2);
+
+  for (i = ni - 1; i >= 0; i--)
+    {
+      mpz_mul_2exp (x1, t1, 1);
+      mpz_mul_2exp (x2, t2, 1);
+
+      mpz_add (x1, x1, t2);
+      mpz_sub (x2, x2, t1);
+
+      mpz_mul (u1, t2, x1);
+      mpz_mul (u2, t1, x2);
+
+      if (((n >> i) & 1) == 0)
+	{
+	  mpz_sub (t1, u1, u2);
+	  mpz_set (t2, u1);
+	}
+      else
+	{
+	  mpz_set (t1, u1);
+	  mpz_mul_2exp (t2, u1, 1);
+	  mpz_sub (t2, t2, u2);
+	}
+    }
+
+  mpz_clear (x1);
+  mpz_clear (x2);
+  mpz_clear (u1);
+  mpz_clear (u2);
+}
diff --git a/rts/gmp/mpz/fits_sint_p.c b/rts/gmp/mpz/fits_sint_p.c
new file mode 100644
index 0000000000..82e32a24d5
--- /dev/null
+++ b/rts/gmp/mpz/fits_sint_p.c
@@ -0,0 +1,50 @@
+/* int mpz_fits_X_p (mpz_t src) -- Return whether src fits the C type X.
+
+Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+mpz_fits_sint_p (mpz_srcptr src)
+#else
+mpz_fits_sint_p (src)
+     mpz_srcptr src;
+#endif
+{
+  mp_size_t size;
+  mp_limb_t mpl;
+
+  mpl = PTR(src)[0];
+  size = SIZ(src);
+  if (size > 0)
+    {
+      if (size > 1)
+	return 0;
+      return mpl < ~((~(unsigned int) 0) >> 1);
+    }
+  else
+    {
+      if (size < -1)
+	return 0;
+      return mpl <= ~((~(unsigned int) 0) >> 1);
+    }
+}
diff --git a/rts/gmp/mpz/fits_slong_p.c b/rts/gmp/mpz/fits_slong_p.c
new file mode 100644
index 0000000000..e0669b5aaa
--- /dev/null
+++ b/rts/gmp/mpz/fits_slong_p.c
@@ -0,0 +1,50 @@
+/* int mpz_fits_X_p (mpz_t src) -- Return whether src fits the C type X.
+
+Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+mpz_fits_slong_p (mpz_srcptr src)
+#else
+mpz_fits_slong_p (src)
+     mpz_srcptr src;
+#endif
+{
+  mp_size_t size;
+  mp_limb_t mpl;
+
+  mpl = PTR(src)[0];
+  size = SIZ(src);
+  if (size > 0)
+    {
+      if (size > 1)
+	return 0;
+      return mpl < ~((~(unsigned long int) 0) >> 1);
+    }
+  else
+    {
+      if (size < -1)
+	return 0;
+      return mpl <= ~((~(unsigned long int) 0) >> 1);
+    }
+}
diff --git a/rts/gmp/mpz/fits_sshort_p.c b/rts/gmp/mpz/fits_sshort_p.c
new file mode 100644
index 0000000000..5b8e31afae
--- /dev/null
+++ b/rts/gmp/mpz/fits_sshort_p.c
@@ -0,0 +1,50 @@
+/* int mpz_fits_X_p (mpz_t src) -- Return whether src fits the C type X.
+
+Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+mpz_fits_sshort_p (mpz_srcptr src)
+#else
+mpz_fits_sshort_p (src)
+     mpz_srcptr src;
+#endif
+{
+  mp_size_t size;
+  mp_limb_t mpl;
+
+  mpl = PTR(src)[0];
+  size = SIZ(src);
+  if (size > 0)
+    {
+      if (size > 1)
+	return 0;
+      return mpl <= (((unsigned short int) ~(unsigned int) 0) >> 1);
+    }
+  else
+    {
+      if (size < -1)
+	return 0;
+      return mpl <= (((unsigned short int) ~(unsigned int) 0) >> 1) + 1;
+    }
+}
diff --git a/rts/gmp/mpz/fits_uint_p.c b/rts/gmp/mpz/fits_uint_p.c
new file mode 100644
index 0000000000..72f62fa723
--- /dev/null
+++ b/rts/gmp/mpz/fits_uint_p.c
@@ -0,0 +1,41 @@
+/* int mpz_fits_X_p (mpz_t src) -- Return whether src fits the C type X.
+
+Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+mpz_fits_uint_p (mpz_srcptr src)
+#else
+mpz_fits_uint_p (src)
+     mpz_srcptr src;
+#endif
+{
+  mp_size_t size;
+  mp_limb_t mpl;
+
+  mpl = PTR(src)[0];
+  size = SIZ(src);
+  if (size < 0 || size > 1)
+    return 0;
+  return mpl <= (~(unsigned int) 0);
+}
diff --git a/rts/gmp/mpz/fits_ulong_p.c b/rts/gmp/mpz/fits_ulong_p.c
new file mode 100644
index 0000000000..92eb42e86e
--- /dev/null
+++ b/rts/gmp/mpz/fits_ulong_p.c
@@ -0,0 +1,41 @@
+/* int mpz_fits_X_p (mpz_t src) -- Return whether src fits the C type X.
+
+Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+mpz_fits_ulong_p (mpz_srcptr src)
+#else
+mpz_fits_ulong_p (src)
+     mpz_srcptr src;
+#endif
+{
+  mp_size_t size;
+  mp_limb_t mpl;
+
+  mpl = PTR(src)[0];
+  size = SIZ(src);
+  if (size < 0 || size > 1)
+    return 0;
+  return mpl <= (~(unsigned long int) 0);
+}
diff --git a/rts/gmp/mpz/fits_ushort_p.c b/rts/gmp/mpz/fits_ushort_p.c
new file mode 100644
index 0000000000..bde0edae6e
--- /dev/null
+++ b/rts/gmp/mpz/fits_ushort_p.c
@@ -0,0 +1,41 @@
+/* int mpz_fits_X_p (mpz_t src) -- Return whether src fits the C type X.
+
+Copyright (C) 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+mpz_fits_ushort_p (mpz_srcptr src)
+#else
+mpz_fits_ushort_p (src)
+     mpz_srcptr src;
+#endif
+{
+  mp_size_t size;
+  mp_limb_t mpl;
+
+  mpl = PTR(src)[0];
+  size = SIZ(src);
+  if (size < 0 || size > 1)
+    return 0;
+  return mpl <= ((unsigned short int) ~(unsigned int) 0);
+}
diff --git a/rts/gmp/mpz/gcd.c b/rts/gmp/mpz/gcd.c
new file mode 100644
index 0000000000..0d950dd609
--- /dev/null
+++ b/rts/gmp/mpz/gcd.c
@@ -0,0 +1,180 @@
+/* mpz/gcd.c:   Calculate the greatest common divisor of two integers.
+
+Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+#ifdef BERKELEY_MP
+#include "mp.h"
+#endif
+
+
+#ifndef BERKELEY_MP
+void
+#if __STDC__
+mpz_gcd (mpz_ptr g, mpz_srcptr u, mpz_srcptr v)
+#else
+mpz_gcd (g, u, v)
+     mpz_ptr g;
+     mpz_srcptr u;
+     mpz_srcptr v;
+#endif
+#else /* BERKELEY_MP */
+void
+#if __STDC__
+gcd (mpz_srcptr u, mpz_srcptr v, mpz_ptr g)
+#else
+gcd (u, v, g)
+     mpz_ptr g;
+     mpz_srcptr u;
+     mpz_srcptr v;
+#endif
+#endif /* BERKELEY_MP */
+
+{
+  unsigned long int g_zero_bits, u_zero_bits, v_zero_bits;
+  mp_size_t g_zero_limbs, u_zero_limbs, v_zero_limbs;
+  mp_ptr tp;
+  mp_ptr up = u->_mp_d;
+  mp_size_t usize = ABS (u->_mp_size);
+  mp_ptr vp = v->_mp_d;
+  mp_size_t vsize = ABS (v->_mp_size);
+  mp_size_t gsize;
+  TMP_DECL (marker);
+
+  /* GCD(0, V) == V.  */
+  if (usize == 0)
+    {
+      g->_mp_size = vsize;
+      if (g == v)
+	return;
+      if (g->_mp_alloc < vsize)
+	_mpz_realloc (g, vsize);
+      MPN_COPY (g->_mp_d, vp, vsize);
+      return;
+    }
+
+  /* GCD(U, 0) == U.  */
+  if (vsize == 0)
+    {
+      g->_mp_size = usize;
+      if (g == u)
+	return;
+      if (g->_mp_alloc < usize)
+	_mpz_realloc (g, usize);
+      MPN_COPY (g->_mp_d, up, usize);
+      return;
+    }
+
+  if (usize == 1)
+    {
+      g->_mp_size = 1;
+      g->_mp_d[0] = mpn_gcd_1 (vp, vsize, up[0]);
+      return;
+    }
+
+  if (vsize == 1)
+    {
+      g->_mp_size = 1;
+      g->_mp_d[0] = mpn_gcd_1 (up, usize, vp[0]);
+      return;
+    }
+
+  TMP_MARK (marker);
+
+  /*  Eliminate low zero bits from U and V and move to temporary storage.  */
+  while (*up == 0)
+    up++;
+  u_zero_limbs = up - u->_mp_d;
+  usize -= u_zero_limbs;
+  count_trailing_zeros (u_zero_bits, *up);
+  tp = up;
+  up = (mp_ptr) TMP_ALLOC (usize * BYTES_PER_MP_LIMB);
+  if (u_zero_bits != 0)
+    {
+      mpn_rshift (up, tp, usize, u_zero_bits);
+      usize -= up[usize - 1] == 0;
+    }
+  else
+    MPN_COPY (up, tp, usize);
+
+  while (*vp == 0)
+    vp++;
+  v_zero_limbs = vp - v->_mp_d;
+  vsize -= v_zero_limbs;
+  count_trailing_zeros (v_zero_bits, *vp);
+  tp = vp;
+  vp = (mp_ptr) TMP_ALLOC (vsize * BYTES_PER_MP_LIMB);
+  if (v_zero_bits != 0)
+    {
+      mpn_rshift (vp, tp, vsize, v_zero_bits);
+      vsize -= vp[vsize - 1] == 0;
+    }
+  else
+    MPN_COPY (vp, tp, vsize);
+
+  if (u_zero_limbs > v_zero_limbs)
+    {
+      g_zero_limbs = v_zero_limbs;
+      g_zero_bits = v_zero_bits;
+    }
+  else if (u_zero_limbs < v_zero_limbs)
+    {
+      g_zero_limbs = u_zero_limbs;
+      g_zero_bits = u_zero_bits;
+    }
+  else  /*  Equal.  */
+    {
+      g_zero_limbs = u_zero_limbs;
+      g_zero_bits = MIN (u_zero_bits, v_zero_bits);
+    }
+
+  /*  Call mpn_gcd.  The 2nd argument must not have more bits than the 1st.  */
+  vsize = (usize < vsize || (usize == vsize && up[usize-1] < vp[vsize-1]))
+    ? mpn_gcd (vp, vp, vsize, up, usize)
+    : mpn_gcd (vp, up, usize, vp, vsize);
+
+  /*  Here G <-- V << (g_zero_limbs*BITS_PER_MP_LIMB + g_zero_bits).  */
+  gsize = vsize + g_zero_limbs;
+  if (g_zero_bits != 0)
+    {
+      mp_limb_t cy_limb;
+      gsize += (vp[vsize - 1] >> (BITS_PER_MP_LIMB - g_zero_bits)) != 0;
+      if (g->_mp_alloc < gsize)
+	_mpz_realloc (g, gsize);
+      MPN_ZERO (g->_mp_d, g_zero_limbs);
+
+      tp = g->_mp_d + g_zero_limbs;
+      cy_limb = mpn_lshift (tp, vp, vsize, g_zero_bits);
+      if (cy_limb != 0)
+	tp[vsize] = cy_limb;
+    }
+  else
+    {
+      if (g->_mp_alloc < gsize)
+	_mpz_realloc (g, gsize);
+      MPN_ZERO (g->_mp_d, g_zero_limbs);
+      MPN_COPY (g->_mp_d + g_zero_limbs, vp, vsize);
+    }
+
+  g->_mp_size = gsize;
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/gcd_ui.c b/rts/gmp/mpz/gcd_ui.c
new file mode 100644
index 0000000000..f3bec58829
--- /dev/null
+++ b/rts/gmp/mpz/gcd_ui.c
@@ -0,0 +1,65 @@
+/* mpz_gcd_ui -- Calculate the greatest common divisior of two integers.
+
+Copyright (C) 1994, 1996, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_gcd_ui (mpz_ptr w, mpz_srcptr u, unsigned long int v)
+#else
+mpz_gcd_ui (w, u, v)
+     mpz_ptr w;
+     mpz_srcptr u;
+     unsigned long int v;
+#endif
+{
+  mp_size_t size;
+  mp_limb_t res;
+
+  size = ABS (u->_mp_size);
+
+  if (size == 0)
+    res = v;
+  else if (v == 0)
+    {
+      if (w != NULL && u != w)
+	{
+	  if (w->_mp_alloc < size)
+	    _mpz_realloc (w, size);
+
+	  MPN_COPY (w->_mp_d, u->_mp_d, size);
+	}
+      w->_mp_size = size;
+      /* We can't return any useful result for gcd(big,0).  */
+      return size > 1 ? 0 : w->_mp_d[0];
+    }
+  else
+    res = mpn_gcd_1 (u->_mp_d, size, (mp_limb_t) v);
+
+  if (w != NULL)
+    {
+      w->_mp_d[0] = res;
+      w->_mp_size = 1;
+    }
+  return res;
+}
diff --git a/rts/gmp/mpz/gcdext.c b/rts/gmp/mpz/gcdext.c
new file mode 100644
index 0000000000..3ba04c84ff
--- /dev/null
+++ b/rts/gmp/mpz/gcdext.c
@@ -0,0 +1,137 @@
+/* mpz_gcdext(g, s, t, a, b) -- Set G to gcd(a, b), and S and T such that
+   g = as + bt.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_gcdext (mpz_ptr g, mpz_ptr s, mpz_ptr t, mpz_srcptr a, mpz_srcptr b)
+#else
+mpz_gcdext (g, s, t, a, b)
+     mpz_ptr g;
+     mpz_ptr s;
+     mpz_ptr t;
+     mpz_srcptr a;
+     mpz_srcptr b;
+#endif
+{
+  mp_size_t asize, bsize, usize, vsize;
+  mp_srcptr ap, bp;
+  mp_ptr up, vp;
+  mp_size_t gsize, ssize, tmp_ssize;
+  mp_ptr gp, sp, tmp_gp, tmp_sp;
+  mpz_srcptr u, v;
+  mpz_ptr ss, tt;
+  __mpz_struct stmp, gtmp;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  /* mpn_gcdext requires that U >= V.  Therefore, we often have to swap U and
+     V.  This in turn leads to a lot of complications.  The computed cofactor
+     will be the wrong one, so we have to fix that up at the end.  */
+
+  asize = ABS (SIZ (a));
+  bsize = ABS (SIZ (b));
+  ap = PTR (a);
+  bp = PTR (b);
+  if (asize > bsize || (asize == bsize && mpn_cmp (ap, bp, asize) > 0))
+    {
+      usize = asize;
+      vsize = bsize;
+      up = (mp_ptr) TMP_ALLOC ((usize + 1) * BYTES_PER_MP_LIMB);
+      vp = (mp_ptr) TMP_ALLOC ((vsize + 1) * BYTES_PER_MP_LIMB);
+      MPN_COPY (up, ap, usize);
+      MPN_COPY (vp, bp, vsize);
+      u = a;
+      v = b;
+      ss = s;
+      tt = t;
+    }
+  else
+    {
+      usize = bsize;
+      vsize = asize;
+      up = (mp_ptr) TMP_ALLOC ((usize + 1) * BYTES_PER_MP_LIMB);
+      vp = (mp_ptr) TMP_ALLOC ((vsize + 1) * BYTES_PER_MP_LIMB);
+      MPN_COPY (up, bp, usize);
+      MPN_COPY (vp, ap, vsize);
+      u = b;
+      v = a;
+      ss = t;
+      tt = s;
+    }
+
+  tmp_gp = (mp_ptr) TMP_ALLOC ((usize + 1) * BYTES_PER_MP_LIMB);
+  tmp_sp = (mp_ptr) TMP_ALLOC ((usize + 1) * BYTES_PER_MP_LIMB);
+
+  if (vsize == 0)
+    {
+      tmp_sp[0] = 1;
+      tmp_ssize = 1;
+      MPN_COPY (tmp_gp, up, usize);
+      gsize = usize;
+    }
+  else
+    gsize = mpn_gcdext (tmp_gp, tmp_sp, &tmp_ssize, up, usize, vp, vsize);
+  ssize = ABS (tmp_ssize);
+
+  PTR (&gtmp) = tmp_gp;
+  SIZ (&gtmp) = gsize;
+
+  PTR (&stmp) = tmp_sp;
+  SIZ (&stmp) = (tmp_ssize ^ SIZ (u)) >= 0 ? ssize : -ssize;
+
+  if (tt != NULL)
+    {
+      if (SIZ (v) == 0)
+	SIZ (tt) = 0;
+      else
+	{
+	  mpz_t x;
+	  MPZ_TMP_INIT (x, ssize + usize + 1);
+	  mpz_mul (x, &stmp, u);
+	  mpz_sub (x, &gtmp, x);
+	  mpz_tdiv_q (tt, x, v);
+	}
+    }
+
+  if (ss != NULL)
+    {
+      if (ALLOC (ss) < ssize)
+	_mpz_realloc (ss, ssize);
+      sp = PTR (ss);
+      MPN_COPY (sp, tmp_sp, ssize);
+      SIZ (ss) = SIZ (&stmp);
+    }
+
+  if (ALLOC (g) < gsize)
+    _mpz_realloc (g, gsize);
+  gp = PTR (g);
+  MPN_COPY (gp, tmp_gp, gsize);
+  SIZ (g) = gsize;
+
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/get_d.c b/rts/gmp/mpz/get_d.c
new file mode 100644
index 0000000000..6a7c5856bb
--- /dev/null
+++ b/rts/gmp/mpz/get_d.c
@@ -0,0 +1,128 @@
+/* double mpz_get_d (mpz_t src) -- Return the double approximation to SRC.
+
+Copyright (C) 1996, 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+static int
+#if __STDC__
+mpn_zero_p (mp_ptr p, mp_size_t n)
+#else
+mpn_zero_p (p, n)
+     mp_ptr p;
+     mp_size_t n;
+#endif
+{
+  mp_size_t i;
+
+  for (i = 0; i < n; i++)
+    {
+      if (p[i] != 0)
+	return 0;
+    }
+
+  return 1;
+}
+
+
+double
+#if __STDC__
+mpz_get_d (mpz_srcptr src)
+#else
+mpz_get_d (src)
+     mpz_srcptr src;
+#endif
+{
+  double res;
+  mp_size_t size;
+  int negative;
+  mp_ptr qp;
+  mp_limb_t hz, lz;
+  int cnt;
+
+  size = SIZ(src);
+  if (size == 0)
+    return 0.0;
+
+  negative = size < 0;
+  size = ABS (size);
+  qp = PTR(src);
+
+  if (size == 1)
+    {
+      res = qp[size - 1];
+    }
+  else if (size == 2)
+    {
+      res = MP_BASE_AS_DOUBLE * qp[size - 1] + qp[size - 2];
+    }
+  else
+    {
+      count_leading_zeros (cnt, qp[size - 1]);
+
+#if BITS_PER_MP_LIMB == 32
+      if (cnt == 0)
+	{
+	  hz = qp[size - 1];
+	  lz = qp[size - 2];
+	}
+      else
+	{
+	  hz = (qp[size - 1] << cnt) | (qp[size - 2] >> BITS_PER_MP_LIMB - cnt);
+	  lz = (qp[size - 2] << cnt) | (qp[size - 3] >> BITS_PER_MP_LIMB - cnt);
+	}
+#if _GMP_IEEE_FLOATS
+      /* Take bits from less significant limbs, but only if they may affect
+	 the result.  */
+      if ((lz & 0x7ff) == 0x400)
+	{
+	  if (cnt != 0)
+	    lz += ((qp[size - 3] << cnt) != 0 || ! mpn_zero_p (qp, size - 3));
+	  else
+	    lz += (! mpn_zero_p (qp, size - 2));
+	}
+#endif
+      res = MP_BASE_AS_DOUBLE * hz + lz;
+      res = __gmp_scale2 (res, (size - 2) * BITS_PER_MP_LIMB - cnt);
+#endif
+#if BITS_PER_MP_LIMB == 64
+      if (cnt == 0)
+	hz = qp[size - 1];
+      else
+	hz = (qp[size - 1] << cnt) | (qp[size - 2] >> BITS_PER_MP_LIMB - cnt);
+#if _GMP_IEEE_FLOATS
+      if ((hz & 0x7ff) == 0x400)
+	{
+	  if (cnt != 0)
+	    hz += ((qp[size - 2] << cnt) != 0 || ! mpn_zero_p (qp, size - 2));
+	  else
+	    hz += (! mpn_zero_p (qp, size - 1));
+	}
+#endif
+      res = hz;
+      res = __gmp_scale2 (res, (size - 1) * BITS_PER_MP_LIMB - cnt);
+#endif
+    }
+
+  return negative ? -res : res;
+}
diff --git a/rts/gmp/mpz/get_si.c b/rts/gmp/mpz/get_si.c
new file mode 100644
index 0000000000..8a5d0e4803
--- /dev/null
+++ b/rts/gmp/mpz/get_si.c
@@ -0,0 +1,43 @@
+/* mpz_get_si(integer) -- Return the least significant digit from INTEGER.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+signed long int
+#if __STDC__
+mpz_get_si (mpz_srcptr op)
+#else
+mpz_get_si (op)
+     mpz_srcptr op;
+#endif
+{
+  mp_size_t size = op->_mp_size;
+  mp_limb_t low_limb = op->_mp_d[0];
+
+  if (size > 0)
+    return low_limb % ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1));
+  else if (size < 0)
+    /* This convoluted expression is necessary to properly handle 0x80000000 */
+    return ~((low_limb - 1) % ((mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1)));
+  else
+    return 0;
+}
diff --git a/rts/gmp/mpz/get_str.c b/rts/gmp/mpz/get_str.c
new file mode 100644
index 0000000000..c7278afb52
--- /dev/null
+++ b/rts/gmp/mpz/get_str.c
@@ -0,0 +1,118 @@
+/* mpz_get_str (string, base, mp_src) -- Convert the multiple precision
+   number MP_SRC to a string STRING of base BASE.  If STRING is NULL
+   allocate space for the result.  In any case, return a pointer to the
+   result.  If STRING is not NULL, the caller must ensure enough space is
+   available to store the result.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+char *
+#if __STDC__
+mpz_get_str (char *res_str, int base, mpz_srcptr x)
+#else
+mpz_get_str (res_str, base, x)
+     char *res_str;
+     int base;
+     mpz_srcptr x;
+#endif
+{
+  mp_ptr xp;
+  mp_size_t x_size = x->_mp_size;
+  unsigned char *str;
+  char *return_str;
+  size_t str_size;
+  char *num_to_text;
+  int i;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+  if (base >= 0)
+    {
+      if (base == 0)
+	base = 10;
+      num_to_text = "0123456789abcdefghijklmnopqrstuvwxyz";
+    }
+  else
+    {
+      base = -base;
+      num_to_text = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+    }
+
+  /* We allways allocate space for the string.  If the caller passed a
+     NULL pointer for RES_STR, we allocate permanent space and return
+     a pointer to that to the caller.  */
+  str_size = ((size_t) (ABS (x_size) * BITS_PER_MP_LIMB
+			* __mp_bases[base].chars_per_bit_exactly)) + 3;
+  if (res_str == 0)
+    {
+      /* We didn't get a string from the user.  Allocate one (and return
+	 a pointer to it).  */
+      res_str = (char *) (*_mp_allocate_func) (str_size);
+      /* Make str, the variable used for raw result from mpn_get_str,
+	 point to the same string, but just after a possible minus sign.  */
+      str = (unsigned char *) res_str + 1;
+    }
+  else
+    {
+      /* Use TMP_ALLOC to get temporary space, since we need a few extra bytes
+	 that we can't expect to caller to supply us with.  */
+      str = (unsigned char *) TMP_ALLOC (str_size);
+    }
+
+  return_str = res_str;
+
+  if (x_size == 0)
+    {
+      res_str[0] = '0';
+      res_str[1] = 0;
+      TMP_FREE (marker);
+      return res_str;
+    }
+  if (x_size < 0)
+    {
+      *res_str++ = '-';
+      x_size = -x_size;
+    }
+
+  /* Move the number to convert into temporary space, since mpn_get_str
+     clobbers its argument + needs one extra high limb....  */
+  xp = (mp_ptr) TMP_ALLOC ((x_size + 1) * BYTES_PER_MP_LIMB);
+  MPN_COPY (xp, x->_mp_d, x_size);
+
+  str_size = mpn_get_str (str, base, xp, x_size);
+
+  /* mpn_get_str might make some leading zeros.  Skip them.  */
+  while (*str == 0)
+    {
+      str_size--;
+      str++;
+    }
+
+  /* Translate result to printable chars and move result to RES_STR.  */
+  for (i = 0; i < str_size; i++)
+    res_str[i] = num_to_text[str[i]];
+  res_str[str_size] = 0;
+
+  TMP_FREE (marker);
+  return return_str;
+}
diff --git a/rts/gmp/mpz/get_ui.c b/rts/gmp/mpz/get_ui.c
new file mode 100644
index 0000000000..a8ec9e01a4
--- /dev/null
+++ b/rts/gmp/mpz/get_ui.c
@@ -0,0 +1,37 @@
+/* mpz_get_ui(integer) -- Return the least significant digit from INTEGER.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_get_ui (mpz_srcptr integer)
+#else
+mpz_get_ui (integer)
+     mpz_srcptr integer;
+#endif
+{
+  if (integer->_mp_size == 0)
+    return 0;
+  else
+    return integer->_mp_d[0];
+}
diff --git a/rts/gmp/mpz/getlimbn.c b/rts/gmp/mpz/getlimbn.c
new file mode 100644
index 0000000000..b772ed05c4
--- /dev/null
+++ b/rts/gmp/mpz/getlimbn.c
@@ -0,0 +1,38 @@
+/* mpz_getlimbn(integer,n) -- Return the N:th limb from INTEGER.
+
+Copyright (C) 1993, 1994, 1995, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+mp_limb_t
+#if __STDC__
+mpz_getlimbn (mpz_srcptr integer, mp_size_t n)
+#else
+mpz_getlimbn (integer, n)
+     mpz_srcptr integer;
+     mp_size_t n;
+#endif
+{
+  if (ABS (integer->_mp_size) <= n || n < 0)
+    return 0;
+  else
+    return integer->_mp_d[n];
+}
diff --git a/rts/gmp/mpz/hamdist.c b/rts/gmp/mpz/hamdist.c
new file mode 100644
index 0000000000..b039a653d2
--- /dev/null
+++ b/rts/gmp/mpz/hamdist.c
@@ -0,0 +1,62 @@
+/* mpz_hamdist(mpz_ptr op1, mpz_ptr op2) -- Compute the hamming distance
+   between OP1 and OP2.  If one of the operands is negative, return ~0.  (We
+   could make the function well-defined when both operands are negative, but
+   that would probably not be worth the trouble.
+
+Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_hamdist (mpz_srcptr u, mpz_srcptr v)
+#else
+mpz_hamdist (u, v)
+     mpz_srcptr u;
+     mpz_srcptr v;
+#endif
+{
+  mp_srcptr up, vp;
+  mp_size_t usize, vsize, size;
+  unsigned long int count;
+
+  usize = u->_mp_size;
+  vsize = v->_mp_size;
+
+  if ((usize | vsize) < 0)
+    return ~ (unsigned long int) 0;
+
+  up = u->_mp_d;
+  vp = v->_mp_d;
+
+  if (usize > vsize)
+    {
+      count = mpn_popcount (up + vsize, usize - vsize);
+      size = vsize;
+    }
+  else
+    {
+      count = mpn_popcount (vp + usize, vsize - usize);
+      size = usize;
+    }
+
+  return count + mpn_hamdist (up, vp, size);
+}
diff --git a/rts/gmp/mpz/init.c b/rts/gmp/mpz/init.c
new file mode 100644
index 0000000000..2e8e4d2cbd
--- /dev/null
+++ b/rts/gmp/mpz/init.c
@@ -0,0 +1,36 @@
+/* mpz_init() -- Make a new multiple precision number with value 0.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_init (mpz_ptr x)
+#else
+mpz_init (x)
+     mpz_ptr x;
+#endif
+{
+  x->_mp_alloc = 1;
+  x->_mp_d = (mp_ptr) (*_mp_allocate_func) (BYTES_PER_MP_LIMB);
+  x->_mp_size = 0;
+}
diff --git a/rts/gmp/mpz/inp_raw.c b/rts/gmp/mpz/inp_raw.c
new file mode 100644
index 0000000000..15e601229d
--- /dev/null
+++ b/rts/gmp/mpz/inp_raw.c
@@ -0,0 +1,101 @@
+/* mpz_inp_raw -- Input a mpz_t in raw, but endianess, and wordsize
+   independent format (as output by mpz_out_raw).
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h>
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+size_t
+#if __STDC__
+mpz_inp_raw (mpz_ptr x, FILE *stream)
+#else
+mpz_inp_raw (x, stream)
+     mpz_ptr x;
+     FILE *stream;
+#endif
+{
+  int i;
+  mp_size_t s;
+  mp_size_t xsize;
+  mp_ptr xp;
+  unsigned int c;
+  mp_limb_t x_limb;
+  mp_size_t in_bytesize;
+  int neg_flag;
+
+  if (stream == 0)
+    stream = stdin;
+
+  /* Read 4-byte size */
+  in_bytesize = 0;
+  for (i = 4 - 1; i >= 0; i--)
+    {
+      c = fgetc (stream);
+      in_bytesize = (in_bytesize << BITS_PER_CHAR) | c;
+    }
+
+  /* Size is stored as a 32 bit word; sign extend in_bytesize for non-32 bit
+     machines.  */
+  if (sizeof (mp_size_t) > 4)
+    in_bytesize |= (-(in_bytesize < 0)) << 31;
+
+  neg_flag = in_bytesize < 0;
+  in_bytesize = ABS (in_bytesize);
+  xsize = (in_bytesize + BYTES_PER_MP_LIMB - 1) / BYTES_PER_MP_LIMB;
+
+  if (xsize == 0)
+    {
+      x->_mp_size = 0;
+      return 4;			/* we've read 4 bytes */
+    }
+
+  if (x->_mp_alloc < xsize)
+    _mpz_realloc (x, xsize);
+  xp = x->_mp_d;
+
+  x_limb = 0;
+  for (i = (in_bytesize - 1) % BYTES_PER_MP_LIMB; i >= 0; i--)
+    {
+      c = fgetc (stream);
+      x_limb = (x_limb << BITS_PER_CHAR) | c;
+    }
+  xp[xsize - 1] = x_limb;
+
+  for (s = xsize - 2; s >= 0; s--)
+    {
+      x_limb = 0;
+      for (i = BYTES_PER_MP_LIMB - 1; i >= 0; i--)
+	{
+	  c = fgetc (stream);
+	  x_limb = (x_limb << BITS_PER_CHAR) | c;
+	}
+      xp[s] = x_limb;
+    }
+
+  if (c == EOF)
+    return 0;			/* error */
+
+  MPN_NORMALIZE (xp, xsize);
+  x->_mp_size = neg_flag ? -xsize : xsize;
+  return in_bytesize + 4;
+}
diff --git a/rts/gmp/mpz/inp_str.c b/rts/gmp/mpz/inp_str.c
new file mode 100644
index 0000000000..7aa5e1fc30
--- /dev/null
+++ b/rts/gmp/mpz/inp_str.c
@@ -0,0 +1,167 @@
+/* mpz_inp_str(dest_integer, stream, base) -- Input a number in base
+   BASE from stdio stream STREAM and store the result in DEST_INTEGER.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1998, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h>
+#include <ctype.h>
+#include "gmp.h"
+#include "gmp-impl.h"
+
+static int
+#if __STDC__
+digit_value_in_base (int c, int base)
+#else
+digit_value_in_base (c, base)
+     int c;
+     int base;
+#endif
+{
+  int digit;
+
+  if (isdigit (c))
+    digit = c - '0';
+  else if (islower (c))
+    digit = c - 'a' + 10;
+  else if (isupper (c))
+    digit = c - 'A' + 10;
+  else
+    return -1;
+
+  if (digit < base)
+    return digit;
+  return -1;
+}
+
+size_t
+#if __STDC__
+mpz_inp_str (mpz_ptr x, FILE *stream, int base)
+#else
+mpz_inp_str (x, stream, base)
+     mpz_ptr x;
+     FILE *stream;
+     int base;
+#endif
+{
+  char *str;
+  size_t alloc_size, str_size;
+  int c;
+  int negative;
+  mp_size_t xsize;
+  size_t nread;
+
+  if (stream == 0)
+    stream = stdin;
+
+  nread = 0;
+
+  /* Skip whitespace.  */
+  do
+    {
+      c = getc (stream);
+      nread++;
+    }
+  while (isspace (c));
+
+  negative = 0;
+  if (c == '-')
+    {
+      negative = 1;
+      c = getc (stream);
+      nread++;
+    }
+
+  if (digit_value_in_base (c, base == 0 ? 10 : base) < 0)
+    return 0;			/* error if no digits */
+
+  /* If BASE is 0, try to find out the base by looking at the initial
+     characters.  */
+  if (base == 0)
+    {
+      base = 10;
+      if (c == '0')
+	{
+	  base = 8;
+	  c = getc (stream);
+	  nread++;
+	  if (c == 'x' || c == 'X')
+	    {
+	      base = 16;
+	      c = getc (stream);
+	      nread++;
+	    }
+	  else if (c == 'b' || c == 'B')
+	    {
+	      base = 2;
+	      c = getc (stream);
+	      nread++;
+	    }
+	}
+    }
+
+  /* Skip leading zeros.  */
+  while (c == '0')
+    {
+      c = getc (stream);
+      nread++;
+    }
+
+  alloc_size = 100;
+  str = (char *) (*_mp_allocate_func) (alloc_size);
+  str_size = 0;
+
+  for (;;)
+    {
+      int dig;
+      if (str_size >= alloc_size)
+	{
+	  size_t old_alloc_size = alloc_size;
+	  alloc_size = alloc_size * 3 / 2;
+	  str = (char *) (*_mp_reallocate_func) (str, old_alloc_size, alloc_size);
+	}
+      dig = digit_value_in_base (c, base);
+      if (dig < 0)
+	break;
+      str[str_size++] = dig;
+      c = getc (stream);
+    }
+
+  ungetc (c, stream);
+
+  /* Make sure the string is not empty, mpn_set_str would fail.  */
+  if (str_size == 0)
+    {
+      x->_mp_size = 0;
+      (*_mp_free_func) (str, alloc_size);
+      return nread;
+    }
+
+  xsize = (((mp_size_t) (str_size / __mp_bases[base].chars_per_bit_exactly))
+	   / BITS_PER_MP_LIMB + 2);
+  if (x->_mp_alloc < xsize)
+    _mpz_realloc (x, xsize);
+
+  /* Convert the byte array in base BASE to our bignum format.  */
+  xsize = mpn_set_str (x->_mp_d, (unsigned char *) str, str_size, base);
+  x->_mp_size = negative ? -xsize : xsize;
+
+  (*_mp_free_func) (str, alloc_size);
+  return str_size + nread;
+}
diff --git a/rts/gmp/mpz/invert.c b/rts/gmp/mpz/invert.c
new file mode 100644
index 0000000000..749a0969fc
--- /dev/null
+++ b/rts/gmp/mpz/invert.c
@@ -0,0 +1,77 @@
+/* mpz_invert (inv, x, n).  Find multiplicative inverse of X in Z(N).
+   If X has an inverse, return non-zero and store inverse in INVERSE,
+   otherwise, return 0 and put garbage in INVERSE.
+
+Copyright (C) 1996, 1997, 1998, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+mpz_invert (mpz_ptr inverse, mpz_srcptr x, mpz_srcptr n)
+#else
+mpz_invert (inverse, x, n)
+     mpz_ptr inverse;
+     mpz_srcptr x, n;
+#endif
+{
+  mpz_t gcd, tmp;
+  mp_size_t xsize, nsize, size;
+  TMP_DECL (marker);
+
+  xsize = SIZ (x);
+  nsize = SIZ (n);
+  xsize = ABS (xsize);
+  nsize = ABS (nsize);
+  size = MAX (xsize, nsize) + 1;
+
+  /* No inverse exists if the leftside operand is 0.  Likewise, no
+     inverse exists if the mod operand is 1.  */
+  if (xsize == 0 || (nsize == 1 && (PTR (n))[0] == 1))
+    return 0;
+
+  TMP_MARK (marker);
+
+  MPZ_TMP_INIT (gcd, size);
+  MPZ_TMP_INIT (tmp, size);
+  mpz_gcdext (gcd, tmp, (mpz_ptr) 0, x, n);
+
+  /* If no inverse existed, return with an indication of that.  */
+  if (gcd->_mp_size != 1 || (gcd->_mp_d)[0] != 1)
+    {
+      TMP_FREE (marker);
+      return 0;
+    }
+
+  /* Make sure we return a positive inverse.  */
+  if (SIZ (tmp) < 0)
+    {
+      if (SIZ (n) < 0)
+	mpz_sub (inverse, tmp, n);
+      else
+	mpz_add (inverse, tmp, n);
+    }
+  else
+    mpz_set (inverse, tmp);
+
+  TMP_FREE (marker);
+  return 1;
+}
diff --git a/rts/gmp/mpz/ior.c b/rts/gmp/mpz/ior.c
new file mode 100644
index 0000000000..0bb5a806dc
--- /dev/null
+++ b/rts/gmp/mpz/ior.c
@@ -0,0 +1,244 @@
+/* mpz_ior -- Logical inclusive or.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_ior (mpz_ptr res, mpz_srcptr op1, mpz_srcptr op2)
+#else
+mpz_ior (res, op1, op2)
+     mpz_ptr res;
+     mpz_srcptr op1;
+     mpz_srcptr op2;
+#endif
+{
+  mp_srcptr op1_ptr, op2_ptr;
+  mp_size_t op1_size, op2_size;
+  mp_ptr res_ptr;
+  mp_size_t res_size;
+  mp_size_t i;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+  op1_size = op1->_mp_size;
+  op2_size = op2->_mp_size;
+
+  op1_ptr = op1->_mp_d;
+  op2_ptr = op2->_mp_d;
+  res_ptr = res->_mp_d;
+
+  if (op1_size >= 0)
+    {
+      if (op2_size >= 0)
+	{
+	  if (op1_size >= op2_size)
+	    {
+	      if (res->_mp_alloc < op1_size)
+		{
+		  _mpz_realloc (res, op1_size);
+		  op1_ptr = op1->_mp_d;
+		  op2_ptr = op2->_mp_d;
+		  res_ptr = res->_mp_d;
+		}
+
+	      if (res_ptr != op1_ptr)
+		MPN_COPY (res_ptr + op2_size, op1_ptr + op2_size,
+			  op1_size - op2_size);
+	      for (i = op2_size - 1; i >= 0; i--)
+		res_ptr[i] = op1_ptr[i] | op2_ptr[i];
+	      res_size = op1_size;
+	    }
+	  else
+	    {
+	      if (res->_mp_alloc < op2_size)
+		{
+		  _mpz_realloc (res, op2_size);
+		  op1_ptr = op1->_mp_d;
+		  op2_ptr = op2->_mp_d;
+		  res_ptr = res->_mp_d;
+		}
+
+	      if (res_ptr != op2_ptr)
+		MPN_COPY (res_ptr + op1_size, op2_ptr + op1_size,
+			  op2_size - op1_size);
+	      for (i = op1_size - 1; i >= 0; i--)
+		res_ptr[i] = op1_ptr[i] | op2_ptr[i];
+	      res_size = op2_size;
+	    }
+
+	  res->_mp_size = res_size;
+	  return;
+	}
+      else /* op2_size < 0 */
+	{
+	  /* Fall through to the code at the end of the function.  */
+	}
+    }
+  else
+    {
+      if (op2_size < 0)
+	{
+	  mp_ptr opx;
+	  mp_limb_t cy;
+
+	  /* Both operands are negative, so will be the result.
+	     -((-OP1) | (-OP2)) = -(~(OP1 - 1) | ~(OP2 - 1)) =
+	     = ~(~(OP1 - 1) | ~(OP2 - 1)) + 1 =
+	     = ((OP1 - 1) & (OP2 - 1)) + 1      */
+
+	  op1_size = -op1_size;
+	  op2_size = -op2_size;
+
+	  res_size = MIN (op1_size, op2_size);
+
+	  /* Possible optimization: Decrease mpn_sub precision,
+	     as we won't use the entire res of both.  */
+	  opx = (mp_ptr) TMP_ALLOC (res_size * BYTES_PER_MP_LIMB);
+	  mpn_sub_1 (opx, op1_ptr, res_size, (mp_limb_t) 1);
+	  op1_ptr = opx;
+
+	  opx = (mp_ptr) TMP_ALLOC (res_size * BYTES_PER_MP_LIMB);
+	  mpn_sub_1 (opx, op2_ptr, res_size, (mp_limb_t) 1);
+	  op2_ptr = opx;
+
+	  if (res->_mp_alloc < res_size)
+	    {
+	      _mpz_realloc (res, res_size);
+	      res_ptr = res->_mp_d;
+	      /* Don't re-read OP1_PTR and OP2_PTR.  They point to
+		 temporary space--never to the space RES->_mp_d used
+		 to point to before reallocation.  */
+	    }
+
+	  /* First loop finds the size of the result.  */
+	  for (i = res_size - 1; i >= 0; i--)
+	    if ((op1_ptr[i] & op2_ptr[i]) != 0)
+	      break;
+	  res_size = i + 1;
+
+	  if (res_size != 0)
+	    {
+	      /* Second loop computes the real result.  */
+	      for (i = res_size - 1; i >= 0; i--)
+		res_ptr[i] = op1_ptr[i] & op2_ptr[i];
+
+	      cy = mpn_add_1 (res_ptr, res_ptr, res_size, (mp_limb_t) 1);
+	      if (cy)
+		{
+		  res_ptr[res_size] = cy;
+		  res_size++;
+		}
+	    }
+	  else
+	    {
+	      res_ptr[0] = 1;
+	      res_size = 1;
+	    }
+
+	  res->_mp_size = -res_size;
+	  TMP_FREE (marker);
+	  return;
+	}
+      else
+	{
+	  /* We should compute -OP1 | OP2.  Swap OP1 and OP2 and fall
+	     through to the code that handles OP1 | -OP2.  */
+          MPZ_SRCPTR_SWAP (op1, op2);
+          MPN_SRCPTR_SWAP (op1_ptr,op1_size, op2_ptr,op2_size);
+	}
+    }
+
+  {
+    mp_ptr opx;
+    mp_limb_t cy;
+    mp_size_t res_alloc;
+    mp_size_t count;
+
+    /* Operand 2 negative, so will be the result.
+       -(OP1 | (-OP2)) = -(OP1 | ~(OP2 - 1)) =
+       = ~(OP1 | ~(OP2 - 1)) + 1 =
+       = (~OP1 & (OP2 - 1)) + 1      */
+
+    op2_size = -op2_size;
+
+    res_alloc = op2_size;
+
+    opx = (mp_ptr) TMP_ALLOC (op2_size * BYTES_PER_MP_LIMB);
+    mpn_sub_1 (opx, op2_ptr, op2_size, (mp_limb_t) 1);
+    op2_ptr = opx;
+    op2_size -= op2_ptr[op2_size - 1] == 0;
+
+    if (res->_mp_alloc < res_alloc)
+      {
+	_mpz_realloc (res, res_alloc);
+	op1_ptr = op1->_mp_d;
+	res_ptr = res->_mp_d;
+	/* Don't re-read OP2_PTR.  It points to temporary space--never
+	   to the space RES->_mp_d used to point to before reallocation.  */
+      }
+
+    if (op1_size >= op2_size)
+      {
+	/* We can just ignore the part of OP1 that stretches above OP2,
+	   because the result limbs are zero there.  */
+
+	/* First loop finds the size of the result.  */
+	for (i = op2_size - 1; i >= 0; i--)
+	  if ((~op1_ptr[i] & op2_ptr[i]) != 0)
+	    break;
+	res_size = i + 1;
+	count = res_size;
+      }
+    else
+      {
+	res_size = op2_size;
+
+	/* Copy the part of OP2 that stretches above OP1, to RES.  */
+	MPN_COPY (res_ptr + op1_size, op2_ptr + op1_size, op2_size - op1_size);
+	count = op1_size;
+      }
+
+    if (res_size != 0)
+      {
+	/* Second loop computes the real result.  */
+	for (i = count - 1; i >= 0; i--)
+	  res_ptr[i] = ~op1_ptr[i] & op2_ptr[i];
+
+	cy = mpn_add_1 (res_ptr, res_ptr, res_size, (mp_limb_t) 1);
+	if (cy)
+	  {
+	    res_ptr[res_size] = cy;
+	    res_size++;
+	  }
+      }
+    else
+      {
+	res_ptr[0] = 1;
+	res_size = 1;
+      }
+
+    res->_mp_size = -res_size;
+  }
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/iset.c b/rts/gmp/mpz/iset.c
new file mode 100644
index 0000000000..114bc2d542
--- /dev/null
+++ b/rts/gmp/mpz/iset.c
@@ -0,0 +1,49 @@
+/* mpz_init_set (src_integer) -- Make a new multiple precision number with
+   a value copied from SRC_INTEGER.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_init_set (mpz_ptr w, mpz_srcptr u)
+#else
+mpz_init_set (w, u)
+     mpz_ptr w;
+     mpz_srcptr u;
+#endif
+{
+  mp_ptr wp, up;
+  mp_size_t usize, size;
+
+  usize = u->_mp_size;
+  size = ABS (usize);
+
+  w->_mp_alloc = MAX (size, 1);
+  w->_mp_d = (mp_ptr) (*_mp_allocate_func) (w->_mp_alloc * BYTES_PER_MP_LIMB);
+
+  wp = w->_mp_d;
+  up = u->_mp_d;
+
+  MPN_COPY (wp, up, size);
+  w->_mp_size = usize;
+}
diff --git a/rts/gmp/mpz/iset_d.c b/rts/gmp/mpz/iset_d.c
new file mode 100644
index 0000000000..502a8933e2
--- /dev/null
+++ b/rts/gmp/mpz/iset_d.c
@@ -0,0 +1,39 @@
+/* mpz_init_set_d(integer, val) -- Initialize and assign INTEGER with a double
+   value VAL.
+
+Copyright (C) 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_init_set_d (mpz_ptr dest, double val)
+#else
+mpz_init_set_d (dest, val)
+     mpz_ptr dest;
+     double val;
+#endif
+{
+  dest->_mp_alloc = 1;
+  dest->_mp_d = (mp_ptr) (*_mp_allocate_func) (BYTES_PER_MP_LIMB);
+  dest->_mp_size = 0;
+  mpz_set_d (dest, val);
+}
diff --git a/rts/gmp/mpz/iset_si.c b/rts/gmp/mpz/iset_si.c
new file mode 100644
index 0000000000..842db140ef
--- /dev/null
+++ b/rts/gmp/mpz/iset_si.c
@@ -0,0 +1,49 @@
+/* mpz_init_set_si(val) -- Make a new multiple precision number with
+   value val.
+
+Copyright (C) 1991, 1993, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_init_set_si (mpz_ptr x, signed long int val)
+#else
+mpz_init_set_si (x, val)
+     mpz_ptr x;
+     signed long int val;
+#endif
+{
+  x->_mp_alloc = 1;
+  x->_mp_d = (mp_ptr) (*_mp_allocate_func) (BYTES_PER_MP_LIMB);
+  if (val > 0)
+    {
+      x->_mp_d[0] = val;
+      x->_mp_size = 1;
+    }
+  else if (val < 0)
+    {
+      x->_mp_d[0] = (unsigned long) -val;
+      x->_mp_size = -1;
+    }
+  else
+    x->_mp_size = 0;
+}
diff --git a/rts/gmp/mpz/iset_str.c b/rts/gmp/mpz/iset_str.c
new file mode 100644
index 0000000000..dfb8c6b230
--- /dev/null
+++ b/rts/gmp/mpz/iset_str.c
@@ -0,0 +1,47 @@
+/* mpz_init_set_str(string, base) -- Convert the \0-terminated string
+   STRING in base BASE to a multiple precision integer.  Return a MP_INT
+   structure representing the integer.  Allow white space in the
+   string.  If BASE == 0 determine the base in the C standard way,
+   i.e.  0xhh...h means base 16, 0oo...o means base 8, otherwise
+   assume base 10.
+
+Copyright (C) 1991, 1993, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+mpz_init_set_str (mpz_ptr x, const char *str, int base)
+#else
+mpz_init_set_str (x, str, base)
+     mpz_ptr x;
+     const char *str;
+     int base;
+#endif
+{
+  x->_mp_alloc = 1;
+  x->_mp_d = (mp_ptr) (*_mp_allocate_func) (BYTES_PER_MP_LIMB);
+
+  /* if str has no digits mpz_set_str leaves x->_mp_size unset */
+  x->_mp_size = 0;
+
+  return mpz_set_str (x, str, base);
+}
diff --git a/rts/gmp/mpz/iset_ui.c b/rts/gmp/mpz/iset_ui.c
new file mode 100644
index 0000000000..759182c556
--- /dev/null
+++ b/rts/gmp/mpz/iset_ui.c
@@ -0,0 +1,39 @@
+/* mpz_init_set_ui(val) -- Make a new multiple precision number with
+   value val.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_init_set_ui (mpz_ptr x, unsigned long int val)
+#else
+mpz_init_set_ui (x, val)
+     mpz_ptr x;
+     unsigned long int val;
+#endif
+{
+  x->_mp_alloc = 1;
+  x->_mp_d = (mp_ptr) (*_mp_allocate_func) (BYTES_PER_MP_LIMB);
+  x->_mp_d[0] = val;
+  x->_mp_size = val != 0;
+}
diff --git a/rts/gmp/mpz/jacobi.c b/rts/gmp/mpz/jacobi.c
new file mode 100644
index 0000000000..9d49e1d0c6
--- /dev/null
+++ b/rts/gmp/mpz/jacobi.c
@@ -0,0 +1,53 @@
+/* mpz_jacobi (op1, op2).
+   Contributed by Bennet Yee (bsy) at Carnegie-Mellon University
+
+Copyright (C) 1991, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+
+/* Precondition:  both p and q are positive */
+
+int
+#if	__STDC__
+mpz_jacobi (mpz_srcptr pi, mpz_srcptr qi)
+#else
+mpz_jacobi (pi, qi)
+     mpz_srcptr pi, qi;
+#endif
+{
+#if GCDCHECK
+  int retval;
+  mpz_t gcdval;
+
+  mpz_init (gcdval);
+  mpz_gcd (gcdval, pi, qi);
+  if (!mpz_cmp_ui (gcdval, 1L))
+    {
+      /* J(ab,cb) = J(ab,c)J(ab,b) = J(ab,c)J(0,b) = J(ab,c)*0 */
+      retval = 0;
+    }
+  else
+    retval = mpz_legendre (pi, qi);
+  mpz_clear (gcdval);
+  return retval;
+#else
+  return mpz_legendre (pi, qi);
+#endif
+}
diff --git a/rts/gmp/mpz/kronsz.c b/rts/gmp/mpz/kronsz.c
new file mode 100644
index 0000000000..c8c6752224
--- /dev/null
+++ b/rts/gmp/mpz/kronsz.c
@@ -0,0 +1,126 @@
+/* mpz_si_kronecker -- Kronecker/Jacobi symbol. */
+
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+int
+#if __STDC__
+mpz_si_kronecker (long a, mpz_srcptr b)
+#else
+mpz_si_kronecker (a, b)
+     long       a;
+     mpz_srcptr b;
+#endif
+{
+  int        b_abs_size;
+  mp_srcptr  b_ptr;
+  mp_limb_t  b_low;
+  int        twos;
+  int        result_bit1;
+
+  b_abs_size = ABSIZ (b);
+  if (b_abs_size == 0)
+    return JACOBI_S0 (a);  /* (a/0) */
+
+  b_ptr = PTR(b);
+  b_low = b_ptr[0];
+
+  /* (0/b) = 1 if b=+/-1, 0 otherwise */
+  if (a == 0)
+    return (b_abs_size == 1) & (b_low == 1);
+
+  /* account for the effect of the sign of b, so can then ignore it */
+  result_bit1 = JACOBI_BSGN_SZ_BIT1 (a, b);
+
+  if ((b_low & 1) == 0)
+    {
+      /* b even */
+
+      if ((a & 1) == 0)
+        return 0;  /* (a/b)=0 if both a,b even */
+
+      /* Require MP_BITS_PER_LIMB even, so that (a/2)^MP_BITS_PER_LIMB = 1,
+         and so that therefore there's no need to account for how many zero
+         limbs are stripped.  */
+      ASSERT ((BITS_PER_MP_LIMB & 1) == 0);
+
+      MPN_STRIP_LOW_ZEROS_NOT_ZERO (b_ptr, b_abs_size);
+      b_low = b_ptr[0];
+
+      if ((b_low & 1) == 0)
+        {
+          /* odd a, even b */
+
+          mp_limb_t  b_shl_bit1;
+
+          count_trailing_zeros (twos, b_low);
+
+          /* b_shl_bit1 is b>>twos, but with only bit 1 guaranteed */
+          if (twos == BITS_PER_MP_LIMB-1)
+            b_shl_bit1 = (b_abs_size == 1) ? 0 : (b_ptr[1] << 1);
+          else
+            b_shl_bit1 = (b_low >> twos);
+
+          result_bit1 ^= JACOBI_ASGN_SU_BIT1 (a, b_shl_bit1);
+          a = ABS(a);
+
+          if (a == 1)
+            return JACOBI_BIT1_TO_PN (result_bit1);  /* (1/b)=1 */
+
+          /* twos (a/2), reciprocity to (b/a), and (b/a) = (b mod a / b) */
+          return mpn_jacobi_base (mpn_mod_1_rshift (b_ptr, b_abs_size,
+                                                    twos, a),
+                                  a,
+                                  result_bit1
+                                  ^ JACOBI_TWOS_U_BIT1 (twos, a)
+                                  ^ JACOBI_RECIP_UU_BIT1 (a, b_shl_bit1));
+        }
+    }
+
+  /* b odd */
+
+  result_bit1 ^= JACOBI_ASGN_SU_BIT1 (a, b_low);
+  a = ABS(a);
+
+  /* (a/1) = 1 for any a */
+  if (b_abs_size == 1 && b_low == 1)
+    return JACOBI_BIT1_TO_PN (result_bit1);
+
+  /* Note a is cast to unsigned because 0x80..00 doesn't fit in a signed. */
+  if ((a & 1) == 0)
+    {
+      count_trailing_zeros (twos, a);
+      a = ((unsigned long) a) >> twos;
+      result_bit1 ^= JACOBI_TWOS_U_BIT1 (twos, b_low);
+    }
+
+  if (a == 1)
+    return JACOBI_BIT1_TO_PN (result_bit1);  /* (1/b)=1 */
+
+  /* reciprocity to (b/a), and (b/a) == (b mod a / a) */
+  return mpn_jacobi_base (mpn_mod_1 (b_ptr, b_abs_size, a), a,
+                          result_bit1 ^ JACOBI_RECIP_UU_BIT1 (a, b_low));
+}
diff --git a/rts/gmp/mpz/kronuz.c b/rts/gmp/mpz/kronuz.c
new file mode 100644
index 0000000000..b877e6f64c
--- /dev/null
+++ b/rts/gmp/mpz/kronuz.c
@@ -0,0 +1,115 @@
+/* mpz_ui_kronecker -- Kronecker/Jacobi symbol. */
+
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+int
+#if __STDC__
+mpz_ui_kronecker (unsigned long a, mpz_srcptr b)
+#else
+mpz_ui_kronecker (a, b)
+     unsigned long a;
+     mpz_srcptr    b;
+#endif
+{
+  int        b_abs_size;
+  mp_srcptr  b_ptr;
+  mp_limb_t  b_low;
+  int        twos;
+  int        result_bit1;
+
+  /* (a/0) */
+  b_abs_size = ABSIZ (b);
+  if (b_abs_size == 0)
+    return JACOBI_U0 (a);
+
+  /* (a/-1)=1 when a>=0, so the sign of b is ignored */
+  b_ptr = PTR(b);
+  b_low = b_ptr[0];
+
+  /* (0/1)=1; (0/-1)=1; (0/b)=0 for b!=+/-1
+     (1/b)=1, for any b */
+  if (a <= 1)
+    return (a == 1) | ((b_abs_size == 1) & (b_low == 1));
+
+  if (b_low & 1)
+    {
+      /* (a/1) = 1 for any a */
+      if (b_abs_size == 1 && b_low == 1)
+        return 1;
+
+      count_trailing_zeros (twos, a);
+      a >>= twos;
+      if (a == 1)
+        return JACOBI_TWOS_U (twos, b_low);  /* powers of (2/b) only */
+
+      /* powers of (2/b); reciprocity to (b/a); (b/a) == (b mod a / a) */
+      return mpn_jacobi_base (mpn_mod_1 (b_ptr, b_abs_size, a),
+                              a,
+                              JACOBI_TWOS_U_BIT1 (twos, b_low)
+                              ^ JACOBI_RECIP_UU_BIT1 (b_low, a));
+    }
+
+  /* b is even; (a/2)=0 if a is even */
+  if ((a & 1) == 0)
+    return 0;
+
+  /* Require MP_BITS_PER_LIMB even, so (a/2)^MP_BITS_PER_LIMB = 1, and so we
+     don't have to pay attention to how many trailing zero limbs are
+     stripped.  */
+  ASSERT ((BITS_PER_MP_LIMB & 1) == 0);
+
+  MPN_STRIP_LOW_ZEROS_NOT_ZERO (b_ptr, b_abs_size);
+  b_low = b_ptr[0];
+
+  if (b_low & 1)
+    /* reciprocity to (b/a); (b/a) == (b mod a / a) */
+    return mpn_jacobi_base (mpn_mod_1 (b_ptr, b_abs_size, a),
+                            a,
+                            JACOBI_RECIP_UU_BIT1 (b_low, a));
+
+  count_trailing_zeros (twos, b_low);
+
+  /* reciprocity to get (b/a) */
+  if (twos == BITS_PER_MP_LIMB-1)
+    {
+      if (b_abs_size == 1)
+        {
+          /* b==0x800...00, one limb high bit only, so (a/2)^(BPML-1) */
+          return JACOBI_TWOS_U (BITS_PER_MP_LIMB-1, a);
+        }
+
+      /* b_abs_size > 1 */
+      result_bit1 = JACOBI_RECIP_UU_BIT1 (a, b_ptr[1] << 1);
+    }
+  else
+    result_bit1 = JACOBI_RECIP_UU_BIT1 (a, b_low >> twos);
+
+  /* powers of (a/2); reciprocity to (b/a); (b/a) == (b mod a / a) */
+  return mpn_jacobi_base (mpn_mod_1_rshift (b_ptr, b_abs_size, twos, a),
+                          a,
+                          JACOBI_TWOS_U_BIT1 (twos, a) ^ result_bit1);
+}
diff --git a/rts/gmp/mpz/kronzs.c b/rts/gmp/mpz/kronzs.c
new file mode 100644
index 0000000000..edfb465976
--- /dev/null
+++ b/rts/gmp/mpz/kronzs.c
@@ -0,0 +1,74 @@
+/* mpz_kronecker_si -- Kronecker/Jacobi symbol. */
+
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* This function is expected to be often used with b odd, so there's a test
+   for this before invoking count_trailing_zeros().
+
+   After the absolute value of b is established it's treated as an unsigned
+   long, because 0x80..00 doesn't fit in a signed long. */
+
+int
+#if __STDC__
+mpz_kronecker_si (mpz_srcptr a, long b)
+#else
+mpz_kronecker_si (a, b)
+     mpz_srcptr a;
+     long       b;
+#endif
+{
+  int  result_bit1;
+  int  twos;
+
+  if (b == 0)
+    return JACOBI_Z0 (a);
+
+  result_bit1 = JACOBI_BSGN_ZS_BIT1(a, b);
+  b = ABS (b);
+
+  if (b == 1)
+    return JACOBI_BIT1_TO_PN (result_bit1);  /* (a/1) = 1 for any a */
+
+  if (b & 1) 
+    return mpn_jacobi_base (mpz_fdiv_ui (a, b), b, result_bit1);
+      
+  /* result 0 if both a,b even */
+  if (mpz_even_p (a))
+    return 0;
+
+  /* (a/2)=(2/a) when a odd */
+  count_trailing_zeros (twos, b);
+  result_bit1 ^= JACOBI_TWOS_U_BIT1 (twos, PTR(a)[0]);
+
+  b = ((unsigned long) b) >> twos;
+  if (b == 1)
+    return JACOBI_BIT1_TO_PN (result_bit1);
+  else
+    return mpn_jacobi_base (mpz_fdiv_ui (a, b), b, result_bit1);
+}
+
+
diff --git a/rts/gmp/mpz/kronzu.c b/rts/gmp/mpz/kronzu.c
new file mode 100644
index 0000000000..749be5df07
--- /dev/null
+++ b/rts/gmp/mpz/kronzu.c
@@ -0,0 +1,66 @@
+/* mpz_kronecker_ui -- Kronecker/Jacobi symbol. */
+
+/*
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.
+*/
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* This function is expected to be often used with b an odd prime, so the
+   code for odd b is nice and short. */
+
+int
+#if __STDC__
+mpz_kronecker_ui (mpz_srcptr a, unsigned long b)
+#else
+mpz_kronecker_ui (a, b)
+     mpz_srcptr    a;
+     unsigned long b;
+#endif
+{
+  int  twos;
+
+  if (b & 1)
+    {
+      if (b != 1)
+        return mpn_jacobi_base (mpz_fdiv_ui (a, b), b, 0);
+      else
+        return 1;  /* (a/1)=1 for any a */
+    }
+
+  if (b == 0)
+    return JACOBI_Z0 (a);
+
+  /* (a/2)=0 if a even */
+  if (mpz_even_p (a))
+    return 0;
+
+  /* (a/2)=(2/a) when a odd */
+  count_trailing_zeros (twos, b);  
+  b >>= twos;
+  if (b == 1)
+    return JACOBI_TWOS_U (twos, PTR(a)[0]);
+
+  return mpn_jacobi_base (mpz_fdiv_ui (a, b), b,
+                          JACOBI_TWOS_U_BIT1(twos, PTR(a)[0]));
+}
diff --git a/rts/gmp/mpz/lcm.c b/rts/gmp/mpz/lcm.c
new file mode 100644
index 0000000000..7495882ae5
--- /dev/null
+++ b/rts/gmp/mpz/lcm.c
@@ -0,0 +1,61 @@
+/* mpz/lcm.c:   Calculate the least common multiple of two integers.
+
+Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void *_mpz_realloc ();
+
+void
+#if __STDC__
+mpz_lcm (mpz_ptr r, mpz_srcptr u, mpz_srcptr v)
+#else
+mpz_lcm (r, u, v)
+     mpz_ptr r;
+     mpz_srcptr u;
+     mpz_srcptr v;
+#endif
+{
+  mpz_t g;
+  mp_size_t usize, vsize, size;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  usize = ABS (SIZ (u));
+  vsize = ABS (SIZ (v));
+
+  if (usize == 0 || vsize == 0)
+    {
+      SIZ (r) = 0;
+      return;
+    }
+
+  size = MAX (usize, vsize);
+  MPZ_TMP_INIT (g, size);
+
+  mpz_gcd (g, u, v);
+  mpz_divexact (g, u, g);
+  mpz_mul (r, g, v);
+
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/legendre.c b/rts/gmp/mpz/legendre.c
new file mode 100644
index 0000000000..ab665f70d0
--- /dev/null
+++ b/rts/gmp/mpz/legendre.c
@@ -0,0 +1,184 @@
+/* mpz_legendre (op1, op2).
+   Contributed by Bennet Yee (bsy) at Carnegie-Mellon University
+
+Copyright (C) 1992, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+
+#if defined (DEBUG)
+#include <stdio.h>
+#endif
+
+/* Precondition:  both p and q are positive */
+
+int
+#if __STDC__
+mpz_legendre (mpz_srcptr pi, mpz_srcptr qi)
+#else
+mpz_legendre (pi, qi)
+mpz_srcptr pi, qi;
+#endif
+{
+  mpz_t p, q, qdiv2;
+#ifdef Q_MINUS_1
+  mpz_t q_minus_1;
+#endif
+  mpz_ptr mtmp;
+  register mpz_ptr pptr, qptr;
+  register int retval = 1;
+  register unsigned long int s;
+
+  pptr = p;
+  mpz_init_set (pptr, pi);
+  qptr = q;
+  mpz_init_set (qptr, qi);
+
+#ifdef Q_MINUS_1
+  mpz_init (q_minus_1);
+#endif
+  mpz_init (qdiv2);
+
+tail_recurse2:
+#ifdef DEBUG
+  printf ("tail_recurse2: p=");
+  mpz_out_str (stdout, 10, pptr);
+  printf ("\nq=");
+  mpz_out_str (stdout, 10, qptr);
+  putchar ('\n');
+#endif
+  s = mpz_scan1 (qptr, 0);
+  if (s) mpz_tdiv_q_2exp (qptr, qptr, s); /* J(a,2) = 1 */
+#ifdef DEBUG
+  printf ("2 factor decomposition: p=");
+  mpz_out_str (stdout, 10, pptr);
+  printf ("\nq=");
+  mpz_out_str (stdout, 10, qptr);
+  putchar ('\n');
+#endif
+  /* postcondition q odd */
+  if (!mpz_cmp_ui (qptr, 1L))  /* J(a,1) = 1 */
+    goto done;
+  mpz_mod (pptr, pptr, qptr); /* J(a,q) = J(b,q) when a == b mod q */
+#ifdef DEBUG
+  printf ("mod out by q: p=");
+  mpz_out_str (stdout, 10, pptr);
+  printf ("\nq=");
+  mpz_out_str (stdout, 10, qptr);
+  putchar ('\n');
+#endif
+  /* quick calculation to get approximate size first */
+  /* precondition: p < q */
+  if ((mpz_sizeinbase (pptr, 2) + 1 >= mpz_sizeinbase (qptr,2))
+      && (mpz_tdiv_q_2exp (qdiv2, qptr, 1L), mpz_cmp (pptr, qdiv2) > 0))
+    {
+      /* p > q/2 */
+      mpz_sub (pptr, qptr, pptr);
+      /* J(-1,q) = (-1)^((q-1)/2), q odd */
+      if (mpz_get_ui (qptr) & 2)
+	retval = -retval;
+    }
+  /* p < q/2 */
+#ifdef Q_MINUS_1
+  mpz_sub_ui (q_minus_q, qptr, 1L);
+#endif
+tail_recurse: /* we use tail_recurse only if q has not changed */
+#ifdef DEBUG
+  printf ("tail_recurse1: p=");
+  mpz_out_str (stdout, 10, pptr);
+  printf ("\nq=");
+  mpz_out_str (stdout, 10, qptr);
+  putchar ('\n');
+#endif
+  /*
+   * J(0,q) = 0
+   * this occurs only if gcd(p,q) != 1 which is never true for
+   * Legendre function.
+   */
+  if (!mpz_cmp_ui (pptr, 0L))
+    {
+      retval = 0;
+      goto done;
+    }
+
+  if (!mpz_cmp_ui (pptr, 1L))
+    {
+      /* J(1,q) = 1 */
+      /* retval *= 1; */
+      goto done;
+    }
+#ifdef Q_MINUS_1
+  if (!mpz_cmp (pptr, q_minus_1))
+    {
+      /* J(-1,q) = (-1)^((q-1)/2) */
+      if (mpz_get_ui (qptr) & 2)
+	retval = -retval;
+      /* else    retval *= 1; */
+      goto done;
+    }
+#endif
+  /*
+   * we do not handle J(xy,q) except for x==2
+   * since we do not want to factor
+   */
+  if ((s = mpz_scan1 (pptr, 0)) != 0)
+    {
+      /*
+       * J(2,q) = (-1)^((q^2-1)/8)
+       *
+       * Note that q odd guarantees that q^2-1 is divisible by 8:
+       * Let a: q=2a+1.  q^2 = 4a^2+4a+1, (q^2-1)/8 = a(a+1)/2, qed
+       *
+       * Now, note that this means that the low two bits of _a_
+       * (or the low bits of q shifted over by 1 determines
+       * the factor).
+       */
+      mpz_tdiv_q_2exp (pptr, pptr, s);
+
+      /* even powers of 2 gives J(2,q)^{2n} = 1 */
+      if (s & 1)
+	{
+	  s = mpz_get_ui (qptr) >> 1;
+	  s = s * (s + 1);
+	  if (s & 2)
+	    retval = -retval;
+	}
+      goto tail_recurse;
+    }
+  /*
+   * we know p is odd since we have cast out 2s
+   * precondition that q is odd guarantees both odd.
+   *
+   * quadratic reciprocity
+   * J(p,q) = (-1)^((p-1)(q-1)/4) * J(q,p)
+   */
+  if ((s = mpz_scan1 (pptr, 1)) <= 2 && (s + mpz_scan1 (qptr, 1)) <= 2)
+    retval = -retval;
+
+  mtmp = pptr; pptr = qptr; qptr = mtmp;
+  goto tail_recurse2;
+done:
+  mpz_clear (p);
+  mpz_clear (q);
+  mpz_clear (qdiv2);
+#ifdef Q_MINUS_1
+  mpz_clear (q_minus_1);
+#endif
+  return retval;
+}
diff --git a/rts/gmp/mpz/mod.c b/rts/gmp/mpz/mod.c
new file mode 100644
index 0000000000..87033b333b
--- /dev/null
+++ b/rts/gmp/mpz/mod.c
@@ -0,0 +1,63 @@
+/* mpz_mod -- The mathematical mod function.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_mod (mpz_ptr rem, mpz_srcptr dividend, mpz_srcptr divisor)
+#else
+mpz_mod (rem, dividend, divisor)
+     mpz_ptr rem;
+     mpz_srcptr dividend;
+     mpz_srcptr divisor;
+#endif
+{
+  mp_size_t divisor_size = divisor->_mp_size;
+  mpz_t temp_divisor;		/* N.B.: lives until function returns! */
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  /* We need the original value of the divisor after the remainder has been
+     preliminary calculated.  We have to copy it to temporary space if it's
+     the same variable as REM.  */
+  if (rem == divisor)
+    {
+      MPZ_TMP_INIT (temp_divisor, ABS (divisor_size));
+      mpz_set (temp_divisor, divisor);
+      divisor = temp_divisor;
+    }
+
+  mpz_tdiv_r (rem, dividend, divisor);
+
+  if (rem->_mp_size != 0)
+    {
+      if (dividend->_mp_size < 0)
+	if (divisor->_mp_size < 0)
+	  mpz_sub (rem, rem, divisor);
+	else
+	  mpz_add (rem, rem, divisor);
+    }
+
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/mul.c b/rts/gmp/mpz/mul.c
new file mode 100644
index 0000000000..7854788e50
--- /dev/null
+++ b/rts/gmp/mpz/mul.c
@@ -0,0 +1,131 @@
+/* mpz_mul -- Multiply two integers.
+
+Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+#ifdef BERKELEY_MP
+#include "mp.h"
+#endif
+
+#ifndef BERKELEY_MP
+void
+#if __STDC__
+mpz_mul (mpz_ptr w, mpz_srcptr u, mpz_srcptr v)
+#else
+mpz_mul (w, u, v)
+     mpz_ptr w;
+     mpz_srcptr u;
+     mpz_srcptr v;
+#endif
+#else /* BERKELEY_MP */
+void
+#if __STDC__
+mult (mpz_srcptr u, mpz_srcptr v, mpz_ptr w)
+#else
+mult (u, v, w)
+     mpz_srcptr u;
+     mpz_srcptr v;
+     mpz_ptr w;
+#endif
+#endif /* BERKELEY_MP */
+{
+  mp_size_t usize = u->_mp_size;
+  mp_size_t vsize = v->_mp_size;
+  mp_size_t wsize;
+  mp_size_t sign_product;
+  mp_ptr up, vp;
+  mp_ptr wp;
+  mp_ptr free_me = NULL;
+  size_t free_me_size;
+  mp_limb_t cy_limb;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+  sign_product = usize ^ vsize;
+  usize = ABS (usize);
+  vsize = ABS (vsize);
+
+  if (usize < vsize)
+    {
+      /* Swap U and V.  */
+      {const __mpz_struct *t = u; u = v; v = t;}
+      {mp_size_t t = usize; usize = vsize; vsize = t;}
+    }
+
+  up = u->_mp_d;
+  vp = v->_mp_d;
+  wp = w->_mp_d;
+
+  /* Ensure W has space enough to store the result.  */
+  wsize = usize + vsize;
+  if (w->_mp_alloc < wsize)
+    {
+      if (wp == up || wp == vp)
+	{
+	  free_me = wp;
+	  free_me_size = w->_mp_alloc;
+	}
+      else
+	(*_mp_free_func) (wp, w->_mp_alloc * BYTES_PER_MP_LIMB);
+
+      w->_mp_alloc = wsize;
+      wp = (mp_ptr) (*_mp_allocate_func) (wsize * BYTES_PER_MP_LIMB);
+      w->_mp_d = wp;
+    }
+  else
+    {
+      /* Make U and V not overlap with W.  */
+      if (wp == up)
+	{
+	  /* W and U are identical.  Allocate temporary space for U.  */
+	  up = (mp_ptr) TMP_ALLOC (usize * BYTES_PER_MP_LIMB);
+	  /* Is V identical too?  Keep it identical with U.  */
+	  if (wp == vp)
+	    vp = up;
+	  /* Copy to the temporary space.  */
+	  MPN_COPY (up, wp, usize);
+	}
+      else if (wp == vp)
+	{
+	  /* W and V are identical.  Allocate temporary space for V.  */
+	  vp = (mp_ptr) TMP_ALLOC (vsize * BYTES_PER_MP_LIMB);
+	  /* Copy to the temporary space.  */
+	  MPN_COPY (vp, wp, vsize);
+	}
+    }
+
+  if (vsize == 0)
+    {
+      wsize = 0;
+    }
+  else
+    {
+      cy_limb = mpn_mul (wp, up, usize, vp, vsize);
+      wsize = usize + vsize;
+      wsize -= cy_limb == 0;
+    }
+
+  w->_mp_size = sign_product < 0 ? -wsize : wsize;
+  if (free_me != NULL)
+    (*_mp_free_func) (free_me, free_me_size * BYTES_PER_MP_LIMB);
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/mul_2exp.c b/rts/gmp/mpz/mul_2exp.c
new file mode 100644
index 0000000000..abea5fed2c
--- /dev/null
+++ b/rts/gmp/mpz/mul_2exp.c
@@ -0,0 +1,76 @@
+/* mpz_mul_2exp -- Multiply a bignum by 2**CNT
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_mul_2exp (mpz_ptr w, mpz_srcptr u, unsigned long int cnt)
+#else
+mpz_mul_2exp (w, u, cnt)
+     mpz_ptr w;
+     mpz_srcptr u;
+     unsigned long int cnt;
+#endif
+{
+  mp_size_t usize = u->_mp_size;
+  mp_size_t abs_usize = ABS (usize);
+  mp_size_t wsize;
+  mp_size_t limb_cnt;
+  mp_ptr wp;
+  mp_limb_t wlimb;
+
+  if (usize == 0)
+    {
+      w->_mp_size = 0;
+      return;
+    }
+
+  limb_cnt = cnt / BITS_PER_MP_LIMB;
+  wsize = abs_usize + limb_cnt + 1;
+  if (w->_mp_alloc < wsize)
+    _mpz_realloc (w, wsize);
+
+  wp = w->_mp_d;
+  wsize = abs_usize + limb_cnt;
+
+  cnt %= BITS_PER_MP_LIMB;
+  if (cnt != 0)
+    {
+      wlimb = mpn_lshift (wp + limb_cnt, u->_mp_d, abs_usize, cnt);
+      if (wlimb != 0)
+	{
+	  wp[wsize] = wlimb;
+	  wsize++;
+	}
+    }
+  else
+    {
+      MPN_COPY_DECR (wp + limb_cnt, u->_mp_d, abs_usize);
+    }
+
+  /* Zero all whole limbs at low end.  Do it here and not before calling
+     mpn_lshift, not to lose for U == W.  */
+  MPN_ZERO (wp, limb_cnt);
+
+  w->_mp_size = usize >= 0 ? wsize : -wsize;
+}
diff --git a/rts/gmp/mpz/mul_siui.c b/rts/gmp/mpz/mul_siui.c
new file mode 100644
index 0000000000..9849cd41b0
--- /dev/null
+++ b/rts/gmp/mpz/mul_siui.c
@@ -0,0 +1,81 @@
+/* mpz_mul_ui/si (product, multiplier, small_multiplicand) -- Set PRODUCT to
+   MULTIPLICATOR times SMALL_MULTIPLICAND.
+
+Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+#ifdef OPERATION_mul_ui
+#define FUNCTION              mpz_mul_ui
+#define MULTIPLICAND_UNSIGNED unsigned
+#define MULTIPLICAND_ABS(x)   x
+#else
+#ifdef OPERATION_mul_si
+#define FUNCTION              mpz_mul_si
+#define MULTIPLICAND_UNSIGNED
+#define MULTIPLICAND_ABS(x)   ABS(x)
+#else
+Error, error, unrecognised OPERATION
+#endif
+#endif
+
+
+void
+#if __STDC__
+FUNCTION (mpz_ptr prod, mpz_srcptr mult,
+          MULTIPLICAND_UNSIGNED long int small_mult)
+#else
+FUNCTION (prod, mult, small_mult)
+     mpz_ptr prod;
+     mpz_srcptr mult;
+     MULTIPLICAND_UNSIGNED long int small_mult;
+#endif
+{
+  mp_size_t size = mult->_mp_size;
+  mp_size_t sign_product = size;
+  mp_limb_t cy;
+  mp_size_t prod_size;
+  mp_ptr prod_ptr;
+
+  if (size == 0 || small_mult == 0)
+    {
+      prod->_mp_size = 0;
+      return;
+    }
+  size = ABS (size);
+
+  prod_size = size + 1;
+  if (prod->_mp_alloc < prod_size)
+    _mpz_realloc (prod, prod_size);
+
+  prod_ptr = prod->_mp_d;
+
+  cy = mpn_mul_1 (prod_ptr, mult->_mp_d, size,
+                  (mp_limb_t) MULTIPLICAND_ABS (small_mult));
+  if (cy != 0)
+    {
+      prod_ptr[size] = cy;
+      size++;
+    }
+
+  prod->_mp_size = ((sign_product < 0) ^ (small_mult < 0)) ? -size : size;
+}
diff --git a/rts/gmp/mpz/neg.c b/rts/gmp/mpz/neg.c
new file mode 100644
index 0000000000..566c3a95aa
--- /dev/null
+++ b/rts/gmp/mpz/neg.c
@@ -0,0 +1,53 @@
+/* mpz_neg(mpz_ptr dst, mpz_ptr src) -- Assign the negated value of SRC to DST.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_neg (mpz_ptr w, mpz_srcptr u)
+#else
+mpz_neg (w, u)
+     mpz_ptr w;
+     mpz_srcptr u;
+#endif
+{
+  mp_ptr wp, up;
+  mp_size_t usize, size;
+
+  usize = u->_mp_size;
+
+  if (u != w)
+    {
+      size = ABS (usize);
+
+      if (w->_mp_alloc < size)
+	_mpz_realloc (w, size);
+
+      wp = w->_mp_d;
+      up = u->_mp_d;
+
+      MPN_COPY (wp, up, size);
+    }
+
+  w->_mp_size = -usize;
+}
diff --git a/rts/gmp/mpz/nextprime.c b/rts/gmp/mpz/nextprime.c
new file mode 100644
index 0000000000..f024dd1206
--- /dev/null
+++ b/rts/gmp/mpz/nextprime.c
@@ -0,0 +1,120 @@
+/* mpz_nextprime(p,t) - compute the next prime > t and store that in p.
+
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_nextprime (mpz_ptr p, mpz_srcptr t)
+#else
+mpz_nextprime (p, t)
+     mpz_ptr    p;
+     mpz_srcptr t;
+#endif
+{
+  mpz_add_ui (p, t, 1L);
+  while (! mpz_probab_prime_p (p, 5))
+    mpz_add_ui (p, p, 1L);
+}
+
+#if 0
+/* This code is not yet tested.  Will be enabled in 3.1. */
+
+status unsigned short primes[] =
+{
+3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,
+101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,
+191,193,197,199,211,223,227,229,233,239,241,251,257,263,269,271,277,
+281,283,293,307,311,313,317,331,337,347,349,353,359,367,373,379,383,
+389,397,401,409,419,421,431,433,439,443,449,457,461,463,467,479,487,
+491,499,503,509,521,523,541,547,557,563,569,571,577,587,593,599,601,
+607,613,617,619,631,641,643,647,653,659,661,673,677,683,691,701,709,
+719,727,733,739,743,751,757,761,769,773,787,797,809,811,821,823,827,
+829,839,853,857,859,863,877,881,883,887,907,911,919,929,937,941,947,
+953,967,971,977,983,991,997
+};
+
+#define NUMBER_OF_PRIMES 167
+
+void
+#if __STDC__
+mpz_nextprime (mpz_ptr p, mpz_srcptr n)
+#else
+mpz_nextprime (p, n)
+     mpz_ptr p;
+     mpz_srcptr n;
+#endif
+{
+  mpz_t tmp;
+  unsigned short *moduli;
+  unsigned long difference;
+  int i;
+  int composite;
+
+  /* First handle tiny numbers */
+  if (mpz_cmp_ui (n, 2) < 0)
+    {
+      mpz_set_ui (p, 2);
+      return;
+    }
+  mpz_add_ui (p, n, 1);
+  mpz_setbit (p, 0);
+
+  if (mpz_cmp_ui (p, 7) <= 0)
+    return;
+
+  prime_limit = NUMBER_OF_PRIMES - 1;
+  if (mpz_cmp_ui (p, primes[prime_limit]) <= 0)
+    /* Just use first three entries (3,5,7) of table for small numbers */
+    prime_limit = 3;
+  if (prime_limit)
+    {
+      /* Compute residues modulo small odd primes */
+      moduli = (unsigned short *) TMP_ALLOC (prime_limit * sizeof moduli[0]);
+      for (i = 0; i < prime_limit; i++)
+	moduli[i] = mpz_fdiv_ui (p, primes[i]);
+    }
+  for (difference = 0; ; difference += 2)
+    {
+      composite = 0;
+
+      /* First check residues */
+      for (i = 0; i < prime_limit; i++)
+	{
+	  int acc, pr;
+	  composite |= (moduli[i] == 0);
+	  acc = moduli[i] + 2;
+	  pr = primes[i];
+	  moduli[i] = acc >= pr ? acc - pr : acc;
+	}
+      if (composite)
+	continue;
+
+      mpz_add_ui (p, p, difference);
+      difference = 0;
+
+      /* Miller-Rabin test */
+      if (mpz_millerrabin (p, 2))
+	break;
+    }
+}
+#endif
diff --git a/rts/gmp/mpz/out_raw.c b/rts/gmp/mpz/out_raw.c
new file mode 100644
index 0000000000..62709479c5
--- /dev/null
+++ b/rts/gmp/mpz/out_raw.c
@@ -0,0 +1,89 @@
+/* mpz_out_raw -- Output a mpz_t in binary.  Use an endianess and word size
+   independent format.
+
+Copyright (C) 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h>
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+size_t
+#if __STDC__
+mpz_out_raw (FILE *stream, mpz_srcptr x)
+#else
+mpz_out_raw (stream, x)
+     FILE *stream;
+     mpz_srcptr x;
+#endif
+{
+  int i;
+  mp_size_t s;
+  mp_size_t xsize = ABS (x->_mp_size);
+  mp_srcptr xp = x->_mp_d;
+  mp_size_t out_bytesize;
+  mp_limb_t hi_limb;
+  int n_bytes_in_hi_limb;
+
+  if (stream == 0)
+    stream = stdout;
+
+  if (xsize == 0)
+    {
+      for (i = 4 - 1; i >= 0; i--)
+	fputc (0, stream);
+      return ferror (stream) ? 0 : 4;
+    }
+
+  hi_limb = xp[xsize - 1];
+  for (i = BYTES_PER_MP_LIMB - 1; i > 0; i--)
+    {
+      if ((hi_limb >> i * BITS_PER_CHAR) != 0)
+	break;
+    }
+  n_bytes_in_hi_limb = i + 1;
+  out_bytesize = BYTES_PER_MP_LIMB * (xsize - 1) + n_bytes_in_hi_limb;
+  if (x->_mp_size < 0)
+    out_bytesize = -out_bytesize;
+
+  /* Make the size 4 bytes on all machines, to make the format portable.  */
+  for (i = 4 - 1; i >= 0; i--)
+    fputc ((out_bytesize >> (i * BITS_PER_CHAR)) % (1 << BITS_PER_CHAR),
+	   stream);
+
+  /* Output from the most significant limb to the least significant limb,
+     with each limb also output in decreasing significance order.  */
+
+  /* Output the most significant limb separately, since we will only
+     output some of its bytes.  */
+  for (i = n_bytes_in_hi_limb - 1; i >= 0; i--)
+    fputc ((hi_limb >> (i * BITS_PER_CHAR)) % (1 << BITS_PER_CHAR), stream);
+
+  /* Output the remaining limbs.  */
+  for (s = xsize - 2; s >= 0; s--)
+    {
+      mp_limb_t x_limb;
+
+      x_limb = xp[s];
+      for (i = BYTES_PER_MP_LIMB - 1; i >= 0; i--)
+	fputc ((x_limb >> (i * BITS_PER_CHAR)) % (1 << BITS_PER_CHAR), stream);
+    }
+  return ferror (stream) ? 0 : ABS (out_bytesize) + 4;
+}
diff --git a/rts/gmp/mpz/out_str.c b/rts/gmp/mpz/out_str.c
new file mode 100644
index 0000000000..bf971b0057
--- /dev/null
+++ b/rts/gmp/mpz/out_str.c
@@ -0,0 +1,108 @@
+/* mpz_out_str(stream, base, integer) -- Output to STREAM the multi prec.
+   integer INTEGER in base BASE.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h>
+#include "gmp.h"
+#include "gmp-impl.h"
+
+size_t
+#if __STDC__
+mpz_out_str (FILE *stream, int base, mpz_srcptr x)
+#else
+mpz_out_str (stream, base, x)
+     FILE *stream;
+     int base;
+     mpz_srcptr x;
+#endif
+{
+  mp_ptr xp;
+  mp_size_t x_size = x->_mp_size;
+  unsigned char *str;
+  size_t str_size;
+  size_t i;
+  size_t written;
+  char *num_to_text;
+  TMP_DECL (marker);
+
+  if (stream == 0)
+    stream = stdout;
+
+  if (base >= 0)
+    {
+      if (base == 0)
+	base = 10;
+      num_to_text = "0123456789abcdefghijklmnopqrstuvwxyz";
+    }
+  else
+    {
+      base = -base;
+      num_to_text = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+    }
+
+  if (x_size == 0)
+    {
+      fputc ('0', stream);
+      return ferror (stream) ? 0 : 1;
+    }
+
+  written = 0;
+
+  if (x_size < 0)
+    {
+      fputc ('-', stream);
+      x_size = -x_size;
+      written = 1;
+    }
+
+  TMP_MARK (marker);
+  str_size = ((size_t) (x_size * BITS_PER_MP_LIMB
+			* __mp_bases[base].chars_per_bit_exactly)) + 3;
+  str = (unsigned char *) TMP_ALLOC (str_size);
+
+  /* Move the number to convert into temporary space, since mpn_get_str
+     clobbers its argument + needs one extra high limb....  */
+  xp = (mp_ptr) TMP_ALLOC ((x_size + 1) * BYTES_PER_MP_LIMB);
+  MPN_COPY (xp, x->_mp_d, x_size);
+
+  str_size = mpn_get_str (str, base, xp, x_size);
+
+  /* mpn_get_str might make some leading zeros.  Skip them.  */
+  while (*str == 0)
+    {
+      str_size--;
+      str++;
+    }
+
+  /* Translate to printable chars.  */
+  for (i = 0; i < str_size; i++)
+    str[i] = num_to_text[str[i]];
+  str[str_size] = 0;
+
+  {
+    size_t fwret;
+    fwret = fwrite ((char *) str, 1, str_size, stream);
+    written += fwret;
+  }
+
+  TMP_FREE (marker);
+  return ferror (stream) ? 0 : written;
+}
diff --git a/rts/gmp/mpz/perfpow.c b/rts/gmp/mpz/perfpow.c
new file mode 100644
index 0000000000..e71670a0be
--- /dev/null
+++ b/rts/gmp/mpz/perfpow.c
@@ -0,0 +1,272 @@
+/* mpz_perfect_power_p(arg) -- Return non-zero if ARG is a perfect power,
+   zero otherwise.
+
+Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/*
+  We are to determine if c is a perfect power, c = a ^ b.
+  Assume c is divisible by 2^n and that codd = c/2^n is odd.
+  Assume a is divisible by 2^m and that aodd = a/2^m is odd.
+  It is always true that m divides n.
+
+  * If n is prime, either 1) a is 2*aodd and b = n
+		       or 2) a = c and b = 1.
+    So for n prime, we readily have a solution.
+  * If n is factorable into the non-trivial factors p1,p2,...
+    Since m divides n, m has a subset of n's factors and b = n / m.
+
+    BUG: Should handle negative numbers, since they can be odd perfect powers.
+*/
+
+/* This is a naive approach to recognizing perfect powers.
+   Many things can be improved.  In particular, we should use p-adic
+   arithmetic for computing possible roots.  */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+static unsigned long int gcd _PROTO ((unsigned long int a, unsigned long int b));
+static int isprime _PROTO ((unsigned long int t));
+
+static const unsigned short primes[] =
+{  2,  3,  5,  7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53,
+  59, 61, 67, 71, 73, 79, 83, 89, 97,101,103,107,109,113,127,131,
+ 137,139,149,151,157,163,167,173,179,181,191,193,197,199,211,223,
+ 227,229,233,239,241,251,257,263,269,271,277,281,283,293,307,311,
+ 313,317,331,337,347,349,353,359,367,373,379,383,389,397,401,409,
+ 419,421,431,433,439,443,449,457,461,463,467,479,487,491,499,503,
+ 509,521,523,541,547,557,563,569,571,577,587,593,599,601,607,613,
+ 617,619,631,641,643,647,653,659,661,673,677,683,691,701,709,719,
+ 727,733,739,743,751,757,761,769,773,787,797,809,811,821,823,827,
+ 829,839,853,857,859,863,877,881,883,887,907,911,919,929,937,941,
+ 947,953,967,971,977,983,991,997,0
+};
+#define SMALLEST_OMITTED_PRIME 1009
+
+
+int
+#if __STDC__
+mpz_perfect_power_p (mpz_srcptr u)
+#else
+mpz_perfect_power_p (u)
+     mpz_srcptr u;
+#endif
+{
+  unsigned long int prime;
+  unsigned long int n, n2;
+  int i;
+  unsigned long int rem;
+  mpz_t u2, q;
+  int exact;
+  mp_size_t uns;
+  TMP_DECL (marker);
+
+  if (mpz_cmp_ui (u, 1) <= 0)
+    return 0;
+
+  n2 = mpz_scan1 (u, 0);
+  if (n2 == 1)
+    return 0;
+
+  TMP_MARK (marker);
+
+  uns = ABSIZ (u) - n2 / BITS_PER_MP_LIMB;
+  MPZ_TMP_INIT (q, uns);
+  MPZ_TMP_INIT (u2, uns);
+
+  mpz_tdiv_q_2exp (u2, u, n2);
+
+  if (isprime (n2))
+    goto n2prime;
+
+  for (i = 1; primes[i] != 0; i++)
+    {
+      prime = primes[i];
+      rem = mpz_tdiv_ui (u2, prime);
+      if (rem == 0)		/* divisable? */
+	{
+	  rem = mpz_tdiv_q_ui (q, u2, prime * prime);
+	  if (rem != 0)
+	    {
+	      TMP_FREE (marker);
+	      return 0;
+	    }
+	  mpz_swap (q, u2);
+	  for (n = 2;;)
+	    {
+	      rem = mpz_tdiv_q_ui (q, u2, prime);
+	      if (rem != 0)
+		break;
+	      mpz_swap (q, u2);
+	      n++;
+	    }
+
+	  n2 = gcd (n2, n);
+	  if (n2 == 1)
+	    {
+	      TMP_FREE (marker);
+	      return 0;
+	    }
+
+	  /* As soon as n2 becomes a prime number, stop factoring.
+	     Either we have u=x^n2 or u is not a perfect power.  */
+	  if (isprime (n2))
+	    goto n2prime;
+	}
+    }
+
+  if (mpz_cmp_ui (u2, 1) == 0)
+    {
+      TMP_FREE (marker);
+      return 1;
+    }
+
+  if (n2 == 0)
+    {
+      unsigned long int nth;
+      /* We did not find any factors above.  We have to consider all values
+	 of n.  */
+      for (nth = 2;; nth++)
+	{
+	  if (! isprime (nth))
+	    continue;
+#if 0
+	  exact = mpz_padic_root (q, u2, nth, PTH);
+	  if (exact)
+#endif
+	    exact = mpz_root (q, u2, nth);
+	  if (exact)
+	    {
+	      TMP_FREE (marker);
+	      return 1;
+	    }
+	  if (mpz_cmp_ui (q, SMALLEST_OMITTED_PRIME) < 0)
+	    {
+	      TMP_FREE (marker);
+	      return 0;
+	    }
+	}
+    }
+  else
+    {
+      unsigned long int nth;
+      /* We found some factors above.  We just need to consider values of n
+	 that divides n2.  */
+      for (nth = 2; nth <= n2; nth++)
+	{
+	  if (! isprime (nth))
+	    continue;
+	  if (n2 % nth != 0)
+	    continue;
+#if 0
+	  exact = mpz_padic_root (q, u2, nth, PTH);
+	  if (exact)
+#endif
+	    exact = mpz_root (q, u2, nth);
+	  if (exact)
+	    {
+	      TMP_FREE (marker);
+	      return 1;
+	    }
+	  if (mpz_cmp_ui (q, SMALLEST_OMITTED_PRIME) < 0)
+	    {
+	      TMP_FREE (marker);
+	      return 0;
+	    }
+	}
+
+      TMP_FREE (marker);
+      return 0;
+    }
+
+n2prime:
+  exact = mpz_root (NULL, u2, n2);
+  TMP_FREE (marker);
+  return exact;
+}
+
+static unsigned long int
+#if __STDC__
+gcd (unsigned long int a, unsigned long int b)
+#else
+gcd (a, b)
+     unsigned long int a, b;
+#endif
+{
+  int an2, bn2, n2;
+
+  if (a == 0)
+    return b;
+  if (b == 0)
+    return a;
+
+  count_trailing_zeros (an2, a);
+  a >>= an2;
+
+  count_trailing_zeros (bn2, b);
+  b >>= bn2;
+
+  n2 = MIN (an2, bn2);
+
+  while (a != b)
+    {
+      if (a > b)
+	{
+	  a -= b;
+	  do
+	    a >>= 1;
+	  while ((a & 1) == 0);
+	}
+      else /*  b > a.  */
+	{
+	  b -= a;
+	  do
+	    b >>= 1;
+	  while ((b & 1) == 0);
+	}
+    }
+
+  return a << n2;
+}
+
+static int
+#if __STDC__
+isprime (unsigned long int t)
+#else
+isprime (t)
+     unsigned long int t;
+#endif
+{
+  unsigned long int q, r, d;
+
+  if (t < 3 || (t & 1) == 0)
+    return t == 2;
+
+  for (d = 3, r = 1; r != 0; d += 2)
+    {
+      q = t / d;
+      r = t - q * d;
+      if (q < d)
+	return 1;
+    }
+  return 0;
+}
diff --git a/rts/gmp/mpz/perfsqr.c b/rts/gmp/mpz/perfsqr.c
new file mode 100644
index 0000000000..92e8d08ea9
--- /dev/null
+++ b/rts/gmp/mpz/perfsqr.c
@@ -0,0 +1,45 @@
+/* mpz_perfect_square_p(arg) -- Return non-zero if ARG is a perfect square,
+   zero otherwise.
+
+Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+mpz_perfect_square_p (mpz_srcptr a)
+#else
+mpz_perfect_square_p (a)
+     mpz_srcptr a;
+#endif
+{
+  mp_size_t asize = a->_mp_size;
+
+  /* No negative numbers are perfect squares.  */
+  if (asize < 0)
+    return 0;
+
+  /* Zero is a perfect square.  */
+  if (asize == 0)
+    return 1;
+
+  return mpn_perfect_square_p (a->_mp_d, asize);
+}
diff --git a/rts/gmp/mpz/popcount.c b/rts/gmp/mpz/popcount.c
new file mode 100644
index 0000000000..3105258e26
--- /dev/null
+++ b/rts/gmp/mpz/popcount.c
@@ -0,0 +1,42 @@
+/* mpz_popcount(mpz_ptr op) -- Population count of OP.  If the operand is
+   negative, return ~0 (a novel representation of infinity).
+
+Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_popcount (mpz_srcptr u)
+#else
+mpz_popcount (u)
+     mpz_srcptr u;
+#endif
+{
+  mp_size_t usize;
+
+  usize = u->_mp_size;
+
+  if ((usize) < 0)
+    return ~ (unsigned long int) 0;
+
+  return mpn_popcount (u->_mp_d, usize);
+}
diff --git a/rts/gmp/mpz/pow_ui.c b/rts/gmp/mpz/pow_ui.c
new file mode 100644
index 0000000000..96ca114e4d
--- /dev/null
+++ b/rts/gmp/mpz/pow_ui.c
@@ -0,0 +1,129 @@
+/* mpz_pow_ui(res, base, exp) -- Set RES to BASE**EXP.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#ifdef BERKELEY_MP
+#include "mp.h"
+#endif
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef BERKELEY_MP
+void
+#if __STDC__
+mpz_pow_ui (mpz_ptr r, mpz_srcptr b, unsigned long int e)
+#else
+mpz_pow_ui (r, b, e)
+     mpz_ptr r;
+     mpz_srcptr b;
+     unsigned long int e;
+#endif
+#else /* BERKELEY_MP */
+void
+#if __STDC__
+rpow (const MINT *b, signed short int e, MINT *r)
+#else
+rpow (b, e, r)
+     const MINT *b;
+     signed short int e;
+     MINT *r;
+#endif
+#endif /* BERKELEY_MP */
+{
+  mp_ptr rp, bp, tp, xp;
+  mp_size_t ralloc, rsize, bsize;
+  int cnt, i;
+  mp_limb_t blimb;
+  TMP_DECL (marker);
+
+  bsize = ABS (b->_mp_size);
+
+  /* Single out cases that give result == 0 or 1.  These tests are here
+     to simplify the general code below, not to optimize.  */
+  if (e == 0)
+    {
+      r->_mp_d[0] = 1;
+      r->_mp_size = 1;
+      return;
+    }
+  if (bsize == 0
+#ifdef BERKELEY_MP
+      || e < 0
+#endif
+      )
+    {
+      r->_mp_size = 0;
+      return;
+    }
+
+  bp = b->_mp_d;
+
+  blimb = bp[bsize - 1];
+  if (bsize == 1 && blimb < 0x100)
+    {
+      /* Estimate space requirements accurately.  Using the code from the
+	 `else' path would over-estimate space requirements wildly.   */
+      float lb = __mp_bases[blimb].chars_per_bit_exactly;
+      ralloc = 3 + ((mp_size_t) (e / lb) / BITS_PER_MP_LIMB);
+    }
+  else
+    {
+      /* Over-estimate space requirements somewhat.  */
+      count_leading_zeros (cnt, blimb);
+      ralloc = bsize * e - cnt * e / BITS_PER_MP_LIMB + 2;
+    }
+
+  TMP_MARK (marker);
+
+  /* The two areas are used to alternatingly hold the input and recieve the
+     product for mpn_mul.  (This scheme is used to fulfill the requirements
+     of mpn_mul; that the product space may not be the same as any of the
+     input operands.)  */
+  rp = (mp_ptr) TMP_ALLOC (ralloc * BYTES_PER_MP_LIMB);
+  tp = (mp_ptr) TMP_ALLOC (ralloc * BYTES_PER_MP_LIMB);
+
+  MPN_COPY (rp, bp, bsize);
+  rsize = bsize;
+  count_leading_zeros (cnt, e);
+
+  for (i = BITS_PER_MP_LIMB - cnt - 2; i >= 0; i--)
+    {
+      mpn_mul_n (tp, rp, rp, rsize);
+      rsize = 2 * rsize;
+      rsize -= tp[rsize - 1] == 0;
+      xp = tp; tp = rp; rp = xp;
+
+      if ((e & ((mp_limb_t) 1 << i)) != 0)
+	{
+	  rsize = rsize + bsize - (mpn_mul (tp, rp, rsize, bp, bsize) == 0);
+	  xp = tp; tp = rp; rp = xp;
+	}
+    }
+
+  /* Now then we know the exact space requirements, reallocate if
+     necessary.  */
+  if (r->_mp_alloc < rsize)
+    _mpz_realloc (r, rsize);
+
+  MPN_COPY (r->_mp_d, rp, rsize);
+  r->_mp_size = (e & 1) == 0 || b->_mp_size >= 0 ? rsize : -rsize;
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/powm.c b/rts/gmp/mpz/powm.c
new file mode 100644
index 0000000000..e6af855a71
--- /dev/null
+++ b/rts/gmp/mpz/powm.c
@@ -0,0 +1,364 @@
+/* mpz_powm(res,base,exp,mod) -- Set RES to (base**exp) mod MOD.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation, Inc.
+Contributed by Paul Zimmermann.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+#ifdef BERKELEY_MP
+#include "mp.h"
+#endif
+
+
+/* set c <- (a*b)/R^n mod m c has to have at least (2n) allocated limbs */
+static void
+#if __STDC__
+mpz_redc (mpz_ptr c, mpz_srcptr a, mpz_srcptr b, mpz_srcptr m, mp_limb_t Nprim)
+#else
+mpz_redc (c, a, b, m, Nprim)
+     mpz_ptr c;
+     mpz_srcptr a;
+     mpz_srcptr b;
+     mpz_srcptr m;
+     mp_limb_t Nprim;
+#endif
+{
+  mp_ptr cp, mp = PTR (m);
+  mp_limb_t cy, cout = 0;
+  mp_limb_t q;
+  size_t j, n = ABSIZ (m);
+
+  ASSERT (ALLOC (c) >= 2 * n);
+
+  mpz_mul (c, a, b);
+  cp = PTR (c);
+  j = ABSIZ (c);
+  MPN_ZERO (cp + j, 2 * n - j);
+  for (j = 0; j < n; j++)
+    {
+      q = cp[0] * Nprim;
+      cy = mpn_addmul_1 (cp, mp, n, q);
+      cout += mpn_add_1 (cp + n, cp + n, n - j, cy);
+      cp++;
+    }
+  cp -= n;
+  if (cout)
+    {
+      cy = cout - mpn_sub_n (cp, cp + n, mp, n);
+      while (cy)
+	cy -= mpn_sub_n (cp, cp, mp, n);
+    }
+  else
+    MPN_COPY (cp, cp + n, n);
+  MPN_NORMALIZE (cp, n);
+  SIZ (c) = SIZ (c) < 0 ? -n : n;
+}
+
+/* average number of calls to redc for an exponent of n bits
+   with the sliding window algorithm of base 2^k: the optimal is
+   obtained for the value of k which minimizes 2^(k-1)+n/(k+1):
+
+   n\k    4     5     6     7     8
+   128    156*  159   171   200   261
+   256    309   307*  316   343   403
+   512    617   607*  610   632   688
+   1024   1231  1204  1195* 1207  1256
+   2048   2461  2399  2366  2360* 2396
+   4096   4918  4787  4707  4665* 4670
+*/
+
+#ifndef BERKELEY_MP
+void
+#if __STDC__
+mpz_powm (mpz_ptr res, mpz_srcptr base, mpz_srcptr e, mpz_srcptr mod)
+#else
+mpz_powm (res, base, e, mod)
+     mpz_ptr res;
+     mpz_srcptr base;
+     mpz_srcptr e;
+     mpz_srcptr mod;
+#endif
+#else /* BERKELEY_MP */
+void
+#if __STDC__
+pow (mpz_srcptr base, mpz_srcptr e, mpz_srcptr mod, mpz_ptr res)
+#else
+pow (base, e, mod, res)
+     mpz_srcptr base;
+     mpz_srcptr e;
+     mpz_srcptr mod;
+     mpz_ptr res;
+#endif
+#endif /* BERKELEY_MP */
+{
+  mp_limb_t invm, *ep, c, mask;
+  mpz_t xx, *g;
+  mp_size_t n, i, K, j, l, k;
+  int sh;
+  int use_redc;
+
+#ifdef POWM_DEBUG
+  mpz_t exp;
+  mpz_init (exp);
+#endif
+
+  n = ABSIZ (mod);
+
+  if (n == 0)
+    DIVIDE_BY_ZERO;
+
+  if (SIZ (e) == 0)
+    {
+      /* Exponent is zero, result is 1 mod MOD, i.e., 1 or 0
+         depending on if MOD equals 1.  */
+      SIZ(res) = (ABSIZ (mod) == 1 && (PTR(mod))[0] == 1) ? 0 : 1;
+      PTR(res)[0] = 1;
+      return;
+    }
+
+  /* Use REDC instead of usual reduction for sizes < POWM_THRESHOLD.
+     In REDC each modular multiplication costs about 2*n^2 limbs operations,
+     whereas using usual reduction it costs 3*K(n), where K(n) is the cost of a
+     multiplication using Karatsuba, and a division is assumed to cost 2*K(n),
+     for example using Burnikel-Ziegler's algorithm. This gives a theoretical
+     threshold of a*KARATSUBA_SQR_THRESHOLD, with a=(3/2)^(1/(2-ln(3)/ln(2))) ~
+     2.66.  */
+  /* For now, also disable REDC when MOD is even, as the inverse can't
+     handle that.  */
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD  ((8 * KARATSUBA_SQR_THRESHOLD) / 3)
+#endif
+
+  use_redc = (n < POWM_THRESHOLD && PTR(mod)[0] % 2 != 0);
+  if (use_redc)
+    {
+      /* invm = -1/m mod 2^BITS_PER_MP_LIMB, must have m odd */
+      modlimb_invert (invm, PTR(mod)[0]);
+      invm = -invm;
+    }
+
+  /* determines optimal value of k */
+  l = ABSIZ (e) * BITS_PER_MP_LIMB; /* number of bits of exponent */
+  k = 1;
+  K = 2;
+  while (2 * l > K * (2 + k * (3 + k)))
+    {
+      k++;
+      K *= 2;
+    }
+
+  g = (mpz_t *) (*_mp_allocate_func) (K / 2 * sizeof (mpz_t));
+  /* compute x*R^n where R=2^BITS_PER_MP_LIMB */
+  mpz_init (g[0]);
+  if (use_redc)
+    {
+      mpz_mul_2exp (g[0], base, n * BITS_PER_MP_LIMB);
+      mpz_mod (g[0], g[0], mod);
+    }
+  else
+    mpz_mod (g[0], base, mod);
+
+  /* compute xx^g for odd g < 2^k */
+  mpz_init (xx);
+  if (use_redc)
+    {
+      _mpz_realloc (xx, 2 * n);
+      mpz_redc (xx, g[0], g[0], mod, invm); /* xx = x^2*R^n */
+    }
+  else
+    {
+      mpz_mul (xx, g[0], g[0]);
+      mpz_mod (xx, xx, mod);
+    }
+  for (i = 1; i < K / 2; i++)
+    {
+      mpz_init (g[i]);
+      if (use_redc)
+	{
+	  _mpz_realloc (g[i], 2 * n);
+	  mpz_redc (g[i], g[i - 1], xx, mod, invm); /* g[i] = x^(2i+1)*R^n */
+	}
+      else
+	{
+	  mpz_mul (g[i], g[i - 1], xx);
+	  mpz_mod (g[i], g[i], mod);
+	}
+    }
+
+  /* now starts the real stuff */
+  mask = (mp_limb_t) ((1<<k) - 1);
+  ep = PTR (e);
+  i = ABSIZ (e) - 1;			/* current index */
+  c = ep[i];				/* current limb */
+  count_leading_zeros (sh, c);
+  sh = BITS_PER_MP_LIMB - sh;		/* significant bits in ep[i] */
+  sh -= k;				/* index of lower bit of ep[i] to take into account */
+  if (sh < 0)
+    {					/* k-sh extra bits are needed */
+      if (i > 0)
+	{
+	  i--;
+	  c = (c << (-sh)) | (ep[i] >> (BITS_PER_MP_LIMB + sh));
+	  sh += BITS_PER_MP_LIMB;
+	}
+    }
+  else
+    c = c >> sh;
+#ifdef POWM_DEBUG
+  printf ("-1/m mod 2^%u = %lu\n", BITS_PER_MP_LIMB, invm);
+  mpz_set_ui (exp, c);
+#endif
+  j=0;
+  while (c % 2 == 0)
+    {
+      j++;
+      c = (c >> 1);
+    }
+  mpz_set (xx, g[c >> 1]);
+  while (j--)
+    {
+      if (use_redc)
+	mpz_redc (xx, xx, xx, mod, invm);
+      else
+	{
+	  mpz_mul (xx, xx, xx);
+	  mpz_mod (xx, xx, mod);
+	}
+    }
+
+#ifdef POWM_DEBUG
+  printf ("x^"); mpz_out_str (0, 10, exp);
+  printf ("*2^%u mod m = ", n * BITS_PER_MP_LIMB); mpz_out_str (0, 10, xx);
+  putchar ('\n');
+#endif
+
+  while (i > 0 || sh > 0)
+    {
+      c = ep[i];
+      sh -= k;
+      l = k;				/* number of bits treated */
+      if (sh < 0)
+	{
+	  if (i > 0)
+	    {
+	      i--;
+	      c = (c << (-sh)) | (ep[i] >> (BITS_PER_MP_LIMB + sh));
+	      sh += BITS_PER_MP_LIMB;
+	    }
+	  else
+	    {
+	      l += sh;			/* may be less bits than k here */
+	      c = c & ((1<<l) - 1);
+	    }
+	}
+      else
+	c = c >> sh;
+      c = c & mask;
+
+      /* this while loop implements the sliding window improvement */
+      while ((c & (1 << (k - 1))) == 0 && (i > 0 || sh > 0))
+	{
+	  if (use_redc) mpz_redc (xx, xx, xx, mod, invm);
+	  else
+	    {
+	      mpz_mul (xx, xx, xx);
+	      mpz_mod (xx, xx, mod);
+	    }
+	  if (sh)
+	    {
+	      sh--;
+	      c = (c<<1) + ((ep[i]>>sh) & 1);
+	    }
+	  else
+	    {
+	      i--;
+	      sh = BITS_PER_MP_LIMB - 1;
+	      c = (c<<1) + (ep[i]>>sh);
+	    }
+	}
+
+#ifdef POWM_DEBUG
+      printf ("l=%u c=%lu\n", l, c);
+      mpz_mul_2exp (exp, exp, k);
+      mpz_add_ui (exp, exp, c);
+#endif
+
+      /* now replace xx by xx^(2^k)*x^c */
+      if (c != 0)
+	{
+	  j = 0;
+	  while (c % 2 == 0)
+	    {
+	      j++;
+	      c = c >> 1;
+	    }
+	  /* c0 = c * 2^j, i.e. xx^(2^k)*x^c = (A^(2^(k - j))*c)^(2^j) */
+	  l -= j;
+	  while (l--)
+	    if (use_redc) mpz_redc (xx, xx, xx, mod, invm);
+	    else
+	      {
+		mpz_mul (xx, xx, xx);
+		mpz_mod (xx, xx, mod);
+	      }
+	  if (use_redc)
+	    mpz_redc (xx, xx, g[c >> 1], mod, invm);
+	  else
+	    {
+	      mpz_mul (xx, xx, g[c >> 1]);
+	      mpz_mod (xx, xx, mod);
+	    }
+	}
+      else
+	j = l;				/* case c=0 */
+      while (j--)
+	{
+	  if (use_redc)
+	    mpz_redc (xx, xx, xx, mod, invm);
+	  else
+	    {
+	      mpz_mul (xx, xx, xx);
+	      mpz_mod (xx, xx, mod);
+	    }
+	}
+#ifdef POWM_DEBUG
+      printf ("x^"); mpz_out_str (0, 10, exp);
+      printf ("*2^%u mod m = ", n * BITS_PER_MP_LIMB); mpz_out_str (0, 10, xx);
+      putchar ('\n');
+#endif
+    }
+
+  /* now convert back xx to xx/R^n */
+  if (use_redc)
+    {
+      mpz_set_ui (g[0], 1);
+      mpz_redc (xx, xx, g[0], mod, invm);
+      if (mpz_cmp (xx, mod) >= 0)
+	mpz_sub (xx, xx, mod);
+    }
+  mpz_set (res, xx);
+
+  mpz_clear (xx);
+  for (i = 0; i < K / 2; i++)
+    mpz_clear (g[i]);
+  (*_mp_free_func) (g, K / 2 * sizeof (mpz_t));
+}
diff --git a/rts/gmp/mpz/powm_ui.c b/rts/gmp/mpz/powm_ui.c
new file mode 100644
index 0000000000..00f70bd563
--- /dev/null
+++ b/rts/gmp/mpz/powm_ui.c
@@ -0,0 +1,248 @@
+/* mpz_powm_ui(res,base,exp,mod) -- Set RES to (base**exp) mod MOD.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void
+#if __STDC__
+mpz_powm_ui (mpz_ptr res, mpz_srcptr base, unsigned long int exp, mpz_srcptr mod)
+#else
+mpz_powm_ui (res, base, exp, mod)
+     mpz_ptr res;
+     mpz_srcptr base;
+     unsigned long int exp;
+     mpz_srcptr mod;
+#endif
+{
+  mp_ptr rp, mp, bp;
+  mp_size_t msize, bsize, rsize;
+  mp_size_t size;
+  int mod_shift_cnt;
+  int negative_result;
+  mp_limb_t *free_me = NULL;
+  size_t free_me_size;
+  TMP_DECL (marker);
+
+  msize = ABS (mod->_mp_size);
+  size = 2 * msize;
+
+  rp = res->_mp_d;
+
+  if (msize == 0)
+    DIVIDE_BY_ZERO;
+
+  if (exp == 0)
+    {
+      /* Exponent is zero, result is 1 mod MOD, i.e., 1 or 0
+	 depending on if MOD equals 1.  */
+      res->_mp_size = (msize == 1 && (mod->_mp_d)[0] == 1) ? 0 : 1;
+      rp[0] = 1;
+      return;
+    }
+
+  TMP_MARK (marker);
+
+  /* Normalize MOD (i.e. make its most significant bit set) as required by
+     mpn_divmod.  This will make the intermediate values in the calculation
+     slightly larger, but the correct result is obtained after a final
+     reduction using the original MOD value.  */
+
+  mp = (mp_ptr) TMP_ALLOC (msize * BYTES_PER_MP_LIMB);
+  count_leading_zeros (mod_shift_cnt, mod->_mp_d[msize - 1]);
+  if (mod_shift_cnt != 0)
+    mpn_lshift (mp, mod->_mp_d, msize, mod_shift_cnt);
+  else
+    MPN_COPY (mp, mod->_mp_d, msize);
+
+  bsize = ABS (base->_mp_size);
+  if (bsize > msize)
+    {
+      /* The base is larger than the module.  Reduce it.  */
+
+      /* Allocate (BSIZE + 1) with space for remainder and quotient.
+	 (The quotient is (bsize - msize + 1) limbs.)  */
+      bp = (mp_ptr) TMP_ALLOC ((bsize + 1) * BYTES_PER_MP_LIMB);
+      MPN_COPY (bp, base->_mp_d, bsize);
+      /* We don't care about the quotient, store it above the remainder,
+	 at BP + MSIZE.  */
+      mpn_divmod (bp + msize, bp, bsize, mp, msize);
+      bsize = msize;
+      /* Canonicalize the base, since we are going to multiply with it
+	 quite a few times.  */
+      MPN_NORMALIZE (bp, bsize);
+    }
+  else
+    bp = base->_mp_d;
+
+  if (bsize == 0)
+    {
+      res->_mp_size = 0;
+      TMP_FREE (marker);
+      return;
+    }
+
+  if (res->_mp_alloc < size)
+    {
+      /* We have to allocate more space for RES.  If any of the input
+	 parameters are identical to RES, defer deallocation of the old
+	 space.  */
+
+      if (rp == mp || rp == bp)
+	{
+	  free_me = rp;
+	  free_me_size = res->_mp_alloc;
+	}
+      else
+	(*_mp_free_func) (rp, res->_mp_alloc * BYTES_PER_MP_LIMB);
+
+      rp = (mp_ptr) (*_mp_allocate_func) (size * BYTES_PER_MP_LIMB);
+      res->_mp_alloc = size;
+      res->_mp_d = rp;
+    }
+  else
+    {
+      /* Make BASE, EXP and MOD not overlap with RES.  */
+      if (rp == bp)
+	{
+	  /* RES and BASE are identical.  Allocate temp. space for BASE.  */
+	  bp = (mp_ptr) TMP_ALLOC (bsize * BYTES_PER_MP_LIMB);
+	  MPN_COPY (bp, rp, bsize);
+	}
+      if (rp == mp)
+	{
+	  /* RES and MOD are identical.  Allocate temporary space for MOD.  */
+	  mp = (mp_ptr) TMP_ALLOC (msize * BYTES_PER_MP_LIMB);
+	  MPN_COPY (mp, rp, msize);
+	}
+    }
+
+  MPN_COPY (rp, bp, bsize);
+  rsize = bsize;
+
+  {
+    mp_ptr xp = (mp_ptr) TMP_ALLOC (2 * (msize + 1) * BYTES_PER_MP_LIMB);
+    int c;
+    mp_limb_t e;
+    mp_limb_t carry_limb;
+
+    negative_result = (exp & 1) && base->_mp_size < 0;
+
+    e = exp;
+    count_leading_zeros (c, e);
+    e = (e << c) << 1;		/* shift the exp bits to the left, lose msb */
+    c = BITS_PER_MP_LIMB - 1 - c;
+
+    /* Main loop.
+
+       Make the result be pointed to alternately by XP and RP.  This
+       helps us avoid block copying, which would otherwise be necessary
+       with the overlap restrictions of mpn_divmod.  With 50% probability
+       the result after this loop will be in the area originally pointed
+       by RP (==RES->_mp_d), and with 50% probability in the area originally
+       pointed to by XP.  */
+
+    while (c != 0)
+      {
+	mp_ptr tp;
+	mp_size_t xsize;
+
+	mpn_mul_n (xp, rp, rp, rsize);
+	xsize = 2 * rsize;
+	xsize -= xp[xsize - 1] == 0;
+	if (xsize > msize)
+	  {
+	    mpn_divmod (xp + msize, xp, xsize, mp, msize);
+	    xsize = msize;
+	  }
+
+	tp = rp; rp = xp; xp = tp;
+	rsize = xsize;
+
+	if ((mp_limb_signed_t) e < 0)
+	  {
+	    mpn_mul (xp, rp, rsize, bp, bsize);
+	    xsize = rsize + bsize;
+	    xsize -= xp[xsize - 1] == 0;
+	    if (xsize > msize)
+	      {
+		mpn_divmod (xp + msize, xp, xsize, mp, msize);
+		xsize = msize;
+	      }
+
+	    tp = rp; rp = xp; xp = tp;
+	    rsize = xsize;
+	  }
+	e <<= 1;
+	c--;
+      }
+
+    /* We shifted MOD, the modulo reduction argument, left MOD_SHIFT_CNT
+       steps.  Adjust the result by reducing it with the original MOD.
+
+       Also make sure the result is put in RES->_mp_d (where it already
+       might be, see above).  */
+
+    if (mod_shift_cnt != 0)
+      {
+	carry_limb = mpn_lshift (res->_mp_d, rp, rsize, mod_shift_cnt);
+	rp = res->_mp_d;
+	if (carry_limb != 0)
+	  {
+	    rp[rsize] = carry_limb;
+	    rsize++;
+	  }
+      }
+    else
+      {
+	MPN_COPY (res->_mp_d, rp, rsize);
+	rp = res->_mp_d;
+      }
+
+    if (rsize >= msize)
+      {
+	mpn_divmod (rp + msize, rp, rsize, mp, msize);
+	rsize = msize;
+      }
+
+    /* Remove any leading zero words from the result.  */
+    if (mod_shift_cnt != 0)
+      mpn_rshift (rp, rp, rsize, mod_shift_cnt);
+    MPN_NORMALIZE (rp, rsize);
+  }
+
+  if (negative_result && rsize != 0)
+    {
+      if (mod_shift_cnt != 0)
+	mpn_rshift (mp, mp, msize, mod_shift_cnt);
+      mpn_sub (rp, mp, msize, rp, rsize);
+      rsize = msize;
+      MPN_NORMALIZE (rp, rsize);
+    }
+  res->_mp_size = rsize;
+
+  if (free_me != NULL)
+    (*_mp_free_func) (free_me, free_me_size * BYTES_PER_MP_LIMB);
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/pprime_p.c b/rts/gmp/mpz/pprime_p.c
new file mode 100644
index 0000000000..82eb678238
--- /dev/null
+++ b/rts/gmp/mpz/pprime_p.c
@@ -0,0 +1,242 @@
+/* mpz_probab_prime_p --
+   An implementation of the probabilistic primality test found in Knuth's
+   Seminumerical Algorithms book.  If the function mpz_probab_prime_p()
+   returns 0 then n is not prime.  If it returns 1, then n is 'probably'
+   prime.  If it returns 2, n is surely prime.  The probability of a false
+   positive is (1/4)**reps, where reps is the number of internal passes of the
+   probabilistic algorithm.  Knuth indicates that 25 passes are reasonable.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998, 1999, 2000 Free Software
+Foundation, Inc.  Miller-Rabin code contributed by John Amanatides.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+static int isprime _PROTO ((unsigned long int t));
+static int mpz_millerrabin _PROTO ((mpz_srcptr n, int reps));
+
+int
+#if __STDC__
+mpz_probab_prime_p (mpz_srcptr n, int reps)
+#else
+mpz_probab_prime_p (n, reps)
+     mpz_srcptr n;
+     int reps;
+#endif
+{
+  mp_limb_t r;
+
+  /* Handle small and negative n.  */
+  if (mpz_cmp_ui (n, 1000000L) <= 0)
+    {
+      int is_prime;
+      if (mpz_sgn (n) < 0)
+	{
+	  /* Negative number.  Negate and call ourselves.  */
+	  mpz_t n2;
+	  mpz_init (n2);
+	  mpz_neg (n2, n);
+	  is_prime = mpz_probab_prime_p (n2, reps);
+	  mpz_clear (n2);
+	  return is_prime;
+	}
+      is_prime = isprime (mpz_get_ui (n));
+      return is_prime ? 2 : 0;
+    }
+
+  /* If n is now even, it is not a prime.  */
+  if ((mpz_get_ui (n) & 1) == 0)
+    return 0;
+
+  /* Check if n has small factors.  */
+  if (UDIV_TIME > (2 * UMUL_TIME + 6))
+    r = mpn_preinv_mod_1 (PTR(n), SIZ(n), (mp_limb_t) PP, (mp_limb_t) PP_INVERTED);
+  else
+    r = mpn_mod_1 (PTR(n), SIZ(n), (mp_limb_t) PP);
+  if (r % 3 == 0 || r % 5 == 0 || r % 7 == 0 || r % 11 == 0 || r % 13 == 0
+      || r % 17 == 0 || r % 19 == 0 || r % 23 == 0 || r % 29 == 0
+#if BITS_PER_MP_LIMB == 64
+      || r % 31 == 0 || r % 37 == 0 || r % 41 == 0 || r % 43 == 0
+      || r % 47 == 0 || r % 53 == 0
+#endif
+      )
+    {
+      return 0;
+    }
+
+  /* Do more dividing.  We collect small primes, using umul_ppmm, until we
+     overflow a single limb.  We divide our number by the small primes product,
+     and look for factors in the remainder.  */
+  {
+    unsigned long int ln2;
+    unsigned long int q;
+    mp_limb_t p1, p0, p;
+    unsigned int primes[15];
+    int nprimes;
+
+    nprimes = 0;
+    p = 1;
+    ln2 = mpz_sizeinbase (n, 2) / 30; ln2 = ln2 * ln2;
+    for (q = BITS_PER_MP_LIMB == 64 ? 59 : 31; q < ln2; q += 2)
+      {
+	if (isprime (q))
+	  {
+	    umul_ppmm (p1, p0, p, q);
+	    if (p1 != 0)
+	      {
+		r = mpn_mod_1 (PTR(n), SIZ(n), p);
+		while (--nprimes >= 0)
+		  if (r % primes[nprimes] == 0)
+		    {
+		      if (mpn_mod_1 (PTR(n), SIZ(n), (mp_limb_t) primes[nprimes]) != 0)
+			abort ();
+		      return 0;
+		    }
+		p = q;
+		nprimes = 0;
+	      }
+	    else
+	      {
+		p = p0;
+	      }
+	    primes[nprimes++] = q;
+	  }
+      }
+  }
+
+  /* Perform a number of Miller-Rabin tests.  */
+  return mpz_millerrabin (n, reps);
+}
+
+static int
+#if __STDC__
+isprime (unsigned long int t)
+#else
+isprime (t)
+     unsigned long int t;
+#endif
+{
+  unsigned long int q, r, d;
+
+  if (t < 3 || (t & 1) == 0)
+    return t == 2;
+
+  for (d = 3, r = 1; r != 0; d += 2)
+    {
+      q = t / d;
+      r = t - q * d;
+      if (q < d)
+	return 1;
+    }
+  return 0;
+}
+
+static int millerrabin _PROTO ((mpz_srcptr n, mpz_srcptr nm1,
+                                mpz_ptr x, mpz_ptr y,
+                                mpz_srcptr q, unsigned long int k));
+
+static int
+#if __STDC__
+mpz_millerrabin (mpz_srcptr n, int reps)
+#else
+mpz_millerrabin (n, reps)
+     mpz_srcptr n;
+     int reps;
+#endif
+{
+  int r;
+  mpz_t nm1, x, y, q;
+  unsigned long int k;
+  gmp_randstate_t rstate;
+  int is_prime;
+  TMP_DECL (marker);
+  TMP_MARK (marker);
+
+  MPZ_TMP_INIT (nm1, SIZ (n) + 1);
+  mpz_sub_ui (nm1, n, 1L);
+
+  MPZ_TMP_INIT (x, SIZ (n));
+  MPZ_TMP_INIT (y, 2 * SIZ (n)); /* mpz_powm_ui needs excessive memory!!! */
+
+  /* Perform a Fermat test.  */
+  mpz_set_ui (x, 210L);
+  mpz_powm (y, x, nm1, n);
+  if (mpz_cmp_ui (y, 1L) != 0)
+    {
+      TMP_FREE (marker);
+      return 0;
+    }
+
+  MPZ_TMP_INIT (q, SIZ (n));
+
+  /* Find q and k, where q is odd and n = 1 + 2**k * q.  */
+  k = mpz_scan1 (nm1, 0L);
+  mpz_tdiv_q_2exp (q, nm1, k);
+
+  gmp_randinit (rstate, GMP_RAND_ALG_DEFAULT, 32L);
+
+  is_prime = 1;
+  for (r = 0; r < reps && is_prime; r++)
+    {
+      do
+	mpz_urandomb (x, rstate, mpz_sizeinbase (n, 2) - 1);
+      while (mpz_cmp_ui (x, 1L) <= 0);
+
+      is_prime = millerrabin (n, nm1, x, y, q, k);
+    }
+
+  gmp_randclear (rstate);
+
+  TMP_FREE (marker);
+  return is_prime;
+}
+
+static int
+#if __STDC__
+millerrabin (mpz_srcptr n, mpz_srcptr nm1, mpz_ptr x, mpz_ptr y,
+             mpz_srcptr q, unsigned long int k)
+#else
+millerrabin (n, nm1, x, y, q, k)
+     mpz_srcptr n;
+     mpz_srcptr nm1;
+     mpz_ptr x;
+     mpz_ptr y;
+     mpz_srcptr q;
+     unsigned long int k;
+#endif
+{
+  unsigned long int i;
+
+  mpz_powm (y, x, q, n);
+
+  if (mpz_cmp_ui (y, 1L) == 0 || mpz_cmp (y, nm1) == 0)
+    return 1;
+
+  for (i = 1; i < k; i++)
+    {
+      mpz_powm_ui (y, y, 2L, n);
+      if (mpz_cmp (y, nm1) == 0)
+	return 1;
+      if (mpz_cmp_ui (y, 1L) == 0)
+	return 0;
+    }
+  return 0;
+}
diff --git a/rts/gmp/mpz/random.c b/rts/gmp/mpz/random.c
new file mode 100644
index 0000000000..60d9113991
--- /dev/null
+++ b/rts/gmp/mpz/random.c
@@ -0,0 +1,56 @@
+/* mpz_random -- Generate a random mpz_t of specified size.
+   This function is non-portable and generates poor random numbers.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "urandom.h"
+
+void
+#if __STDC__
+mpz_random (mpz_ptr x, mp_size_t size)
+#else
+mpz_random (x, size)
+     mpz_ptr x;
+     mp_size_t size;
+#endif
+{
+  mp_size_t i;
+  mp_limb_t ran;
+  mp_ptr xp;
+  mp_size_t abs_size;
+
+  abs_size = ABS (size);
+
+  if (x->_mp_alloc < abs_size)
+    _mpz_realloc (x, abs_size);
+
+  xp = x->_mp_d;
+
+  for (i = 0; i < abs_size; i++)
+    {
+      ran = urandom ();
+      xp[i] = ran;
+    }
+
+  MPN_NORMALIZE (xp, abs_size);
+  x->_mp_size = size < 0 ? -abs_size : abs_size;
+}
diff --git a/rts/gmp/mpz/random2.c b/rts/gmp/mpz/random2.c
new file mode 100644
index 0000000000..a90af115e9
--- /dev/null
+++ b/rts/gmp/mpz/random2.c
@@ -0,0 +1,48 @@
+/* mpz_random2 -- Generate a positive random mpz_t of specified size, with
+   long runs of consecutive ones and zeros in the binary representation.
+   Meant for testing of other MP routines.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_random2 (mpz_ptr x, mp_size_t size)
+#else
+mpz_random2 (x, size)
+     mpz_ptr x;
+     mp_size_t size;
+#endif
+{
+  mp_size_t abs_size;
+
+  abs_size = ABS (size);
+  if (abs_size != 0)
+    {
+      if (x->_mp_alloc < abs_size)
+	_mpz_realloc (x, abs_size);
+
+      mpn_random2 (x->_mp_d, abs_size);
+    }
+
+  x->_mp_size = size;
+}
diff --git a/rts/gmp/mpz/realloc.c b/rts/gmp/mpz/realloc.c
new file mode 100644
index 0000000000..0b9e447ec3
--- /dev/null
+++ b/rts/gmp/mpz/realloc.c
@@ -0,0 +1,52 @@
+/* _mpz_realloc -- make the mpz_t have NEW_SIZE digits allocated.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void *
+#if __STDC__
+_mpz_realloc (mpz_ptr m, mp_size_t new_size)
+#else
+_mpz_realloc (m, new_size)
+     mpz_ptr m;
+     mp_size_t new_size;
+#endif
+{
+  /* Never allocate zero space. */
+  if (new_size == 0)
+    new_size = 1;
+
+  m->_mp_d = (mp_ptr) (*_mp_reallocate_func) (m->_mp_d,
+					      m->_mp_alloc * BYTES_PER_MP_LIMB,
+					      new_size * BYTES_PER_MP_LIMB);
+  m->_mp_alloc = new_size;
+
+#if 0
+  /* This might break some code that reads the size field after
+     reallocation, in the case the reallocated destination and a
+     source argument are identical.  */
+  if (ABS (m->_mp_size) > new_size)
+    m->_mp_size = 0;
+#endif
+
+  return (void *) m->_mp_d;
+}
diff --git a/rts/gmp/mpz/remove.c b/rts/gmp/mpz/remove.c
new file mode 100644
index 0000000000..bc6675f972
--- /dev/null
+++ b/rts/gmp/mpz/remove.c
@@ -0,0 +1,93 @@
+/* mpz_remove -- divide out a factor and return its multiplicity.
+
+Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_remove (mpz_ptr dest, mpz_srcptr src, mpz_srcptr f)
+#else
+mpz_remove (dest, src, f)
+     mpz_ptr dest;
+     mpz_srcptr src;
+     mpz_srcptr f;
+#endif
+{
+  mpz_t fpow[40];		/* inexhaustible...until year 2020 or so */
+  mpz_t x, rem;
+  unsigned long int pwr;
+  int p;
+
+  if (mpz_cmp_ui (f, 1) <= 0 || mpz_sgn (src) == 0)
+    DIVIDE_BY_ZERO;
+  if (mpz_cmp_ui (f, 2) == 0)
+    {
+      unsigned long int s0;
+      s0 = mpz_scan1 (src, 0);
+      mpz_div_2exp (dest, src, s0);
+      return s0;
+    }
+
+  /* We could perhaps compute mpz_scan1(src,0)/mpz_scan1(f,0).  It is an
+     upper bound of the result we're seeking.  We could also shift down the
+     operands so that they become odd, to make intermediate values smaller.  */
+
+  mpz_init (rem);
+  mpz_init (x);
+
+  pwr = 0;
+  mpz_init (fpow[0]);
+  mpz_set (fpow[0], f);
+  mpz_set (dest, src);
+
+  /* Divide by f, f^2, ..., f^(2^k) until we get a remainder for f^(2^k).  */
+  for (p = 0;; p++)
+    {
+      mpz_tdiv_qr (x, rem, dest, fpow[p]);
+      if (SIZ (rem) != 0)
+	break;
+      mpz_init (fpow[p + 1]);
+      mpz_mul (fpow[p + 1], fpow[p], fpow[p]);
+      mpz_set (dest, x);
+    }
+
+  pwr = (1 << p) - 1;
+
+  mpz_clear (fpow[p]);
+
+  /* Divide by f^(2^(k-1)), f^(2^(k-2)), ..., f for all divisors that give a
+     zero remainder.  */
+  while (--p >= 0)
+    {
+      mpz_tdiv_qr (x, rem, dest, fpow[p]);
+      if (SIZ (rem) == 0)
+	{
+	  pwr += 1 << p;
+	  mpz_set (dest, x);
+	}
+      mpz_clear (fpow[p]);
+    }
+
+  mpz_clear (x);
+  mpz_clear (rem);
+  return pwr;
+}
diff --git a/rts/gmp/mpz/root.c b/rts/gmp/mpz/root.c
new file mode 100644
index 0000000000..0920bf22d3
--- /dev/null
+++ b/rts/gmp/mpz/root.c
@@ -0,0 +1,183 @@
+/* mpz_root(root, u, nth) --  Set ROOT to floor(U^(1/nth)).
+   Return an indication if the result is exact.
+
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+/* Naive implementation of nth root extraction.  It would probably be a
+   better idea to use a division-free Newton iteration.  It is insane
+   to use full precision from iteration 1.  The mpz_scan1 trick compensates
+   to some extent.  It would be natural to avoid representing the low zero
+   bits mpz_scan1 is counting, and at the same time call mpn directly.  */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+int
+#if __STDC__
+mpz_root (mpz_ptr r, mpz_srcptr c, unsigned long int nth)
+#else
+mpz_root (r, c, nth)
+     mpz_ptr r;
+     mpz_srcptr c;
+     unsigned long int nth;
+#endif
+{
+  mpz_t x, t0, t1, t2;
+  __mpz_struct ccs, *cc = &ccs;
+  unsigned long int nbits;
+  int bit;
+  int exact;
+  int i;
+  unsigned long int lowz;
+  unsigned long int rl;
+
+  /* even roots of negatives provoke an exception */
+  if (mpz_sgn (c) < 0 && (nth & 1) == 0)
+    SQRT_OF_NEGATIVE;
+
+  /* root extraction interpreted as c^(1/nth) means a zeroth root should
+     provoke a divide by zero, do this even if c==0 */
+  if (nth == 0)
+    DIVIDE_BY_ZERO;
+
+  if (mpz_sgn (c) == 0)
+    {
+      if (r != NULL)
+	mpz_set_ui (r, 0);
+      return 1;			/* exact result */
+    }
+
+  PTR(cc) = PTR(c);
+  SIZ(cc) = ABSIZ(c);
+
+  nbits = (mpz_sizeinbase (cc, 2) - 1) / nth;
+  if (nbits == 0)
+    {
+      if (r != NULL)
+	mpz_set_ui (r, 1);
+      if (mpz_sgn (c) < 0)
+	{
+	  if (r != NULL)
+	    SIZ(r) = -SIZ(r);
+	  return mpz_cmp_si (c, -1L) == 0;
+	}
+      return mpz_cmp_ui (c, 1L) == 0;
+    }
+
+  mpz_init (x);
+  mpz_init (t0);
+  mpz_init (t1);
+  mpz_init (t2);
+
+  /* Create a one-bit approximation.  */
+  mpz_set_ui (x, 0);
+  mpz_setbit (x, nbits);
+
+  /* Make the approximation better, one bit at a time.  This odd-looking
+     termination criteria makes large nth get better initial approximation,
+     which avoids slow convergence for such values.  */
+  bit = nbits - 1;
+  for (i = 1; (nth >> i) != 0; i++)
+    {
+      mpz_setbit (x, bit);
+      mpz_tdiv_q_2exp (t0, x, bit);
+      mpz_pow_ui (t1, t0, nth);
+      mpz_mul_2exp (t1, t1, bit * nth);
+      if (mpz_cmp (cc, t1) < 0)
+	mpz_clrbit (x, bit);
+
+      bit--;			/* check/set next bit */
+      if (bit < 0)
+	{
+	  /* We're done.  */
+	  mpz_pow_ui (t1, x, nth);
+	  goto done;
+	}
+    }
+  mpz_setbit (x, bit);
+  mpz_set_ui (t2, 0); mpz_setbit (t2, bit);  mpz_add (x, x, t2);
+
+#if DEBUG
+  /* Check that the starting approximation is >= than the root.  */
+  mpz_pow_ui (t1, x, nth);
+  if (mpz_cmp (cc, t1) >= 0)
+    abort ();
+#endif
+
+  mpz_add_ui (x, x, 1);
+
+  /* Main loop */
+  do
+    {
+      lowz = mpz_scan1 (x, 0);
+      mpz_tdiv_q_2exp (t0, x, lowz);
+      mpz_pow_ui (t1, t0, nth - 1);
+      mpz_mul_2exp (t1, t1, lowz * (nth - 1));
+      mpz_tdiv_q (t2, cc, t1);
+      mpz_sub (t2, x, t2);
+      rl = mpz_tdiv_q_ui (t2, t2, nth);
+      mpz_sub (x, x, t2);
+    }
+  while (mpz_sgn (t2) != 0);
+
+  /* If we got a non-zero remainder in the last division, we know our root
+     is too large.  */
+  mpz_sub_ui (x, x, (mp_limb_t) (rl != 0));
+
+  /* Adjustment loop.  If we spend more care on rounding in the loop above,
+     we could probably get rid of this, or greatly simplify it.  */
+  {
+    int bad = 0;
+    lowz = mpz_scan1 (x, 0);
+    mpz_tdiv_q_2exp (t0, x, lowz);
+    mpz_pow_ui (t1, t0, nth);
+    mpz_mul_2exp (t1, t1, lowz * nth);
+    while (mpz_cmp (cc, t1) < 0)
+      {
+	bad++;
+	if (bad > 2)
+	  abort ();			/* abort if our root is far off */
+	mpz_sub_ui (x, x, 1);
+	lowz = mpz_scan1 (x, 0);
+	mpz_tdiv_q_2exp (t0, x, lowz);
+	mpz_pow_ui (t1, t0, nth);
+	mpz_mul_2exp (t1, t1, lowz * nth);
+      }
+  }
+
+ done:
+  exact = mpz_cmp (t1, cc) == 0;
+
+  if (r != NULL)
+    {
+      mpz_set (r, x);
+      if (mpz_sgn (c) < 0)
+	SIZ(r) = -SIZ(r);
+    }
+
+  mpz_clear (t2);
+  mpz_clear (t1);
+  mpz_clear (t0);
+  mpz_clear (x);
+
+  return exact;
+}
diff --git a/rts/gmp/mpz/rrandomb.c b/rts/gmp/mpz/rrandomb.c
new file mode 100644
index 0000000000..7d78243674
--- /dev/null
+++ b/rts/gmp/mpz/rrandomb.c
@@ -0,0 +1,117 @@
+/* mpz_rrandomb -- Generate a positive random mpz_t of specified bit size, with
+   long runs of consecutive ones and zeros in the binary representation.
+   Meant for testing of other MP routines.
+
+Copyright (C) 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+static void gmp_rrandomb _PROTO ((mp_ptr rp, gmp_randstate_t rstate, unsigned long int nbits));
+
+void
+#if __STDC__
+mpz_rrandomb (mpz_ptr x, gmp_randstate_t rstate, unsigned long int nbits)
+#else
+mpz_rrandomb (x, rstate, nbits)
+     mpz_ptr x;
+     gmp_randstate_t rstate;
+     unsigned long int nbits;
+#endif
+{
+  mp_size_t nl = 0;
+
+  if (nbits != 0)
+    {
+      mp_ptr xp;
+      nl = (nbits + BITS_PER_MP_LIMB - 1) / BITS_PER_MP_LIMB;
+      if (x->_mp_alloc < nl)
+	_mpz_realloc (x, nl);
+
+      xp = PTR(x);
+      gmp_rrandomb (xp, rstate, nbits);
+      MPN_NORMALIZE (xp, nl);
+    }
+
+  SIZ(x) = nl;
+}
+
+#define BITS_PER_CHUNK 4
+
+static void
+#if __STDC__
+gmp_rrandomb (mp_ptr rp, gmp_randstate_t rstate, unsigned long int nbits)
+#else
+gmp_rrandomb (rp, rstate, nbits)
+     mp_ptr rp;
+     gmp_randstate_t rstate;
+     unsigned long int nbits;
+#endif
+{
+  int nb;
+  int bit_pos;
+  mp_size_t limb_pos;
+  mp_limb_t ran, ranm;
+  mp_limb_t acc;
+  mp_size_t n;
+
+  bit_pos = nbits % BITS_PER_MP_LIMB;
+  limb_pos = nbits / BITS_PER_MP_LIMB;
+  if (bit_pos == 0)
+    {
+      bit_pos = BITS_PER_MP_LIMB;
+      limb_pos--;
+    }
+
+  acc = 0;
+  while (limb_pos >= 0)
+    {
+      _gmp_rand (&ranm, rstate, BITS_PER_CHUNK + 1);
+      ran = ranm;
+      nb = (ran >> 1) + 1;
+      if ((ran & 1) != 0)
+	{
+	  /* Generate a string of ones.  */
+	  if (nb > bit_pos)
+	    {
+	      rp[limb_pos--] = acc | ((((mp_limb_t) 1) << bit_pos) - 1);
+	      bit_pos += BITS_PER_MP_LIMB;
+	      bit_pos -= nb;
+	      acc = (~(mp_limb_t) 0) << bit_pos;
+	    }
+	  else
+	    {
+	      bit_pos -= nb;
+	      acc |= ((((mp_limb_t) 1) << nb) - 1) << bit_pos;
+	    }
+	}
+      else
+	{
+	  /* Generate a string of zeroes.  */
+	  if (nb > bit_pos)
+	    {
+	      rp[limb_pos--] = acc;
+	      acc = 0;
+	      bit_pos += BITS_PER_MP_LIMB;
+	    }
+	  bit_pos -= nb;
+	}
+    }
+}
diff --git a/rts/gmp/mpz/scan0.c b/rts/gmp/mpz/scan0.c
new file mode 100644
index 0000000000..6c59cf8939
--- /dev/null
+++ b/rts/gmp/mpz/scan0.c
@@ -0,0 +1,35 @@
+/* mpz_scan0(op, startbit) -- Scan for the next set bit, starting at startbit.
+
+Copyright (C) 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_scan0 (mpz_srcptr u, unsigned long int starting_bit)
+#else
+mpz_scan0 (u, starting_bit)
+     mpz_srcptr u;
+     unsigned long int starting_bit;
+#endif
+{
+  return mpn_scan0 (u->_mp_d, starting_bit);
+}
diff --git a/rts/gmp/mpz/scan1.c b/rts/gmp/mpz/scan1.c
new file mode 100644
index 0000000000..3b84e3420c
--- /dev/null
+++ b/rts/gmp/mpz/scan1.c
@@ -0,0 +1,35 @@
+/* mpz_scan1(op, startbit) -- Scan for the next set bit, starting at startbit.
+
+Copyright (C) 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_scan1 (mpz_srcptr u, unsigned long int starting_bit)
+#else
+mpz_scan1 (u, starting_bit)
+     mpz_srcptr u;
+     unsigned long int starting_bit;
+#endif
+{
+  return mpn_scan1 (u->_mp_d, starting_bit);
+}
diff --git a/rts/gmp/mpz/set.c b/rts/gmp/mpz/set.c
new file mode 100644
index 0000000000..06b2eef511
--- /dev/null
+++ b/rts/gmp/mpz/set.c
@@ -0,0 +1,48 @@
+/* mpz_set (dest_integer, src_integer) -- Assign DEST_INTEGER from SRC_INTEGER.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_set (mpz_ptr w, mpz_srcptr u)
+#else
+mpz_set (w, u)
+     mpz_ptr w;
+     mpz_srcptr u;
+#endif
+{
+  mp_ptr wp, up;
+  mp_size_t usize, size;
+
+  usize = u->_mp_size;
+  size = ABS (usize);
+
+  if (w->_mp_alloc < size)
+    _mpz_realloc (w, size);
+
+  wp = w->_mp_d;
+  up = u->_mp_d;
+
+  MPN_COPY (wp, up, size);
+  w->_mp_size = usize;
+}
diff --git a/rts/gmp/mpz/set_d.c b/rts/gmp/mpz/set_d.c
new file mode 100644
index 0000000000..e90ed9bc2f
--- /dev/null
+++ b/rts/gmp/mpz/set_d.c
@@ -0,0 +1,96 @@
+/* mpz_set_d(integer, val) -- Assign INTEGER with a double value VAL.
+
+Copyright (C) 1995, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_set_d (mpz_ptr r, double d)
+#else
+mpz_set_d (r, d)
+     mpz_ptr r;
+     double d;
+#endif
+{
+  int negative;
+  mp_limb_t tp[3];
+  mp_ptr rp;
+  mp_size_t rn;
+
+  negative = d < 0;
+  d = ABS (d);
+
+  /* Handle small arguments quickly.  */
+  if (d < MP_BASE_AS_DOUBLE)
+    {
+      mp_limb_t tmp;
+      tmp = d;
+      PTR(r)[0] = tmp;
+      SIZ(r) = negative ? -(tmp != 0) : (tmp != 0);
+      return;
+    }
+
+  rn = __gmp_extract_double (tp, d);
+
+  if (ALLOC(r) < rn)
+    _mpz_realloc (r, rn);
+
+  rp = PTR (r);
+
+#if BITS_PER_MP_LIMB == 32
+  switch (rn)
+    {
+    default:
+      MPN_ZERO (rp, rn - 3);
+      rp += rn - 3;
+      /* fall through */
+    case 3:
+      rp[2] = tp[2];
+      rp[1] = tp[1];
+      rp[0] = tp[0];
+      break;
+    case 2:
+      rp[1] = tp[2];
+      rp[0] = tp[1];
+      break;
+    case 1:
+      /* handled in "small aguments" case above */
+      abort ();
+    }
+#else
+  switch (rn)
+    {
+    default:
+      MPN_ZERO (rp, rn - 2);
+      rp += rn - 2;
+      /* fall through */
+    case 2:
+      rp[1] = tp[1], rp[0] = tp[0];
+      break;
+    case 1:
+      /* handled in "small aguments" case above */
+      abort ();
+    }
+#endif
+
+  SIZ(r) = negative ? -rn : rn;
+}
diff --git a/rts/gmp/mpz/set_f.c b/rts/gmp/mpz/set_f.c
new file mode 100644
index 0000000000..2273953dfd
--- /dev/null
+++ b/rts/gmp/mpz/set_f.c
@@ -0,0 +1,64 @@
+/* mpz_set_f (dest_integer, src_float) -- Assign DEST_INTEGER from SRC_FLOAT.
+
+Copyright (C) 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_set_f (mpz_ptr w, mpf_srcptr u)
+#else
+mpz_set_f (w, u)
+     mpz_ptr w;
+     mpf_srcptr u;
+#endif
+{
+  mp_ptr wp, up;
+  mp_size_t usize, size;
+  mp_exp_t exp;
+
+  usize = SIZ (u);
+  size = ABS (usize);
+  exp = EXP (u);
+
+  if (w->_mp_alloc < exp)
+    _mpz_realloc (w, exp);
+
+  wp = w->_mp_d;
+  up = u->_mp_d;
+
+  if (exp <= 0)
+    {
+      SIZ (w) = 0;
+      return;
+    }
+  if (exp < size)
+    {
+      MPN_COPY (wp, up + size - exp, exp);
+    }
+  else
+    {
+      MPN_ZERO (wp, exp - size);
+      MPN_COPY (wp + exp - size, up, size);
+    }
+
+  w->_mp_size = usize >= 0 ? exp : -exp;
+}
diff --git a/rts/gmp/mpz/set_q.c b/rts/gmp/mpz/set_q.c
new file mode 100644
index 0000000000..72d3222a80
--- /dev/null
+++ b/rts/gmp/mpz/set_q.c
@@ -0,0 +1,36 @@
+/* mpz_set_q (dest_integer, src_rational) -- Assign DEST_INTEGER from
+   SRC_rational.
+
+Copyright (C) 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_set_q (mpz_ptr w, mpq_srcptr u)
+#else
+mpz_set_q (w, u)
+     mpz_ptr w;
+     mpq_srcptr u;
+#endif
+{
+  mpz_tdiv_q (w, mpq_numref (u), mpq_denref (u));
+}
diff --git a/rts/gmp/mpz/set_si.c b/rts/gmp/mpz/set_si.c
new file mode 100644
index 0000000000..9ba2fbaf30
--- /dev/null
+++ b/rts/gmp/mpz/set_si.c
@@ -0,0 +1,48 @@
+/* mpz_set_si(integer, val) -- Assign INTEGER with a small value VAL.
+
+Copyright (C) 1991, 1993, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_set_si (mpz_ptr dest, signed long int val)
+#else
+mpz_set_si (dest, val)
+     mpz_ptr dest;
+     signed long int val;
+#endif
+{
+  /* We don't check if the allocation is enough, since the rest of the
+     package ensures it's at least 1, which is what we need here.  */
+  if (val > 0)
+    {
+      dest->_mp_d[0] = val;
+      dest->_mp_size = 1;
+    }
+  else if (val < 0)
+    {
+      dest->_mp_d[0] = (unsigned long) -val;
+      dest->_mp_size = -1;
+    }
+  else
+    dest->_mp_size = 0;
+}
diff --git a/rts/gmp/mpz/set_str.c b/rts/gmp/mpz/set_str.c
new file mode 100644
index 0000000000..3ab79c0e89
--- /dev/null
+++ b/rts/gmp/mpz/set_str.c
@@ -0,0 +1,157 @@
+/* mpz_set_str(mp_dest, string, base) -- Convert the \0-terminated
+   string STRING in base BASE to multiple precision integer in
+   MP_DEST.  Allow white space in the string.  If BASE == 0 determine
+   the base in the C standard way, i.e.  0xhh...h means base 16,
+   0oo...o means base 8, otherwise assume base 10.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998, 2000 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <string.h>
+#include <ctype.h>
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+static int
+#if __STDC__
+digit_value_in_base (int c, int base)
+#else
+digit_value_in_base (c, base)
+     int c;
+     int base;
+#endif
+{
+  int digit;
+
+  if (isdigit (c))
+    digit = c - '0';
+  else if (islower (c))
+    digit = c - 'a' + 10;
+  else if (isupper (c))
+    digit = c - 'A' + 10;
+  else
+    return -1;
+
+  if (digit < base)
+    return digit;
+  return -1;
+}
+
+int
+#if __STDC__
+mpz_set_str (mpz_ptr x, const char *str, int base)
+#else
+mpz_set_str (x, str, base)
+     mpz_ptr x;
+     const char *str;
+     int base;
+#endif
+{
+  size_t str_size;
+  char *s, *begs;
+  size_t i;
+  mp_size_t xsize;
+  int c;
+  int negative;
+  TMP_DECL (marker);
+
+  /* Skip whitespace.  */
+  do
+    c = *str++;
+  while (isspace (c));
+
+  negative = 0;
+  if (c == '-')
+    {
+      negative = 1;
+      c = *str++;
+    }
+
+  if (digit_value_in_base (c, base == 0 ? 10 : base) < 0)
+    return -1;			/* error if no digits */
+
+  /* If BASE is 0, try to find out the base by looking at the initial
+     characters.  */
+  if (base == 0)
+    {
+      base = 10;
+      if (c == '0')
+	{
+	  base = 8;
+	  c = *str++;
+	  if (c == 'x' || c == 'X')
+	    {
+	      base = 16;
+	      c = *str++;
+	    }
+	  else if (c == 'b' || c == 'B')
+	    {
+	      base = 2;
+	      c = *str++;
+	    }
+	}
+    }
+
+  /* Skip leading zeros.  */
+  while (c == '0')
+    c = *str++;
+  /* Make sure the string does not become empty, mpn_set_str would fail.  */
+  if (c == 0)
+    {
+      x->_mp_size = 0;
+      return 0;
+    }
+
+  TMP_MARK (marker);
+  str_size = strlen (str - 1);
+  s = begs = (char *) TMP_ALLOC (str_size + 1);
+
+  /* Remove spaces from the string and convert the result from ASCII to a
+     byte array.  */
+  for (i = 0; i < str_size; i++)
+    {
+      if (!isspace (c))
+	{
+	  int dig = digit_value_in_base (c, base);
+	  if (dig < 0)
+	    {
+	      TMP_FREE (marker);
+	      return -1;
+	    }
+	  *s++ = dig;
+	}
+      c = *str++;
+    }
+
+  str_size = s - begs;
+
+  xsize = (((mp_size_t) (str_size / __mp_bases[base].chars_per_bit_exactly))
+	   / BITS_PER_MP_LIMB + 2);
+  if (x->_mp_alloc < xsize)
+    _mpz_realloc (x, xsize);
+
+  /* Convert the byte array in base BASE to our bignum format.  */
+  xsize = mpn_set_str (x->_mp_d, (unsigned char *) begs, str_size, base);
+  x->_mp_size = negative ? -xsize : xsize;
+
+  TMP_FREE (marker);
+  return 0;
+}
diff --git a/rts/gmp/mpz/set_ui.c b/rts/gmp/mpz/set_ui.c
new file mode 100644
index 0000000000..d6097c170a
--- /dev/null
+++ b/rts/gmp/mpz/set_ui.c
@@ -0,0 +1,43 @@
+/* mpz_set_ui(integer, val) -- Assign INTEGER with a small value VAL.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_set_ui (mpz_ptr dest, unsigned long int val)
+#else
+mpz_set_ui (dest, val)
+     mpz_ptr dest;
+     unsigned long int val;
+#endif
+{
+  /* We don't check if the allocation is enough, since the rest of the
+     package ensures it's at least 1, which is what we need here.  */
+  if (val > 0)
+    {
+      dest->_mp_d[0] = val;
+      dest->_mp_size = 1;
+    }
+  else
+    dest->_mp_size = 0;
+}
diff --git a/rts/gmp/mpz/setbit.c b/rts/gmp/mpz/setbit.c
new file mode 100644
index 0000000000..d4249a434e
--- /dev/null
+++ b/rts/gmp/mpz/setbit.c
@@ -0,0 +1,119 @@
+/* mpz_setbit -- set a specified bit.
+
+Copyright (C) 1991, 1993, 1994, 1995, 1997, 1999 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_setbit (mpz_ptr d, unsigned long int bit_index)
+#else
+mpz_setbit (d, bit_index)
+     mpz_ptr d;
+     unsigned long int bit_index;
+#endif
+{
+  mp_size_t dsize = d->_mp_size;
+  mp_ptr dp = d->_mp_d;
+  mp_size_t limb_index;
+
+  limb_index = bit_index / BITS_PER_MP_LIMB;
+  if (dsize >= 0)
+    {
+      if (limb_index < dsize)
+	{
+	  dp[limb_index] |= (mp_limb_t) 1 << (bit_index % BITS_PER_MP_LIMB);
+	  d->_mp_size = dsize;
+	}
+      else
+	{
+	  /* Ugh.  The bit should be set outside of the end of the
+	     number.  We have to increase the size of the number.  */
+	  if (d->_mp_alloc < limb_index + 1)
+	    {
+	      _mpz_realloc (d, limb_index + 1);
+	      dp = d->_mp_d;
+	    }
+	  MPN_ZERO (dp + dsize, limb_index - dsize);
+	  dp[limb_index] = (mp_limb_t) 1 << (bit_index % BITS_PER_MP_LIMB);
+	  d->_mp_size = limb_index + 1;
+	}
+    }
+  else
+    {
+      mp_size_t zero_bound;
+
+      /* Simulate two's complement arithmetic, i.e. simulate
+	 1. Set OP = ~(OP - 1) [with infinitely many leading ones].
+	 2. Set the bit.
+	 3. Set OP = ~OP + 1.  */
+
+      dsize = -dsize;
+
+      /* No upper bound on this loop, we're sure there's a non-zero limb
+	 sooner ot later.  */
+      for (zero_bound = 0; ; zero_bound++)
+	if (dp[zero_bound] != 0)
+	  break;
+
+      if (limb_index > zero_bound)
+	{
+	  if (limb_index < dsize)
+	    dp[limb_index] &= ~((mp_limb_t) 1 << (bit_index % BITS_PER_MP_LIMB));
+	  else
+	    ;
+	}
+      else if (limb_index == zero_bound)
+	{
+	  dp[limb_index] = ((dp[limb_index] - 1)
+			    & ~((mp_limb_t) 1 << (bit_index % BITS_PER_MP_LIMB))) + 1;
+	  if (dp[limb_index] == 0)
+	    {
+	      mp_size_t i;
+	      for (i = limb_index + 1; i < dsize; i++)
+		{
+		  dp[i] += 1;
+		  if (dp[i] != 0)
+		    goto fin;
+		}
+	      /* We got carry all way out beyond the end of D.  Increase
+		 its size (and allocation if necessary).  */
+	      dsize++;
+	      if (d->_mp_alloc < dsize)
+		{
+		  _mpz_realloc (d, dsize);
+		  dp = d->_mp_d;
+		}
+	      dp[i] = 1;
+	      d->_mp_size = -dsize;
+	    fin:;
+	    }
+	}
+      else
+	{
+	  mpn_decr_u (dp + limb_index,
+		     (mp_limb_t) 1 << (bit_index % BITS_PER_MP_LIMB));
+	  dsize -= dp[dsize - 1] == 0;
+	  d->_mp_size = -dsize;
+	}
+    }
+}
diff --git a/rts/gmp/mpz/size.c b/rts/gmp/mpz/size.c
new file mode 100644
index 0000000000..6574756783
--- /dev/null
+++ b/rts/gmp/mpz/size.c
@@ -0,0 +1,35 @@
+/* mpz_size(x) -- return the number of lims currently used by the
+   value of integer X.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+size_t
+#if __STDC__
+mpz_size (mpz_srcptr x)
+#else
+mpz_size (x)
+     mpz_srcptr x;
+#endif
+{
+  return ABS (x->_mp_size);
+}
diff --git a/rts/gmp/mpz/sizeinbase.c b/rts/gmp/mpz/sizeinbase.c
new file mode 100644
index 0000000000..734f9c4532
--- /dev/null
+++ b/rts/gmp/mpz/sizeinbase.c
@@ -0,0 +1,60 @@
+/* mpz_sizeinbase(x, base) -- return an approximation to the number of
+   character the integer X would have printed in base BASE.  The
+   approximation is never too small.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+size_t
+#if __STDC__
+mpz_sizeinbase (mpz_srcptr x, int base)
+#else
+mpz_sizeinbase (x, base)
+     mpz_srcptr x;
+     int base;
+#endif
+{
+  mp_size_t size = ABS (x->_mp_size);
+  int lb_base, cnt;
+  size_t totbits;
+
+  /* Special case for X == 0.  */
+  if (size == 0)
+    return 1;
+
+  /* Calculate the total number of significant bits of X.  */
+  count_leading_zeros (cnt, x->_mp_d[size - 1]);
+  totbits = size * BITS_PER_MP_LIMB - cnt;
+
+  if ((base & (base - 1)) == 0)
+    {
+      /* Special case for powers of 2, giving exact result.  */
+
+      count_leading_zeros (lb_base, base);
+      lb_base = BITS_PER_MP_LIMB - lb_base - 1;
+
+      return (totbits + lb_base - 1) / lb_base;
+    }
+  else
+    return (size_t) (totbits * __mp_bases[base].chars_per_bit_exactly) + 1;
+}
diff --git a/rts/gmp/mpz/sqrt.c b/rts/gmp/mpz/sqrt.c
new file mode 100644
index 0000000000..fe82fe407a
--- /dev/null
+++ b/rts/gmp/mpz/sqrt.c
@@ -0,0 +1,86 @@
+/* mpz_sqrt(root, u) --  Set ROOT to floor(sqrt(U)).
+
+Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_sqrt (mpz_ptr root, mpz_srcptr op)
+#else
+mpz_sqrt (root, op)
+     mpz_ptr root;
+     mpz_srcptr op;
+#endif
+{
+  mp_size_t op_size, root_size;
+  mp_ptr root_ptr, op_ptr;
+  mp_ptr free_me = NULL;
+  mp_size_t free_me_size;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+  op_size = op->_mp_size;
+  if (op_size < 0)
+    SQRT_OF_NEGATIVE;
+
+  /* The size of the root is accurate after this simple calculation.  */
+  root_size = (op_size + 1) / 2;
+
+  root_ptr = root->_mp_d;
+  op_ptr = op->_mp_d;
+
+  if (root->_mp_alloc < root_size)
+    {
+      if (root_ptr == op_ptr)
+	{
+	  free_me = root_ptr;
+	  free_me_size = root->_mp_alloc;
+	}
+      else
+	(*_mp_free_func) (root_ptr, root->_mp_alloc * BYTES_PER_MP_LIMB);
+
+      root->_mp_alloc = root_size;
+      root_ptr = (mp_ptr) (*_mp_allocate_func) (root_size * BYTES_PER_MP_LIMB);
+      root->_mp_d = root_ptr;
+    }
+  else
+    {
+      /* Make OP not overlap with ROOT.  */
+      if (root_ptr == op_ptr)
+	{
+	  /* ROOT and OP are identical.  Allocate temporary space for OP.  */
+	  op_ptr = (mp_ptr) TMP_ALLOC (op_size * BYTES_PER_MP_LIMB);
+	  /* Copy to the temporary space.  Hack: Avoid temporary variable
+	     by using ROOT_PTR.  */
+	  MPN_COPY (op_ptr, root_ptr, op_size);
+	}
+    }
+
+  mpn_sqrtrem (root_ptr, NULL, op_ptr, op_size);
+
+  root->_mp_size = root_size;
+
+  if (free_me != NULL)
+    (*_mp_free_func) (free_me, free_me_size * BYTES_PER_MP_LIMB);
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/sqrtrem.c b/rts/gmp/mpz/sqrtrem.c
new file mode 100644
index 0000000000..99a6453122
--- /dev/null
+++ b/rts/gmp/mpz/sqrtrem.c
@@ -0,0 +1,111 @@
+/* mpz_sqrtrem(root,rem,x) -- Set ROOT to floor(sqrt(X)) and REM
+   to the remainder, i.e. X - ROOT**2.
+
+Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h> /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+#ifdef BERKELEY_MP
+#include "mp.h"
+#endif
+
+#ifndef BERKELEY_MP
+void
+#if __STDC__
+mpz_sqrtrem (mpz_ptr root, mpz_ptr rem, mpz_srcptr op)
+#else
+mpz_sqrtrem (root, rem, op)
+     mpz_ptr root;
+     mpz_ptr rem;
+     mpz_srcptr op;
+#endif
+#else /* BERKELEY_MP */
+void
+#if __STDC__
+msqrt (mpz_srcptr op, mpz_ptr root, mpz_ptr rem)
+#else
+msqrt (op, root, rem)
+     mpz_srcptr op;
+     mpz_ptr root;
+     mpz_ptr rem;
+#endif
+#endif /* BERKELEY_MP */
+{
+  mp_size_t op_size, root_size, rem_size;
+  mp_ptr root_ptr, op_ptr;
+  mp_ptr free_me = NULL;
+  mp_size_t free_me_size;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+  op_size = op->_mp_size;
+  if (op_size < 0)
+    SQRT_OF_NEGATIVE;
+
+  if (rem->_mp_alloc < op_size)
+    _mpz_realloc (rem, op_size);
+
+  /* The size of the root is accurate after this simple calculation.  */
+  root_size = (op_size + 1) / 2;
+
+  root_ptr = root->_mp_d;
+  op_ptr = op->_mp_d;
+
+  if (root->_mp_alloc < root_size)
+    {
+      if (root_ptr == op_ptr)
+	{
+	  free_me = root_ptr;
+	  free_me_size = root->_mp_alloc;
+	}
+      else
+	(*_mp_free_func) (root_ptr, root->_mp_alloc * BYTES_PER_MP_LIMB);
+
+      root->_mp_alloc = root_size;
+      root_ptr = (mp_ptr) (*_mp_allocate_func) (root_size * BYTES_PER_MP_LIMB);
+      root->_mp_d = root_ptr;
+    }
+  else
+    {
+      /* Make OP not overlap with ROOT.  */
+      if (root_ptr == op_ptr)
+	{
+	  /* ROOT and OP are identical.  Allocate temporary space for OP.  */
+	  op_ptr = (mp_ptr) TMP_ALLOC (op_size * BYTES_PER_MP_LIMB);
+	  /* Copy to the temporary space.  Hack: Avoid temporary variable
+	     by using ROOT_PTR.  */
+	  MPN_COPY (op_ptr, root_ptr, op_size);
+	}
+    }
+
+  rem_size = mpn_sqrtrem (root_ptr, rem->_mp_d, op_ptr, op_size);
+
+  root->_mp_size = root_size;
+
+  /* Write remainder size last, to enable us to define this function to
+     give only the square root remainder, if the user calls if with
+     ROOT == REM.  */
+  rem->_mp_size = rem_size;
+
+  if (free_me != NULL)
+    (*_mp_free_func) (free_me, free_me_size * BYTES_PER_MP_LIMB);
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/sub.c b/rts/gmp/mpz/sub.c
new file mode 100644
index 0000000000..f3ae7c23a0
--- /dev/null
+++ b/rts/gmp/mpz/sub.c
@@ -0,0 +1,123 @@
+/* mpz_sub -- Subtract two integers.
+
+Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#ifdef BERKELEY_MP
+#include "mp.h"
+#endif
+
+#ifndef BERKELEY_MP
+void
+#if __STDC__
+mpz_sub (mpz_ptr w, mpz_srcptr u, mpz_srcptr v)
+#else
+mpz_sub (w, u, v)
+     mpz_ptr w;
+     mpz_srcptr u;
+     mpz_srcptr v;
+#endif
+#else /* BERKELEY_MP */
+void
+#if __STDC__
+msub (mpz_srcptr u, mpz_srcptr v, mpz_ptr w)
+#else
+msub (u, v, w)
+     mpz_srcptr u;
+     mpz_srcptr v;
+     mpz_ptr w;
+#endif
+#endif /* BERKELEY_MP */
+{
+  mp_srcptr up, vp;
+  mp_ptr wp;
+  mp_size_t usize, vsize, wsize;
+  mp_size_t abs_usize;
+  mp_size_t abs_vsize;
+
+  usize = u->_mp_size;
+  vsize = -v->_mp_size;		/* The "-" makes the difference from mpz_add */
+  abs_usize = ABS (usize);
+  abs_vsize = ABS (vsize);
+
+  if (abs_usize < abs_vsize)
+    {
+      /* Swap U and V. */
+      MPZ_SRCPTR_SWAP (u, v);
+      MP_SIZE_T_SWAP (usize, vsize);
+      MP_SIZE_T_SWAP (abs_usize, abs_vsize);
+    }
+
+  /* True: ABS_USIZE >= ABS_VSIZE.  */
+
+  /* If not space for w (and possible carry), increase space.  */
+  wsize = abs_usize + 1;
+  if (w->_mp_alloc < wsize)
+    _mpz_realloc (w, wsize);
+
+  /* These must be after realloc (u or v may be the same as w).  */
+  up = u->_mp_d;
+  vp = v->_mp_d;
+  wp = w->_mp_d;
+
+  if ((usize ^ vsize) < 0)
+    {
+      /* U and V have different sign.  Need to compare them to determine
+	 which operand to subtract from which.  */
+
+      /* This test is right since ABS_USIZE >= ABS_VSIZE.  */
+      if (abs_usize != abs_vsize)
+	{
+	  mpn_sub (wp, up, abs_usize, vp, abs_vsize);
+	  wsize = abs_usize;
+	  MPN_NORMALIZE (wp, wsize);
+	  if (usize < 0)
+	    wsize = -wsize;
+	}
+      else if (mpn_cmp (up, vp, abs_usize) < 0)
+	{
+	  mpn_sub_n (wp, vp, up, abs_usize);
+	  wsize = abs_usize;
+	  MPN_NORMALIZE (wp, wsize);
+	  if (usize >= 0)
+	    wsize = -wsize;
+	}
+      else
+	{
+	  mpn_sub_n (wp, up, vp, abs_usize);
+	  wsize = abs_usize;
+	  MPN_NORMALIZE (wp, wsize);
+	  if (usize < 0)
+	    wsize = -wsize;
+	}
+    }
+  else
+    {
+      /* U and V have same sign.  Add them.  */
+      mp_limb_t cy_limb = mpn_add (wp, up, abs_usize, vp, abs_vsize);
+      wp[abs_usize] = cy_limb;
+      wsize = abs_usize + cy_limb;
+      if (usize < 0)
+	wsize = -wsize;
+    }
+
+  w->_mp_size = wsize;
+}
diff --git a/rts/gmp/mpz/sub_ui.c b/rts/gmp/mpz/sub_ui.c
new file mode 100644
index 0000000000..327add8503
--- /dev/null
+++ b/rts/gmp/mpz/sub_ui.c
@@ -0,0 +1,84 @@
+/* mpz_sub_ui -- Subtract an unsigned one-word integer from an MP_INT.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_sub_ui (mpz_ptr w, mpz_srcptr u, unsigned long int v)
+#else
+mpz_sub_ui (w, u, v)
+     mpz_ptr w;
+     mpz_srcptr u;
+     unsigned long int v;
+#endif
+{
+  mp_srcptr up;
+  mp_ptr wp;
+  mp_size_t usize, wsize;
+  mp_size_t abs_usize;
+
+  usize = u->_mp_size;
+  abs_usize = ABS (usize);
+
+  /* If not space for W (and possible carry), increase space.  */
+  wsize = abs_usize + 1;
+  if (w->_mp_alloc < wsize)
+    _mpz_realloc (w, wsize);
+
+  /* These must be after realloc (U may be the same as W).  */
+  up = u->_mp_d;
+  wp = w->_mp_d;
+
+  if (abs_usize == 0)
+    {
+      wp[0] = v;
+      w->_mp_size = -(v != 0);
+      return;
+    }
+
+  if (usize < 0)
+    {
+      mp_limb_t cy;
+      cy = mpn_add_1 (wp, up, abs_usize, (mp_limb_t) v);
+      wp[abs_usize] = cy;
+      wsize = -(abs_usize + cy);
+    }
+  else
+    {
+      /* The signs are different.  Need exact comparison to determine
+	 which operand to subtract from which.  */
+      if (abs_usize == 1 && up[0] < v)
+	{
+	  wp[0] = v - up[0];
+	  wsize = -1;
+	}
+      else
+	{
+	  mpn_sub_1 (wp, up, abs_usize, (mp_limb_t) v);
+	  /* Size can decrease with at most one limb.  */
+	  wsize = abs_usize - (wp[abs_usize - 1] == 0);
+	}
+    }
+
+  w->_mp_size = wsize;
+}
diff --git a/rts/gmp/mpz/swap.c b/rts/gmp/mpz/swap.c
new file mode 100644
index 0000000000..0070d6ff24
--- /dev/null
+++ b/rts/gmp/mpz/swap.c
@@ -0,0 +1,52 @@
+/* mpz_swap (dest_integer, src_integer) -- Swap U and V.
+
+Copyright (C) 1997, 1998 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_swap (mpz_ptr u, mpz_ptr v)
+#else
+mpz_swap (u, v)
+     mpz_ptr u;
+     mpz_ptr v;
+#endif
+{
+  mp_ptr up, vp;
+  mp_size_t usize, vsize;
+  mp_size_t ualloc, valloc;
+
+  ualloc = u->_mp_alloc;
+  valloc = v->_mp_alloc;
+  v->_mp_alloc = ualloc;
+  u->_mp_alloc = valloc;
+
+  usize = u->_mp_size;
+  vsize = v->_mp_size;
+  v->_mp_size = usize;
+  u->_mp_size = vsize;
+
+  up = u->_mp_d;
+  vp = v->_mp_d;
+  v->_mp_d = up;
+  u->_mp_d = vp;
+}
diff --git a/rts/gmp/mpz/tdiv_q.c b/rts/gmp/mpz/tdiv_q.c
new file mode 100644
index 0000000000..21db4ab385
--- /dev/null
+++ b/rts/gmp/mpz/tdiv_q.c
@@ -0,0 +1,91 @@
+/* mpz_tdiv_q -- divide two integers and produce a quotient.
+
+Copyright (C) 1991, 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void
+#if __STDC__
+mpz_tdiv_q (mpz_ptr quot, mpz_srcptr num, mpz_srcptr den)
+#else
+mpz_tdiv_q (quot, num, den)
+     mpz_ptr quot;
+     mpz_srcptr num;
+     mpz_srcptr den;
+#endif
+{
+  mp_size_t ql;
+  mp_size_t ns, ds, nl, dl;
+  mp_ptr np, dp, qp, rp;
+  TMP_DECL (marker);
+
+  ns = SIZ (num);
+  ds = SIZ (den);
+  nl = ABS (ns);
+  dl = ABS (ds);
+  ql = nl - dl + 1;
+
+  if (dl == 0)
+    DIVIDE_BY_ZERO;
+
+  if (ql <= 0)
+    {
+      SIZ (quot) = 0;
+      return;
+    }
+
+  MPZ_REALLOC (quot, ql);
+
+  TMP_MARK (marker);
+  qp = PTR (quot);
+  rp = (mp_ptr) TMP_ALLOC (dl * BYTES_PER_MP_LIMB);
+  np = PTR (num);
+  dp = PTR (den);
+
+  /* FIXME: We should think about how to handle the temporary allocation.
+     Perhaps mpn_tdiv_qr should handle it, since it anyway often needs to
+     allocate temp space.  */
+
+  /* Copy denominator to temporary space if it overlaps with the quotient.  */
+  if (dp == qp)
+    {
+      mp_ptr tp;
+      tp = (mp_ptr) TMP_ALLOC (dl * BYTES_PER_MP_LIMB);
+      MPN_COPY (tp, dp, dl);
+      dp = tp;
+    }
+  /* Copy numerator to temporary space if it overlaps with the quotient.  */
+  if (np == qp)
+    {
+      mp_ptr tp;
+      tp = (mp_ptr) TMP_ALLOC (nl * BYTES_PER_MP_LIMB);
+      MPN_COPY (tp, np, nl);
+      np = tp;
+    }
+
+  mpn_tdiv_qr (qp, rp, 0L, np, nl, dp, dl);
+
+  ql -=  qp[ql - 1] == 0;
+
+  SIZ (quot) = (ns ^ ds) >= 0 ? ql : -ql;
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/tdiv_q_2exp.c b/rts/gmp/mpz/tdiv_q_2exp.c
new file mode 100644
index 0000000000..03d1e01f89
--- /dev/null
+++ b/rts/gmp/mpz/tdiv_q_2exp.c
@@ -0,0 +1,68 @@
+/* mpz_tdiv_q_2exp -- Divide an integer by 2**CNT.  Round the quotient
+   towards -infinity.
+
+Copyright (C) 1991, 1993, 1994, 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_tdiv_q_2exp (mpz_ptr w, mpz_srcptr u, unsigned long int cnt)
+#else
+mpz_tdiv_q_2exp (w, u, cnt)
+     mpz_ptr w;
+     mpz_srcptr u;
+     unsigned long int cnt;
+#endif
+{
+  mp_size_t usize, wsize;
+  mp_size_t limb_cnt;
+
+  usize = u->_mp_size;
+  limb_cnt = cnt / BITS_PER_MP_LIMB;
+  wsize = ABS (usize) - limb_cnt;
+  if (wsize <= 0)
+    w->_mp_size = 0;
+  else
+    {
+      mp_ptr wp;
+      mp_srcptr up;
+
+      if (w->_mp_alloc < wsize)
+	_mpz_realloc (w, wsize);
+
+      wp = w->_mp_d;
+      up = u->_mp_d;
+
+      cnt %= BITS_PER_MP_LIMB;
+      if (cnt != 0)
+	{
+	  mpn_rshift (wp, up + limb_cnt, wsize, cnt);
+	  wsize -= wp[wsize - 1] == 0;
+	}
+      else
+	{
+	  MPN_COPY_INCR (wp, up + limb_cnt, wsize);
+	}
+
+      w->_mp_size = usize >= 0 ? wsize : -wsize;
+    }
+}
diff --git a/rts/gmp/mpz/tdiv_q_ui.c b/rts/gmp/mpz/tdiv_q_ui.c
new file mode 100644
index 0000000000..a2e3462b76
--- /dev/null
+++ b/rts/gmp/mpz/tdiv_q_ui.c
@@ -0,0 +1,64 @@
+/* mpz_tdiv_q_ui(quot, dividend, divisor_limb)
+   -- Divide DIVIDEND by DIVISOR_LIMB and store the result in QUOT.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1998 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_tdiv_q_ui (mpz_ptr quot, mpz_srcptr dividend, unsigned long int divisor)
+#else
+mpz_tdiv_q_ui (quot, dividend, divisor)
+     mpz_ptr quot;
+     mpz_srcptr dividend;
+     unsigned long int divisor;
+#endif
+{
+  mp_size_t dividend_size;
+  mp_size_t size;
+  mp_ptr quot_ptr;
+  mp_limb_t remainder_limb;
+
+  if (divisor == 0)
+    DIVIDE_BY_ZERO;
+
+  dividend_size = dividend->_mp_size;
+  size = ABS (dividend_size);
+
+  /* No need for temporary allocation and copying if QUOT == DIVIDEND as
+     the divisor is just one limb, and thus no intermediate remainders
+     need to be stored.  */
+
+  if (quot->_mp_alloc < size)
+    _mpz_realloc (quot, size);
+
+  quot_ptr = quot->_mp_d;
+
+  remainder_limb
+    = mpn_divmod_1 (quot_ptr, dividend->_mp_d, size, (mp_limb_t) divisor);
+
+  /* The quotient is SIZE limbs, but the most significant might be zero. */
+  size -= size != 0 && quot_ptr[size - 1] == 0;
+  quot->_mp_size = dividend_size >= 0 ? size : -size;
+
+  return remainder_limb;
+}
diff --git a/rts/gmp/mpz/tdiv_qr.c b/rts/gmp/mpz/tdiv_qr.c
new file mode 100644
index 0000000000..d66f57d9e5
--- /dev/null
+++ b/rts/gmp/mpz/tdiv_qr.c
@@ -0,0 +1,130 @@
+/* mpz_tdiv_qr(quot,rem,dividend,divisor) -- Set QUOT to DIVIDEND/DIVISOR,
+   and REM to DIVIDEND mod DIVISOR.
+
+Copyright (C) 1991, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+#ifdef BERKELEY_MP
+#include "mp.h"
+#endif
+
+
+#ifndef BERKELEY_MP
+
+void
+#if __STDC__
+mpz_tdiv_qr (mpz_ptr quot, mpz_ptr rem, mpz_srcptr num, mpz_srcptr den)
+#else
+mpz_tdiv_qr (quot, rem, num, den)
+     mpz_ptr quot;
+     mpz_ptr rem;
+     mpz_srcptr num;
+     mpz_srcptr den;
+#endif
+
+#else /* BERKELEY_MP */
+
+void
+#if __STDC__
+mdiv (mpz_srcptr num, mpz_srcptr den, mpz_ptr quot, mpz_ptr rem)
+#else
+mdiv (num, den, quot, rem)
+     mpz_srcptr num;
+     mpz_srcptr den;
+     mpz_ptr    quot;
+     mpz_ptr    rem;
+#endif
+
+#endif /* BERKELEY_MP */
+{
+  mp_size_t ql;
+  mp_size_t ns, ds, nl, dl;
+  mp_ptr np, dp, qp, rp;
+  TMP_DECL (marker);
+
+  ns = SIZ (num);
+  ds = SIZ (den);
+  nl = ABS (ns);
+  dl = ABS (ds);
+  ql = nl - dl + 1;
+
+  if (dl == 0)
+    DIVIDE_BY_ZERO;
+
+  MPZ_REALLOC (rem, dl);
+
+  if (ql <= 0)
+    {
+      if (num != rem)
+	{
+	  mp_ptr np, rp;
+	  np = PTR (num);
+	  rp = PTR (rem);
+	  MPN_COPY (rp, np, nl);
+	  SIZ (rem) = SIZ (num);
+	}
+      /* This needs to follow the assignment to rem, in case the
+	 numerator and quotient are the same.  */
+      SIZ (quot) = 0;
+      return;
+    }
+
+  MPZ_REALLOC (quot, ql);
+
+  TMP_MARK (marker);
+  qp = PTR (quot);
+  rp = PTR (rem);
+  np = PTR (num);
+  dp = PTR (den);
+
+  /* FIXME: We should think about how to handle the temporary allocation.
+     Perhaps mpn_tdiv_qr should handle it, since it anyway often needs to
+     allocate temp space.  */
+
+  /* Copy denominator to temporary space if it overlaps with the quotient
+     or remainder.  */
+  if (dp == rp || dp == qp)
+    {
+      mp_ptr tp;
+      tp = (mp_ptr) TMP_ALLOC (dl * BYTES_PER_MP_LIMB);
+      MPN_COPY (tp, dp, dl);
+      dp = tp;
+    }
+  /* Copy numerator to temporary space if it overlaps with the quotient or
+     remainder.  */
+  if (np == rp || np == qp)
+    {
+      mp_ptr tp;
+      tp = (mp_ptr) TMP_ALLOC (nl * BYTES_PER_MP_LIMB);
+      MPN_COPY (tp, np, nl);
+      np = tp;
+    }
+
+  mpn_tdiv_qr (qp, rp, 0L, np, nl, dp, dl);
+
+  ql -=  qp[ql - 1] == 0;
+  MPN_NORMALIZE (rp, dl);
+
+  SIZ (quot) = (ns ^ ds) >= 0 ? ql : -ql;
+  SIZ (rem) = ns >= 0 ? dl : -dl;
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/tdiv_qr_ui.c b/rts/gmp/mpz/tdiv_qr_ui.c
new file mode 100644
index 0000000000..10368cd340
--- /dev/null
+++ b/rts/gmp/mpz/tdiv_qr_ui.c
@@ -0,0 +1,76 @@
+/* mpz_tdiv_qr_ui(quot,rem,dividend,short_divisor) --
+   Set QUOT to DIVIDEND / SHORT_DIVISOR
+   and REM to DIVIDEND mod SHORT_DIVISOR.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1998 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_tdiv_qr_ui (mpz_ptr quot, mpz_ptr rem, mpz_srcptr dividend, unsigned long int divisor)
+#else
+mpz_tdiv_qr_ui (quot, rem, dividend, divisor)
+     mpz_ptr quot;
+     mpz_ptr rem;
+     mpz_srcptr dividend;
+     unsigned long int divisor;
+#endif
+{
+  mp_size_t dividend_size;
+  mp_size_t size;
+  mp_ptr quot_ptr;
+  mp_limb_t remainder_limb;
+
+  if (divisor == 0)
+    DIVIDE_BY_ZERO;
+
+  dividend_size = dividend->_mp_size;
+  size = ABS (dividend_size);
+
+  /* No need for temporary allocation and copying if QUOT == DIVIDEND as
+     the divisor is just one limb, and thus no intermediate remainders
+     need to be stored.  */
+
+  if (quot->_mp_alloc < size)
+    _mpz_realloc (quot, size);
+
+  quot_ptr = quot->_mp_d;
+
+  remainder_limb = mpn_divmod_1 (quot_ptr, dividend->_mp_d, size,
+				 (mp_limb_t) divisor);
+
+  if (remainder_limb == 0)
+    rem->_mp_size = 0;
+  else
+    {
+      /* Store the single-limb remainder.  We don't check if there's space
+	 for just one limb, since no function ever makes zero space.  */
+      rem->_mp_size = dividend_size >= 0 ? 1 : -1;
+      rem->_mp_d[0] = remainder_limb;
+    }
+
+  /* The quotient is SIZE limbs, but the most significant might be zero. */
+  size -= size != 0 && quot_ptr[size - 1] == 0;
+  quot->_mp_size = dividend_size >= 0 ? size : -size;
+
+  return remainder_limb;
+}
diff --git a/rts/gmp/mpz/tdiv_r.c b/rts/gmp/mpz/tdiv_r.c
new file mode 100644
index 0000000000..9eb87dfabf
--- /dev/null
+++ b/rts/gmp/mpz/tdiv_r.c
@@ -0,0 +1,98 @@
+/* mpz_tdiv_r(rem, dividend, divisor) -- Set REM to DIVIDEND mod DIVISOR.
+
+Copyright (C) 1991, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void
+#if __STDC__
+mpz_tdiv_r (mpz_ptr rem, mpz_srcptr num, mpz_srcptr den)
+#else
+mpz_tdiv_r (rem, num, den)
+     mpz_ptr rem;
+     mpz_srcptr num;
+     mpz_srcptr den;
+#endif
+{
+  mp_size_t ql;
+  mp_size_t ns, ds, nl, dl;
+  mp_ptr np, dp, qp, rp;
+  TMP_DECL (marker);
+
+  ns = SIZ (num);
+  ds = SIZ (den);
+  nl = ABS (ns);
+  dl = ABS (ds);
+  ql = nl - dl + 1;
+
+  if (dl == 0)
+    DIVIDE_BY_ZERO;
+
+  MPZ_REALLOC (rem, dl);
+
+  if (ql <= 0)
+    {
+      if (num != rem)
+	{
+	  mp_ptr np, rp;
+	  np = PTR (num);
+	  rp = PTR (rem);
+	  MPN_COPY (rp, np, nl);
+	  SIZ (rem) = SIZ (num);
+	}
+      return;
+    }
+
+  TMP_MARK (marker);
+  qp = (mp_ptr) TMP_ALLOC (ql * BYTES_PER_MP_LIMB);
+  rp = PTR (rem);
+  np = PTR (num);
+  dp = PTR (den);
+
+  /* FIXME: We should think about how to handle the temporary allocation.
+     Perhaps mpn_tdiv_qr should handle it, since it anyway often needs to
+     allocate temp space.  */
+
+  /* Copy denominator to temporary space if it overlaps with the remainder.  */
+  if (dp == rp)
+    {
+      mp_ptr tp;
+      tp = (mp_ptr) TMP_ALLOC (dl * BYTES_PER_MP_LIMB);
+      MPN_COPY (tp, dp, dl);
+      dp = tp;
+    }
+  /* Copy numerator to temporary space if it overlaps with the remainder.  */
+  if (np == rp)
+    {
+      mp_ptr tp;
+      tp = (mp_ptr) TMP_ALLOC (nl * BYTES_PER_MP_LIMB);
+      MPN_COPY (tp, np, nl);
+      np = tp;
+    }
+
+  mpn_tdiv_qr (qp, rp, 0L, np, nl, dp, dl);
+
+  MPN_NORMALIZE (rp, dl);
+
+  SIZ (rem) = ns >= 0 ? dl : -dl;
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/tdiv_r_2exp.c b/rts/gmp/mpz/tdiv_r_2exp.c
new file mode 100644
index 0000000000..91de170f5c
--- /dev/null
+++ b/rts/gmp/mpz/tdiv_r_2exp.c
@@ -0,0 +1,79 @@
+/* mpz_tdiv_r_2exp -- Divide a integer by 2**CNT and produce a remainder.
+
+Copyright (C) 1991, 1993, 1994, 1995 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_tdiv_r_2exp (mpz_ptr res, mpz_srcptr in, unsigned long int cnt)
+#else
+mpz_tdiv_r_2exp (res, in, cnt)
+     mpz_ptr res;
+     mpz_srcptr in;
+     unsigned long int cnt;
+#endif
+{
+  mp_size_t in_size = ABS (in->_mp_size);
+  mp_size_t res_size;
+  mp_size_t limb_cnt = cnt / BITS_PER_MP_LIMB;
+  mp_srcptr in_ptr = in->_mp_d;
+
+  if (in_size > limb_cnt)
+    {
+      /* The input operand is (probably) greater than 2**CNT.  */
+      mp_limb_t x;
+
+      x = in_ptr[limb_cnt] & (((mp_limb_t) 1 << cnt % BITS_PER_MP_LIMB) - 1);
+      if (x != 0)
+	{
+	  res_size = limb_cnt + 1;
+	  if (res->_mp_alloc < res_size)
+	    _mpz_realloc (res, res_size);
+
+	  res->_mp_d[limb_cnt] = x;
+	}
+      else
+	{
+	  res_size = limb_cnt;
+	  MPN_NORMALIZE (in_ptr, res_size);
+
+	  if (res->_mp_alloc < res_size)
+	    _mpz_realloc (res, res_size);
+
+	  limb_cnt = res_size;
+	}
+    }
+  else
+    {
+      /* The input operand is smaller than 2**CNT.  We perform a no-op,
+	 apart from that we might need to copy IN to RES.  */
+      res_size = in_size;
+      if (res->_mp_alloc < res_size)
+	_mpz_realloc (res, res_size);
+
+      limb_cnt = res_size;
+    }
+
+  if (res != in)
+    MPN_COPY (res->_mp_d, in->_mp_d, limb_cnt);
+  res->_mp_size = in->_mp_size >= 0 ? res_size : -res_size;
+}
diff --git a/rts/gmp/mpz/tdiv_r_ui.c b/rts/gmp/mpz/tdiv_r_ui.c
new file mode 100644
index 0000000000..2ea411fda1
--- /dev/null
+++ b/rts/gmp/mpz/tdiv_r_ui.c
@@ -0,0 +1,63 @@
+/* mpz_tdiv_r_ui(rem, dividend, divisor_limb)
+   -- Set REM to DIVDEND mod DIVISOR_LIMB.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1998 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_tdiv_r_ui (mpz_ptr rem, mpz_srcptr dividend, unsigned long int divisor)
+#else
+mpz_tdiv_r_ui (rem, dividend, divisor)
+     mpz_ptr rem;
+     mpz_srcptr dividend;
+     unsigned long int divisor;
+#endif
+{
+  mp_size_t dividend_size;
+  mp_size_t size;
+  mp_limb_t remainder_limb;
+
+  if (divisor == 0)
+    DIVIDE_BY_ZERO;
+
+  dividend_size = dividend->_mp_size;
+  size = ABS (dividend_size);
+
+  /* No need for temporary allocation and copying if QUOT == DIVIDEND as
+     the divisor is just one limb, and thus no intermediate remainders
+     need to be stored.  */
+
+  remainder_limb = mpn_mod_1 (dividend->_mp_d, size, (mp_limb_t) divisor);
+
+  if (remainder_limb == 0)
+    rem->_mp_size = 0;
+  else
+    {
+      /* Store the single-limb remainder.  We don't check if there's space
+	 for just one limb, since no function ever makes zero space.  */
+      rem->_mp_size = dividend_size >= 0 ? 1 : -1;
+      rem->_mp_d[0] = remainder_limb;
+    }
+
+  return remainder_limb;
+}
diff --git a/rts/gmp/mpz/tdiv_ui.c b/rts/gmp/mpz/tdiv_ui.c
new file mode 100644
index 0000000000..7a40a6a7f7
--- /dev/null
+++ b/rts/gmp/mpz/tdiv_ui.c
@@ -0,0 +1,53 @@
+/* mpz_tdiv_ui(dividend, divisor_limb)
+   -- Return DIVDEND mod DIVISOR_LIMB.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 1998 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+unsigned long int
+#if __STDC__
+mpz_tdiv_ui (mpz_srcptr dividend, unsigned long int divisor)
+#else
+mpz_tdiv_ui (dividend, divisor)
+     mpz_srcptr dividend;
+     unsigned long int divisor;
+#endif
+{
+  mp_size_t dividend_size;
+  mp_size_t size;
+  mp_limb_t remainder_limb;
+
+  if (divisor == 0)
+    DIVIDE_BY_ZERO;
+
+  dividend_size = dividend->_mp_size;
+  size = ABS (dividend_size);
+
+  /* No need for temporary allocation and copying if QUOT == DIVIDEND as
+     the divisor is just one limb, and thus no intermediate remainders
+     need to be stored.  */
+
+  remainder_limb = mpn_mod_1 (dividend->_mp_d, size, (mp_limb_t) divisor);
+
+  return remainder_limb;
+}
diff --git a/rts/gmp/mpz/tstbit.c b/rts/gmp/mpz/tstbit.c
new file mode 100644
index 0000000000..b0a8b0b31a
--- /dev/null
+++ b/rts/gmp/mpz/tstbit.c
@@ -0,0 +1,70 @@
+/* mpz_tstbit -- test a specified bit.  Simulate 2's complement representation.
+
+Copyright (C) 1997 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+int
+#if __STDC__
+mpz_tstbit (mpz_srcptr d, unsigned long int bit_index)
+#else
+mpz_tstbit (d, bit_index)
+     mpz_srcptr d;
+     unsigned long int bit_index;
+#endif
+{
+  mp_size_t dsize = d->_mp_size;
+  mp_ptr dp = d->_mp_d;
+  mp_size_t limb_index;
+
+  limb_index = bit_index / BITS_PER_MP_LIMB;
+  if (dsize >= 0)
+    {
+      if (limb_index < dsize)
+	return (dp[limb_index] >> (bit_index % BITS_PER_MP_LIMB)) & 1;
+      else
+	/* Testing a bit outside of a positive number.  */
+	return 0;
+    }
+  else
+    {
+      mp_size_t zero_bound;
+
+      dsize = -dsize;
+
+      /* Locate the least significant non-zero limb.  */
+      for (zero_bound = 0; dp[zero_bound] == 0; zero_bound++)
+	;
+
+      if (limb_index > zero_bound)
+	{
+	  if (limb_index < dsize)
+	    return (~dp[limb_index] >> (bit_index % BITS_PER_MP_LIMB)) & 1;
+	  else
+	    /* Testing a bit outside of a negative number.  */
+	    return 1;
+	}
+      else if (limb_index == zero_bound)
+	return (-dp[limb_index] >> (bit_index % BITS_PER_MP_LIMB)) & 1;
+      else
+	return 0;
+    }
+}
diff --git a/rts/gmp/mpz/ui_pow_ui.c b/rts/gmp/mpz/ui_pow_ui.c
new file mode 100644
index 0000000000..edd2dee625
--- /dev/null
+++ b/rts/gmp/mpz/ui_pow_ui.c
@@ -0,0 +1,139 @@
+/* mpz_ui_pow_ui(res, base, exp) -- Set RES to BASE**EXP.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+static void mpz_pow2 _PROTO ((mpz_ptr r, mp_limb_t blimb, unsigned long int e, mp_limb_t rl));
+
+void
+#if __STDC__
+mpz_ui_pow_ui (mpz_ptr r, unsigned long int b, unsigned long int e)
+#else
+mpz_ui_pow_ui (r, b, e)
+     mpz_ptr r;
+     unsigned long int b;
+     unsigned long int e;
+#endif
+{
+  mp_limb_t blimb = b;
+  mp_limb_t rl;
+
+  if (e == 0)
+    {
+      /* For x^0 we return 1, even if x is 0.  */
+      r->_mp_d[0] = 1;
+      r->_mp_size = 1;
+      return;
+    }
+
+  /* Compute b^e as (b^n)^(e div n) * b^(e mod n), where n is chosen such that
+     the latter factor is the largest number small enough to fit in a limb.  */
+
+  rl = 1;
+  while (e != 0 && blimb < ((mp_limb_t) 1 << BITS_PER_MP_LIMB/2))
+    {
+      if ((e & 1) != 0)
+	rl = rl * blimb;
+      blimb = blimb * blimb;
+      e = e >> 1;
+    }
+
+  /* rl is now b^(e mod n).  (I.e., the latter factor above.)  */
+
+  if (e == 0)
+    {
+      r->_mp_d[0] = rl;
+      r->_mp_size = rl != 0;
+      return;
+    }
+
+  mpz_pow2 (r, blimb, e, rl);
+}
+
+/* Multi-precision part of expontialization code.  */
+static void
+#if __STDC__
+mpz_pow2 (mpz_ptr r, mp_limb_t blimb, unsigned long int e, mp_limb_t rl)
+#else
+mpz_pow2 (r, blimb, e, rl)
+     mpz_ptr r;
+     mp_limb_t blimb;
+     unsigned long int e;
+     mp_limb_t rl;
+#endif
+{
+  mp_ptr rp, tp;
+  mp_size_t ralloc, rsize;
+  int cnt, i;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  /* Over-estimate temporary space requirements somewhat.  */
+  count_leading_zeros (cnt, blimb);
+  ralloc = e - cnt * e / BITS_PER_MP_LIMB + 1;
+
+  /* The two areas are used to alternatingly hold the input and receive the
+     product for mpn_mul.  (Needed since mpn_mul_n requires that the product
+     is distinct from either input operand.)  */
+  rp = (mp_ptr) TMP_ALLOC (ralloc * BYTES_PER_MP_LIMB);
+  tp = (mp_ptr) TMP_ALLOC (ralloc * BYTES_PER_MP_LIMB);
+
+  rp[0] = blimb;
+  rsize = 1;
+
+  count_leading_zeros (cnt, e);
+  for (i = BITS_PER_MP_LIMB - cnt - 2; i >= 0; i--)
+    {
+      mpn_mul_n (tp, rp, rp, rsize);
+      rsize = 2 * rsize;
+      rsize -= tp[rsize - 1] == 0;
+      MP_PTR_SWAP (rp, tp);
+
+      if ((e & ((mp_limb_t) 1 << i)) != 0)
+	{
+	  mp_limb_t cy;
+	  cy = mpn_mul_1 (rp, rp, rsize, blimb);
+	  rp[rsize] = cy;
+	  rsize += cy != 0;
+	}
+    }
+
+  /* We will need rsize or rsize+1 limbs for the result.  */
+  if (r->_mp_alloc <= rsize)
+    _mpz_realloc (r, rsize + 1);
+
+  /* Multiply the two factors (in rp,rsize and rl) and put the final result
+     in place.  */
+  {
+    mp_limb_t cy;
+    cy = mpn_mul_1 (r->_mp_d, rp, rsize, rl);
+    (r->_mp_d)[rsize] = cy;
+    rsize += cy != 0;
+  }
+
+  r->_mp_size = rsize;
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/urandomb.c b/rts/gmp/mpz/urandomb.c
new file mode 100644
index 0000000000..caca086e05
--- /dev/null
+++ b/rts/gmp/mpz/urandomb.c
@@ -0,0 +1,49 @@
+/* mpz_urandomb (rop, state, n) -- Generate a uniform pseudorandom
+   integer in the range 0 to 2^N - 1, inclusive, using STATE as the
+   random state previously initialized by a call to gmp_randinit().
+
+Copyright (C) 1999, 2000  Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_urandomb (mpz_t rop, gmp_randstate_t rstate, unsigned long int nbits)
+#else
+mpz_urandomb (rop, rstate, nbits)
+     mpz_t rop;
+     gmp_randstate_t rstate;
+     unsigned long int nbits;
+#endif
+{
+  mp_ptr rp;
+  mp_size_t size;
+
+  size = (nbits + BITS_PER_MP_LIMB - 1) / BITS_PER_MP_LIMB;
+  if (ALLOC (rop) < size)
+    _mpz_realloc (rop, size);
+
+  rp = PTR (rop);
+
+  _gmp_rand (rp, rstate, nbits);
+  MPN_NORMALIZE (rp, size);
+  SIZ (rop) = size;
+}
diff --git a/rts/gmp/mpz/urandomm.c b/rts/gmp/mpz/urandomm.c
new file mode 100644
index 0000000000..69e1bae78a
--- /dev/null
+++ b/rts/gmp/mpz/urandomm.c
@@ -0,0 +1,78 @@
+/* mpz_urandomm (rop, state, n) -- Generate a uniform pseudorandom
+   integer in the range 0 to N-1, using STATE as the random state
+   previously initialized by a call to gmp_randinit().
+
+Copyright (C) 2000  Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void
+#if __STDC__
+mpz_urandomm (mpz_t rop, gmp_randstate_t rstate, mpz_t n)
+#else
+mpz_urandomm (rop, rstate, n)
+     mpz_t rop;
+     gmp_randstate_t rstate;
+     mpz_t n;
+#endif
+{
+  mpz_t t, p, m;
+  mp_ptr tp;
+  mp_size_t nbits, size;
+  int count;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+
+  /* FIXME: Should check for n == 0 and report error */
+
+  size = SIZ (n);
+  count_leading_zeros (count, PTR (n)[size - 1]);
+  nbits = size * BITS_PER_MP_LIMB - count;
+
+  /* Allocate enough for any mpz function called since a realloc of
+     these will fail.  */
+  MPZ_TMP_INIT (t, size);
+  MPZ_TMP_INIT (m, size + 1);
+  MPZ_TMP_INIT (p, size + 1);
+
+  /* Let m = highest possible random number plus 1.  */
+  mpz_set_ui (m, 0);
+  mpz_setbit (m, nbits);
+
+  /* Let p = floor(m / n) * n.  */
+  mpz_fdiv_q (p, m, n);
+  mpz_mul (p, p, n);
+
+  tp = PTR (t);
+  do
+    {
+      _gmp_rand (tp, rstate, nbits);
+      MPN_NORMALIZE (tp, size);	/* FIXME: Really necessary?  */
+      SIZ (t) = size;
+    }
+  while (mpz_cmp (t, p) >= 0);
+
+  mpz_mod (rop, t, n);
+
+  TMP_FREE (marker);
+}
diff --git a/rts/gmp/mpz/xor.c b/rts/gmp/mpz/xor.c
new file mode 100644
index 0000000000..69898d1791
--- /dev/null
+++ b/rts/gmp/mpz/xor.c
@@ -0,0 +1,217 @@
+/* mpz_xor -- Logical xor.
+
+Copyright (C) 1991, 1993, 1994, 1996, 1997, 2000 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+mpz_xor (mpz_ptr res, mpz_srcptr op1, mpz_srcptr op2)
+#else
+mpz_xor (res, op1, op2)
+     mpz_ptr res;
+     mpz_srcptr op1;
+     mpz_srcptr op2;
+#endif
+{
+  mp_srcptr op1_ptr, op2_ptr;
+  mp_size_t op1_size, op2_size;
+  mp_ptr res_ptr;
+  mp_size_t res_size, res_alloc;
+  mp_size_t i;
+  TMP_DECL (marker);
+
+  TMP_MARK (marker);
+  op1_size = op1->_mp_size;
+  op2_size = op2->_mp_size;
+
+  op1_ptr = op1->_mp_d;
+  op2_ptr = op2->_mp_d;
+  res_ptr = res->_mp_d;
+
+  if (op1_size >= 0)
+    {
+      if (op2_size >= 0)
+	{
+	  if (op1_size >= op2_size)
+	    {
+	      if (res->_mp_alloc < op1_size)
+		{
+		  _mpz_realloc (res, op1_size);
+		  op1_ptr = op1->_mp_d;
+		  op2_ptr = op2->_mp_d;
+		  res_ptr = res->_mp_d;
+		}
+
+	      if (res_ptr != op1_ptr)
+		MPN_COPY (res_ptr + op2_size, op1_ptr + op2_size,
+			  op1_size - op2_size);
+	      for (i = op2_size - 1; i >= 0; i--)
+		res_ptr[i] = op1_ptr[i] ^ op2_ptr[i];
+	      res_size = op1_size;
+	    }
+	  else
+	    {
+	      if (res->_mp_alloc < op2_size)
+		{
+		  _mpz_realloc (res, op2_size);
+		  op1_ptr = op1->_mp_d;
+		  op2_ptr = op2->_mp_d;
+		  res_ptr = res->_mp_d;
+		}
+
+	      if (res_ptr != op2_ptr)
+		MPN_COPY (res_ptr + op1_size, op2_ptr + op1_size,
+			  op2_size - op1_size);
+	      for (i = op1_size - 1; i >= 0; i--)
+		res_ptr[i] = op1_ptr[i] ^ op2_ptr[i];
+	      res_size = op2_size;
+	    }
+
+	  MPN_NORMALIZE (res_ptr, res_size);
+	  res->_mp_size = res_size;
+	  return;
+	}
+      else /* op2_size < 0 */
+	{
+	  /* Fall through to the code at the end of the function.  */
+	}
+    }
+  else
+    {
+      if (op2_size < 0)
+	{
+	  mp_ptr opx;
+	  mp_limb_t cy;
+
+	  /* Both operands are negative, the result will be positive.
+	      (-OP1) ^ (-OP2) =
+	     = ~(OP1 - 1) ^ ~(OP2 - 1) =
+	     = (OP1 - 1) ^ (OP2 - 1)  */
+
+	  op1_size = -op1_size;
+	  op2_size = -op2_size;
+
+	  /* Possible optimization: Decrease mpn_sub precision,
+	     as we won't use the entire res of both.  */
+	  opx = (mp_ptr) TMP_ALLOC (op1_size * BYTES_PER_MP_LIMB);
+	  mpn_sub_1 (opx, op1_ptr, op1_size, (mp_limb_t) 1);
+	  op1_ptr = opx;
+
+	  opx = (mp_ptr) TMP_ALLOC (op2_size * BYTES_PER_MP_LIMB);
+	  mpn_sub_1 (opx, op2_ptr, op2_size, (mp_limb_t) 1);
+	  op2_ptr = opx;
+
+	  res_alloc = MAX (op1_size, op2_size);
+	  if (res->_mp_alloc < res_alloc)
+	    {
+	      _mpz_realloc (res, res_alloc);
+	      res_ptr = res->_mp_d;
+	      /* Don't re-read OP1_PTR and OP2_PTR.  They point to
+		 temporary space--never to the space RES->_mp_d used
+		 to point to before reallocation.  */
+	    }
+
+	  if (op1_size > op2_size)
+	    {
+	      MPN_COPY (res_ptr + op2_size, op1_ptr + op2_size,
+			op1_size - op2_size);
+	      for (i = op2_size - 1; i >= 0; i--)
+		res_ptr[i] = op1_ptr[i] ^ op2_ptr[i];
+	      res_size = op1_size;
+	    }
+	  else
+	    {
+	      MPN_COPY (res_ptr + op1_size, op2_ptr + op1_size,
+			op2_size - op1_size);
+	      for (i = op1_size - 1; i >= 0; i--)
+		res_ptr[i] = op1_ptr[i] ^ op2_ptr[i];
+	      res_size = op2_size;
+	    }
+
+	  MPN_NORMALIZE (res_ptr, res_size);
+	  res->_mp_size = res_size;
+	  TMP_FREE (marker);
+	  return;
+	}
+      else
+	{
+	  /* We should compute -OP1 ^ OP2.  Swap OP1 and OP2 and fall
+	     through to the code that handles OP1 ^ -OP2.  */
+          MPZ_SRCPTR_SWAP (op1, op2);
+          MPN_SRCPTR_SWAP (op1_ptr,op1_size, op2_ptr,op2_size);
+	}
+    }
+
+  {
+    mp_ptr opx;
+    mp_limb_t cy;
+    mp_size_t count;
+
+    /* Operand 2 negative, so will be the result.
+       -(OP1 ^ (-OP2)) = -(OP1 ^ ~(OP2 - 1)) =
+       = ~(OP1 ^ ~(OP2 - 1)) + 1 =
+       = (OP1 ^ (OP2 - 1)) + 1      */
+
+    op2_size = -op2_size;
+
+    opx = (mp_ptr) TMP_ALLOC (op2_size * BYTES_PER_MP_LIMB);
+    mpn_sub_1 (opx, op2_ptr, op2_size, (mp_limb_t) 1);
+    op2_ptr = opx;
+
+    res_alloc = MAX (op1_size, op2_size) + 1;
+    if (res->_mp_alloc < res_alloc)
+      {
+	_mpz_realloc (res, res_alloc);
+	op1_ptr = op1->_mp_d;
+	res_ptr = res->_mp_d;
+	/* Don't re-read OP2_PTR.  It points to temporary space--never
+	   to the space RES->_mp_d used to point to before reallocation.  */
+      }
+
+    if (op1_size > op2_size)
+      {
+	MPN_COPY (res_ptr + op2_size, op1_ptr + op2_size, op1_size - op2_size);
+	for (i = op2_size - 1; i >= 0; i--)
+	  res_ptr[i] = op1_ptr[i] ^ op2_ptr[i];
+	res_size = op1_size;
+      }
+    else
+      {
+	MPN_COPY (res_ptr + op1_size, op2_ptr + op1_size, op2_size - op1_size);
+	for (i = op1_size - 1; i >= 0; i--)
+	  res_ptr[i] = op1_ptr[i] ^ op2_ptr[i];
+	res_size = op2_size;
+      }
+
+    cy = mpn_add_1 (res_ptr, res_ptr, res_size, (mp_limb_t) 1);
+    if (cy)
+      {
+	res_ptr[res_size] = cy;
+	res_size++;
+      }
+
+    MPN_NORMALIZE (res_ptr, res_size);
+    res->_mp_size = -res_size;
+    TMP_FREE (marker);
+  }
+}
diff --git a/rts/gmp/rand.c b/rts/gmp/rand.c
new file mode 100644
index 0000000000..d1f9354511
--- /dev/null
+++ b/rts/gmp/rand.c
@@ -0,0 +1,171 @@
+/* gmp_randinit (state, algorithm, ...) -- Initialize a random state.
+
+Copyright (C) 1999, 2000  Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include <stdio.h> /* for NULL */
+#if __STDC__
+# include <stdarg.h>
+#else
+# include <varargs.h>
+#endif
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Array of CL-schemes, ordered in increasing order of the first
+   member (the 'm2exp' value).  The end of the array is indicated with
+   an entry containing all zeros.  */
+
+/* All multipliers are in the range 0.01*m and 0.99*m, and are
+congruent to 5 (mod 8).
+They all pass the spectral test with Vt >= 2^(30/t) and merit >= 1.
+(Up to and including 196 bits, merit is >= 3.)  */
+
+struct __gmp_rand_lc_scheme_struct
+{
+  unsigned long int m2exp;	/* Modulus is 2 ^ m2exp. */
+  char *astr;			/* Multiplier in string form. */
+  unsigned long int c;		/* Adder. */
+};
+
+struct __gmp_rand_lc_scheme_struct __gmp_rand_lc_scheme[] =
+{
+  {32, "43840821", 	     1},
+  {33, "85943917", 	     1},
+  {34, "171799469", 	     1},
+  {35, "343825285", 	     1},
+  {36, "687285701", 	     1},
+  {37, "1374564613", 	     1},
+  {38, "2749193437", 	     1},
+  {39, "5497652029", 	     1},
+  {40, "10995212661", 	     1},
+  {56, "47988680294711517",  1},
+  {64, "13469374875402548381", 1},
+  {100, "203786806069096950756900463357", 1},	
+  {128, "96573135900076068624591706046897650309", 1},
+  {156, "43051576988660538262511726153887323360449035333", 1},
+  {196, "1611627857640767981443524165616850972435303571524033586421", 1},
+  {200, "491824250216153841876046962368396460896019632211283945747141", 1},
+  {256, "79336254595106925775099152154558630917988041692672147726148065355845551082677", 1},
+  {0, NULL, 0}			/* End of array. */
+};
+
+void
+#if __STDC__
+gmp_randinit (gmp_randstate_t rstate,
+	      gmp_randalg_t alg,
+	      ...)
+#else
+gmp_randinit (va_alist)
+     va_dcl
+#endif
+{
+  va_list ap;
+#if __STDC__
+#else
+  __gmp_randstate_struct *rstate;
+  gmp_randalg_t alg;
+#endif
+
+#if __STDC__
+  va_start (ap, alg);
+#else
+  va_start (ap);
+
+  rstate = va_arg (ap, __gmp_randstate_struct *);
+  alg = va_arg (ap, gmp_randalg_t);
+#endif
+
+  switch (alg)
+    {
+    case GMP_RAND_ALG_LC:	/* Linear congruential.  */
+      {
+	unsigned long int size;
+	struct __gmp_rand_lc_scheme_struct *sp;
+	mpz_t a;
+
+	size = va_arg (ap, unsigned long int);
+
+	/* Pick a scheme.  */
+	for (sp = __gmp_rand_lc_scheme; sp->m2exp != 0; sp++)
+	  if (sp->m2exp / 2 >= size)
+	    break;
+
+	if (sp->m2exp == 0)	/* Nothing big enough found.  */
+	  {
+	    gmp_errno |= GMP_ERROR_INVALID_ARGUMENT;
+	    return;
+	  }
+
+	/* Install scheme.  */
+	mpz_init_set_str (a, sp->astr, 0);
+	gmp_randinit_lc_2exp (rstate, a, sp->c, sp->m2exp);
+	mpz_clear (a);
+	break;
+      }
+
+#if 0
+    case GMP_RAND_ALG_BBS:	/* Blum, Blum, and Shub. */
+      {				
+	mpz_t p, q;
+	mpz_t ztmp;
+
+	/* FIXME: Generate p and q.  They must be ``large'' primes,
+           congruent to 3 mod 4.  Should we ensure that they meet some
+           of the criterias for being ``hard primes''?*/
+
+	/* These are around 128 bits. */
+	mpz_init_set_str (p, "148028650191182616877187862194899201391", 10); 
+	mpz_init_set_str (q, "315270837425234199477225845240496832591", 10);
+	
+	/* Allocate algorithm specific data. */
+	rstate->data.bbs = (__gmp_rand_data_bbs *)
+	  (*_mp_allocate_func) (sizeof (__gmp_rand_data_bbs));
+
+	mpz_init (rstate->data.bbs->bi); /* The Blum integer. */
+	mpz_mul (rstate->data.bbs->bi, p, q);
+
+	/* Find a seed, x, with gcd (x, bi) == 1. */
+	mpz_init (ztmp);
+	while (1)
+	  {
+	    mpz_gcd (ztmp, seed, rstate->data.bbs->bi);
+	    if (!mpz_cmp_ui (ztmp, 1))
+	      break;
+	    mpz_add_ui (seed, seed, 1);
+	  }
+
+	rstate->alg = alg;
+	rstate->size = size;		/* FIXME: Remove. */
+	mpz_set (rstate->seed, seed);
+
+	mpz_clear (p);
+	mpz_clear (q);
+	mpz_clear (ztmp);
+	break;
+      }
+#endif /* 0 */
+
+    default:			/* Bad choice. */
+      gmp_errno |= GMP_ERROR_UNSUPPORTED_ARGUMENT;
+    }
+
+  va_end (ap);
+}
diff --git a/rts/gmp/randclr.c b/rts/gmp/randclr.c
new file mode 100644
index 0000000000..5cb0291165
--- /dev/null
+++ b/rts/gmp/randclr.c
@@ -0,0 +1,54 @@
+/* gmp_randclear (state) -- Clear and deallocate random state STATE.
+
+Copyright (C) 1999, 2000  Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+gmp_randclear (gmp_randstate_t rstate)
+#else
+gmp_randclear (rstate)
+     gmp_randstate_t rstate;
+#endif
+{
+  mpz_clear (rstate->seed);
+
+  switch (rstate->alg)
+    {
+    case GMP_RAND_ALG_LC:
+      mpz_clear (rstate->algdata.lc->a);
+      if (rstate->algdata.lc->m2exp == 0)
+	mpz_clear (rstate->algdata.lc->m);
+      (*_mp_free_func) (rstate->algdata.lc, sizeof (*rstate->algdata.lc));
+      break;
+
+#if 0
+    case GMP_RAND_ALG_BBS:
+      mpz_clear (rstate->algdata.bbs->bi);
+      (*_mp_free_func) (rstate->algdata.bbs, sizeof (*rstate->algdata.bbs));
+      break;
+#endif /* 0 */
+
+    default:
+      gmp_errno |= GMP_ERROR_UNSUPPORTED_ARGUMENT;
+    }
+}
diff --git a/rts/gmp/randlc.c b/rts/gmp/randlc.c
new file mode 100644
index 0000000000..7079db827e
--- /dev/null
+++ b/rts/gmp/randlc.c
@@ -0,0 +1,56 @@
+/* gmp_randinit_lc (state, a, c, m) -- Initialize a random state for a
+   linear congruential generator with multiplier A, adder C, and
+   modulus M.
+
+Copyright (C) 1999, 2000  Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+gmp_randinit_lc (gmp_randstate_t rstate,
+		 mpz_t a,
+		 unsigned long int c,
+		 mpz_t m)
+#else
+gmp_randinit_lc (rstate, a, c, m)
+     gmp_randstate_t rstate;
+     mpz_t a;
+     unsigned long int c;
+     mpz_t m;
+#endif
+{
+  /* FIXME: Not finished.  We don't handle this in _gmp_rand() yet. */
+  abort ();			
+
+  mpz_init_set_ui (rstate->seed, 1);
+  _mpz_realloc (rstate->seed, ABSIZ (m));
+
+  /* Allocate algorithm specific data. */
+  rstate->algdata.lc = (__gmp_randata_lc *)
+    (*_mp_allocate_func) (sizeof (__gmp_randata_lc));
+
+  mpz_init_set (rstate->algdata.lc->a, a);
+  rstate->algdata.lc->c = c;
+  mpz_init_set (rstate->algdata.lc->m, m);
+
+  rstate->alg = GMP_RAND_ALG_LC;
+}
diff --git a/rts/gmp/randlc2x.c b/rts/gmp/randlc2x.c
new file mode 100644
index 0000000000..dbd5f041ee
--- /dev/null
+++ b/rts/gmp/randlc2x.c
@@ -0,0 +1,59 @@
+/* gmp_randinit_lc_2exp (state, a, c, m2exp) -- Initialize random
+   state STATE for a linear congruential generator with multiplier A,
+   adder C, and modulus 2 ^ M2EXP.
+
+Copyright (C) 2000  Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+gmp_randinit_lc_2exp (gmp_randstate_t rstate,
+		      mpz_t a,
+		      unsigned long int c,
+		      unsigned long int m2exp)
+#else
+gmp_randinit_lc_2exp (rstate, a, c, m2exp)
+     gmp_randstate_t rstate;
+     mpz_t a;
+     unsigned long int c;
+     unsigned long int m2exp;
+#endif
+{
+  mpz_init_set_ui (rstate->seed, 1);
+  _mpz_realloc (rstate->seed, m2exp / BITS_PER_MP_LIMB
+		+ (m2exp % BITS_PER_MP_LIMB != 0));
+
+  /* Allocate algorithm specific data. */
+  rstate->algdata.lc = (__gmp_randata_lc *)
+    (*_mp_allocate_func) (sizeof (__gmp_randata_lc));
+
+  mpz_init_set (rstate->algdata.lc->a, a);
+  rstate->algdata.lc->c = c;
+
+  /* Cover weird case where m2exp is 0, which means that m is used
+     instead of m2exp.  */
+  if (m2exp == 0)
+    mpz_init_set_ui (rstate->algdata.lc->m, 0);
+  rstate->algdata.lc->m2exp = m2exp;
+
+  rstate->alg = GMP_RAND_ALG_LC;
+}
diff --git a/rts/gmp/randraw.c b/rts/gmp/randraw.c
new file mode 100644
index 0000000000..c0c3889d33
--- /dev/null
+++ b/rts/gmp/randraw.c
@@ -0,0 +1,360 @@
+/* _gmp_rand (rp, state, nbits) -- Generate a random bitstream of
+   length NBITS in RP.  RP must have enough space allocated to hold
+   NBITS.
+
+Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* For linear congruential (LC), we use one of algorithms (1) or (2).
+   (gmp-3.0 uses algorithm (1) with 'm' as a power of 2.)
+
+LC algorithm (1).
+
+	X = (aX + c) mod m
+
+[D. Knuth, "The Art of Computer Programming: Volume 2, Seminumerical Algorithms",
+Third Edition, Addison Wesley, 1998, pp. 184-185.]
+
+	X is the seed and the result
+	a is chosen so that
+	    a mod 8 = 5 [3.2.1.2] and [3.2.1.3]
+	    .01m < a < .99m
+	    its binary or decimal digits is not a simple, regular pattern
+	    it has no large quotients when Euclid's algorithm is used to find
+	      gcd(a, m) [3.3.3]
+	    it passes the spectral test [3.3.4]
+	    it passes several tests of [3.3.2]
+	c has no factor in common with m (c=1 or c=a can be good)
+	m is large (2^30)
+	  is a power of 2 [3.2.1.1]
+
+The least significant digits of the generated number are not very
+random.  It should be regarded as a random fraction X/m.  To get a
+random integer between 0 and n-1, multiply X/m by n and truncate.
+(Don't use X/n [ex 3.4.1-3])
+
+The ``accuracy'' in t dimensions is one part in ``the t'th root of m'' [3.3.4].
+
+Don't generate more than about m/1000 numbers without changing a, c, or m.
+
+The sequence length depends on chosen a,c,m.
+
+
+LC algorithm (2).
+
+	X = a * (X mod q) - r * (long) (X/q)
+	if X<0 then X+=m
+
+[Knuth, pp. 185-186.]
+
+	X is the seed and the result
+	  as a seed is nonzero and less than m
+	a is a primitive root of m (which means that a^2 <= m)
+	q is (long) m / a
+	r is m mod a
+	m is a prime number near the largest easily computed integer
+
+which gives
+
+	X = a * (X % ((long) m / a)) -
+	    (M % a) * ((long) (X / ((long) m / a)))
+
+Since m is prime, the least-significant bits of X are just as random as
+the most-significant bits. */
+
+/* Blum, Blum, and Shub.
+
+   [Bruce Schneier, "Applied Cryptography", Second Edition, John Wiley
+   & Sons, Inc., 1996, pp. 417-418.]
+
+   "Find two large prime numbers, p and q, which are congruent to 3
+   modulo 4.  The product of those numbers, n, is a blum integer.
+   Choose another random integer, x, which is relatively prime to n.
+   Compute
+	x[0] = x^2 mod n
+   That's the seed for the generator."
+
+   To generate a random bit, compute
+	x[i] = x[i-1]^2 mod n
+   The least significant bit of x[i] is the one we want.
+
+   We can use more than one bit from x[i], namely the
+	log2(bitlength of x[i])
+   least significant bits of x[i].
+
+   So, for a 32-bit seed we get 5 bits per computation.
+
+   The non-predictability of this generator is based on the difficulty
+   of factoring n.
+ */
+
+/* -------------------------------------------------- */
+
+/* lc (rp, state) -- Generate next number in LC sequence.  Return the
+   number of valid bits in the result.  NOTE: If 'm' is a power of 2
+   (m2exp != 0), discard the lower half of the result.  */
+
+static
+unsigned long int
+#if __STDC__
+lc (mp_ptr rp, gmp_randstate_t rstate)
+#else
+lc (rp, rstate)
+     mp_ptr rp;
+     gmp_randstate_t rstate;
+#endif
+{
+  mp_ptr tp, seedp, ap;
+  mp_size_t ta;
+  mp_size_t tn, seedn, an;
+  mp_size_t retval;
+  int shiftcount = 0;
+  unsigned long int m2exp;
+  mp_limb_t c;
+  TMP_DECL (mark);
+
+  m2exp = rstate->algdata.lc->m2exp;
+  c = (mp_limb_t) rstate->algdata.lc->c;
+
+  seedp = PTR (rstate->seed);
+  seedn = SIZ (rstate->seed);
+
+  if (seedn == 0)
+    {
+      /* Seed is 0.  Result is C % M.  */
+      *rp = c;
+
+      if (m2exp != 0)
+	{
+	  /* M is a power of 2.  */
+	  if (m2exp < BITS_PER_MP_LIMB)
+	    {
+	      /* Only necessary when M may be smaller than C.  */
+	      *rp &= (((mp_limb_t) 1 << m2exp) - 1);
+	    }
+	}
+      else
+	{
+	  /* M is not a power of 2.  */
+	  abort ();		/* FIXME.  */
+	}
+
+      /* Save result as next seed.  */
+      *seedp = *rp;
+      SIZ (rstate->seed) = 1;
+      return BITS_PER_MP_LIMB;
+    }
+
+  ap = PTR (rstate->algdata.lc->a);
+  an = SIZ (rstate->algdata.lc->a);
+
+  /* Allocate temporary storage.  Let there be room for calculation of
+     (A * seed + C) % M, or M if bigger than that.  */
+
+  ASSERT_ALWAYS (m2exp != 0);	/* FIXME.  */
+
+  TMP_MARK (mark);
+  ta = an + seedn + 1;
+  tp = (mp_ptr) TMP_ALLOC (ta * BYTES_PER_MP_LIMB);
+  MPN_ZERO (tp, ta);
+
+  /* t = a * seed */
+  if (seedn >= an)
+    mpn_mul_basecase (tp, seedp, seedn, ap, an);
+  else
+    mpn_mul_basecase (tp, ap, an, seedp, seedn);
+  tn = an + seedn;
+
+  /* t = t + c */
+  mpn_incr_u (tp, c);
+
+  /* t = t % m */
+  if (m2exp != 0)
+    {
+      /* M is a power of 2.  The mod operation is trivial.  */
+
+      tp[m2exp / BITS_PER_MP_LIMB] &= ((mp_limb_t) 1 << m2exp % BITS_PER_MP_LIMB) - 1;
+      tn = (m2exp + BITS_PER_MP_LIMB - 1) / BITS_PER_MP_LIMB;
+    }
+  else
+    {
+      abort ();			/* FIXME.  */
+    }
+
+  /* Save result as next seed.  */
+  MPN_COPY (PTR (rstate->seed), tp, tn);
+  SIZ (rstate->seed) = tn;
+
+  if (m2exp != 0)
+    {
+      /* Discard the lower half of the result.  */
+      unsigned long int discardb = m2exp / 2;
+      mp_size_t discardl = discardb / BITS_PER_MP_LIMB;
+
+      tn -= discardl;
+      if (tn > 0)
+	{
+	  if (discardb % BITS_PER_MP_LIMB != 0)
+	    {
+	      mpn_rshift (tp, tp + discardl, tn, discardb % BITS_PER_MP_LIMB);
+	      MPN_COPY (rp, tp, (discardb + BITS_PER_MP_LIMB -1) / BITS_PER_MP_LIMB);
+	    }
+	  else			/* Even limb boundary.  */
+	    MPN_COPY_INCR (rp, tp + discardl, tn);
+	}
+    }
+  else
+    {
+      MPN_COPY (rp, tp, tn);
+    }
+
+  TMP_FREE (mark);
+
+  /* Return number of valid bits in the result.  */
+  if (m2exp != 0)
+    retval = (m2exp + 1) / 2;
+  else
+    retval = SIZ (rstate->algdata.lc->m) * BITS_PER_MP_LIMB - shiftcount;
+  return retval;
+}
+
+#ifdef RAWRANDEBUG
+/* Set even bits to EVENBITS and odd bits to ! EVENBITS in RP.
+   Number of bits is m2exp in state.  */
+/* FIXME: Remove.  */
+unsigned long int
+lc_test (mp_ptr rp, gmp_randstate_t s, const int evenbits)
+{
+  unsigned long int rn, nbits;
+  int f;
+
+  nbits = s->algdata.lc->m2exp / 2;
+  rn = nbits / BITS_PER_MP_LIMB + (nbits % BITS_PER_MP_LIMB != 0);
+  MPN_ZERO (rp, rn);
+
+  for (f = 0; f < nbits; f++)
+    {
+      mpn_lshift (rp, rp, rn, 1);
+      if (f % 2 == ! evenbits)
+	rp[0] += 1;
+    }
+
+  return nbits;
+}
+#endif /* RAWRANDEBUG */
+
+void
+#if __STDC__
+_gmp_rand (mp_ptr rp, gmp_randstate_t rstate, unsigned long int nbits)
+#else
+_gmp_rand (rp, rstate, nbits)
+     mp_ptr rp;
+     gmp_randstate_t rstate;
+     unsigned long int nbits;
+#endif
+{
+  mp_size_t rn;			/* Size of R.  */
+
+  rn = (nbits + BITS_PER_MP_LIMB - 1) / BITS_PER_MP_LIMB;
+
+  switch (rstate->alg)
+    {
+    case GMP_RAND_ALG_LC:
+      {
+	unsigned long int rbitpos;
+	int chunk_nbits;
+	mp_ptr tp;
+	mp_size_t tn;
+	TMP_DECL (lcmark);
+
+	TMP_MARK (lcmark);
+
+	chunk_nbits = rstate->algdata.lc->m2exp / 2;
+	tn = (chunk_nbits + BITS_PER_MP_LIMB - 1) / BITS_PER_MP_LIMB;
+
+	tp = (mp_ptr) TMP_ALLOC (tn * BYTES_PER_MP_LIMB);
+
+	rbitpos = 0;
+	while (rbitpos + chunk_nbits <= nbits)
+	  {
+	    mp_ptr r2p = rp + rbitpos / BITS_PER_MP_LIMB;
+
+	    if (rbitpos % BITS_PER_MP_LIMB != 0)
+	      {
+		mp_limb_t savelimb, rcy;
+		/* Target of of new chunk is not bit aligned.  Use temp space
+		   and align things by shifting it up.  */
+		lc (tp, rstate);
+		savelimb = r2p[0];
+		rcy = mpn_lshift (r2p, tp, tn, rbitpos % BITS_PER_MP_LIMB);
+		r2p[0] |= savelimb;
+/* bogus */	if ((chunk_nbits % BITS_PER_MP_LIMB + rbitpos % BITS_PER_MP_LIMB)
+		    > BITS_PER_MP_LIMB)
+		  r2p[tn] = rcy;
+	      }
+	    else
+	      {
+		/* Target of of new chunk is bit aligned.  Let `lc' put bits
+		   directly into our target variable.  */
+		lc (r2p, rstate);
+	      }
+	    rbitpos += chunk_nbits;
+	  }
+
+	/* Handle last [0..chunk_nbits) bits.  */
+	if (rbitpos != nbits)
+	  {
+	    mp_ptr r2p = rp + rbitpos / BITS_PER_MP_LIMB;
+	    int last_nbits = nbits - rbitpos;
+	    tn = (last_nbits + BITS_PER_MP_LIMB - 1) / BITS_PER_MP_LIMB;
+	    lc (tp, rstate);
+	    if (rbitpos % BITS_PER_MP_LIMB != 0)
+	      {
+		mp_limb_t savelimb, rcy;
+		/* Target of of new chunk is not bit aligned.  Use temp space
+		   and align things by shifting it up.  */
+		savelimb = r2p[0];
+		rcy = mpn_lshift (r2p, tp, tn, rbitpos % BITS_PER_MP_LIMB);
+		r2p[0] |= savelimb;
+		if (rbitpos + tn * BITS_PER_MP_LIMB - rbitpos % BITS_PER_MP_LIMB < nbits)
+		  r2p[tn] = rcy;
+	      }
+	    else
+	      {
+		MPN_COPY (r2p, tp, tn);
+	      }
+	    /* Mask off top bits if needed.  */
+	    if (nbits % BITS_PER_MP_LIMB != 0)
+	      rp[nbits / BITS_PER_MP_LIMB]
+		&= ~ ((~(mp_limb_t) 0) << nbits % BITS_PER_MP_LIMB);
+	  }
+
+	TMP_FREE (lcmark);
+	break;
+      }
+
+    default:
+      gmp_errno |= GMP_ERROR_UNSUPPORTED_ARGUMENT;
+      break;
+    }
+}
diff --git a/rts/gmp/randsd.c b/rts/gmp/randsd.c
new file mode 100644
index 0000000000..3bed14b578
--- /dev/null
+++ b/rts/gmp/randsd.c
@@ -0,0 +1,37 @@
+/* gmp_randseed (state, seed) -- Set initial seed SEED in random state
+   STATE.
+
+Copyright (C) 2000  Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+gmp_randseed (gmp_randstate_t rstate,
+	      mpz_t seed)
+#else
+gmp_randseed (rstate, seed)
+     gmp_randstate_t rstate;
+     mpz_t seed;
+#endif
+{
+  mpz_set (rstate->seed, seed);
+}
diff --git a/rts/gmp/randsdui.c b/rts/gmp/randsdui.c
new file mode 100644
index 0000000000..92f412f3ea
--- /dev/null
+++ b/rts/gmp/randsdui.c
@@ -0,0 +1,37 @@
+/* gmp_randseed_ui (state, seed) -- Set initial seed SEED in random
+   state STATE.
+
+Copyright (C) 2000  Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+void
+#if __STDC__
+gmp_randseed_ui (gmp_randstate_t rstate,
+		  unsigned long int seed)
+#else
+gmp_randseed_ui (rstate, seed)
+     gmp_randstate_t rstate;
+     mpz_t seed;
+#endif
+{
+  mpz_set_ui (rstate->seed, seed);
+}
diff --git a/rts/gmp/stack-alloc.c b/rts/gmp/stack-alloc.c
new file mode 100644
index 0000000000..9ab98fe5f9
--- /dev/null
+++ b/rts/gmp/stack-alloc.c
@@ -0,0 +1,136 @@
+/* Stack allocation routines.  This is intended for machines without support
+   for the `alloca' function.
+
+Copyright (C) 1996, 1997, 1999, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#include "stack-alloc.h"
+
+#define __need_size_t
+#include <stddef.h>
+#undef __need_size_t
+
+/* gmp-impl.h and stack-alloc.h conflict when not USE_STACK_ALLOC, so these
+   declarations are copied here */
+#if __STDC__
+extern void *	(*__gmp_allocate_func) (size_t);
+extern void	(*__gmp_free_func) (void *, size_t);
+#else
+extern void *	(*__gmp_allocate_func) ();
+extern void	(*__gmp_free_func) ();
+#endif
+
+typedef struct tmp_stack tmp_stack;
+
+static unsigned long max_total_allocation = 0;
+static unsigned long current_total_allocation = 0;
+
+static tmp_stack xxx = {&xxx, &xxx, 0};
+static tmp_stack *current = &xxx;
+
+/* The rounded size of the header of each allocation block.  */
+#define HSIZ ((sizeof (tmp_stack) + __TMP_ALIGN - 1) & -__TMP_ALIGN)
+
+/* Allocate a block of exactly <size> bytes.  This should only be called
+   through the TMP_ALLOC macro, which takes care of rounding/alignment.  */
+void *
+#if __STDC__
+__gmp_tmp_alloc (unsigned long size)
+#else
+__gmp_tmp_alloc (size)
+     unsigned long size;
+#endif
+{
+  void *that;
+
+  if (size > (char *) current->end - (char *) current->alloc_point)
+    {
+      void *chunk;
+      tmp_stack *header;
+      unsigned long chunk_size;
+      unsigned long now;
+
+      /* Allocate a chunk that makes the total current allocation somewhat
+	 larger than the maximum allocation ever.  If size is very large, we
+	 allocate that much.  */
+
+      now = current_total_allocation + size;
+      if (now > max_total_allocation)
+	{
+	  /* We need more temporary memory than ever before.  Increase
+	     for future needs.  */
+	  now = now * 3 / 2;
+	  chunk_size = now - current_total_allocation + HSIZ;
+	  current_total_allocation = now;
+	  max_total_allocation = current_total_allocation;
+	}
+      else
+	{
+	  chunk_size = max_total_allocation - current_total_allocation + HSIZ;
+	  current_total_allocation = max_total_allocation;
+	}
+
+      chunk = (*__gmp_allocate_func) (chunk_size);
+      header = (tmp_stack *) chunk;
+      header->end = (char *) chunk + chunk_size;
+      header->alloc_point = (char *) chunk + HSIZ;
+      header->prev = current;
+      current = header;
+    }
+
+  that = current->alloc_point;
+  current->alloc_point = (char *) that + size;
+  return that;
+}
+
+/* Typically called at function entry.  <mark> is assigned so that
+   __gmp_tmp_free can later be used to reclaim all subsequently allocated
+   storage.  */
+void
+#if __STDC__
+__gmp_tmp_mark (tmp_marker *mark)
+#else
+__gmp_tmp_mark (mark)
+     tmp_marker *mark;
+#endif
+{
+  mark->which_chunk = current;
+  mark->alloc_point = current->alloc_point;
+}
+
+/* Free everything allocated since <mark> was assigned by __gmp_tmp_mark */
+void
+#if __STDC__
+__gmp_tmp_free (tmp_marker *mark)
+#else
+__gmp_tmp_free (mark)
+     tmp_marker *mark;
+#endif
+{
+  while (mark->which_chunk != current)
+    {
+      tmp_stack *tmp;
+
+      tmp = current;
+      current = tmp->prev;
+      current_total_allocation -= (((char *) (tmp->end) - (char *) tmp) - HSIZ);
+      (*__gmp_free_func) (tmp, (char *) tmp->end - (char *) tmp);
+    }
+  current->alloc_point = mark->alloc_point;
+}
diff --git a/rts/gmp/stack-alloc.h b/rts/gmp/stack-alloc.h
new file mode 100644
index 0000000000..f59beec266
--- /dev/null
+++ b/rts/gmp/stack-alloc.h
@@ -0,0 +1,64 @@
+/* Stack allocation routines.  This is intended for machines without support
+   for the `alloca' function.
+
+Copyright (C) 1996, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+struct tmp_stack
+{
+  void *end;
+  void *alloc_point;
+  struct tmp_stack *prev;
+};
+
+struct tmp_marker
+{
+  struct tmp_stack *which_chunk;
+  void *alloc_point;
+};
+
+typedef struct tmp_marker tmp_marker;
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#if __STDC__
+void *__gmp_tmp_alloc (unsigned long);
+void __gmp_tmp_mark (tmp_marker *);
+void __gmp_tmp_free (tmp_marker *);
+#else
+void *__gmp_tmp_alloc ();
+void __gmp_tmp_mark ();
+void __gmp_tmp_free ();
+#endif
+
+#if defined (__cplusplus)
+}
+#endif
+
+#ifndef __TMP_ALIGN
+#define __TMP_ALIGN 8
+#endif
+
+#define TMP_DECL(marker) tmp_marker marker
+#define TMP_ALLOC(size) \
+  __gmp_tmp_alloc (((unsigned long) (size) + __TMP_ALIGN - 1) & -__TMP_ALIGN)
+#define TMP_MARK(marker) __gmp_tmp_mark (&marker)
+#define TMP_FREE(marker) __gmp_tmp_free (&marker)
diff --git a/rts/gmp/stamp-h.in b/rts/gmp/stamp-h.in
new file mode 100644
index 0000000000..9788f70238
--- /dev/null
+++ b/rts/gmp/stamp-h.in
@@ -0,0 +1 @@
+timestamp
diff --git a/rts/gmp/stamp-vti b/rts/gmp/stamp-vti
new file mode 100644
index 0000000000..e3186186b2
--- /dev/null
+++ b/rts/gmp/stamp-vti
@@ -0,0 +1,3 @@
+@set UPDATED 5 October 2000
+@set EDITION 3.1.1
+@set VERSION 3.1.1
diff --git a/rts/gmp/urandom.h b/rts/gmp/urandom.h
new file mode 100644
index 0000000000..313479e8b7
--- /dev/null
+++ b/rts/gmp/urandom.h
@@ -0,0 +1,86 @@
+/* urandom.h -- define urandom returning a full unsigned long random value.
+
+Copyright (C) 1995, 1996, 1997, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA. */
+
+#if defined (__hpux) || defined (__svr4__) || defined (__SVR4)
+/* HPUX lacks random().  */
+static inline mp_limb_t
+urandom ()
+{
+  return mrand48 ();
+}
+#define __URANDOM
+#endif
+
+#if defined(_WIN32) && !(defined(__CYGWIN__) || defined(__CYGWIN32__))
+/* MS CRT supplies just the poxy rand(), with an upper bound of 0x7fff */
+static inline unsigned long
+urandom ()
+{
+  return rand () ^ (rand () << 16) ^ (rand() << 32);
+}
+#define __URANDOM
+#endif
+
+#if defined (__alpha) && !defined (__URANDOM)
+/* DEC OSF/1 1.2 random() returns a double.  */
+long mrand48 ();
+static inline mp_limb_t
+urandom ()
+{
+  return mrand48 () | (mrand48 () << 32);
+}
+#define __URANDOM
+#endif
+
+#if BITS_PER_MP_LIMB == 32 && !defined (__URANDOM)
+#if defined (__cplusplus)
+extern "C" {
+#endif
+long random ();
+#if defined (__cplusplus)
+}
+#endif
+static inline mp_limb_t
+urandom ()
+{
+  /* random() returns 31 bits, we want 32.  */
+  return random () ^ (random () << 1);
+}
+#define __URANDOM
+#endif
+
+#if BITS_PER_MP_LIMB == 64 && !defined (__URANDOM)
+#if defined (__cplusplus)
+extern "C" {
+#endif
+long random ();
+#if defined (__cplusplus)
+}
+#endif
+static inline mp_limb_t
+urandom ()
+{
+  /* random() returns 31 bits, we want 64.  */
+  return random () ^ ((mp_limb_t) random () << 31) ^ ((mp_limb_t) random () << 62);
+}
+#define __URANDOM
+#endif
+
diff --git a/rts/gmp/version.c b/rts/gmp/version.c
new file mode 100644
index 0000000000..9d544ee1d8
--- /dev/null
+++ b/rts/gmp/version.c
@@ -0,0 +1,26 @@
+/* gmp_version -- version number compiled into the library */
+
+/*
+Copyright (C) 1996, 1999, 2000  Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+MA 02111-1307, USA.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+const char *gmp_version = VERSION;
diff --git a/rts/gmp/version.texi b/rts/gmp/version.texi
new file mode 100644
index 0000000000..e3186186b2
--- /dev/null
+++ b/rts/gmp/version.texi
@@ -0,0 +1,3 @@
+@set UPDATED 5 October 2000
+@set EDITION 3.1.1
+@set VERSION 3.1.1
diff --git a/rts/hooks/FlagDefaults.c b/rts/hooks/FlagDefaults.c
new file mode 100644
index 0000000000..393d39bc39
--- /dev/null
+++ b/rts/hooks/FlagDefaults.c
@@ -0,0 +1,20 @@
+/* -----------------------------------------------------------------------------
+ *
+ * User-overridable RTS hooks.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+
+void
+defaultsHook (void)
+{ /* this is called *after* RTSflags has had
+     its defaults set, but *before* we start
+     processing the RTS command-line options.
+
+     This default version does *nothing*.
+     The user may provide a more interesting
+     one.
+  */
+}
+
diff --git a/rts/hooks/InitEachPE.c b/rts/hooks/InitEachPE.c
new file mode 100644
index 0000000000..cc9cdc0dba
--- /dev/null
+++ b/rts/hooks/InitEachPE.c
@@ -0,0 +1,23 @@
+/* -----------------------------------------------------------------------------
+ *
+ * User-overridable RTS hooks.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+
+#ifdef PAR
+void
+InitEachPEHook (void)
+{ /* In a GUM setup this is called on each
+     PE immediately before SynchroniseSystem.
+     It can be used to read in static data 
+     to each PE which has to be available to
+     each PE. See GPH-Maple as an example how to
+     use this in combination with foreign language
+     code:
+       http://www.risc.uni-linz.ac.at/software/ghc-maple/
+     -- HWL
+  */
+}
+#endif
diff --git a/rts/hooks/MallocFail.c b/rts/hooks/MallocFail.c
new file mode 100644
index 0000000000..1218d1d8d0
--- /dev/null
+++ b/rts/hooks/MallocFail.c
@@ -0,0 +1,16 @@
+/* -----------------------------------------------------------------------------
+ *
+ * User-overridable RTS hooks.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+
+#include <stdio.h>
+
+void
+MallocFailHook (lnat request_size /* in bytes */, char *msg)
+{
+    fprintf(stderr, "malloc: failed on request for %lu bytes; message: %s\n", request_size, msg);
+}
+
diff --git a/rts/hooks/OnExit.c b/rts/hooks/OnExit.c
new file mode 100644
index 0000000000..dd4c3b4bb0
--- /dev/null
+++ b/rts/hooks/OnExit.c
@@ -0,0 +1,19 @@
+/* -----------------------------------------------------------------------------
+ *
+ * User-overridable RTS hooks.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+
+/* Note: by the time this hook has been called, Haskell land
+ * will have been shut down completely.
+ *
+ * ToDo: feed the hook info on whether we're shutting down as a result
+ * of termination or run-time error ?
+ */
+ 
+void
+OnExitHook ()
+{
+}
diff --git a/rts/hooks/OutOfHeap.c b/rts/hooks/OutOfHeap.c
new file mode 100644
index 0000000000..98db0d7d49
--- /dev/null
+++ b/rts/hooks/OutOfHeap.c
@@ -0,0 +1,19 @@
+/* -----------------------------------------------------------------------------
+ *
+ * User-overridable RTS hooks.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+#include <stdio.h>
+
+void
+OutOfHeapHook (lnat request_size, lnat heap_size) /* both sizes in bytes */
+{
+  /*    fprintf(stderr, "Heap exhausted;\nwhile trying to allocate %lu bytes in a %lu-byte heap;\nuse `+RTS -H<size>' to increase the total heap size.\n", */
+
+  (void)request_size;   /* keep gcc -Wall happy */
+  fprintf(stderr, "Heap exhausted;\nCurrent maximum heap size is %lu bytes (%lu Mb);\nuse `+RTS -M<size>' to increase it.\n",
+	  heap_size, heap_size / (1024*1024));
+}
+
diff --git a/rts/hooks/RtsOpts.c b/rts/hooks/RtsOpts.c
new file mode 100644
index 0000000000..b934b05f1b
--- /dev/null
+++ b/rts/hooks/RtsOpts.c
@@ -0,0 +1,13 @@
+/* -----------------------------------------------------------------------------
+ *
+ * Default RTS options.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+
+#include <stdlib.h>
+
+// Default RTS options can be given by providing an alternate
+// definition for this variable, pointing to a string of RTS options.
+char *ghc_rts_opts = NULL;
diff --git a/rts/hooks/ShutdownEachPEHook.c b/rts/hooks/ShutdownEachPEHook.c
new file mode 100644
index 0000000000..f5e3ba9344
--- /dev/null
+++ b/rts/hooks/ShutdownEachPEHook.c
@@ -0,0 +1,19 @@
+/* -----------------------------------------------------------------------------
+ *
+ * User-overridable RTS hooks.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+
+#ifdef PAR
+void
+ShutdownEachPEHook (void)
+{ /* In a GUM setup this routine is called at the end of 
+     shutdownParallelSystem on each PE. Useful for
+     cleaning up stuff, especially when interfacing 
+     with foreign language code.
+     -- HWL 
+  */
+}
+#endif
diff --git a/rts/hooks/StackOverflow.c b/rts/hooks/StackOverflow.c
new file mode 100644
index 0000000000..a395a3a1a5
--- /dev/null
+++ b/rts/hooks/StackOverflow.c
@@ -0,0 +1,16 @@
+/* -----------------------------------------------------------------------------
+ *
+ * User-overridable RTS hooks.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+
+#include <stdio.h>
+
+void
+StackOverflowHook (lnat stack_size)    /* in bytes */
+{
+    fprintf(stderr, "Stack space overflow: current size %ld bytes.\nUse `+RTS -Ksize' to increase it.\n", stack_size);
+}
+
diff --git a/rts/package.conf.in b/rts/package.conf.in
new file mode 100644
index 0000000000..935b71d6a6
--- /dev/null
+++ b/rts/package.conf.in
@@ -0,0 +1,152 @@
+/* The RTS is just another package! */
+
+#include "ghcconfig.h"
+#include "RtsConfig.h"
+
+name:		PACKAGE
+version:	1.0
+license:	BSD3
+maintainer:	glasgow-haskell-users@haskell.org
+exposed:	True
+
+exposed-modules:
+hidden-modules:
+
+import-dirs:
+
+#ifdef INSTALLING
+library-dirs:		LIB_DIR
+# ifdef mingw32_HOST_OS
+			, LIB_DIR"/gcc-lib"
+			/* force the dist-provided gcc-lib/ into scope. */
+# endif
+#else /* !INSTALLING */
+library-dirs:		FPTOOLS_TOP_ABS"/rts"
+# if !defined(HAVE_LIBGMP) && !defined(HAVE_FRAMEWORK_GMP)
+			, FPTOOLS_TOP_ABS"/rts/gmp"
+# endif
+#endif
+
+hs-libraries:   "HSrts"
+
+extra-libraries:		"m"		/* for ldexp() */
+#ifndef HAVE_FRAMEWORK_GMP
+			      , "gmp"
+#ifdef HAVE_LIBDL
+			      , "dl"
+#endif
+#endif
+#ifdef HAVE_LIBRT
+			      , "rt"
+#endif
+#ifdef mingw32_HOST_OS
+			      ,"wsock32"	/* for the linker */
+#endif
+#ifdef WANT_DOTNET_SUPPORT
+			      , "oleaut32", "ole32", "uuid"
+#endif
+#if defined(DEBUG) && defined(HAVE_LIBBFD)
+			      ,"bfd", "iberty"	/* for debugging */
+#endif
+#ifdef HAVE_LIBMINGWEX
+# ifndef INSTALLING                             /* Bundled Mingw is behind */
+                              ,"mingwex"
+# endif
+#endif
+
+#ifdef INSTALLING
+include-dirs:		INCLUDE_DIR
+# ifdef mingw32_HOST_OS
+			, INCLUDE_DIR"/mingw"
+# endif
+#else /* !INSTALLING */
+include-dirs:		FPTOOLS_TOP_ABS"/includes"
+#endif
+
+includes:		Stg.h
+depends:	 	
+hugs-options:
+cc-options:
+
+ld-options:
+#ifdef LEADING_UNDERSCORE
+           "-u", "_GHCziBase_Izh_static_info"
+         , "-u", "_GHCziBase_Czh_static_info"
+         , "-u", "_GHCziFloat_Fzh_static_info"
+         , "-u", "_GHCziFloat_Dzh_static_info"
+         , "-u", "_GHCziPtr_Ptr_static_info"
+         , "-u", "_GHCziWord_Wzh_static_info"
+         , "-u", "_GHCziInt_I8zh_static_info"
+         , "-u", "_GHCziInt_I16zh_static_info"
+         , "-u", "_GHCziInt_I32zh_static_info"
+         , "-u", "_GHCziInt_I64zh_static_info"
+         , "-u", "_GHCziWord_W8zh_static_info"
+         , "-u", "_GHCziWord_W16zh_static_info"
+         , "-u", "_GHCziWord_W32zh_static_info"
+         , "-u", "_GHCziWord_W64zh_static_info"
+         , "-u", "_GHCziStable_StablePtr_static_info"
+         , "-u", "_GHCziBase_Izh_con_info"
+         , "-u", "_GHCziBase_Czh_con_info"
+         , "-u", "_GHCziFloat_Fzh_con_info"
+         , "-u", "_GHCziFloat_Dzh_con_info"
+         , "-u", "_GHCziPtr_Ptr_con_info"
+         , "-u", "_GHCziPtr_FunPtr_con_info"
+         , "-u", "_GHCziStable_StablePtr_con_info"
+         , "-u", "_GHCziBase_False_closure"
+         , "-u", "_GHCziBase_True_closure"
+         , "-u", "_GHCziPack_unpackCString_closure"
+         , "-u", "_GHCziIOBase_stackOverflow_closure"
+         , "-u", "_GHCziIOBase_heapOverflow_closure"
+         , "-u", "_GHCziIOBase_NonTermination_closure"
+         , "-u", "_GHCziIOBase_BlockedOnDeadMVar_closure"
+         , "-u", "_GHCziIOBase_BlockedIndefinitely_closure"
+         , "-u", "_GHCziIOBase_Deadlock_closure"
+         , "-u", "_GHCziIOBase_NestedAtomically_closure"
+         , "-u", "_GHCziWeak_runFinalizzerBatch_closure"
+#else
+           "-u", "GHCziBase_Izh_static_info"
+         , "-u", "GHCziBase_Czh_static_info"
+         , "-u", "GHCziFloat_Fzh_static_info"
+         , "-u", "GHCziFloat_Dzh_static_info"
+         , "-u", "GHCziPtr_Ptr_static_info"
+         , "-u", "GHCziWord_Wzh_static_info"
+         , "-u", "GHCziInt_I8zh_static_info"
+         , "-u", "GHCziInt_I16zh_static_info"
+         , "-u", "GHCziInt_I32zh_static_info"
+         , "-u", "GHCziInt_I64zh_static_info"
+         , "-u", "GHCziWord_W8zh_static_info"
+         , "-u", "GHCziWord_W16zh_static_info"
+         , "-u", "GHCziWord_W32zh_static_info"
+         , "-u", "GHCziWord_W64zh_static_info"
+         , "-u", "GHCziStable_StablePtr_static_info"
+         , "-u", "GHCziBase_Izh_con_info"
+         , "-u", "GHCziBase_Czh_con_info"
+         , "-u", "GHCziFloat_Fzh_con_info"
+         , "-u", "GHCziFloat_Dzh_con_info"
+         , "-u", "GHCziPtr_Ptr_con_info"
+         , "-u", "GHCziPtr_FunPtr_con_info"
+         , "-u", "GHCziStable_StablePtr_con_info"
+         , "-u", "GHCziBase_False_closure"
+         , "-u", "GHCziBase_True_closure"
+         , "-u", "GHCziPack_unpackCString_closure"
+         , "-u", "GHCziIOBase_stackOverflow_closure"
+         , "-u", "GHCziIOBase_heapOverflow_closure"
+         , "-u", "GHCziIOBase_NonTermination_closure"
+         , "-u", "GHCziIOBase_BlockedOnDeadMVar_closure"
+         , "-u", "GHCziIOBase_BlockedIndefinitely_closure"
+         , "-u", "GHCziIOBase_Deadlock_closure"
+         , "-u", "GHCziIOBase_NestedAtomically_closure"
+         , "-u", "GHCziWeak_runFinalizzerBatch_closure"
+#endif
+
+framework-dirs:
+
+#ifdef HAVE_FRAMEWORK_GMP
+frameworks:	"GMP"
+#else
+frameworks:
+#endif
+
+haddock-interfaces:
+haddock-html:
+
diff --git a/rts/parallel/0Hash.c b/rts/parallel/0Hash.c
new file mode 100644
index 0000000000..a471e30a66
--- /dev/null
+++ b/rts/parallel/0Hash.c
@@ -0,0 +1,320 @@
+/*-----------------------------------------------------------------------------
+ *
+ * (c) The AQUA Project, Glasgow University, 1995-1998
+ * (c) The GHC Team, 1999
+ *
+ * Dynamically expanding linear hash tables, as described in
+ * Per-\AAke Larson, ``Dynamic Hash Tables,'' CACM 31(4), April 1988,
+ * pp. 446 -- 457.
+ * -------------------------------------------------------------------------- */
+
+/* 
+   Replaced with ghc/rts/Hash.c in the new RTS
+*/
+
+#if 0
+
+#include "Rts.h"
+#include "Hash.h"
+#include "RtsUtils.h"
+
+#define HSEGSIZE    1024    /* Size of a single hash table segment */
+			    /* Also the minimum size of a hash table */
+#define HDIRSIZE    1024    /* Size of the segment directory */
+			    /* Maximum hash table size is HSEGSIZE * HDIRSIZE */
+#define HLOAD	    5	    /* Maximum average load of a single hash bucket */
+
+#define HCHUNK	    (1024 * sizeof(W_) / sizeof(HashList))
+			    /* Number of HashList cells to allocate in one go */
+
+
+/* Linked list of (key, data) pairs for separate chaining */
+struct hashlist {
+    StgWord key;
+    void *data;
+    struct hashlist *next;  /* Next cell in bucket chain (same hash value) */
+};
+
+typedef struct hashlist HashList;
+
+struct hashtable {
+    int split;		    /* Next bucket to split when expanding */
+    int max;		    /* Max bucket of smaller table */
+    int mask1;		    /* Mask for doing the mod of h_1 (smaller table) */
+    int mask2;		    /* Mask for doing the mod of h_2 (larger table) */
+    int kcount;		    /* Number of keys */
+    int bcount;		    /* Number of buckets */
+    HashList **dir[HDIRSIZE];	/* Directory of segments */
+};
+
+/* -----------------------------------------------------------------------------
+ * Hash first using the smaller table.  If the bucket is less than the
+ * next bucket to be split, re-hash using the larger table.
+ * -------------------------------------------------------------------------- */
+
+static int
+hash(HashTable *table, W_ key)
+{
+    int bucket;
+
+    /* Strip the boring zero bits */
+    key /= sizeof(StgWord);
+
+    /* Mod the size of the hash table (a power of 2) */
+    bucket = key & table->mask1;
+
+    if (bucket < table->split) {
+	/* Mod the size of the expanded hash table (also a power of 2) */
+	bucket = key & table->mask2;
+    }
+    return bucket;
+}
+
+/* -----------------------------------------------------------------------------
+ * Allocate a new segment of the dynamically growing hash table.
+ * -------------------------------------------------------------------------- */
+
+static void
+allocSegment(HashTable *table, int segment)
+{
+    table->dir[segment] = stgMallocBytes(HSEGSIZE * sizeof(HashList *), 
+					 "allocSegment");
+}
+
+
+/* -----------------------------------------------------------------------------
+ * Expand the larger hash table by one bucket, and split one bucket
+ * from the smaller table into two parts.  Only the bucket referenced
+ * by @table->split@ is affected by the expansion.
+ * -------------------------------------------------------------------------- */
+
+static void
+expand(HashTable *table)
+{
+    int oldsegment;
+    int oldindex;
+    int newbucket;
+    int newsegment;
+    int newindex;
+    HashList *hl;
+    HashList *next;
+    HashList *old, *new;
+
+    if (table->split + table->max >= HDIRSIZE * HSEGSIZE)
+	/* Wow!  That's big.  Too big, so don't expand. */
+	return;
+
+    /* Calculate indices of bucket to split */
+    oldsegment = table->split / HSEGSIZE;
+    oldindex = table->split % HSEGSIZE;
+
+    newbucket = table->max + table->split;
+
+    /* And the indices of the new bucket */
+    newsegment = newbucket / HSEGSIZE;
+    newindex = newbucket % HSEGSIZE;
+
+    if (newindex == 0)
+	allocSegment(table, newsegment);
+
+    if (++table->split == table->max) {
+	table->split = 0;
+	table->max *= 2;
+	table->mask1 = table->mask2;
+	table->mask2 = table->mask2 << 1 | 1;
+    }
+    table->bcount++;
+
+    /* Split the bucket, paying no attention to the original order */
+
+    old = new = NULL;
+    for (hl = table->dir[oldsegment][oldindex]; hl != NULL; hl = next) {
+	next = hl->next;
+	if (hash(table, hl->key) == newbucket) {
+	    hl->next = new;
+	    new = hl;
+	} else {
+	    hl->next = old;
+	    old = hl;
+	}
+    }
+    table->dir[oldsegment][oldindex] = old;
+    table->dir[newsegment][newindex] = new;
+
+    return;
+}
+
+void *
+lookupHashTable(HashTable *table, StgWord key)
+{
+    int bucket;
+    int segment;
+    int index;
+    HashList *hl;
+
+    bucket = hash(table, key);
+    segment = bucket / HSEGSIZE;
+    index = bucket % HSEGSIZE;
+
+    for (hl = table->dir[segment][index]; hl != NULL; hl = hl->next)
+	if (hl->key == key)
+	    return hl->data;
+
+    /* It's not there */
+    return NULL;
+}
+
+/* -----------------------------------------------------------------------------
+ * We allocate the hashlist cells in large chunks to cut down on malloc
+ * overhead.  Although we keep a free list of hashlist cells, we make
+ * no effort to actually return the space to the malloc arena.
+ * -------------------------------------------------------------------------- */
+
+static HashList *freeList = NULL;
+
+static HashList *
+allocHashList(void)
+{
+    HashList *hl, *p;
+
+    if ((hl = freeList) != NULL) {
+	freeList = hl->next;
+    } else {
+        hl = stgMallocBytes(HCHUNK * sizeof(HashList), "allocHashList");
+
+	freeList = hl + 1;
+	for (p = freeList; p < hl + HCHUNK - 1; p++)
+	    p->next = p + 1;
+	p->next = NULL;
+    }
+    return hl;
+}
+
+static void
+freeHashList(HashList *hl)
+{
+    hl->next = freeList;
+    freeList = hl;
+}
+
+void
+insertHashTable(HashTable *table, StgWord key, void *data)
+{
+    int bucket;
+    int segment;
+    int index;
+    HashList *hl;
+
+    /* We want no duplicates */
+    ASSERT(lookupHashTable(table, key) == NULL);
+    
+    /* When the average load gets too high, we expand the table */
+    if (++table->kcount >= HLOAD * table->bcount)
+	expand(table);
+
+    bucket = hash(table, key);
+    segment = bucket / HSEGSIZE;
+    index = bucket % HSEGSIZE;
+
+    hl = allocHashList();
+
+    hl->key = key;
+    hl->data = data;
+    hl->next = table->dir[segment][index];
+    table->dir[segment][index] = hl;
+
+}
+
+void *
+removeHashTable(HashTable *table, StgWord key, void *data)
+{
+    int bucket;
+    int segment;
+    int index;
+    HashList *hl;
+    HashList *prev = NULL;
+
+    bucket = hash(table, key);
+    segment = bucket / HSEGSIZE;
+    index = bucket % HSEGSIZE;
+
+    for (hl = table->dir[segment][index]; hl != NULL; hl = hl->next) {
+	if (hl->key == key && (data == NULL || hl->data == data)) {
+	    if (prev == NULL)
+		table->dir[segment][index] = hl->next;
+	    else
+		prev->next = hl->next;
+	    table->kcount--;
+	    return hl->data;
+	}
+	prev = hl;
+    }
+
+    /* It's not there */
+    ASSERT(data == NULL);
+    return NULL;
+}
+
+/* -----------------------------------------------------------------------------
+ * When we free a hash table, we are also good enough to free the
+ * data part of each (key, data) pair, as long as our caller can tell
+ * us how to do it.
+ * -------------------------------------------------------------------------- */
+
+void
+freeHashTable(HashTable *table, void (*freeDataFun)(void *) )
+{
+    long segment;
+    long index;
+    HashList *hl;
+    HashList *next;
+
+    /* The last bucket with something in it is table->max + table->split - 1 */
+    segment = (table->max + table->split - 1) / HSEGSIZE;
+    index = (table->max + table->split - 1) % HSEGSIZE;
+
+    while (segment >= 0) {
+	while (index >= 0) {
+	    for (hl = table->dir[segment][index]; hl != NULL; hl = next) {
+		next = hl->next;
+		if (freeDataFun != NULL)
+		    (*freeDataFun)(hl->data);
+		freeHashList(hl);
+	    }
+	    index--;
+	}
+	free(table->dir[segment]);
+	segment--;
+	index = HSEGSIZE - 1;
+    }
+    free(table);
+}
+
+/* -----------------------------------------------------------------------------
+ * When we initialize a hash table, we set up the first segment as well,
+ * initializing all of the first segment's hash buckets to NULL.
+ * -------------------------------------------------------------------------- */
+
+HashTable *
+allocHashTable(void)
+{
+    HashTable *table;
+    HashList **hb;
+
+    table = stgMallocBytes(sizeof(HashTable),"allocHashTable");
+
+    allocSegment(table, 0);
+
+    for (hb = table->dir[0]; hb < table->dir[0] + HSEGSIZE; hb++)
+	*hb = NULL;
+
+    table->split = 0;
+    table->max = HSEGSIZE;
+    table->mask1 = HSEGSIZE - 1;
+    table->mask2 = 2 * HSEGSIZE - 1;
+    table->kcount = 0;
+    table->bcount = HSEGSIZE;
+
+    return table;
+}
+#endif
diff --git a/rts/parallel/0Parallel.h b/rts/parallel/0Parallel.h
new file mode 100644
index 0000000000..d52bf00fc2
--- /dev/null
+++ b/rts/parallel/0Parallel.h
@@ -0,0 +1,414 @@
+/*
+  Time-stamp: <Mon Oct 04 1999 14:50:28 Stardate: [-30]3692.88 hwloidl>
+ 
+  Definitions for parallel machines.
+
+This section contains definitions applicable only to programs compiled
+to run on a parallel machine, i.e. on GUM. Some of these definitions
+are also used when simulating parallel execution, i.e. on GranSim.
+  */
+
+/*
+  ToDo: Check the PAR specfic part of this file 
+        Move stuff into Closures.h and ClosureMacros.h 
+	Clean-up GRAN specific code
+  -- HWL
+  */
+
+#ifndef PARALLEL_H
+#define PARALLEL_H
+
+#if defined(PAR) || defined(GRAN)        /* whole file */
+
+#include "Rts.h"
+#include "GranSim.h"
+//#include "ClosureTypes.h"
+
+//@menu
+//* Basic definitions::		
+//* Externs and types::		
+//* Dummy defs::		
+//* Par specific fixed headers::  
+//* Parallel only heap objects::  
+//* Packing definitions::	
+//* End of File::		
+//@end menu
+//*/
+
+//@node Basic definitions, Externs and types
+//@section Basic definitions
+
+/* SET_PAR_HDR and SET_STATIC_PAR_HDR now live in ClosureMacros.h */
+
+/* Needed for dumping routines */
+#if defined(PAR)
+# define TIME                      ullong
+# define CURRENT_TIME              msTime()
+# define TIME_ON_PROC(p)           msTime()
+# define CURRENT_PROC              thisPE
+# define BINARY_STATS              RtsFlags.ParFlags.granSimStats_Binary
+#elif defined(GRAN)
+# define TIME                      rtsTime
+# define CURRENT_TIME              CurrentTime[CurrentProc]
+# define TIME_ON_PROC(p)           CurrentTime[p]
+# define CURRENT_PROC              CurrentProc
+# define BINARY_STATS              RtsFlags.GranFlags.granSimStats_Binary
+#endif
+
+#if defined(PAR)
+#  define MAX_PES	256		/* Maximum number of processors */
+	/* MAX_PES is enforced by SysMan, which does not
+	   allow more than this many "processors".
+	   This is important because PackGA [GlobAddr.lc]
+	   **assumes** that a PE# can fit in 8+ bits.
+	*/
+#endif
+
+//@node Externs and types, Dummy defs, Basic definitions
+//@section Externs and types
+
+#if defined(PAR)
+/* GUM: one spark queue on each PE, and each PE sees only its own spark queue */
+extern rtsSparkQ pending_sparks_hd;
+extern rtsSparkQ pending_sparks_tl;
+#elif defined(GRAN)
+/* GranSim: a globally visible array of spark queues */
+extern rtsSparkQ pending_sparks_hds[];
+extern rtsSparkQ pending_sparks_tls[];
+#endif
+extern unsigned int /* nat */ spark_queue_len(PEs proc);
+
+extern StgInt SparksAvail;     /* How many sparks are available */
+
+/* prototypes of spark routines */
+/* ToDo: check whether all have to be visible -- HWL */
+#if defined(GRAN)
+rtsSpark *newSpark(StgClosure *node, StgInt name, StgInt gran_info, StgInt size_info, StgInt par_info, StgInt local);
+void disposeSpark(rtsSpark *spark);
+void disposeSparkQ(rtsSparkQ spark);
+void add_to_spark_queue(rtsSpark *spark);
+void delete_from_spark_queue (rtsSpark *spark);
+#endif
+
+#define STATS_FILENAME_MAXLEN	128
+
+/* Where to write the log file */
+//extern FILE *gr_file;
+extern char gr_filename[STATS_FILENAME_MAXLEN];
+
+#if defined(GRAN)
+int init_gr_simulation(char *rts_argv[], int rts_argc, char *prog_argv[], int prog_argc);
+void end_gr_simulation(void);
+#endif 
+
+#if defined(PAR)
+extern I_ do_sp_profile;
+
+extern P_ PendingFetches;
+extern GLOBAL_TASK_ID *PEs;
+
+extern rtsBool IAmMainThread, GlobalStopPending;
+extern rtsBool fishing;
+extern GLOBAL_TASK_ID SysManTask;
+extern int seed;			/*pseudo-random-number generator seed:*/
+					/*Initialised in ParInit*/
+extern I_ threadId;                     /*Number of Threads that have existed on a PE*/
+extern GLOBAL_TASK_ID mytid;
+
+extern int  nPEs;
+
+extern rtsBool InGlobalGC;  	/* Are we in the midst of performing global GC */
+
+extern HashTable *pGAtoGALAtable;
+extern HashTable *LAtoGALAtable;
+extern GALA *freeIndirections;
+extern GALA *liveIndirections;
+extern GALA *freeGALAList;
+extern GALA *liveRemoteGAs;
+extern int thisPE;
+
+void RunParallelSystem (StgPtr program_closure);
+void initParallelSystem();
+void SynchroniseSystem();
+
+void registerTask (GLOBAL_TASK_ID gtid);
+globalAddr *LAGAlookup (P_ addr);
+P_ GALAlookup (globalAddr *ga);
+globalAddr *MakeGlobal (P_ addr, rtsBool preferred);
+globalAddr *setRemoteGA (P_ addr, globalAddr *ga, rtsBool preferred);
+void splitWeight (globalAddr *to, globalAddr *from);
+globalAddr *addWeight (globalAddr *ga);
+void initGAtables();
+W_ taskIDtoPE (GLOBAL_TASK_ID gtid);
+void RebuildLAGAtable();
+
+void *lookupHashTable (HashTable *table, StgWord key);
+void insertHashTable (HashTable *table, StgWord key, void *data);
+void freeHashTable (HashTable *table, void (*freeDataFun) ((void *data)));
+HashTable *allocHashTable();
+void *removeHashTable (HashTable *table, StgWord key, void *data);
+#endif /* PAR */
+
+/* Interface for dumping routines (i.e. writing to log file) */
+void DumpGranEvent(GranEventType name, StgTSO *tso);
+void DumpRawGranEvent(PEs proc, PEs p, GranEventType name, 
+ 	              StgTSO *tso, StgClosure *node, StgInt sparkname, StgInt len);
+//void DumpEndEvent(PEs proc, StgTSO *tso, rtsBool mandatory_thread);
+
+//@node Dummy defs, Par specific fixed headers, Externs and types
+//@section Dummy defs
+
+/*
+Get this out of the way.  These are all null definitions.
+*/
+
+
+//#  define GA_HDR_SIZE			0 
+//#  define GA(closure)	        	/*nothing */ 
+  
+//#  define SET_GA(closure,ga)		/* nothing */ 
+//#  define SET_STATIC_GA(closure)	/* nothing */ 
+//#  define SET_GRAN_HDR(closure,pe)      /* nothing */ 
+//#  define SET_STATIC_PROCS(closure)	/* nothing */ 
+  
+//#  define SET_TASK_ACTIVITY(act)	/* nothing */ 
+
+#if defined(GRAN)
+
+#  define GA_HDR_SIZE			1
+
+#  define PROCS_HDR_POSN		PAR_HDR_POSN
+#  define PROCS_HDR_SIZE		1
+
+/* Accessing components of the field */
+#  define PROCS(closure)	        ((closure)->header.gran.procs)
+/* SET_PROCS is now SET_GRAN_HEADER in ClosureMacros.h. */
+#endif
+
+
+//@node Par specific fixed headers, Parallel only heap objects, Dummy defs
+//@section Par specific fixed headers
+
+/*
+Definitions relating to the entire parallel-only fixed-header field.
+
+On GUM, the global addresses for each local closure are stored in a separate
+hash table, rather then with the closure in the heap.  We call @getGA@ to
+look up the global address associated with a local closure (0 is returned
+for local closures that have no global address), and @setGA@ to store a new
+global address for a local closure which did not previously have one.
+*/
+
+#if defined(PAR) 
+
+#  define GA_HDR_SIZE			0
+  
+#  define GA(closure)		        getGA(closure)
+  
+#  define SET_GA(closure, ga)             setGA(closure,ga)
+#  define SET_STATIC_GA(closure)
+#  define SET_GRAN_HDR(closure,pe)
+#  define SET_STATIC_PROCS(closure)
+  
+#  define MAX_GA_WEIGHT			0	/* Treat as 2^n */
+  
+W_ PackGA ((W_, int));
+   /* There was a PACK_GA macro here; but we turned it into the PackGA
+      routine [GlobAddr.lc] (because it needs to do quite a bit of
+      paranoia checking.  Phil & Will (95/08)
+   */
+
+/* At the moment, there is no activity profiling for GUM.  This may change. */
+#  define SET_TASK_ACTIVITY(act)        /* nothing */
+#endif
+
+//@node Parallel only heap objects, Packing definitions, Par specific fixed headers
+//@section Parallel only heap objects
+
+// NB: The following definitons are BOTH for GUM and GrAnSim -- HWL
+
+/*   All in Closures.h and CLosureMacros.h */
+
+//@node Packing definitions, End of File, Parallel only heap objects
+//@section Packing definitions
+
+//@menu
+//* GUM::			
+//* GranSim::			
+//@end menu
+//*/
+
+//@node GUM, GranSim, Packing definitions, Packing definitions
+//@subsection GUM
+
+#if defined(PAR) 
+/*
+Symbolic constants for the packing code.
+
+This constant defines how many words of data we can pack into a single
+packet in the parallel (GUM) system.
+*/
+
+//@menu
+//* Externs::			
+//* Prototypes::		
+//* Macros::			
+//@end menu
+//*/
+
+//@node Externs, Prototypes, GUM, GUM
+//@subsubsection Externs
+
+extern W_      *PackBuffer;      /* size: can be set via option */
+extern long *buffer;             /* HWL_ */
+extern W_ *freeBuffer;           /* HWL_ */
+extern W_ *packBuffer;           /* HWL_ */
+
+extern void    InitPackBuffer(STG_NO_ARGS);
+extern void    InitMoreBuffers(STG_NO_ARGS);
+extern void    InitPendingGABuffer(W_ size); 
+extern void    AllocClosureQueue(W_ size);
+
+//@node Prototypes, Macros, Externs, GUM
+//@subsubsection Prototypes
+
+void	InitPackBuffer();
+P_      PackTSO (P_ tso, W_ *size);
+P_      PackStkO (P_ stko, W_ *size);
+P_	AllocateHeap (W_ size);          /* Doesn't belong */
+
+void    InitClosureQueue ();
+P_      DeQueueClosure();
+void    QueueClosure (P_ closure);
+rtsBool QueueEmpty();
+void    PrintPacket (P_ buffer);
+
+P_      get_closure_info (P_ closure, W_ *size, W_ *ptrs, W_ *nonptrs, W_ *vhs, char *type);
+
+rtsBool isOffset (globalAddr *ga),
+	isFixed (globalAddr *ga);
+
+void    doGlobalGC();
+
+P_      PackNearbyGraph (P_ closure,W_ *size);
+P_      UnpackGraph (W_ *buffer, globalAddr **gamap, W_ *nGAs);
+
+
+//@node Macros,  , Prototypes, GUM
+//@subsubsection Macros
+
+#    define PACK_HEAP_REQUIRED  \
+      ((RtsFlags.ParFlags.packBufferSize - PACK_HDR_SIZE) / (PACK_GA_SIZE + _FHS) * (SPEC_HS + 2))
+
+#  define MAX_GAS 	(RtsFlags.ParFlags.packBufferSize / PACK_GA_SIZE)
+
+
+#  define PACK_GA_SIZE	3	/* Size of a packed GA in words */
+			        /* Size of a packed fetch-me in words */
+#  define PACK_FETCHME_SIZE (PACK_GA_SIZE + FIXED_HS)
+
+#  define PACK_HDR_SIZE	1	/* Words of header in a packet */
+
+#  define PACK_PLC_SIZE	2	/* Size of a packed PLC in words */
+
+#endif /* PAR */
+
+//@node GranSim,  , GUM, Packing definitions
+//@subsection GranSim
+
+#if defined(GRAN)
+/* ToDo: Check which of the PAR routines are needed in GranSim -- HWL */
+
+//@menu
+//* Types::			
+//* Prototypes::		
+//* Macros::			
+//@end menu
+//*/
+
+//@node Types, Prototypes, GranSim, GranSim
+//@subsubsection Types
+
+typedef struct rtsPackBuffer_ {
+  StgInt /* nat */           size;
+  StgInt /* nat */           unpacked_size;
+  StgTSO       *tso;
+  StgClosure  **buffer;  
+} rtsPackBuffer;
+
+//@node Prototypes, Macros, Types, GranSim
+//@subsubsection Prototypes
+
+
+/* main packing functions */
+/*
+rtsPackBuffer *PackNearbyGraph(StgClosure* closure, StgTSO* tso, nat *packbuffersize);
+rtsPackBuffer *PackOneNode(StgClosure* closure, StgTSO* tso, nat *packbuffersize);
+void PrintPacket(rtsPackBuffer *buffer);
+StgClosure *UnpackGraph(rtsPackBuffer* buffer);
+*/
+/* important auxiliary functions */
+
+//StgInfoTable *get_closure_info(StgClosure* node, nat *size, nat *ptrs, nat *nonptrs, nat *vhs, char *info_hdr_ty);
+int IS_BLACK_HOLE(StgClosure* node);
+StgClosure *IS_INDIRECTION(StgClosure* node);
+int IS_THUNK(StgClosure* closure);
+char *display_info_type(StgClosure* closure, char *str);
+
+/* 
+OLD CODE -- HWL
+void  InitPackBuffer(void);
+P_    AllocateHeap (W_ size);
+P_    PackNearbyGraph (P_ closure, P_ tso, W_ *packbuffersize);
+P_    PackOneNode (P_ closure, P_ tso, W_ *packbuffersize);
+P_    UnpackGraph (P_ buffer);
+
+void    InitClosureQueue (void);
+P_      DeQueueClosure(void);
+void    QueueClosure (P_ closure);
+// rtsBool QueueEmpty();
+void    PrintPacket (P_ buffer);
+*/
+
+// StgInfoTable *get_closure_info(StgClosure* node, unsigned int /* nat */ *size, unsigned int /* nat */ *ptrs, unsigned int /* nat */ *nonptrs, unsigned int /* nat */ *vhs, char *info_hdr_ty);
+// int /* rtsBool */ IS_BLACK_HOLE(StgClosure* node)          ;
+
+//@node Macros,  , Prototypes, GranSim
+//@subsubsection Macros
+
+/* These are needed in the packing code to get the size of the packet
+   right. The closures itself are never built in GrAnSim. */
+#  define FETCHME_VHS				IND_VHS
+#  define FETCHME_HS				IND_HS
+  
+#  define FETCHME_GA_LOCN                       FETCHME_HS
+  
+#  define FETCHME_CLOSURE_SIZE(closure)		IND_CLOSURE_SIZE(closure)
+#  define FETCHME_CLOSURE_NoPTRS(closure)		0L
+#  define FETCHME_CLOSURE_NoNONPTRS(closure)	(IND_CLOSURE_SIZE(closure)-IND_VHS)
+  
+#  define MAX_GAS 	(RtsFlags.GranFlags.packBufferSize / PACK_GA_SIZE)
+#  define PACK_GA_SIZE	3	/* Size of a packed GA in words */
+			        /* Size of a packed fetch-me in words */
+#  define PACK_FETCHME_SIZE (PACK_GA_SIZE + FIXED_HS)
+#  define PACK_HDR_SIZE	4	/* Words of header in a packet */
+
+#    define PACK_HEAP_REQUIRED  \
+      (RtsFlags.GranFlags.packBufferSize * sizeofW(StgClosure*) + \
+      2 * sizeofW(StgInt) + sizeofW(StgTSO*))
+
+#    define PACK_FLAG_LOCN           0  
+#    define PACK_TSO_LOCN            1
+#    define PACK_UNPACKED_SIZE_LOCN  2
+#    define PACK_SIZE_LOCN           3
+#    define MAGIC_PACK_FLAG          0xfabc
+
+#endif   /* GRAN */
+
+//@node End of File,  , Packing definitions
+//@section End of File
+
+#endif /* defined(PAR) || defined(GRAN)         whole file */
+#endif /* Parallel_H */
+
+
diff --git a/rts/parallel/0Unpack.c b/rts/parallel/0Unpack.c
new file mode 100644
index 0000000000..fc4a8e50c3
--- /dev/null
+++ b/rts/parallel/0Unpack.c
@@ -0,0 +1,440 @@
+/*
+  Time-stamp: <Wed Jan 12 2000 13:29:08 Stardate: [-30]4193.85 hwloidl>
+
+  Unpacking closures which have been exported to remote processors
+
+  This module defines routines for unpacking closures in the parallel
+  runtime system (GUM).
+
+  In the case of GrAnSim, this module defines routines for *simulating* the
+  unpacking of closures as it is done in the parallel runtime system.
+*/
+
+/* 
+   Code in this file has been merged with Pack.c 
+*/
+
+#if 0
+
+//@node Unpacking closures, , ,
+//@section Unpacking closures
+
+//@menu
+//* Includes::			
+//* Prototypes::		
+//* GUM code::			
+//* GranSim Code::		
+//* Index::			
+//@end menu
+//*/
+
+//@node Includes, Prototypes, Unpacking closures, Unpacking closures
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "GranSimRts.h"
+#include "ParallelRts.h"
+#include "ParallelDebug.h"
+#include "FetchMe.h"
+#include "Storage.h"
+
+//@node Prototypes, GUM code, Includes, Unpacking closures
+//@subsection Prototypes
+
+void     InitPacking(void);
+# if defined(PAR)
+void            InitPackBuffer(void);
+# endif
+/* Interface for ADT of closure queues */
+void    	  AllocClosureQueue(nat size);
+void    	  InitClosureQueue(void);
+rtsBool 	  QueueEmpty(void);
+void    	  QueueClosure(StgClosure *closure);
+StgClosure *DeQueueClosure(void);
+
+StgPtr AllocateHeap(nat size);
+
+//@node GUM code, GranSim Code, Prototypes, Unpacking closures
+//@subsection GUM code
+
+#if defined(PAR) 
+
+//@node Local Definitions,  , GUM code, GUM code
+//@subsubsection Local Definitions
+
+//@cindex PendingGABuffer
+static globalAddr *PendingGABuffer;  
+/* is initialised in main; */
+
+//@cindex InitPendingGABuffer
+void
+InitPendingGABuffer(size)
+nat size; 
+{
+  PendingGABuffer = (globalAddr *) 
+                      stgMallocBytes((size-PACK_HDR_SIZE)*2*sizeof(globalAddr),
+				     "InitPendingGABuffer");
+}
+
+/*
+  @CommonUp@ commons up two closures which we have discovered to be
+  variants of the same object.  One is made an indirection to the other.  */
+
+//@cindex CommonUp
+void
+CommonUp(StgClosure *src, StgClosure *dst)
+{
+  StgBlockingQueueElement *bqe;
+
+  ASSERT(src != dst);
+  switch (get_itbl(src)->type) {
+  case BLACKHOLE_BQ:
+    bqe = ((StgBlockingQueue *)src)->blocking_queue;
+    break;
+
+  case FETCH_ME_BQ:
+    bqe = ((StgFetchMeBlockingQueue *)src)->blocking_queue;
+    break;
+    
+  case RBH:
+    bqe = ((StgRBH *)src)->blocking_queue;
+    break;
+    
+  case BLACKHOLE:
+  case FETCH_ME:
+    bqe = END_BQ_QUEUE;
+    break;
+
+  default:
+    /* Don't common up anything else */
+    return;
+  }
+  /* We do not use UPD_IND because that would awaken the bq, too */
+  // UPD_IND(src, dst);
+  updateWithIndirection(get_itbl(src), src, dst);
+  //ASSERT(!IS_BIG_MOTHER(INFO_PTR(dst)));
+  if (bqe != END_BQ_QUEUE)
+    awaken_blocked_queue(bqe, src);
+}
+
+/*
+  @UnpackGraph@ unpacks the graph contained in a message buffer.  It
+  returns a pointer to the new graph.  The @gamap@ parameter is set to
+  point to an array of (oldGA,newGA) pairs which were created as a result
+  of unpacking the buffer; @nGAs@ is set to the number of GA pairs which
+  were created.
+
+  The format of graph in the pack buffer is as defined in @Pack.lc@.  */
+
+//@cindex UnpackGraph
+StgClosure *
+UnpackGraph(packBuffer, gamap, nGAs)
+rtsPackBuffer *packBuffer;
+globalAddr **gamap;
+nat *nGAs;
+{
+  nat size, ptrs, nonptrs, vhs;
+  StgWord **buffer, **bufptr, **slotptr;
+  globalAddr ga, *gaga;
+  StgClosure *closure, *existing,
+             *graphroot, *graph, *parent;
+  StgInfoTable *ip, *oldip;
+  nat bufsize, i,
+      pptr = 0, pptrs = 0, pvhs;
+  char str[80];
+
+  InitPackBuffer();                  /* in case it isn't already init'd */
+  graphroot = (StgClosure *)NULL;
+
+  gaga = PendingGABuffer;
+
+  InitClosureQueue();
+
+  /* Unpack the header */
+  bufsize = packBuffer->size;
+  buffer = packBuffer->buffer;
+  bufptr = buffer;
+
+  /* allocate heap */
+  if (bufsize > 0) {
+    graph = allocate(bufsize);
+    ASSERT(graph != NULL);
+  }
+
+  parent = (StgClosure *)NULL;
+
+  do {
+    /* This is where we will ultimately save the closure's address */
+    slotptr = bufptr;
+
+    /* First, unpack the next GA or PLC */
+    ga.weight = (rtsWeight) *bufptr++;
+
+    if (ga.weight > 0) {
+      ga.payload.gc.gtid = (GlobalTaskId) *bufptr++;
+      ga.payload.gc.slot = (int) *bufptr++;
+    } else
+      ga.payload.plc = (StgPtr) *bufptr++;
+    
+    /* Now unpack the closure body, if there is one */
+    if (isFixed(&ga)) {
+      /* No more to unpack; just set closure to local address */
+      IF_PAR_DEBUG(pack,
+		   belch("Unpacked PLC at %x", ga.payload.plc)); 
+      closure = ga.payload.plc;
+    } else if (isOffset(&ga)) {
+      /* No more to unpack; just set closure to cached address */
+      ASSERT(parent != (StgClosure *)NULL);
+      closure = (StgClosure *) buffer[ga.payload.gc.slot];
+    } else {
+      /* Now we have to build something. */
+
+      ASSERT(bufsize > 0);
+
+      /*
+       * Close your eyes.  You don't want to see where we're looking. You
+       * can't get closure info until you've unpacked the variable header,
+       * but you don't know how big it is until you've got closure info.
+       * So...we trust that the closure in the buffer is organized the
+       * same way as they will be in the heap...at least up through the
+       * end of the variable header.
+       */
+      ip = get_closure_info(bufptr, &size, &ptrs, &nonptrs, &vhs, str);
+	  
+      /* 
+	 Remember, the generic closure layout is as follows:
+	 +-------------------------------------------------+
+	 | FIXED HEADER | VARIABLE HEADER | PTRS | NON-PRS |
+	 +-------------------------------------------------+
+      */
+      /* Fill in the fixed header */
+      for (i = 0; i < FIXED_HS; i++)
+	((StgPtr)graph)[i] = *bufptr++;
+
+      if (ip->type == FETCH_ME)
+	size = ptrs = nonptrs = vhs = 0;
+
+      /* Fill in the packed variable header */
+      for (i = 0; i < vhs; i++)
+	((StgPtr)graph)[FIXED_HS + i] = *bufptr++;
+
+      /* Pointers will be filled in later */
+
+      /* Fill in the packed non-pointers */
+      for (i = 0; i < nonptrs; i++)
+	((StgPtr)graph)[FIXED_HS + i + vhs + ptrs] = *bufptr++;
+                
+      /* Indirections are never packed */
+      // ASSERT(INFO_PTR(graph) != (W_) Ind_info_TO_USE);
+
+      /* Add to queue for processing */
+      QueueClosure(graph);
+	
+      /*
+       * Common up the new closure with any existing closure having the same
+       * GA
+       */
+
+      if ((existing = GALAlookup(&ga)) == NULL) {
+	globalAddr *newGA;
+	/* Just keep the new object */
+	IF_PAR_DEBUG(pack,
+		     belch("Unpacking new (%x, %d, %x)\n", 
+			   ga.payload.gc.gtid, ga.payload.gc.slot, ga.weight));
+
+	closure = graph;
+	newGA = setRemoteGA(graph, &ga, rtsTrue);
+	if (ip->type == FETCH_ME)
+	  // FETCHME_GA(closure) = newGA;
+	  ((StgFetchMe *)closure)->ga = newGA;
+      } else {
+	/* Two closures, one global name.  Someone loses */
+	oldip = get_itbl(existing);
+
+	if ((oldip->type == FETCH_ME || IS_BLACK_HOLE(existing)) &&
+	    ip->type != FETCH_ME) {
+
+	  /* What we had wasn't worth keeping */
+	  closure = graph;
+	  CommonUp(existing, graph);
+	} else {
+
+	  /*
+	   * Either we already had something worthwhile by this name or
+	   * the new thing is just another FetchMe.  However, the thing we
+	   * just unpacked has to be left as-is, or the child unpacking
+	   * code will fail.  Remember that the way pointer words are
+	   * filled in depends on the info pointers of the parents being
+	   * the same as when they were packed.
+	   */
+	  IF_PAR_DEBUG(pack,
+		       belch("Unpacking old (%x, %d, %x), keeping %#lx", 
+			     ga.payload.gc.gtid, ga.payload.gc.slot, ga.weight,
+			     existing));
+
+	  closure = existing;
+	}
+	/* Pool the total weight in the stored ga */
+	(void) addWeight(&ga);
+      }
+
+      /* Sort out the global address mapping */
+      if ((ip_THUNK(ip) && !ip_UNPOINTED(ip)) || 
+	  (ip_MUTABLE(ip) && ip->type != FETCH_ME)) {
+	/* Make up new GAs for single-copy closures */
+	globalAddr *newGA = makeGlobal(closure, rtsTrue);
+	
+	ASSERT(closure == graph);
+
+	/* Create an old GA to new GA mapping */
+	*gaga++ = ga;
+	splitWeight(gaga, newGA);
+	ASSERT(gaga->weight == 1L << (BITS_IN(unsigned) - 1));
+	gaga++;
+      }
+      graph += FIXED_HS + (size < MIN_UPD_SIZE ? MIN_UPD_SIZE : size);
+    }
+
+    /*
+     * Set parent pointer to point to chosen closure.  If we're at the top of
+     * the graph (our parent is NULL), then we want to arrange to return the
+     * chosen closure to our caller (possibly in place of the allocated graph
+     * root.)
+     */
+    if (parent == NULL)
+      graphroot = closure;
+    else
+      ((StgPtr)parent)[FIXED_HS + pvhs + pptr] = (StgWord) closure;
+
+    /* Save closure pointer for resolving offsets */
+    *slotptr = (StgWord) closure;
+
+    /* Locate next parent pointer */
+    pptr++;
+    while (pptr + 1 > pptrs) {
+      parent = DeQueueClosure();
+
+      if (parent == NULL)
+	break;
+      else {
+	(void) get_closure_info(parent, &size, &pptrs, &nonptrs,
+					&pvhs, str);
+	pptr = 0;
+      }
+    }
+  } while (parent != NULL);
+
+  ASSERT(bufsize == 0 || graph - 1 <= SAVE_Hp);
+
+  *gamap = PendingGABuffer;
+  *nGAs = (gaga - PendingGABuffer) / 2;
+
+  /* ToDo: are we *certain* graphroot has been set??? WDP 95/07 */
+  ASSERT(graphroot!=NULL);
+  return (graphroot);
+}
+#endif  /* PAR */
+
+//@node GranSim Code, Index, GUM code, Unpacking closures
+//@subsection GranSim Code
+
+/*
+   For GrAnSim: In general no actual unpacking should be necessary. We just
+   have to walk over the graph and set the bitmasks appropriately. -- HWL */
+
+//@node Unpacking,  , GranSim Code, GranSim Code
+//@subsubsection Unpacking
+
+#if defined(GRAN)
+void
+CommonUp(StgClosure *src, StgClosure *dst)
+{
+  barf("CommonUp: should never be entered in a GranSim setup");
+}
+
+/* This code fakes the unpacking of a somewhat virtual buffer */
+StgClosure*
+UnpackGraph(buffer)
+rtsPackBuffer* buffer;
+{
+  nat size, ptrs, nonptrs, vhs,
+      bufptr = 0;
+  StgClosure *closure, *graphroot, *graph;
+  StgInfoTable *ip;
+  StgWord bufsize, unpackedsize,
+          pptr = 0, pptrs = 0, pvhs;
+  StgTSO* tso;
+  char str[240], str1[80];
+  int i;
+
+  bufptr = 0;
+  graphroot = buffer->buffer[0];
+
+  tso = buffer->tso;
+
+  /* Unpack the header */
+  unpackedsize = buffer->unpacked_size;
+  bufsize = buffer->size;
+
+  IF_GRAN_DEBUG(pack,
+		belch("<<< Unpacking <<%d>> (buffer @ %p):\n    (root @ %p, PE %d,size=%d), demanded by TSO %d (%p)[PE %d]",
+		      buffer->id, buffer, graphroot, where_is(graphroot), 
+		      bufsize, tso->id, tso, 
+		      where_is((StgClosure *)tso)));
+
+  do {
+    closure = buffer->buffer[bufptr++]; /* that's all we need for GrAnSim -- HWL */
+      
+    /* Actually only ip is needed; rest is useful for TESTING -- HWL */
+    ip = get_closure_info(closure, 
+			  &size, &ptrs, &nonptrs, &vhs, str);
+      
+    IF_GRAN_DEBUG(pack,
+		  sprintf(str, "**    (%p): Changing bitmask[%s]: 0x%x ",
+			  closure, (closure_HNF(closure) ? "NF" : "__"),
+			  PROCS(closure)));
+
+    if (ip->type == RBH) {
+      closure->header.gran.procs = PE_NUMBER(CurrentProc);    /* Move node */
+      
+      IF_GRAN_DEBUG(pack,
+		    strcat(str, " (converting RBH) ")); 
+
+      convertFromRBH(closure);   /* In GUM that's done by convertToFetchMe */
+    } else if (IS_BLACK_HOLE(closure)) {
+      closure->header.gran.procs |= PE_NUMBER(CurrentProc); /* Copy node */
+    } else if ( closure->header.gran.procs & PE_NUMBER(CurrentProc) == 0 ) {
+      if (closure_HNF(closure))
+	closure->header.gran.procs |= PE_NUMBER(CurrentProc); /* Copy node */
+      else
+	closure->header.gran.procs = PE_NUMBER(CurrentProc);  /* Move node */
+    }
+
+    IF_GRAN_DEBUG(pack,
+		  sprintf(str1, "0x%x",   PROCS(closure)); strcat(str, str1));
+    IF_GRAN_DEBUG(pack, belch(str));
+    
+  } while (bufptr<buffer->size) ;   /*  (parent != NULL);  */
+
+  /* In GrAnSim we allocate pack buffers dynamically! -- HWL */
+  free(buffer->buffer);
+  free(buffer);
+
+  IF_GRAN_DEBUG(pack,
+		belch("PrintGraph of %p is:", graphroot); PrintGraph(graphroot,0));
+
+  return (graphroot);
+}
+#endif  /* GRAN */
+#endif
+
+//@node Index,  , GranSim Code, Unpacking closures
+//@subsection Index
+
+//@index
+//* CommonUp::  @cindex\s-+CommonUp
+//* InitPendingGABuffer::  @cindex\s-+InitPendingGABuffer
+//* PendingGABuffer::  @cindex\s-+PendingGABuffer
+//* UnpackGraph::  @cindex\s-+UnpackGraph
+//@end index
diff --git a/rts/parallel/Dist.c b/rts/parallel/Dist.c
new file mode 100644
index 0000000000..eeec780716
--- /dev/null
+++ b/rts/parallel/Dist.c
@@ -0,0 +1,117 @@
+#include "Dist.h"
+
+#ifdef DIST /* whole file */
+
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "ParallelRts.h"
+#include "Parallel.h" // nPEs,allPEs,mytid 
+#include "HLC.h" //for sendReval
+#include "LLC.h" //for pvm stuff
+#include "FetchMe.h"     // for BLOCKED_FETCH_info 
+#include "Storage.h"       // for recordMutable
+
+/* hopefully the result>0  */
+StgWord32 cGetPECount(void)
+{ return nPEs;
+} 
+
+/* return taskID, n is 1..count, n=1 is always the mainPE */
+StgPEId cGetPEId(StgWord32 n)
+{ return allPEs[n-1];
+}
+
+/* return the taskID */
+StgPEId cGetMyPEId(void)
+{ return mytid;
+}
+
+/* return the taskID of the owning PE of an MVar/TSO:
+- MVAR/TSOs get converted to REMOTE_REFs when shipped, and
+  there is no mechanism for using these REMOTE_REFs 
+  apart from this code.
+*/   
+
+StgPEId cGetCertainOwner(StgClosure *mv)
+{ globalAddr *ga; 
+  switch(get_itbl(mv)->type)
+  { case TSO:
+    case MVAR:
+      return  mytid; // must be local 
+    case REMOTE_REF:
+      ga = LAGAlookup(mv);
+      ASSERT(ga);
+      return ga->payload.gc.gtid; // I know its global address
+  }   
+  barf("Dist.c:cGetCertainOwner() wrong closure type %s",info_type(mv));
+}
+
+/* for some additional fun, lets look up a certain host... */
+StgPEId cGetHostOwner(StgByteArray h) //okay h is a C string 
+{ int nArch,nHost,nTask,i;
+  StgPEId dtid;
+  struct pvmhostinfo *host;   
+  struct pvmtaskinfo *task;
+  
+  dtid=0;
+  pvm_config(&nHost,&nArch,&host); 
+  for(i=0;i<nHost;i++)
+    if(strcmp(host[i].hi_name,h)==0) 
+    { dtid=host[i].hi_tid;
+      break;
+    } 
+  if(dtid==0) return 0; // no host of that name
+  
+  for(i=0;i<nPEs;i++)
+  { pvm_tasks(allPEs[i],&nTask,&task);
+    ASSERT(nTask==1); //cause we lookup a single task
+    if(task[0].ti_host==dtid)
+      return allPEs[i];
+  }  
+  return 0;  //know host, put no PE on it
+}
+
+void cRevalIO(StgClosure *job,StgPEId p)
+{ nat size;
+  rtsPackBuffer *buffer=NULL;
+      
+  ASSERT(get_itbl(job)->type==MVAR);  
+  job=((StgMVar*)job)->value; // extract the job from the MVar
+
+  ASSERT(closure_THUNK(job)); // must be a closure!!!!!
+  ASSERT(p!=mytid);
+  
+  buffer = PackNearbyGraph(job, END_TSO_QUEUE, &size,p);
+  ASSERT(buffer != (rtsPackBuffer *)NULL);
+  ASSERT(get_itbl(job)->type==RBH);  
+  
+  IF_PAR_DEBUG(verbose,
+               belch("@;~) %x doing revalIO to %x\n",
+		     mytid,p)); 
+
+  sendReval(p,size,buffer);  
+  
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    globalParStats.tot_reval_mess++;
+  }
+  
+  /* 
+     We turn job into a FETCHME_BQ so that the thread will block
+     when it enters it.
+     
+     Note: it will not receive an ACK, thus no GA.   
+  */
+  
+  ASSERT(get_itbl(job)->type==RBH);  
+ 
+   /* put closure on mutables list, while it is still a RBH */
+  recordMutable((StgMutClosure *)job);
+
+  /* actually turn it into a FETCH_ME_BQ */
+  SET_INFO(job, &FETCH_ME_BQ_info);
+  ((StgFetchMe *)job)->ga = 0;     //hope this won't make anyone barf!!!
+  ((StgBlockingQueue*)job)->blocking_queue=END_BQ_QUEUE;
+}
+
+#endif
diff --git a/rts/parallel/Dist.h b/rts/parallel/Dist.h
new file mode 100644
index 0000000000..c67cce2748
--- /dev/null
+++ b/rts/parallel/Dist.h
@@ -0,0 +1,20 @@
+#ifndef __DIST_H
+#define __DIST_H
+
+#ifdef DIST 
+
+#include "Rts.h"
+
+typedef StgWord32 StgPEId;
+
+// interface functions for Haskell Language calls
+StgWord32 cGetPECount(void);
+StgPEId cGetPEId(StgWord32 n);
+StgPEId cGetMyPEId(void);
+StgPEId cGetCertainOwner(StgClosure *mv);
+void cRevalIO(StgClosure *job,StgPEId p);
+StgPEId cGetHostOwner(StgByteArray h);
+
+#endif /* DIST */
+
+#endif /* __DIST_H */
diff --git a/rts/parallel/FetchMe.h b/rts/parallel/FetchMe.h
new file mode 100644
index 0000000000..be5cbf6b54
--- /dev/null
+++ b/rts/parallel/FetchMe.h
@@ -0,0 +1,24 @@
+/* -----------------------------------------------------------------------------
+ *
+ * Closure types for the parallel system.
+ *
+ * ---------------------------------------------------------------------------*/
+
+EI_(stg_FETCH_ME_info);
+EF_(stg_FETCH_ME_entry);
+
+EI_(stg_FETCH_ME_BQ_info);
+EF_(stg_FETCH_ME_BQ_entry);
+
+EI_(stg_BLOCKED_FETCH_info);
+EF_(stg_BLOCKED_FETCH_entry);
+
+EI_(stg_REMOTE_REF_info);
+EF_(stg_REMOTE_REF_entry);
+
+EI_(stg_RBH_Save_0_info);
+EF_(stg_RBH_Save_0_entry);
+EI_(stg_RBH_Save_1_info);
+EF_(stg_RBH_Save_1_entry);
+EI_(stg_RBH_Save_2_info);
+EF_(stg_RBH_Save_2_entry);
diff --git a/rts/parallel/FetchMe.hc b/rts/parallel/FetchMe.hc
new file mode 100644
index 0000000000..f142e9e514
--- /dev/null
+++ b/rts/parallel/FetchMe.hc
@@ -0,0 +1,180 @@
+/* ----------------------------------------------------------------------------
+ Time-stamp: <Tue Mar 06 2001 17:01:46 Stardate: [-30]6288.54 hwloidl>
+
+ Entry code for a FETCH_ME closure
+
+ This module defines routines for handling remote pointers (@FetchMe@s)
+ in GUM.  It is threaded (@.hc@) because @FetchMe_entry@ will be
+ called during evaluation.
+
+ * --------------------------------------------------------------------------*/
+ 
+#ifdef PAR /* all of it */
+
+//@menu
+//* Includes::			
+//* Info tables::		
+//* Index::			
+//@end menu
+
+//@node Includes, Info tables
+//@subsection Includes
+
+#include "Stg.h"
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "Storage.h"
+#include "GranSim.h"
+#include "GranSimRts.h"
+#include "Parallel.h"
+#include "ParallelRts.h"
+#include "FetchMe.h"
+#include "HLC.h"
+#include "StgRun.h"	/* for StgReturn and register saving */
+
+/* --------------------------------------------------------------------------
+   FETCH_ME closures.
+
+   A FETCH_ME closure represents data that currently resides on
+   another PE.  We issue a fetch message, and wait for the data to be
+   retrieved.
+
+   A word on the ptr/nonptr fields in the macros: they are unused at the
+   moment; all closures defined here have constant size (ie. no payload
+   that varies from closure to closure). Therefore, all routines that 
+   need to know the size of these closures have to do a sizeofW(StgFetchMe) 
+   etc to get the closure size. See get_closure_info(), evacuate() and
+   checkClosure() (using the same fcts for determining the size of the 
+   closures would be a good idea; at least it would be a nice step towards
+   making this code bug free).
+   ------------------------------------------------------------------------ */
+
+//@node Info tables, Index, Includes
+//@subsection Info tables
+
+//@cindex FETCH_ME_info
+INFO_TABLE(stg_FETCH_ME_info, stg_FETCH_ME_entry, 0,2, FETCH_ME,, EF_,"FETCH_ME","FETCH_ME");
+//@cindex FETCH_ME_entry
+STGFUN(stg_FETCH_ME_entry)
+{
+  FB_
+    TICK_ENT_BH();
+
+    ASSERT(((StgFetchMe *)R1.p)->ga->payload.gc.gtid != mytid);
+  
+    /* Turn the FETCH_ME into a FETCH_ME_BQ, and place the current thread
+     * on the blocking queue.
+     */
+    // ((StgFetchMeBlockingQueue *)R1.cl)->header.info = &FETCH_ME_BQ_info; // does the same as SET_INFO
+    SET_INFO((StgClosure *)R1.cl, &stg_FETCH_ME_BQ_info);
+  
+    /* Remember GA as a global var (used in blockThread); NB: not thread safe! */
+    ASSERT(theGlobalFromGA.payload.gc.gtid == (GlobalTaskId)0);
+    theGlobalFromGA = *((StgFetchMe *)R1.p)->ga; 
+
+    /* Put ourselves on the blocking queue for this black hole */
+    ASSERT(looks_like_ga(((StgFetchMe *)R1.p)->ga));
+    CurrentTSO->link = END_BQ_QUEUE;
+    ((StgFetchMeBlockingQueue *)R1.cl)->blocking_queue = (StgBlockingQueueElement *)CurrentTSO;
+  
+    /* jot down why and on what closure we are blocked */
+    CurrentTSO->why_blocked = BlockedOnGA;
+    CurrentTSO->block_info.closure = R1.cl;
+    /* closure is mutable since something has just been added to its BQ */
+    //recordMutable((StgMutClosure *)R1.cl);
+
+    /* sendFetch etc is now done in blockThread, which is called from the
+       scheduler -- HWL */
+
+    BLOCK_NP(1); 
+  FE_
+}
+
+/* ---------------------------------------------------------------------------
+   FETCH_ME_BQ
+   
+   On the first entry of a FETCH_ME closure, we turn the closure into
+   a FETCH_ME_BQ, which behaves just like a BLACKHOLE_BQ.  Any thread
+   entering the FETCH_ME_BQ will be placed in the blocking queue.
+   When the data arrives from the remote PE, all waiting threads are
+   woken up and the FETCH_ME_BQ is overwritten with the fetched data.
+
+   FETCH_ME_BQ_entry is almost identical to BLACKHOLE_BQ_entry -- HWL
+   ------------------------------------------------------------------------ */
+
+INFO_TABLE(stg_FETCH_ME_BQ_info, stg_FETCH_ME_BQ_entry,0,2,FETCH_ME_BQ,,EF_,"FETCH_ME_BQ","FETCH_ME_BQ");
+//@cindex FETCH_ME_BQ_info
+STGFUN(stg_FETCH_ME_BQ_entry)
+{
+  FB_
+    TICK_ENT_BH();
+
+    /* Put ourselves on the blocking queue for this node */
+    CurrentTSO->link = (StgTSO*)((StgBlockingQueue *)R1.p)->blocking_queue;
+    ((StgBlockingQueue *)R1.p)->blocking_queue = (StgBlockingQueueElement *)CurrentTSO;
+
+    /* jot down why and on what closure we are blocked */
+    CurrentTSO->why_blocked = BlockedOnGA_NoSend;
+    CurrentTSO->block_info.closure = R1.cl;
+
+    /* stg_gen_block is too heavyweight, use a specialised one */
+    BLOCK_NP(1);
+  FE_
+}
+
+/* ---------------------------------------------------------------------------
+   BLOCKED_FETCH_BQ
+   
+   A BLOCKED_FETCH closure only ever exists in the blocking queue of a
+   globally visible closure i.e. one with a GA. A BLOCKED_FETCH closure
+   indicates that a TSO on another PE is waiting for the result of this
+   computation. Thus, when updating the closure, the result has to be sent
+   to that PE. The relevant routines handling that are awakenBlockedQueue
+   and blockFetch (for putting BLOCKED_FETCH closure into a BQ).
+   ------------------------------------------------------------------------ */
+
+//@cindex BLOCKED_FETCH_info
+INFO_TABLE(stg_BLOCKED_FETCH_info, stg_BLOCKED_FETCH_entry,0,2,BLOCKED_FETCH,,EF_,"BLOCKED_FETCH","BLOCKED_FETCH");
+//@cindex BLOCKED_FETCH_entry
+STGFUN(stg_BLOCKED_FETCH_entry)
+{
+  FB_
+    /* see NON_ENTERABLE_ENTRY_CODE in StgMiscClosures.hc */
+    STGCALL2(fprintf,stderr,"BLOCKED_FETCH object entered!\n");
+    STGCALL1(shutdownHaskellAndExit, EXIT_FAILURE);
+  FE_
+}
+
+
+/* ---------------------------------------------------------------------------
+   REMOTE_REF
+   
+   A REMOTE_REF closure is generated whenever we wish to refer to a sticky
+   object on another PE.
+   ------------------------------------------------------------------------ */
+
+//@cindex REMOTE_REF_info
+INFO_TABLE(stg_REMOTE_REF_info, stg_REMOTE_REF_entry,0,2,REMOTE_REF,,EF_,"REMOTE_REF","REMOTE_REF");
+//@cindex REMOTE_REF_entry
+STGFUN(stg_REMOTE_REF_entry)
+{
+  FB_
+    /* see NON_ENTERABLE_ENTRY_CODE in StgMiscClosures.hc */
+    STGCALL2(fprintf,stderr,"REMOTE REF object entered!\n");
+    STGCALL1(shutdownHaskellAndExit, EXIT_FAILURE);
+  FE_
+}
+
+#endif /* PAR */
+
+//@node Index,  , Info tables
+//@subsection Index
+
+//@index
+//* BLOCKED_FETCH_entry::  @cindex\s-+BLOCKED_FETCH_entry
+//* BLOCKED_FETCH_info::  @cindex\s-+BLOCKED_FETCH_info
+//* FETCH_ME_BQ_info::  @cindex\s-+FETCH_ME_BQ_info
+//* FETCH_ME_entry::  @cindex\s-+FETCH_ME_entry
+//* FETCH_ME_info::  @cindex\s-+FETCH_ME_info
+//@end index
diff --git a/rts/parallel/Global.c b/rts/parallel/Global.c
new file mode 100644
index 0000000000..b2541357e1
--- /dev/null
+++ b/rts/parallel/Global.c
@@ -0,0 +1,1090 @@
+/* ---------------------------------------------------------------------------
+   Time-stamp: <Wed Mar 21 2001 16:32:23 Stardate: [-30]6363.44 hwloidl>
+
+   (c) The AQUA/Parade Projects, Glasgow University, 1995
+       The GdH/APART 624 Projects, Heriot-Watt University, Edinburgh, 1999
+
+   Global Address Manipulation.
+   
+   The GALA and LAGA tables for mapping global addresses to local addresses 
+   (i.e. heap pointers) are defined here. We use the generic hash tables
+   defined in Hash.c.
+   ------------------------------------------------------------------------- */
+
+#ifdef PAR /* whole file */
+
+//@menu
+//* Includes::			
+//* Global tables and lists::	
+//* Fcts on GALA tables::	
+//* Interface to taskId-PE table::  
+//* Interface to LAGA table::	
+//* Interface to GALA table::	
+//* GC functions for GALA tables::  
+//* Index::			
+//@end menu
+//*/
+
+//@node Includes, Global tables and lists, Global Address Manipulation, Global Address Manipulation
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "Storage.h"
+#include "Hash.h"
+#include "HLC.h"
+#include "ParallelRts.h"
+#if defined(DEBUG)
+# include "Sanity.h"
+#include "ParallelDebug.h"
+#endif
+#if defined(DIST)
+# include "Dist.h"
+#endif
+
+/*
+  @globalAddr@ structures are allocated in chunks to reduce malloc overhead.
+*/
+
+//@node Global tables and lists, Fcts on GALA tables, Includes, Global Address Manipulation
+//@subsection Global tables and lists
+
+//@cindex thisPE
+nat thisPE;
+
+//@menu
+//* Free lists::		
+//* Hash tables::		
+//@end menu
+
+//@node Free lists, Hash tables, Global tables and lists, Global tables and lists
+//@subsubsection Free lists
+
+/* Free list of GALA entries */
+GALA *freeGALAList = NULL;
+
+/* Number of globalAddr cells to allocate in one go */
+#define GCHUNK	    (1024 * sizeof(StgWord) / sizeof(GALA))
+
+/* Free list of indirections */
+
+//@cindex nextIndirection
+static StgInt nextIndirection = 0;
+//@cindex freeIndirections
+GALA *freeIndirections = NULL;
+
+/* The list of live indirections has to be marked for GC (see makeGlobal) */
+//@cindex liveIndirections
+GALA *liveIndirections = NULL;
+
+/* The list of remote indirections has to be marked for GC (see setRemoteGA) */
+//@cindex liveRemoteGAs
+GALA *liveRemoteGAs = NULL;
+
+//@node Hash tables,  , Free lists, Global tables and lists
+//@subsubsection Hash tables
+
+/* Mapping global task ids PEs */
+//@cindex taskIDtoPEtable
+HashTable *taskIDtoPEtable = NULL;
+
+static int nextPE = 0;
+
+/* LAGA table: StgClosure* -> globalAddr*
+               (Remember: globalAddr = (GlobalTaskId, Slot, Weight))
+   Mapping local to global addresses (see interface below) 
+*/
+
+//@cindex LAtoGALAtable
+HashTable *LAtoGALAtable = NULL;
+
+/* GALA table: globalAddr* -> StgClosure*
+               (Remember: globalAddr = (GlobalTaskId, Slot, Weight))
+   Mapping global to local addresses (see interface below) 
+*/
+
+//@cindex pGAtoGALAtable
+HashTable *pGAtoGALAtable = NULL;
+
+//@node Fcts on GALA tables, Interface to taskId-PE table, Global tables and lists, Global Address Manipulation
+//@subsection Fcts on GALA tables
+
+//@cindex allocGALA
+static GALA *
+allocGALA(void)
+{
+  GALA *gl, *p;
+
+  if ((gl = freeGALAList) != NULL) {
+    IF_DEBUG(sanity,
+	     ASSERT(gl->ga.weight==0xdead0add);
+             ASSERT(gl->la==(StgPtr)0xdead00aa));
+    freeGALAList = gl->next;
+  } else {
+    gl = (GALA *) stgMallocBytes(GCHUNK * sizeof(GALA), "allocGALA");
+
+    freeGALAList = gl + 1;
+    for (p = freeGALAList; p < gl + GCHUNK - 1; p++) {
+      p->next = p + 1;
+      IF_DEBUG(sanity,
+	       p->ga.weight=0xdead0add;
+               p->la=(StgPtr)0xdead00aa);
+    }
+    /* last elem in the new block has NULL pointer in link field */
+    p->next = NULL;
+    IF_DEBUG(sanity,
+	     p->ga.weight=0xdead0add;
+	     p->la=(StgPtr)0xdead00aa);
+  }
+  IF_DEBUG(sanity,
+	   gl->ga.weight=0xdead0add;
+           gl->la=(StgPtr)0xdead00aa);
+  return gl;
+}
+
+//@node Interface to taskId-PE table, Interface to LAGA table, Fcts on GALA tables, Global Address Manipulation
+//@subsection Interface to taskId-PE table
+
+/*
+  We don't really like GLOBAL_TASK_ID, so we keep a table of TASK_ID to
+  PE mappings.  The idea is that a PE identifier will fit in 16 bits, whereas 
+  a TASK_ID may not.
+*/
+
+//@cindex taskIDtoPE
+PEs
+taskIDtoPE(GlobalTaskId gtid)
+{
+  return ((PEs) lookupHashTable(taskIDtoPEtable, gtid));
+}
+
+//@cindex registerTask
+void 
+registerTask(GlobalTaskId gtid) { 
+  nextPE++;               //start counting from 1
+  if (gtid == mytid)
+    thisPE = nextPE;
+
+  insertHashTable(taskIDtoPEtable, gtid, (void *) (StgWord) nextPE);
+}
+
+//@node Interface to LAGA table, Interface to GALA table, Interface to taskId-PE table, Global Address Manipulation
+//@subsection Interface to LAGA table
+
+/*
+  The local address to global address mapping returns a globalAddr structure
+  (pe task id, slot, weight) for any closure in the local heap which has a
+  global identity.  Such closures may be copies of normal form objects with
+  a remote `master' location, @FetchMe@ nodes referencing remote objects, or
+  globally visible objects in the local heap (for which we are the master).
+*/
+
+//@cindex LAGAlookup
+globalAddr *
+LAGAlookup(addr)
+StgClosure *addr;
+{
+  GALA *gala;
+
+  /* We never look for GA's on indirections. -- unknown hacker
+     Well, in fact at the moment we do in the new RTS. -- HWL
+     ToDo: unwind INDs when entering them into the hash table
+
+  ASSERT(IS_INDIRECTION(addr) == NULL);
+  */
+  if ((gala = lookupHashTable(LAtoGALAtable, (StgWord) addr)) == NULL)
+    return NULL;
+  else
+    return &(gala->ga);
+}
+
+//@node Interface to GALA table, GC functions for GALA tables, Interface to LAGA table, Global Address Manipulation
+//@subsection Interface to GALA table
+
+/*
+  We also manage a mapping of global addresses to local addresses, so that
+  we can ``common up'' multiple references to the same object as they arrive
+  in data packets from remote PEs.
+
+  The global address to local address mapping is actually managed via a
+  ``packed global address'' to GALA hash table.  The packed global
+  address takes the interesting part of the @globalAddr@ structure
+  (i.e. the pe and slot fields) and packs them into a single word
+  suitable for hashing.
+*/
+
+//@cindex GALAlookup
+StgClosure *
+GALAlookup(ga)
+globalAddr *ga;
+{
+  StgWord pga = PackGA(taskIDtoPE(ga->payload.gc.gtid), ga->payload.gc.slot);
+  GALA *gala;
+
+  if ((gala = (GALA *) lookupHashTable(pGAtoGALAtable, pga)) == NULL)
+    return NULL;
+  else {
+    /* 
+     * Bypass any indirections when returning a local closure to
+     * the caller.  Note that we do not short-circuit the entry in
+     * the GALA tables right now, because we would have to do a
+     * hash table delete and insert in the LAtoGALAtable to keep
+     * that table up-to-date for preferred GALA pairs.  That's
+     * probably a bit expensive.
+     */
+    return UNWIND_IND((StgClosure *)(gala->la));
+  }
+}
+
+/* ga becomes non-preferred (e.g. due to CommonUp) */
+void
+GALAdeprecate(ga)
+globalAddr *ga;
+{
+  StgWord pga = PackGA(taskIDtoPE(ga->payload.gc.gtid), ga->payload.gc.slot);
+  GALA *gala;
+
+  gala = (GALA *) lookupHashTable(pGAtoGALAtable, pga);
+  ASSERT(gala!=NULL);
+  ASSERT(gala->preferred==rtsTrue);
+  gala->preferred = rtsFalse;
+}
+
+/*
+  External references to our globally-visible closures are managed through an
+  indirection table.  The idea is that the closure may move about as the result
+  of local garbage collections, but its global identity is determined by its
+  slot in the indirection table, which never changes.
+
+  The indirection table is maintained implicitly as part of the global
+  address to local address table.  We need only keep track of the
+  highest numbered indirection index allocated so far, along with a free
+  list of lower numbered indices no longer in use.
+*/
+
+/* 
+   Allocate an indirection slot for the closure currently at address @addr@.
+*/
+
+//@cindex allocIndirection
+static GALA *
+allocIndirection(StgClosure *closure)
+{
+  GALA *gala;
+  
+  if ((gala = freeIndirections) != NULL) {
+    IF_DEBUG(sanity,
+	     ASSERT(gala->ga.weight==0xdead0add);
+             ASSERT(gala->la==(StgPtr)0xdead00aa));
+    freeIndirections = gala->next;
+  } else {
+    gala = allocGALA();
+    IF_DEBUG(sanity,
+	     ASSERT(gala->ga.weight==0xdead0add);
+             ASSERT(gala->la==(StgPtr)0xdead00aa));
+    gala->ga.payload.gc.gtid = mytid;
+    gala->ga.payload.gc.slot = nextIndirection++;
+    IF_DEBUG(sanity,
+	     if (nextIndirection>=MAX_SLOTS)
+	       barf("Cannot handle more than %d slots for GA in a sanity-checking setup (this is no error)"));
+  }
+  gala->ga.weight = MAX_GA_WEIGHT;
+  gala->la = (StgPtr)closure;
+  IF_DEBUG(sanity,
+	   gala->next=(struct gala *)0xcccccccc);
+  return gala;
+}
+
+/* 
+   This is only used for sanity checking (see LOOKS_LIKE_SLOT)
+*/
+StgInt
+highest_slot (void) { return nextIndirection; }
+
+/*
+  Make a local closure globally visible.  
+
+  Called from: GlobaliseAndPackGA
+  Args: 
+   closure ... closure to be made visible
+   preferred ... should the new GA become the preferred one (normalle=y true)
+
+  Allocate a GALA structure and add it to the (logical) Indirections table,
+  by inserting it into the LAtoGALAtable hash table and putting it onto the
+  liveIndirections list (only if it is preferred).
+   
+  We have to allocate an indirection slot for it, and update both the local
+  address to global address and global address to local address maps.  
+*/
+
+//@cindex makeGlobal
+globalAddr *
+makeGlobal(closure, preferred)
+StgClosure *closure;
+rtsBool preferred;
+{
+  /* check whether we already have a GA for this local closure */
+  GALA *oldGALA = lookupHashTable(LAtoGALAtable, (StgWord) closure);
+  /* create an entry in the LAGA table */
+  GALA *newGALA = allocIndirection(closure);
+  StgWord pga = PackGA(thisPE, newGALA->ga.payload.gc.slot);
+
+  IF_DEBUG(sanity,
+	   ASSERT(newGALA->next==(struct gala *)0xcccccccc););
+  // ASSERT(HEAP_ALLOCED(closure)); // check that closure might point into the heap; might be static, though
+  ASSERT(GALAlookup(&(newGALA->ga)) == NULL);
+  
+  /* global statistics gathering */
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    globalParStats.local_alloc_GA++;
+  }
+
+  newGALA->la = (StgPtr)closure;
+  newGALA->preferred = preferred;
+
+  if (preferred) {
+    /* The new GA is now the preferred GA for the LA */
+    if (oldGALA != NULL) {
+      oldGALA->preferred = rtsFalse;
+      (void) removeHashTable(LAtoGALAtable, (StgWord) closure, (void *) oldGALA);
+    }
+    insertHashTable(LAtoGALAtable, (StgWord) closure, (void *) newGALA);
+  }
+
+  ASSERT(!isOnLiveIndTable(&(newGALA->ga)));
+  /* put the new GALA entry on the list of live indirections */
+  newGALA->next = liveIndirections;
+  liveIndirections = newGALA;
+  
+  insertHashTable(pGAtoGALAtable, pga, (void *) newGALA);
+  
+  return &(newGALA->ga);
+}
+
+/*
+  Assign an existing remote global address to an existing closure.
+
+  Called from: Unpack in Pack.c
+  Args:
+   local_closure ... a closure that has just been unpacked 
+   remote_ga ... the GA that came with it, ie. the name under which the 
+                 closure is known while being transferred
+   preferred ... should the new GA become the preferred one (normalle=y true)
+
+  Allocate a GALA structure and add it to the (logical) RemoteGA table,
+  by inserting it into the LAtoGALAtable hash table and putting it onto the
+  liveRemoteGAs list (only if it is preferred).
+
+  We do not retain the @globalAddr@ structure that's passed in as an argument,
+  so it can be a static in the calling routine.
+*/
+
+//@cindex setRemoteGA
+globalAddr *
+setRemoteGA(local_closure, remote_ga, preferred)
+StgClosure *local_closure;
+globalAddr *remote_ga;
+rtsBool preferred;
+{
+  /* old entry ie the one with the GA generated when sending off the closure */
+  GALA *oldGALA = lookupHashTable(LAtoGALAtable, (StgWord) local_closure);
+  /* alloc new entry and fill it with contents of the newly arrives GA */
+  GALA *newGALA = allocGALA();
+  StgWord pga = PackGA(taskIDtoPE(remote_ga->payload.gc.gtid), 
+		       remote_ga->payload.gc.slot);
+
+  ASSERT(remote_ga->payload.gc.gtid != mytid);
+  ASSERT(remote_ga->weight > 0);
+  ASSERT(GALAlookup(remote_ga) == NULL);
+
+  newGALA->ga = *remote_ga;
+  newGALA->la = (StgPtr)local_closure;
+  newGALA->preferred = preferred;
+
+  if (preferred) {
+    /* The new GA is now the preferred GA for the LA */
+    if (oldGALA != NULL) {
+      oldGALA->preferred = rtsFalse;
+      (void) removeHashTable(LAtoGALAtable, (StgWord) local_closure, (void *) oldGALA);
+    }
+    insertHashTable(LAtoGALAtable, (StgWord) local_closure, (void *) newGALA);
+  }
+
+  ASSERT(!isOnRemoteGATable(&(newGALA->ga)));
+  /* add new entry to the (logical) RemoteGA table */
+  newGALA->next = liveRemoteGAs;
+  liveRemoteGAs = newGALA;
+  
+  insertHashTable(pGAtoGALAtable, pga, (void *) newGALA);
+  
+  /*
+    The weight carried by the incoming closure is transferred to the newGALA
+    entry (via the structure assign above). Therefore, we have to give back
+    the weight to the GA on the other processor, because that indirection is
+    no longer needed. 
+  */
+  remote_ga->weight = 0;
+  return &(newGALA->ga);
+}
+
+/*
+  Give me a bit of weight to give away on a new reference to a particular
+  global address.  If we run down to nothing, we have to assign a new GA.  
+*/
+
+//@cindex splitWeight
+#if 0
+void
+splitWeight(to, from)
+globalAddr *to, *from;
+{
+  /* Make sure we have enough weight to split */
+  if (from->weight!=MAX_GA_WEIGHT && from->weight<=3)  // fixed by UK in Eden implementation
+    from = makeGlobal(GALAlookup(from), rtsTrue);
+  
+  to->payload = from->payload;
+
+  if (from->weight == MAX_GA_WEIGHT)
+    to->weight = 1L << (BITS_IN(unsigned) - 1);
+  else
+    to->weight = from->weight / 2;
+
+  from->weight -= to->weight;
+}
+#else
+void
+splitWeight(to, from)
+globalAddr *to, *from;
+{
+  /* Make sure we have enough weight to split */
+  /* Splitting at 2 needed, as weight 1 is not legal in packets (UK+KH) */
+  
+  if (from->weight / 2 <= 2) /* old: weight== 1 (UK) */
+      from = makeGlobal(GALAlookup(from), rtsTrue);
+  
+  to->payload = from->payload;
+  
+  if (from->weight <= 1) /* old == 0 (UK) */
+      to->weight = 1L << (BITS_IN(unsigned) - 1);
+  else
+      to->weight = from->weight / 2;
+  
+  from->weight -= to->weight;
+}
+#endif
+/*
+  Here, I am returning a bit of weight that a remote PE no longer needs.
+*/
+
+//@cindex addWeight
+globalAddr *
+addWeight(ga)
+globalAddr *ga;
+{
+  StgWord pga;
+  GALA *gala;
+
+  ASSERT(LOOKS_LIKE_GA(ga));
+
+  pga = PackGA(taskIDtoPE(ga->payload.gc.gtid), ga->payload.gc.slot);
+  gala = (GALA *) lookupHashTable(pGAtoGALAtable, pga);
+
+  IF_PAR_DEBUG(weight,
+	       fprintf(stderr, "@* Adding weight %x to ", ga->weight);
+	       printGA(&(gala->ga));
+	       fputc('\n', stderr));
+
+  gala->ga.weight += ga->weight;    
+  ga->weight = 0;
+
+  return &(gala->ga);
+}
+
+/*
+  Initialize all of the global address structures: the task ID to PE id
+  map, the local address to global address map, the global address to
+  local address map, and the indirection table.
+*/
+
+//@cindex initGAtables
+void
+initGAtables(void)
+{
+  taskIDtoPEtable = allocHashTable();
+  LAtoGALAtable = allocHashTable();
+  pGAtoGALAtable = allocHashTable();
+}
+
+//@cindex PackGA
+StgWord
+PackGA (pe, slot)
+StgWord pe;
+int slot;
+{
+  int pe_shift = (BITS_IN(StgWord)*3)/4;
+  int pe_bits  = BITS_IN(StgWord) - pe_shift;
+
+  if ( pe_bits < 8 || slot >= (1L << pe_shift) ) { /* big trouble */
+    fflush(stdout);
+    fprintf(stderr, "PackGA: slot# too big (%d) or not enough pe_bits (%d)\n",
+	    slot,pe_bits);
+    stg_exit(EXIT_FAILURE);
+  }
+
+  return((((StgWord)(pe)) << pe_shift) | ((StgWord)(slot)));
+	
+    /* the idea is to use 3/4 of the bits (e.g., 24) for indirection-
+       table "slot", and 1/4 for the pe# (e.g., 8).
+       
+       We check for too many bits in "slot", and double-check (at
+       compile-time?) that we have enough bits for "pe".  We *don't*
+       check for too many bits in "pe", because SysMan enforces a
+       MAX_PEs limit at the very very beginning.
+
+       Phil & Will 95/08
+    */
+}
+
+//@node GC functions for GALA tables, Debugging routines, Interface to GALA table, Global Address Manipulation
+//@subsection GC functions for GALA tables
+
+/*
+  When we do a copying collection, we want to evacuate all of the local
+  entries in the GALA table for which there are outstanding remote
+  pointers (i.e. for which the weight is not MAX_GA_WEIGHT.)
+  This routine has to be run BEFORE doing the GC proper (it's a 
+  ``mark roots'' thing).
+*/
+//@cindex markLocalGAs
+void
+markLocalGAs(rtsBool full)
+{
+  GALA *gala, *next, *prev = NULL;
+  StgPtr old_la, new_la;
+  nat n=0, m=0; // debugging only
+  double start_time_GA; // stats only
+
+  IF_PAR_DEBUG(tables,
+	   belch("@@%%%% markLocalGAs (full=%d): Marking LIVE INDIRECTIONS in GALA table starting with GALA at %p\n",
+		 full, liveIndirections);
+	   printLAGAtable());
+
+  PAR_TICKY_MARK_LOCAL_GAS_START();
+
+  for (gala = liveIndirections, m=0; gala != NULL; gala = next, m++) {
+    IF_PAR_DEBUG(tables,
+		 fputs("@@ ",stderr);
+		 printGA(&(gala->ga));
+		 fprintf(stderr, ";@ %d: LA: %p (%s) ",
+			 m, (void*)gala->la, info_type((StgClosure*)gala->la)));
+    next = gala->next;
+    old_la = gala->la;
+    ASSERT(gala->ga.payload.gc.gtid == mytid); /* it's supposed to be local */
+    if (gala->ga.weight != MAX_GA_WEIGHT) {
+      /* Remote references exist, so we must evacuate the local closure */
+      if (get_itbl((StgClosure *)old_la)->type == EVACUATED) {
+	/* somebody else already evacuated this closure */
+	new_la = (StgPtr)((StgEvacuated *)old_la)->evacuee;
+	IF_PAR_DEBUG(tables,
+		 belch(" already evacuated to %p", new_la));
+      } else {
+#if 1
+	/* unwind any indirections we find */
+	StgClosure *foo = UNWIND_IND((StgClosure *)old_la) ; // debugging only
+	//ASSERT(HEAP_ALLOCED(foo));
+	n++;
+
+	new_la = (StgPtr) MarkRoot(foo);
+	IF_PAR_DEBUG(tables,
+		     belch(" evacuated %p to %p", foo, new_la));
+	/* ToDo: is this the right assertion to check that new_la is in to-space?
+	ASSERT(!HEAP_ALLOCED(new_la) || Bdescr(new_la)->evacuated);
+	*/
+#else
+	new_la = MarkRoot(old_la); // or just evacuate(old_ga)
+	IF_PAR_DEBUG(tables,
+		     belch(" evacuated %p to %p", old_la, new_la));
+#endif
+      }
+
+      gala->la = new_la;
+      /* remove old LA and replace with new LA */
+      if (/* !full && */ gala->preferred && new_la != old_la) {
+	GALA *q;
+	ASSERT(lookupHashTable(LAtoGALAtable, (StgWord)old_la));
+	(void) removeHashTable(LAtoGALAtable, (StgWord) old_la, (void *) gala);
+	if ((q = lookupHashTable(LAtoGALAtable, (StgWord) new_la))!=NULL) {
+	  if (q->preferred && gala->preferred) {
+	    q->preferred = rtsFalse;
+	    IF_PAR_DEBUG(tables,
+			 fprintf(stderr, "@@## found hash entry for closure %p (%s): deprecated GA ",
+			   new_la, info_type((StgClosure*)new_la));
+			 printGA(&(q->ga));
+			 fputc('\n', stderr)); 
+	  }
+	} else {
+	  insertHashTable(LAtoGALAtable, (StgWord) new_la, (void *) gala);
+	}
+	IF_PAR_DEBUG(tables,
+		 belch("__## Hash table update (%p --> %p): ",
+		       old_la, new_la));
+      }
+
+      gala->next = prev;
+      prev = gala;
+    } else if(LOOKS_LIKE_STATIC_CLOSURE(gala->la)) {
+      /* to handle the CAFs, is this all?*/
+      MarkRoot(gala->la);
+      IF_PAR_DEBUG(tables,
+		   belch(" processed static closure"));
+      n++;
+      gala->next = prev;
+      prev = gala;   
+    } else {
+      /* Since we have all of the weight, this GA is no longer needed */
+      StgWord pga = PackGA(thisPE, gala->ga.payload.gc.slot);
+      
+      IF_PAR_DEBUG(free,
+		   belch("@@!! Freeing slot %d", 
+			 gala->ga.payload.gc.slot));
+      /* put gala on free indirections list */
+      gala->next = freeIndirections;
+      freeIndirections = gala;
+      (void) removeHashTable(pGAtoGALAtable, pga, (void *) gala);
+      if (/* !full && */ gala->preferred)
+	(void) removeHashTable(LAtoGALAtable, (W_) gala->la, (void *) gala);
+
+      IF_DEBUG(sanity,
+	       gala->ga.weight = 0xdead0add;
+	       gala->la = (StgPtr) 0xdead00aa);
+    }
+  } /* for gala ... */
+  liveIndirections = prev;  /* list has been reversed during the marking */
+
+
+  PAR_TICKY_MARK_LOCAL_GAS_END(n);
+
+  IF_PAR_DEBUG(tables,
+	       belch("@@%%%% markLocalGAs: %d of %d GALAs marked on PE %x",
+		     n, m, mytid));
+}
+
+/*
+  Traverse the GALA table: for every live remote GA check whether it has been
+  touched during GC; if not it is not needed locally and we can free the 
+  closure (i.e. let go of its heap space and send a free message to the
+  PE holding its GA).
+  This routine has to be run AFTER doing the GC proper.
+*/
+void
+rebuildGAtables(rtsBool full)
+{
+  GALA *gala, *next, *prev;
+  StgClosure *closure;
+  nat n = 0, size_GA = 0; // stats only (no. of GAs, and their heap size in bytes)
+
+  IF_PAR_DEBUG(tables,
+	   belch("@@%%%% rebuildGAtables (full=%d): rebuilding LIVE REMOTE GAs in GALA table starting with GALA at %p\n",
+		 full, liveRemoteGAs));
+
+  PAR_TICKY_REBUILD_GA_TABLES_START();
+
+  prepareFreeMsgBuffers();
+
+  for (gala = liveRemoteGAs, prev = NULL; gala != NULL; gala = next) {
+    IF_PAR_DEBUG(tables,
+		 printGA(&(gala->ga)));
+    next = gala->next;
+    ASSERT(gala->ga.payload.gc.gtid != mytid); /* it's supposed to be remote */
+
+    closure = (StgClosure *) (gala->la);
+    IF_PAR_DEBUG(tables,
+		 fprintf(stderr, " %p (%s) ",
+			 (StgClosure *)closure, info_type(closure)));
+
+    if (/* !full && */ gala->preferred)
+      (void) removeHashTable(LAtoGALAtable, (StgWord) gala->la, (void *) gala);
+
+    /* Follow indirection chains to the end, just in case */
+    // should conform with unwinding in markLocalGAs
+    closure = UNWIND_IND(closure);
+
+    /*
+       If closure has been evacuated it is live; otherwise it's dead and we
+       can nuke the GA attached to it in the LAGA table.
+       This approach also drops global aliases for PLCs.
+    */
+
+    //ASSERT(!HEAP_ALLOCED(closure) || !(Bdescr((StgPtr)closure)->evacuated));
+    if (get_itbl(closure)->type == EVACUATED) {
+      closure = ((StgEvacuated *)closure)->evacuee;
+      IF_PAR_DEBUG(tables,
+		   fprintf(stderr, " EVAC %p (%s)\n",
+			   closure, info_type(closure)));
+    } else {
+      /* closure is not alive any more, thus remove GA and send free msg */
+      int pe = taskIDtoPE(gala->ga.payload.gc.gtid);
+      StgWord pga = PackGA(pe, gala->ga.payload.gc.slot);
+
+      /* check that the block containing this closure is not in to-space */
+      IF_PAR_DEBUG(tables,
+		   fprintf(stderr, " !EVAC %p (%s); sending free to PE %d\n",
+			   closure, info_type(closure), pe));
+
+      (void) removeHashTable(pGAtoGALAtable, pga, (void *) gala);
+      freeRemoteGA(pe-1, &(gala->ga)); //-1 cause ids start at 1... not 0
+      gala->next = freeGALAList;
+      freeGALAList = gala;
+      IF_DEBUG(sanity,
+	       gala->ga.weight = 0xdead0add;
+	       gala->la = (StgPtr)0xdead00aa);
+      continue;
+    }
+    gala->la = (StgPtr)closure;
+    if (/* !full && */ gala->preferred) {
+      GALA *q;
+      if ((q = lookupHashTable(LAtoGALAtable, (StgWord) gala->la))!=NULL) {
+	if (q->preferred && gala->preferred) {
+	    q->preferred = rtsFalse;
+	    IF_PAR_DEBUG(tables,
+			 fprintf(stderr, "@@## found hash entry for closure %p (%s): deprecated GA ",
+			   gala->la, info_type((StgClosure*)gala->la));
+			 printGA(&(q->ga));
+			 fputc('\n', stderr)); 
+	}
+      } else {
+	insertHashTable(LAtoGALAtable, (StgWord) gala->la, (void *) gala);
+      }
+    }
+    gala->next = prev;
+    prev = gala;
+    /* Global statistics: count GAs and total size
+    if (RtsFlags.ParFlags.ParStats.Global &&
+	RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+      StgInfoTable *info;
+      nat size, ptrs, nonptrs, vhs, i;
+      char str[80];
+
+      info = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str);
+
+      size_GA += size ;
+      n++; // stats: count number of GAs we add to the new table
+    }
+    */
+  }
+  liveRemoteGAs = prev; /* list is reversed during marking */
+
+  /* If we have any remaining FREE messages to send off, do so now */
+  sendFreeMessages();
+
+  PAR_TICKY_CNT_FREE_GA();
+
+  IF_DEBUG(sanity,
+	   checkFreeGALAList();
+	   checkFreeIndirectionsList());
+
+  rebuildLAGAtable();
+
+#if defined(PAR_TICKY)
+  getLAGAtableSize(&n, &size_GA);        // determine no of GAs and global heap
+  PAR_TICKY_REBUILD_GA_TABLES_END(n, size_GA); // record these values
+#endif
+
+  IF_PAR_DEBUG(tables,
+	   belch("@#%%%% rebuildGAtables: After ReBuilding GALA table starting with GALA at %p",
+		 liveRemoteGAs);
+	   printLAGAtable());
+}
+
+/*
+  Rebuild the LA->GA table, assuming that the addresses in the GALAs are
+  correct.  
+  A word on the lookupHashTable check in both loops:
+  After GC we may end up with 2 preferred GAs for the same LA! For example,
+  if we received a closure whose GA already exists on this PE we CommonUp
+  both closures, making one an indirection to the other. Before GC everything
+  is fine: one preferred GA refers to the IND, the other preferred GA refers
+  to the closure it points to. After GC, however, we have short cutted the 
+  IND and suddenly we have 2 preferred GAs for the same closure. We detect
+  this case in the loop below and deprecate one GA, so that we always just
+  have one preferred GA per LA.
+*/
+
+//@cindex rebuildLAGAtable
+void
+rebuildLAGAtable(void)
+{
+  GALA *gala;
+  nat n=0, m=0; // debugging
+
+  /* The old LA->GA table is worthless */
+  freeHashTable(LAtoGALAtable, NULL);
+  LAtoGALAtable = allocHashTable();
+
+  IF_PAR_DEBUG(tables,
+	   belch("@@%%%% rebuildLAGAtable: new LAGA table at %p",
+		 LAtoGALAtable)); 
+  
+  for (gala = liveIndirections; gala != NULL; gala = gala->next) {
+    n++;
+    if (gala->preferred) {
+      GALA *q;
+      if ((q = lookupHashTable(LAtoGALAtable, (StgWord) gala->la))!=NULL) {
+	if (q->preferred && gala->preferred) {
+	  /* this deprecates q (see also GALAdeprecate) */
+	  q->preferred = rtsFalse;
+	  (void) removeHashTable(LAtoGALAtable, (StgWord) gala->la, (void *)q);
+	  IF_PAR_DEBUG(tables,
+		       fprintf(stderr, "@@## found hash entry for closure %p (%s): deprecated GA ",
+			       gala->la, info_type((StgClosure*)gala->la));
+		       printGA(&(q->ga));
+		       fputc('\n', stderr)); 
+	}
+      }
+      insertHashTable(LAtoGALAtable, (StgWord) gala->la, (void *) gala);
+    }
+  }
+
+  for (gala = liveRemoteGAs; gala != NULL; gala = gala->next) {
+    m++;
+    if (gala->preferred) {
+      GALA *q;
+      if ((q = lookupHashTable(LAtoGALAtable, (StgWord) gala->la))!=NULL) {
+	if (q->preferred && gala->preferred) {
+	  /* this deprecates q (see also GALAdeprecate) */
+	  q->preferred = rtsFalse;
+	  (void) removeHashTable(LAtoGALAtable, (StgWord) gala->la, (void *)q);
+	  IF_PAR_DEBUG(tables,
+		       fprintf(stderr, "@@## found hash entry for closure %p (%s): deprecated GA ",
+			       (StgClosure*)gala->la, info_type((StgClosure*)gala->la));
+		       printGA(&(q->ga));
+		       fputc('\n', stderr)); 
+	}
+      }
+      insertHashTable(LAtoGALAtable, (StgWord) gala->la, (void *) gala);
+    }
+  }
+
+  IF_PAR_DEBUG(tables,
+	   belch("@@%%%% rebuildLAGAtable: inserted %d entries from liveIndirections and %d entries from liveRemoteGAs",
+		 n,m)); 
+}
+
+/*
+  Determine the size of the LAGA and GALA tables.
+  Has to be done after rebuilding the tables. 
+  Only used for global statistics gathering.
+*/
+
+//@cindex getLAGAtableSize
+void
+getLAGAtableSize(nat *nP, nat *sizeP)
+{
+  GALA *gala;
+  // nat n=0, tot_size=0;
+  StgClosure *closure;
+  StgInfoTable *info;
+  nat size, ptrs, nonptrs, vhs, i;
+  char str[80];
+  /* IN order to avoid counting closures twice we maintain a hash table
+     of all closures seen so far.
+     ToDo: collect this data while rebuilding the GALA table and make use
+           of the existing hash tables;
+  */
+  HashTable *closureTable;  // hash table for closures encountered already
+
+  closureTable = allocHashTable();
+
+  (*nP) = (*sizeP) = 0;
+  for (gala = liveIndirections; gala != NULL; gala = gala->next) {
+    closure = (StgClosure*) gala->la;
+    if (lookupHashTable(closureTable, (StgWord)closure)==NULL) { // not seen yet
+      insertHashTable(closureTable, (StgWord)closure, (void *)1);
+      info = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str);
+      (*sizeP) += size ;   // stats: measure total heap size of global closures
+      (*nP)++;             // stats: count number of GAs
+    }
+  }
+
+  for (gala = liveRemoteGAs; gala != NULL; gala = gala->next) {
+    closure = (StgClosure*) gala->la;
+    if (lookupHashTable(closureTable, (StgWord)closure)==NULL) { // not seen yet
+      insertHashTable(closureTable, (StgWord)closure, (void *)1);
+      info = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str);
+      (*sizeP) += size ;   // stats: measure total heap size of global closures
+      (*nP)++;             // stats: count number of GAs
+    }
+  }
+
+  freeHashTable(closureTable, NULL);
+}
+
+//@node Debugging routines, Index, GC functions for GALA tables, Global Address Manipulation
+//@subsection Debugging routines
+
+//@cindex printGA
+void
+printGA (globalAddr *ga)
+{
+  fprintf(stderr, "((%x, %d, %x))", 
+	  ga->payload.gc.gtid,
+	  ga->payload.gc.slot,
+	  ga->weight);
+}
+
+//@cindex printGALA
+void 
+printGALA (GALA *gala)
+{
+  printGA(&(gala->ga));
+  fprintf(stderr, " -> %p (%s)",
+	  (StgClosure*)gala->la, info_type((StgClosure*)gala->la));
+  fprintf(stderr, " %s",
+	  (gala->preferred) ? "PREF" : "____");
+}
+
+/*
+  Printing the LA->GA table.
+*/
+
+//@cindex printLiveIndTable
+void
+printLiveIndTable(void)
+{
+  GALA *gala, *q;
+  nat n=0; // debugging
+
+  belch("@@%%%%:: logical LiveIndTable (%p) (liveIndirections=%p):",
+	LAtoGALAtable, liveIndirections); 
+  
+  for (gala = liveIndirections; gala != NULL; gala = gala->next) {
+    n++;
+    printGALA(gala);
+    /* check whether this gala->la is hashed into the LAGA table */
+    q = lookupHashTable(LAtoGALAtable, (StgWord)(gala->la));
+    fprintf(stderr, "\t%s\n", (q==NULL) ? "...." : (q==gala) ?  "====" : "####");
+    //ASSERT(lookupHashTable(LAtoGALAtable, (StgWord)(gala->la)));
+  }
+  belch("@@%%%%:: %d live indirections",
+	n);
+}
+
+void
+printRemoteGATable(void)
+{
+  GALA *gala, *q;
+  nat m=0; // debugging
+
+  belch("@@%%%%:: logical RemoteGATable (%p) (liveRemoteGAs=%p):",
+	LAtoGALAtable, liveRemoteGAs);
+
+  for (gala = liveRemoteGAs; gala != NULL; gala = gala->next) {
+    m++;
+    printGALA(gala);
+    /* check whether this gala->la is hashed into the LAGA table */
+    q = lookupHashTable(LAtoGALAtable, (StgWord)(gala->la));
+    fprintf(stderr, "\t%s\n", (q==NULL) ? "...." : (q==gala) ? "====" : "####");
+    // ASSERT(lookupHashTable(LAtoGALAtable, (StgWord)(gala->la)));
+  }
+  belch("@@%%%%:: %d remote GAs",
+	m);
+}
+
+//@cindex printLAGAtable
+void
+printLAGAtable(void)
+{
+  belch("@@%%: LAGAtable (%p) with liveIndirections=%p, liveRemoteGAs=%p:",
+	LAtoGALAtable, liveIndirections, liveRemoteGAs); 
+
+  printLiveIndTable();
+  printRemoteGATable();
+}
+
+/*
+  Check whether a GA is already in a list.
+*/
+rtsBool
+isOnLiveIndTable(globalAddr *ga)
+{
+  GALA *gala;
+
+  for (gala = liveIndirections; gala != NULL; gala = gala->next) 
+    if (gala->ga.weight==ga->weight &&
+	gala->ga.payload.gc.slot==ga->payload.gc.slot &&
+	gala->ga.payload.gc.gtid==ga->payload.gc.gtid)
+      return rtsTrue;
+
+  return rtsFalse;
+}
+
+rtsBool
+isOnRemoteGATable(globalAddr *ga)
+{
+  GALA *gala;
+
+  for (gala = liveRemoteGAs; gala != NULL; gala = gala->next) 
+    if (gala->ga.weight==ga->weight &&
+	gala->ga.payload.gc.slot==ga->payload.gc.slot &&
+	gala->ga.payload.gc.gtid==ga->payload.gc.gtid)
+      return rtsTrue;
+
+  return rtsFalse;
+}
+
+/* 
+   Sanity check for free lists.
+*/
+void
+checkFreeGALAList(void) {
+  GALA *gl;
+
+  for (gl=freeGALAList; gl != NULL; gl=gl->next) {
+    ASSERT(gl->ga.weight==0xdead0add);
+    ASSERT(gl->la==(StgPtr)0xdead00aa);
+  }
+}
+
+void
+checkFreeIndirectionsList(void) {
+  GALA *gl;
+
+  for (gl=freeIndirections; gl != NULL; gl=gl->next) {
+    ASSERT(gl->ga.weight==0xdead0add);
+    ASSERT(gl->la==(StgPtr)0xdead00aa);
+  }
+}
+#endif /* PAR -- whole file */
+
+//@node Index,  , Debugging routines, Global Address Manipulation
+//@subsection Index
+
+//@index
+//* DebugPrintLAGAtable::  @cindex\s-+DebugPrintLAGAtable
+//* GALAlookup::  @cindex\s-+GALAlookup
+//* LAGAlookup::  @cindex\s-+LAGAlookup
+//* LAtoGALAtable::  @cindex\s-+LAtoGALAtable
+//* PackGA::  @cindex\s-+PackGA
+//* addWeight::  @cindex\s-+addWeight
+//* allocGALA::  @cindex\s-+allocGALA
+//* allocIndirection::  @cindex\s-+allocIndirection
+//* freeIndirections::  @cindex\s-+freeIndirections
+//* initGAtables::  @cindex\s-+initGAtables
+//* liveIndirections::  @cindex\s-+liveIndirections
+//* liveRemoteGAs::  @cindex\s-+liveRemoteGAs
+//* makeGlobal::  @cindex\s-+makeGlobal
+//* markLocalGAs::  @cindex\s-+markLocalGAs
+//* nextIndirection::  @cindex\s-+nextIndirection
+//* pGAtoGALAtable::  @cindex\s-+pGAtoGALAtable
+//* printGA::  @cindex\s-+printGA
+//* printGALA::  @cindex\s-+printGALA
+//* rebuildLAGAtable::  @cindex\s-+rebuildLAGAtable
+//* registerTask::  @cindex\s-+registerTask
+//* setRemoteGA::  @cindex\s-+setRemoteGA
+//* splitWeight::  @cindex\s-+splitWeight
+//* taskIDtoPE::  @cindex\s-+taskIDtoPE
+//* taskIDtoPEtable::  @cindex\s-+taskIDtoPEtable
+//* thisPE::  @cindex\s-+thisPE
+//@end index
diff --git a/rts/parallel/GranSim.c b/rts/parallel/GranSim.c
new file mode 100644
index 0000000000..b1cc0962be
--- /dev/null
+++ b/rts/parallel/GranSim.c
@@ -0,0 +1,3015 @@
+/* 
+   Time-stamp: <Tue Mar 06 2001 00:17:42 Stardate: [-30]6285.06 hwloidl>
+
+   Variables and functions specific to GranSim the parallelism simulator
+   for GPH.
+*/
+
+//@node GranSim specific code, , ,
+//@section GranSim specific code
+
+/*
+   Macros for dealing with the new and improved GA field for simulating
+   parallel execution. Based on @CONCURRENT@ package. The GA field now
+   contains a mask, where the n-th bit stands for the n-th processor, where
+   this data can be found. In case of multiple copies, several bits are
+   set. The total number of processors is bounded by @MAX_PROC@, which
+   should be <= the length of a word in bits.  -- HWL 
+*/
+
+//@menu
+//* Includes::			
+//* Prototypes and externs::	
+//* Constants and Variables::	
+//* Initialisation::		
+//* Global Address Operations::	 
+//* Global Event Queue::	
+//* Spark queue functions::	
+//* Scheduling functions::	
+//* Thread Queue routines::	
+//* GranSim functions::		
+//* GranSimLight routines::	
+//* Code for Fetching Nodes::	
+//* Idle PEs::			
+//* Routines directly called from Haskell world::  
+//* Emiting profiling info for GrAnSim::  
+//* Dumping routines::		
+//* Index::			
+//@end menu
+
+//@node Includes, Prototypes and externs, GranSim specific code, GranSim specific code
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "StgMiscClosures.h"
+#include "StgTypes.h"
+#include "Schedule.h"
+#include "SchedAPI.h"       // for pushClosure
+#include "GranSimRts.h"
+#include "GranSim.h"
+#include "ParallelRts.h"
+#include "ParallelDebug.h"
+#include "Sparks.h"
+#include "Storage.h"       // for recordMutable
+
+
+//@node Prototypes and externs, Constants and Variables, Includes, GranSim specific code
+//@subsection Prototypes and externs
+
+#if defined(GRAN)
+
+/* Prototypes */
+static inline PEs      ga_to_proc(StgWord);
+static inline rtsBool  any_idle(void);
+static inline nat      idlers(void);
+       PEs             where_is(StgClosure *node);
+
+static rtsBool         stealSomething(PEs proc, rtsBool steal_spark, rtsBool steal_thread);
+static rtsBool         stealSpark(PEs proc);
+static rtsBool         stealThread(PEs proc);
+static rtsBool         stealSparkMagic(PEs proc);
+static rtsBool         stealThreadMagic(PEs proc);
+/* subsumed by stealSomething
+static void            stealThread(PEs proc); 
+static void            stealSpark(PEs proc);
+*/
+static rtsTime         sparkStealTime(void);
+static nat             natRandom(nat from, nat to);
+static PEs             findRandomPE(PEs proc);
+static void            sortPEsByTime (PEs proc, PEs *pes_by_time, 
+				      nat *firstp, nat *np);
+
+void GetRoots(void);
+
+#endif /* GRAN */
+
+//@node Constants and Variables, Initialisation, Prototypes and externs, GranSim specific code
+//@subsection Constants and Variables
+
+#if defined(GRAN) || defined(PAR)
+/* See GranSim.h for the definition of the enum gran_event_types */
+char *gran_event_names[] = {
+    "START", "START(Q)",
+    "STEALING", "STOLEN", "STOLEN(Q)",
+    "FETCH", "REPLY", "BLOCK", "RESUME", "RESUME(Q)",
+    "SCHEDULE", "DESCHEDULE",
+    "END",
+    "SPARK", "SPARKAT", "USED", "PRUNED", "EXPORTED", "ACQUIRED",
+    "ALLOC",
+    "TERMINATE",
+    "SYSTEM_START", "SYSTEM_END",           /* only for debugging */
+    "??"
+};
+#endif
+
+#if defined(GRAN)                                              /* whole file */
+char *proc_status_names[] = {
+  "Idle", "Sparking", "Starting", "Fetching", "Fishing", "Busy", 
+  "UnknownProcStatus"
+};
+
+/* For internal use (event statistics) only */
+char *event_names[] =
+    { "ContinueThread", "StartThread", "ResumeThread", 
+      "MoveSpark", "MoveThread", "FindWork",
+      "FetchNode", "FetchReply",
+      "GlobalBlock", "UnblockThread"
+    };
+
+//@cindex CurrentProc
+PEs CurrentProc = 0;
+
+/*
+  ToDo: Create a structure for the processor status and put all the 
+        arrays below into it. 
+  -- HWL */
+
+//@cindex CurrentTime
+/* One clock for each PE */
+rtsTime CurrentTime[MAX_PROC];  
+
+/* Useful to restrict communication; cf fishing model in GUM */
+nat OutstandingFetches[MAX_PROC], OutstandingFishes[MAX_PROC];
+
+/* Status of each PE (new since but independent of GranSim Light) */
+rtsProcStatus procStatus[MAX_PROC];
+
+# if defined(GRAN) && defined(GRAN_CHECK)
+/* To check if the RTS ever tries to run a thread that should be blocked
+   because of fetching remote data */
+StgTSO *BlockedOnFetch[MAX_PROC];
+# define FETCH_MASK_TSO  0x08000000      /* only bits 0, 1, 2 should be used */
+# endif
+
+nat SparksAvail = 0;     /* How many sparks are available */
+nat SurplusThreads = 0;  /* How many excess threads are there */
+
+/* Do we need to reschedule following a fetch? */
+rtsBool NeedToReSchedule = rtsFalse, IgnoreEvents = rtsFalse, IgnoreYields = rtsFalse; 
+rtsTime TimeOfNextEvent, TimeOfLastEvent, EndOfTimeSlice; /* checked from the threaded world! */
+
+//@cindex spark queue
+/* GranSim: a globally visible array of spark queues */
+rtsSparkQ pending_sparks_hds[MAX_PROC];
+rtsSparkQ pending_sparks_tls[MAX_PROC];
+
+nat sparksIgnored = 0, sparksCreated = 0;
+
+GlobalGranStats globalGranStats;
+
+nat gran_arith_cost, gran_branch_cost, gran_load_cost, 
+    gran_store_cost, gran_float_cost;
+
+/*
+Old comment from 0.29. ToDo: Check and update -- HWL
+
+The following variables control the behaviour of GrAnSim. In general, there
+is one RTS option for enabling each of these features. In getting the
+desired setup of GranSim the following questions have to be answered:
+\begin{itemize}
+\item {\em Which scheduling algorithm} to use (@RtsFlags.GranFlags.DoFairSchedule@)? 
+      Currently only unfair scheduling is supported.
+\item What to do when remote data is fetched (@RtsFlags.GranFlags.DoAsyncFetch@)? 
+      Either block and wait for the
+      data or reschedule and do some other work.
+      Thus, if this variable is true, asynchronous communication is
+      modelled. Block on fetch mainly makes sense for incremental fetching.
+
+      There is also a simplified fetch variant available
+      (@RtsFlags.GranFlags.SimplifiedFetch@). This variant does not use events to model
+      communication. It is faster but the results will be less accurate.
+\item How aggressive to be in getting work after a reschedule on fetch
+      (@RtsFlags.GranFlags.FetchStrategy@)?
+      This is determined by the so-called {\em fetching
+      strategy\/}. Currently, there are four possibilities:
+      \begin{enumerate}
+       \item Only run a runnable thread.
+       \item Turn a spark into a thread, if necessary.
+       \item Steal a remote spark, if necessary.
+       \item Steal a runnable thread from another processor, if necessary.
+      \end{itemize}
+      The variable @RtsFlags.GranFlags.FetchStrategy@ determines how far to go in this list
+      when rescheduling on a fetch.
+\item Should sparks or threads be stolen first when looking for work
+      (@RtsFlags.GranFlags.DoStealThreadsFirst@)? 
+      The default is to steal sparks first (much cheaper).
+\item Should the RTS use a lazy thread creation scheme
+      (@RtsFlags.GranFlags.DoAlwaysCreateThreads@)?  By default yes i.e.\ sparks are only
+      turned into threads when work is needed. Also note, that sparks
+      can be discarded by the RTS (this is done in the case of an overflow
+      of the spark pool). Setting @RtsFlags.GranFlags.DoAlwaysCreateThreads@  to @True@ forces
+      the creation of threads at the next possibility (i.e.\ when new work
+      is demanded the next time).
+\item Should data be fetched closure-by-closure or in packets
+      (@RtsFlags.GranFlags.DoBulkFetching@)? The default strategy is a GRIP-like incremental 
+      (i.e.\ closure-by-closure) strategy. This makes sense in a
+      low-latency setting but is bad in a high-latency system. Setting 
+      @RtsFlags.GranFlags.DoBulkFetching@ to @True@ enables bulk (packet) fetching. Other
+      parameters determine the size of the packets (@pack_buffer_size@) and the number of
+      thunks that should be put into one packet (@RtsFlags.GranFlags.ThunksToPack@).
+\item If there is no other possibility to find work, should runnable threads
+      be moved to an idle processor (@RtsFlags.GranFlags.DoThreadMigration@)? In any case, the
+      RTS tried to get sparks (either local or remote ones) first. Thread
+      migration is very expensive, since a whole TSO has to be transferred
+      and probably data locality becomes worse in the process. Note, that
+      the closure, which will be evaluated next by that TSO is not
+      transferred together with the TSO (that might block another thread).
+\item Should the RTS distinguish between sparks created by local nodes and
+      stolen sparks (@RtsFlags.GranFlags.PreferSparksOfLocalNodes@)?  The idea is to improve 
+      data locality by preferring sparks of local nodes (it is more likely
+      that the data for those sparks is already on the local processor). 
+      However, such a distinction also imposes an overhead on the spark
+      queue management, and typically a large number of sparks are
+      generated during execution. By default this variable is set to @False@.
+\item Should the RTS use granularity control mechanisms? The idea of a 
+      granularity control mechanism is to make use of granularity
+      information provided via annotation of the @par@ construct in order
+      to prefer bigger threads when either turning a spark into a thread or
+      when choosing the next thread to schedule. Currently, three such
+      mechanisms are implemented:
+      \begin{itemize}
+        \item Cut-off: The granularity information is interpreted as a
+              priority. If a threshold priority is given to the RTS, then
+              only those sparks with a higher priority than the threshold 
+              are actually created. Other sparks are immediately discarded.
+              This is similar to a usual cut-off mechanism often used in 
+              parallel programs, where parallelism is only created if the 
+              input data is lage enough. With this option, the choice is 
+              hidden in the RTS and only the threshold value has to be 
+              provided as a parameter to the runtime system.
+        \item Priority Sparking: This mechanism keeps priorities for sparks
+              and chooses the spark with the highest priority when turning
+              a spark into a thread. After that the priority information is
+              discarded. The overhead of this mechanism comes from
+              maintaining a sorted spark queue.
+        \item Priority Scheduling: This mechanism keeps the granularity
+              information for threads, to. Thus, on each reschedule the 
+              largest thread is chosen. This mechanism has a higher
+              overhead, as the thread queue is sorted, too.
+       \end{itemize}  
+\end{itemize}
+*/
+
+//@node Initialisation, Global Address Operations, Constants and Variables, GranSim specific code
+//@subsection Initialisation
+
+void 
+init_gr_stats (void) {
+  memset(&globalGranStats, '\0', sizeof(GlobalGranStats));
+#if 0
+  /* event stats */
+  globalGranStats.noOfEvents = 0;
+  for (i=0; i<MAX_EVENT; i++) globalGranStats.event_counts[i]=0;
+
+  /* communication stats */
+  globalGranStats.fetch_misses = 0;
+  globalGranStats.tot_low_pri_sparks = 0;
+
+  /* obscure stats */  
+  globalGranStats.rs_sp_count = 0;
+  globalGranStats.rs_t_count = 0;
+  globalGranStats.ntimes_total = 0, 
+  globalGranStats.fl_total = 0;
+  globalGranStats.no_of_steals = 0;
+
+  /* spark queue stats */
+  globalGranStats.tot_sq_len = 0, 
+  globalGranStats.tot_sq_probes = 0; 
+  globalGranStats.tot_sparks = 0;
+  globalGranStats.withered_sparks = 0;
+  globalGranStats.tot_add_threads = 0;
+  globalGranStats.tot_tq_len = 0;
+  globalGranStats.non_end_add_threads = 0;
+
+  /* thread stats */
+  globalGranStats.tot_threads_created = 0;
+  for (i=0; i<MAX_PROC; i++) globalGranStats.threads_created_on_PE[i]=0;
+#endif /* 0 */
+}
+
+//@node Global Address Operations, Global Event Queue, Initialisation, GranSim specific code
+//@subsection Global Address Operations
+/*
+  ----------------------------------------------------------------------
+  Global Address Operations
+
+  These functions perform operations on the global-address (ga) part of a
+  closure. The ga is the only new field (1 word) in a closure introduced by
+  GrAnSim. It serves as a bitmask, indicating on which processor the
+  closure is residing. Since threads are described by Thread State Object
+  (TSO), which is nothing but another kind of closure, this scheme allows
+  gives placement information about threads.
+
+  A ga is just a bitmask, so the operations on them are mainly bitmask
+  manipulating functions. Note, that there are important macros like PROCS,
+  IS_LOCAL_TO etc. They are defined in @GrAnSim.lh@.
+
+  NOTE: In GrAnSim-light we don't maintain placement information. This
+  allows to simulate an arbitrary number of processors. The price we have
+  to be is the lack of costing any communication properly. In short,
+  GrAnSim-light is meant to reveal the maximal parallelism in a program.
+  From an implementation point of view the important thing is: {\em
+  GrAnSim-light does not maintain global-addresses}.  */
+
+/* ga_to_proc returns the first processor marked in the bitmask ga.
+   Normally only one bit in ga should be set. But for PLCs all bits
+   are set. That shouldn't hurt since we only need IS_LOCAL_TO for PLCs */
+ 
+//@cindex ga_to_proc
+
+static inline PEs
+ga_to_proc(StgWord ga)
+{
+    PEs i;
+    for (i = 0; i < RtsFlags.GranFlags.proc && !IS_LOCAL_TO(ga, i); i++);
+    ASSERT(i<RtsFlags.GranFlags.proc);
+    return (i);
+}
+
+/* NB: This takes a *node* rather than just a ga as input */
+//@cindex where_is
+PEs
+where_is(StgClosure *node)
+{ return (ga_to_proc(PROCS(node))); }
+
+// debugging only
+//@cindex is_unique
+rtsBool
+is_unique(StgClosure *node)
+{ 
+  PEs i;
+  rtsBool unique = rtsFalse;
+
+  for (i = 0; i < RtsFlags.GranFlags.proc ; i++)
+    if (IS_LOCAL_TO(PROCS(node), i))
+      if (unique)          // exactly 1 instance found so far
+	return rtsFalse;   // found a 2nd instance => not unique
+      else 
+	unique = rtsTrue;  // found 1st instance 
+  ASSERT(unique);          // otherwise returned from within loop
+  return (unique);
+}
+
+//@cindex any_idle
+static inline rtsBool
+any_idle(void) { /* any (map (\ i -> procStatus[i] == Idle)) [0,..,MAX_PROC] */
+ PEs i; 
+ rtsBool any_idle; 
+ for(i=0, any_idle=rtsFalse; 
+     !any_idle && i<RtsFlags.GranFlags.proc; 
+     any_idle = any_idle || procStatus[i] == Idle, i++) 
+ {} ;
+}
+
+//@cindex idlers
+static inline nat
+idlers(void) {  /* number of idle PEs */
+ PEs i, j; 
+ for(i=0, j=0;
+     i<RtsFlags.GranFlags.proc; 
+     j += (procStatus[i] == Idle) ? 1 : 0, i++) 
+ {} ;
+ return j;
+}
+
+//@node Global Event Queue, Spark queue functions, Global Address Operations, GranSim specific code
+//@subsection Global Event Queue
+/*
+The following routines implement an ADT of an event-queue (FIFO). 
+ToDo: Put that in an own file(?)
+*/
+
+/* Pointer to the global event queue; events are currently malloc'ed */
+rtsEventQ EventHd = NULL;
+
+//@cindex get_next_event
+rtsEvent *
+get_next_event(void)
+{
+  static rtsEventQ entry = NULL;
+
+  if (EventHd == NULL) {
+    barf("No next event. This may be caused by a circular data dependency in the program.");
+  }
+
+  if (entry != NULL)
+    free((char *)entry);
+
+  if (RtsFlags.GranFlags.GranSimStats.Global) {     /* count events */
+    globalGranStats.noOfEvents++;
+    globalGranStats.event_counts[EventHd->evttype]++;
+  }
+
+  entry = EventHd;
+
+  IF_GRAN_DEBUG(event_trace,
+	   print_event(entry));
+
+  EventHd = EventHd->next;
+  return(entry);
+}
+
+/* When getting the time of the next event we ignore CONTINUETHREAD events:
+   we don't want to be interrupted before the end of the current time slice
+   unless there is something important to handle. 
+*/
+//@cindex get_time_of_next_event
+rtsTime
+get_time_of_next_event(void)
+{ 
+  rtsEventQ event = EventHd;
+
+  while (event != NULL && event->evttype==ContinueThread) {
+    event = event->next;
+  }
+  if(event == NULL)
+      return ((rtsTime) 0);
+  else
+      return (event->time);
+}
+
+/* ToDo: replace malloc/free with a free list */
+//@cindex insert_event
+void
+insert_event(newentry)
+rtsEvent *newentry;
+{
+  rtsEventType evttype = newentry->evttype;
+  rtsEvent *event, **prev;
+
+  /* if(evttype >= CONTINUETHREAD1) evttype = CONTINUETHREAD; */
+
+  /* Search the queue and insert at the right point:
+     FINDWORK before everything, CONTINUETHREAD after everything.
+
+     This ensures that we find any available work after all threads have
+     executed the current cycle.  This level of detail would normally be
+     irrelevant, but matters for ridiculously low latencies...
+  */
+
+  /* Changed the ordering: Now FINDWORK comes after everything but 
+     CONTINUETHREAD. This makes sure that a MOVESPARK comes before a 
+     FINDWORK. This is important when a GranSimSparkAt happens and
+     DoAlwaysCreateThreads is turned on. Also important if a GC occurs
+     when trying to build a new thread (see much_spark)  -- HWL 02/96  */
+
+  if(EventHd == NULL)
+    EventHd = newentry;
+  else {
+    for (event = EventHd, prev=(rtsEvent**)&EventHd; 
+	 event != NULL; 
+         prev = (rtsEvent**)&(event->next), event = event->next) {
+      switch (evttype) {
+        case FindWork: if ( event->time < newentry->time ||
+                            ( (event->time == newentry->time) &&
+			      (event->evttype != ContinueThread) ) )
+                         continue;
+                       else
+                         break;
+        case ContinueThread: if ( event->time <= newentry->time )
+			       continue;
+			     else
+                               break;
+        default: if ( event->time < newentry->time || 
+	              ((event->time == newentry->time) &&
+		       (event->evttype == newentry->evttype)) )
+		   continue;
+		 else
+                   break;
+       }
+       /* Insert newentry here (i.e. before event) */
+       *prev = newentry;
+       newentry->next = event;
+       break;
+    }
+    if (event == NULL)
+      *prev = newentry;
+  }
+}
+
+//@cindex new_event
+void
+new_event(proc,creator,time,evttype,tso,node,spark)
+PEs proc, creator;
+rtsTime time;
+rtsEventType evttype;
+StgTSO *tso;
+StgClosure *node;
+rtsSpark *spark;
+{
+  rtsEvent *newentry = (rtsEvent *) stgMallocBytes(sizeof(rtsEvent), "new_event");
+
+  newentry->proc     = proc;
+  newentry->creator  = creator;
+  newentry->time     = time;
+  newentry->evttype  = evttype;
+  newentry->tso      = tso;
+  newentry->node     = node;
+  newentry->spark    = spark;
+  newentry->gc_info  = 0;
+  newentry->next     = NULL;
+
+  insert_event(newentry);
+
+  IF_DEBUG(gran, 
+	   fprintf(stderr, "GRAN: new_event: \n"); 
+	   print_event(newentry));
+}
+
+//@cindex prepend_event
+void
+prepend_event(event)       /* put event at beginning of EventQueue */
+rtsEvent *event;
+{				  /* only used for GC! */
+ event->next = EventHd;
+ EventHd = event;
+}
+
+//@cindex grab_event
+rtsEventQ
+grab_event(void)             /* undo prepend_event i.e. get the event */
+{			 /* at the head of EventQ but don't free anything */
+ rtsEventQ event = EventHd;
+
+ if (EventHd == NULL) {
+   barf("No next event (in grab_event). This may be caused by a circular data dependency in the program.");
+ }
+
+ EventHd = EventHd->next;
+ return (event);
+}
+
+//@cindex traverse_eventq_for_gc
+void 
+traverse_eventq_for_gc(void)
+{
+ rtsEventQ event = EventHd;
+ StgWord bufsize;
+ StgClosure *closurep;
+ StgTSO *tsop;
+ StgPtr buffer, bufptr;
+ PEs proc, creator;
+
+ /* Traverse eventq and replace every FETCHREPLY by a FETCHNODE for the
+    orig closure (root of packed graph). This means that a graph, which is
+    between processors at the time of GC is fetched again at the time when
+    it would have arrived, had there been no GC. Slightly inaccurate but
+    safe for GC.
+    This is only needed for GUM style fetchng. -- HWL */
+ if (!RtsFlags.GranFlags.DoBulkFetching)
+   return;
+
+ for(event = EventHd; event!=NULL; event=event->next) {
+   if (event->evttype==FetchReply) {
+     buffer = stgCast(StgPtr,event->node);
+     ASSERT(buffer[PACK_FLAG_LOCN]==MAGIC_PACK_FLAG);  /* It's a pack buffer */
+     bufsize = buffer[PACK_SIZE_LOCN];
+     closurep = stgCast(StgClosure*,buffer[PACK_HDR_SIZE]);
+     tsop = stgCast(StgTSO*,buffer[PACK_TSO_LOCN]);
+     proc = event->proc;
+     creator = event->creator;                 /* similar to unpacking */
+     for (bufptr=buffer+PACK_HDR_SIZE; 
+	  bufptr<(buffer+bufsize);
+	  bufptr++) {
+	 // if ( (INFO_TYPE(INFO_PTR(*bufptr)) == INFO_SPEC_RBH_TYPE) ||
+	 //      (INFO_TYPE(INFO_PTR(*bufptr)) == INFO_GEN_RBH_TYPE) ) {
+	   if ( GET_INFO(stgCast(StgClosure*,bufptr)) ) {
+	     convertFromRBH(stgCast(StgClosure *,bufptr));
+	 }
+     }
+     free(buffer);
+     event->evttype = FetchNode;
+     event->proc    = creator;
+     event->creator = proc;
+     event->node    = closurep;
+     event->tso     = tsop;
+     event->gc_info = 0;
+   }
+ }
+}
+
+void
+markEventQueue(void)
+{ 
+  StgClosure *MarkRoot(StgClosure *root); // prototype
+
+  rtsEventQ event = EventHd;
+  nat len;
+
+  /* iterate over eventq and register relevant fields in event as roots */
+  for(event = EventHd, len =  0; event!=NULL; event=event->next, len++) {
+    switch (event->evttype) {
+      case ContinueThread:  
+	event->tso = (StgTSO *)MarkRoot((StgClosure *)event->tso);
+	break;
+      case StartThread: 
+	event->tso = (StgTSO *)MarkRoot((StgClosure *)event->tso);
+	event->node = (StgClosure *)MarkRoot((StgClosure *)event->node);
+	break;
+      case ResumeThread:
+	event->tso = (StgTSO *)MarkRoot((StgClosure *)event->tso);
+	event->node = (StgClosure *)MarkRoot((StgClosure *)event->node);
+	break;
+      case MoveSpark:
+	event->spark->node = (StgClosure *)MarkRoot((StgClosure *)event->spark->node);
+	break;
+      case MoveThread:
+	event->tso = (StgTSO *)MarkRoot((StgClosure *)event->tso);
+	break;
+      case FindWork:
+	break;
+      case FetchNode: 
+	event->tso = (StgTSO *)MarkRoot((StgClosure *)event->tso);
+	event->node = (StgClosure *)MarkRoot((StgClosure *)event->node);
+  	break;
+      case FetchReply:
+	event->tso = (StgTSO *)MarkRoot((StgClosure *)event->tso);
+	if (RtsFlags.GranFlags.DoBulkFetching)
+	  // ToDo: traverse_eventw_for_gc if GUM-Fetching!!! HWL
+	  belch("ghuH: packets in BulkFetching not marked as roots; mayb be fatal");
+	else
+	  event->node = (StgClosure *)MarkRoot((StgClosure *)event->node);
+	break;
+      case GlobalBlock:
+	event->tso = (StgTSO *)MarkRoot((StgClosure *)event->tso);
+	event->node = (StgClosure *)MarkRoot((StgClosure *)event->node);
+	break;
+      case UnblockThread:
+	event->tso = (StgTSO *)MarkRoot((StgClosure *)event->tso);
+	event->node = (StgClosure *)MarkRoot((StgClosure *)event->node);
+	break;
+      default:
+	barf("markEventQueue: trying to mark unknown event @ %p", event);
+    }}
+  IF_DEBUG(gc,
+	   belch("GC: markEventQueue: %d events in queue", len));
+}
+
+/*
+  Prune all ContinueThread events related to tso or node in the eventq.
+  Currently used if a thread leaves STG land with ThreadBlocked status,
+  i.e. it blocked on a closure and has been put on its blocking queue.  It
+  will be reawakended via a call to awakenBlockedQueue. Until then no
+  event effecting this tso should appear in the eventq.  A bit of a hack,
+  because ideally we shouldn't generate such spurious ContinueThread events
+  in the first place.  
+*/
+//@cindex prune_eventq 
+void 
+prune_eventq(tso, node) 
+StgTSO *tso; 
+StgClosure *node; 
+{ rtsEventQ prev = (rtsEventQ)NULL, event = EventHd;
+
+  /* node unused for now */ 
+  ASSERT(node==NULL); 
+  /* tso must be valid, then */
+  ASSERT(tso!=END_TSO_QUEUE);
+  while (event != NULL) {
+    if (event->evttype==ContinueThread && 
+	(event->tso==tso)) {
+      IF_GRAN_DEBUG(event_trace, // ToDo: use another debug flag
+		    belch("prune_eventq: pruning ContinueThread event for TSO %d (%p) on PE %d @ %lx (%p)",
+			  event->tso->id, event->tso, event->proc, event->time, event));
+      if (prev==(rtsEventQ)NULL) { // beginning of eventq
+	EventHd = event->next;
+	free(event); 
+	event = EventHd;
+      } else {
+	prev->next = event->next;
+	free(event); 
+	event = prev->next;
+      }
+    } else { // no pruning necessary; go to next event
+      prev = event;
+      event = event->next;
+    }
+  }
+}
+
+//@cindex print_event
+void
+print_event(event)
+rtsEvent *event;
+{
+  char str_tso[16], str_node[16];
+  StgThreadID tso_id;
+
+  if (event->tso==END_TSO_QUEUE) {
+    strcpy(str_tso, "______");
+    tso_id = 0;
+  } else { 
+    sprintf(str_tso, "%p", event->tso);
+    tso_id = (event->tso==NULL) ? 0 : event->tso->id;
+  }
+  if  (event->node==(StgClosure*)NULL) {
+    strcpy(str_node, "______");
+  } else {
+    sprintf(str_node, "%p", event->node);
+  }
+  // HWL: shouldn't be necessary; ToDo: nuke
+  //str_tso[6]='\0';
+  //str_node[6]='\0';
+
+  if (event==NULL)
+    fprintf(stderr,"Evt: NIL\n");
+  else
+    fprintf(stderr, "Evt: %s (%u), PE %u [%u], Time %lu, TSO %d (%s), Node %s\n", //"Evt: %s (%u), PE %u [%u], Time %u, TSO %s (%#l), Node %s\n",
+	      event_names[event->evttype], event->evttype,
+              event->proc, event->creator, event->time, 
+	      tso_id, str_tso, str_node
+	      /*, event->spark, event->next */ );
+
+}
+
+//@cindex print_eventq
+void
+print_eventq(hd)
+rtsEvent *hd;
+{
+  rtsEvent *x;
+
+  fprintf(stderr,"Event Queue with root at %p:\n", hd);
+  for (x=hd; x!=NULL; x=x->next) {
+    print_event(x);
+  }
+}
+
+/* 
+   Spark queue functions are now all  in Sparks.c!!
+*/
+//@node Scheduling functions, Thread Queue routines, Spark queue functions, GranSim specific code
+//@subsection Scheduling functions
+
+/* 
+   These functions are variants of thread initialisation and therefore
+   related to initThread and friends in Schedule.c. However, they are
+   specific to a GranSim setup in storing more info in the TSO's statistics
+   buffer and sorting the thread queues etc.  
+*/
+
+/*
+   A large portion of startThread deals with maintaining a sorted thread
+   queue, which is needed for the Priority Sparking option. Without that
+   complication the code boils down to FIFO handling.  
+*/
+//@cindex insertThread
+void
+insertThread(tso, proc)
+StgTSO*     tso;
+PEs         proc;
+{
+  StgTSO *prev = NULL, *next = NULL;
+  nat count = 0;
+  rtsBool found = rtsFalse;
+
+  ASSERT(CurrentProc==proc);
+  ASSERT(!is_on_queue(tso,proc));
+  /* Idle proc: put the thread on the run queue
+     same for pri spark and basic version */
+  if (run_queue_hds[proc] == END_TSO_QUEUE)
+    {
+      /* too strong!
+      ASSERT((CurrentProc==MainProc &&   
+	      CurrentTime[MainProc]==0 &&
+	      procStatus[MainProc]==Idle) ||
+	     procStatus[proc]==Starting);
+      */
+      run_queue_hds[proc] = run_queue_tls[proc] = tso;
+
+      CurrentTime[proc] += RtsFlags.GranFlags.Costs.threadqueuetime;
+
+      /* new_event of ContinueThread has been moved to do_the_startthread */
+
+      /* too strong!
+      ASSERT(procStatus[proc]==Idle || 
+             procStatus[proc]==Fishing || 
+             procStatus[proc]==Starting);
+      procStatus[proc] = Busy;
+      */
+      return;
+    }
+
+  if (RtsFlags.GranFlags.Light)
+    GranSimLight_insertThread(tso, proc);
+
+  /* Only for Pri Scheduling: find place where to insert tso into queue */
+  if (RtsFlags.GranFlags.DoPriorityScheduling && tso->gran.pri!=0)
+    /* {add_to_spark_queue}vo' jInIHta'; Qu' wa'DIch yIleghQo' */
+    for (prev = run_queue_hds[proc], next = run_queue_hds[proc]->link, count=0;
+	 (next != END_TSO_QUEUE) && 
+	 !(found = tso->gran.pri >= next->gran.pri);
+	 prev = next, next = next->link, count++) 
+      { 
+       ASSERT((prev!=(StgTSO*)NULL || next==run_queue_hds[proc]) &&
+	      (prev==(StgTSO*)NULL || prev->link==next));
+      }
+
+  ASSERT(!found || next != END_TSO_QUEUE);
+  ASSERT(procStatus[proc]!=Idle);
+ 
+  if (found) {
+     /* found can only be rtsTrue if pri scheduling enabled */ 
+     ASSERT(RtsFlags.GranFlags.DoPriorityScheduling);
+     if (RtsFlags.GranFlags.GranSimStats.Global) 
+       globalGranStats.non_end_add_threads++;
+     /* Add tso to ThreadQueue between prev and next */
+     tso->link = next;
+     if ( next == (StgTSO*)END_TSO_QUEUE ) {
+       run_queue_tl = tso;
+     } else {
+       /* no back link for TSO chain */
+     }
+     
+     if ( prev == (StgTSO*)END_TSO_QUEUE ) {
+       /* Never add TSO as first elem of thread queue; the first */
+       /* element should be the one that is currently running -- HWL */
+       IF_DEBUG(gran,
+		belch("GRAN: Qagh: NewThread (w/ PriorityScheduling): Trying to add TSO %p (PRI=%d) as first elem of threadQ (%p) on proc %u (@ %u)\n",
+		    tso, tso->gran.pri, run_queue_hd, proc,
+		    CurrentTime[proc]));
+     } else {
+      prev->link = tso;
+     }
+  } else { /* !found */ /* or not pri sparking! */
+    /* Add TSO to the end of the thread queue on that processor */
+    run_queue_tls[proc]->link = tso;
+    run_queue_tls[proc] = tso;
+  }
+  ASSERT(RtsFlags.GranFlags.DoPriorityScheduling || count==0);
+  CurrentTime[proc] += count * RtsFlags.GranFlags.Costs.pri_sched_overhead +
+                       RtsFlags.GranFlags.Costs.threadqueuetime;
+
+  /* ToDo: check if this is still needed -- HWL 
+  if (RtsFlags.GranFlags.DoThreadMigration)
+    ++SurplusThreads;
+
+  if (RtsFlags.GranFlags.GranSimStats.Full &&
+      !(( event_type == GR_START || event_type == GR_STARTQ) && 
+	RtsFlags.GranFlags.labelling) )
+    DumpRawGranEvent(proc, creator, event_type+1, tso, node, 
+	             tso->gran.sparkname, spark_queue_len(proc));
+  */
+
+# if defined(GRAN_CHECK)
+  /* Check if thread queue is sorted. Only for testing, really!  HWL */
+  if ( RtsFlags.GranFlags.DoPriorityScheduling && 
+       (RtsFlags.GranFlags.Debug.sortedQ) ) {
+    rtsBool sorted = rtsTrue;
+    StgTSO *prev, *next;
+
+    if (run_queue_hds[proc]==END_TSO_QUEUE || 
+	run_queue_hds[proc]->link==END_TSO_QUEUE) {
+      /* just 1 elem => ok */
+    } else {
+      /* Qu' wa'DIch yIleghQo' (ignore first elem)! */
+      for (prev = run_queue_hds[proc]->link, next = prev->link;
+	   (next != END_TSO_QUEUE) ;
+	   prev = next, next = prev->link) {
+	ASSERT((prev!=(StgTSO*)NULL || next==run_queue_hds[proc]) &&
+	       (prev==(StgTSO*)NULL || prev->link==next));
+	sorted = sorted && 
+	         (prev->gran.pri >= next->gran.pri);
+      }
+    }
+    if (!sorted) {
+      fprintf(stderr,"Qagh: THREADQ on PE %d is not sorted:\n",
+	      CurrentProc);
+      G_THREADQ(run_queue_hd,0x1);
+    }
+  }
+# endif
+}
+
+/*
+  insertThread, which is only used for GranSim Light, is similar to
+  startThread in that it adds a TSO to a thread queue. However, it assumes
+  that the thread queue is sorted by local clocks and it inserts the TSO at
+  the right place in the queue. Don't create any event, just insert.  
+*/
+//@cindex GranSimLight_insertThread
+rtsBool
+GranSimLight_insertThread(tso, proc)
+StgTSO* tso;
+PEs proc;
+{
+  StgTSO *prev, *next;
+  nat count = 0;
+  rtsBool found = rtsFalse;
+
+  ASSERT(RtsFlags.GranFlags.Light);
+
+  /* In GrAnSim-Light we always have an idle `virtual' proc.
+     The semantics of the one-and-only thread queue is different here:
+     all threads in the queue are running (each on its own virtual processor);
+     the queue is only needed internally in the simulator to interleave the
+     reductions of the different processors.
+     The one-and-only thread queue is sorted by the local clocks of the TSOs.
+  */
+  ASSERT(run_queue_hds[proc] != END_TSO_QUEUE);
+  ASSERT(tso->link == END_TSO_QUEUE);
+
+  /* If only one thread in queue so far we emit DESCHEDULE in debug mode */
+  if (RtsFlags.GranFlags.GranSimStats.Full &&
+      (RtsFlags.GranFlags.Debug.checkLight) && 
+      (run_queue_hd->link == END_TSO_QUEUE)) {
+    DumpRawGranEvent(proc, proc, GR_DESCHEDULE,
+		     run_queue_hds[proc], (StgClosure*)NULL, 
+		     tso->gran.sparkname, spark_queue_len(proc)); // ToDo: check spar_queue_len
+    // resched = rtsTrue;
+  }
+
+  /* this routine should only be used in a GrAnSim Light setup */
+  /* && CurrentProc must be 0 in GrAnSim Light setup */
+  ASSERT(RtsFlags.GranFlags.Light && CurrentProc==0);
+
+  /* Idle proc; same for pri spark and basic version */
+  if (run_queue_hd==END_TSO_QUEUE)
+    {
+      run_queue_hd = run_queue_tl = tso;
+      /* MAKE_BUSY(CurrentProc); */
+      return rtsTrue;
+    }
+
+  for (prev = run_queue_hds[proc], next = run_queue_hds[proc]->link, count = 0;
+       (next != END_TSO_QUEUE) && 
+       !(found = (tso->gran.clock < next->gran.clock));
+       prev = next, next = next->link, count++) 
+    { 
+       ASSERT((prev!=(StgTSO*)NULL || next==run_queue_hds[proc]) &&
+	      (prev==(StgTSO*)NULL || prev->link==next));
+    }
+
+  /* found can only be rtsTrue if pri sparking enabled */ 
+  if (found) {
+     /* Add tso to ThreadQueue between prev and next */
+     tso->link = next;
+     if ( next == END_TSO_QUEUE ) {
+       run_queue_tls[proc] = tso;
+     } else {
+       /* no back link for TSO chain */
+     }
+     
+     if ( prev == END_TSO_QUEUE ) {
+       run_queue_hds[proc] = tso;
+     } else {
+       prev->link = tso;
+     }
+  } else { /* !found */ /* or not pri sparking! */
+    /* Add TSO to the end of the thread queue on that processor */
+    run_queue_tls[proc]->link = tso;
+    run_queue_tls[proc] = tso;
+  }
+
+  if ( prev == END_TSO_QUEUE ) {        /* new head of queue */
+    new_event(proc, proc, CurrentTime[proc],
+	      ContinueThread,
+	      tso, (StgClosure*)NULL, (rtsSpark*)NULL);
+  }
+  /*
+  if (RtsFlags.GranFlags.GranSimStats.Full && 
+      !(( event_type == GR_START || event_type == GR_STARTQ) && 
+	RtsFlags.GranFlags.labelling) )
+    DumpRawGranEvent(proc, creator, gr_evttype, tso, node,
+		     tso->gran.sparkname, spark_queue_len(proc));
+  */
+  return rtsTrue;
+}
+
+/*
+  endThread is responsible for general clean-up after the thread tso has
+  finished. This includes emitting statistics into the profile etc.  
+*/
+void
+endThread(StgTSO *tso, PEs proc) 
+{
+  ASSERT(procStatus[proc]==Busy);        // coming straight out of STG land
+  ASSERT(tso->what_next==ThreadComplete);
+  // ToDo: prune ContinueThreads for this TSO from event queue
+  DumpEndEvent(proc, tso, rtsFalse /* not mandatory */);
+
+  /* if this was the last thread on this PE then make it Idle */
+  if (run_queue_hds[proc]==END_TSO_QUEUE) {
+    procStatus[CurrentProc] = Idle;
+  }
+}
+
+//@node Thread Queue routines, GranSim functions, Scheduling functions, GranSim specific code
+//@subsection Thread Queue routines
+
+/* 
+   Check whether given tso resides on the run queue of the current processor.
+   Only used for debugging.
+*/
+   
+//@cindex is_on_queue
+rtsBool
+is_on_queue (StgTSO *tso, PEs proc) 
+{
+  StgTSO *t;
+  rtsBool found;
+
+  for (t=run_queue_hds[proc], found=rtsFalse; 
+       t!=END_TSO_QUEUE && !(found = t==tso);
+       t=t->link)
+    /* nothing */ ;
+
+  return found;
+}
+
+/* This routine  is only  used for keeping   a statistics  of thread  queue
+   lengths to evaluate the impact of priority scheduling. -- HWL 
+   {spark_queue_len}vo' jInIHta'
+*/
+//@cindex thread_queue_len
+nat
+thread_queue_len(PEs proc) 
+{
+ StgTSO *prev, *next;
+ nat len;
+
+ for (len = 0, prev = END_TSO_QUEUE, next = run_queue_hds[proc];
+      next != END_TSO_QUEUE; 
+      len++, prev = next, next = prev->link)
+   {}
+
+ return (len);
+}
+
+//@node GranSim functions, GranSimLight routines, Thread Queue routines, GranSim specific code
+//@subsection GranSim functions
+
+/* -----------------------------------------------------------------  */
+/* The main event handling functions; called from Schedule.c (schedule) */
+/* -----------------------------------------------------------------  */
+ 
+//@cindex do_the_globalblock
+
+void 
+do_the_globalblock(rtsEvent* event)
+{ 
+  PEs proc          = event->proc;        /* proc that requested node */
+  StgTSO *tso       = event->tso;         /* tso that requested node */
+  StgClosure  *node = event->node;        /* requested, remote node */
+
+  IF_DEBUG(gran, fprintf(stderr, "GRAN: doing the GlobalBlock\n"));
+  /* There should be no GLOBALBLOCKs in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+  /* GlobalBlock events only valid with GUM fetching */
+  ASSERT(RtsFlags.GranFlags.DoBulkFetching);
+
+  IF_GRAN_DEBUG(bq, // globalBlock,
+    if (IS_LOCAL_TO(PROCS(node),proc)) {
+      belch("## Qagh: GlobalBlock: Blocking TSO %d (%p) on LOCAL node %p (PE %d).\n",
+	    tso->id, tso, node, proc);
+    });
+
+  /* CurrentTime[CurrentProc] += RtsFlags.GranFlags.Costs.munpacktime; */
+  if ( blockFetch(tso,proc,node) != 0 )
+    return;                     /* node has become local by now */
+
+#if 0
+ ToDo: check whether anything has to be done at all after blockFetch -- HWL
+
+  if (!RtsFlags.GranFlags.DoAsyncFetch) { /* head of queue is next thread */
+    StgTSO* tso = run_queue_hds[proc];       /* awaken next thread */
+    if (tso != (StgTSO*)NULL) {
+      new_event(proc, proc, CurrentTime[proc],
+		ContinueThread,
+		tso, (StgClosure*)NULL, (rtsSpark*)NULL);
+      CurrentTime[proc] += RtsFlags.GranFlags.Costs.threadcontextswitchtime;
+      if (RtsFlags.GranFlags.GranSimStats.Full)
+        DumpRawGranEvent(proc, CurrentProc, GR_SCHEDULE, tso,
+			 (StgClosure*)NULL, tso->gran.sparkname, spark_queue_len(CurrentProc));  // ToDo: check sparkname and spar_queue_len
+      procStatus[proc] = Busy;                  /* might have been fetching */
+    } else {
+      procStatus[proc] = Idle;                     /* no work on proc now */
+    }
+  } else {  /* RtsFlags.GranFlags.DoAsyncFetch i.e. block-on-fetch */
+	      /* other thread is already running */
+	      /* 'oH 'utbe' 'e' vIHar ; I think that's not needed -- HWL 
+	      new_event(proc,proc,CurrentTime[proc],
+		       CONTINUETHREAD,EVENT_TSO(event),
+		       (RtsFlags.GranFlags.DoBulkFetching ? closure :
+		       EVENT_NODE(event)),NULL);
+	      */
+  }
+#endif
+}
+
+//@cindex do_the_unblock
+
+void 
+do_the_unblock(rtsEvent* event) 
+{
+  PEs proc = event->proc,       /* proc that requested node */
+      creator = event->creator; /* proc that requested node */
+  StgTSO* tso = event->tso;     /* tso that requested node */
+  StgClosure* node = event->node;  /* requested, remote node */
+  
+  IF_DEBUG(gran, fprintf(stderr, "GRAN: doing the UnBlock\n"))
+  /* There should be no UNBLOCKs in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+  /* UnblockThread means either FetchReply has arrived or
+     a blocking queue has been awakened;
+     ToDo: check with assertions
+  ASSERT(procStatus[proc]==Fetching || IS_BLACK_HOLE(event->node));
+  */
+  if (!RtsFlags.GranFlags.DoAsyncFetch) {  /* block-on-fetch */
+    /* We count block-on-fetch as normal block time */    
+    tso->gran.blocktime += CurrentTime[proc] - tso->gran.blockedat;
+    /* Dumping now done when processing the event
+       No costs for contextswitch or thread queueing in this case 
+       if (RtsFlags.GranFlags.GranSimStats.Full)
+         DumpRawGranEvent(proc, CurrentProc, GR_RESUME, tso, 
+                          (StgClosure*)NULL, tso->gran.sparkname, spark_queue_len(CurrentProc));
+    */
+    /* Maybe do this in FetchReply already 
+    if (procStatus[proc]==Fetching)
+      procStatus[proc] = Busy;
+    */
+    /*
+    new_event(proc, proc, CurrentTime[proc],
+	      ContinueThread,
+	      tso, node, (rtsSpark*)NULL);
+    */
+  } else {
+    /* Asynchr comm causes additional costs here: */
+    /* Bring the TSO from the blocked queue into the threadq */
+  }
+  /* In all cases, the UnblockThread causes a ResumeThread to be scheduled */
+  new_event(proc, proc, 
+	    CurrentTime[proc]+RtsFlags.GranFlags.Costs.threadqueuetime,
+	    ResumeThread,
+	    tso, node, (rtsSpark*)NULL);
+}
+
+//@cindex do_the_fetchnode
+
+void
+do_the_fetchnode(rtsEvent* event)
+{
+  PEs proc = event->proc,       /* proc that holds the requested node */
+      creator = event->creator; /* proc that requested node */
+  StgTSO* tso = event->tso;
+  StgClosure* node = event->node;  /* requested, remote node */
+  rtsFetchReturnCode rc;
+
+  ASSERT(CurrentProc==proc);
+  /* There should be no FETCHNODEs in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+
+  IF_DEBUG(gran, fprintf(stderr, "GRAN: doing the FetchNode\n"));
+
+  CurrentTime[proc] += RtsFlags.GranFlags.Costs.munpacktime;
+
+  /* ToDo: check whether this is the right place for dumping the event */
+  if (RtsFlags.GranFlags.GranSimStats.Full)
+    DumpRawGranEvent(creator, proc, GR_FETCH, tso, node, (StgInt)0, 0);
+
+  do {
+    rc = handleFetchRequest(node, proc, creator, tso);
+    if (rc == OutOfHeap) {                                   /* trigger GC */
+# if defined(GRAN_CHECK)  && defined(GRAN)
+     if (RtsFlags.GcFlags.giveStats)
+       fprintf(RtsFlags.GcFlags.statsFile,"*****   veQ boSwI'  PackNearbyGraph(node %p, tso %p (%d))\n",
+     	        node, tso, tso->id);
+# endif
+     barf("//// do_the_fetchnode: out of heap after handleFetchRequest; ToDo: call GarbageCollect()");
+     prepend_event(event);
+     GarbageCollect(GetRoots, rtsFalse); 
+     // HWL: ToDo: check whether a ContinueThread has to be issued
+     // HWL old: ReallyPerformThreadGC(PACK_HEAP_REQUIRED, rtsFalse);
+# if 0 && defined(GRAN_CHECK)  && defined(GRAN)
+     if (RtsFlags.GcFlags.giveStats) {
+       fprintf(RtsFlags.GcFlags.statsFile,"*****      SAVE_Hp=%p, SAVE_HpLim=%p, PACK_HEAP_REQUIRED=%d\n",
+     	        Hp, HpLim, 0) ; // PACK_HEAP_REQUIRED);  ???
+       fprintf(stderr,"*****      No. of packets so far: %d (total size: %d)\n", 
+     	        globalGranStats.tot_packets, globalGranStats.tot_packet_size);
+     }
+# endif 
+     event = grab_event();
+     // Hp -= PACK_HEAP_REQUIRED; // ???
+
+     /* GC knows that events are special and follows the pointer i.e. */
+     /* events are valid even if they moved. An EXIT is triggered */
+     /* if there is not enough heap after GC. */
+    }
+  } while (rc == OutOfHeap);
+}
+
+//@cindex do_the_fetchreply
+void 
+do_the_fetchreply(rtsEvent* event)
+{
+  PEs proc = event->proc,       /* proc that requested node */
+      creator = event->creator; /* proc that holds the requested node */
+  StgTSO* tso = event->tso;
+  StgClosure* node = event->node;  /* requested, remote node */
+  StgClosure* closure=(StgClosure*)NULL;
+
+  ASSERT(CurrentProc==proc);
+  ASSERT(RtsFlags.GranFlags.DoAsyncFetch || procStatus[proc]==Fetching);
+
+  IF_DEBUG(gran, fprintf(stderr, "GRAN: doing the FetchReply\n"));
+  /* There should be no FETCHREPLYs in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+
+  /* assign message unpack costs *before* dumping the event */
+  CurrentTime[proc] += RtsFlags.GranFlags.Costs.munpacktime;
+  
+  /* ToDo: check whether this is the right place for dumping the event */
+  if (RtsFlags.GranFlags.GranSimStats.Full)
+    DumpRawGranEvent(proc, creator, GR_REPLY, tso, node, 
+  		      tso->gran.sparkname, spark_queue_len(proc));
+
+  /* THIS SHOULD NEVER HAPPEN 
+     If tso is in the BQ of node this means that it actually entered the 
+     remote closure, due to a missing GranSimFetch at the beginning of the 
+     entry code; therefore, this is actually a faked fetch, triggered from 
+     within GranSimBlock; 
+     since tso is both in the EVQ and the BQ for node, we have to take it out 
+     of the BQ first before we can handle the FetchReply;
+     ToDo: special cases in awakenBlockedQueue, since the BQ magically moved.
+  */
+  if (tso->block_info.closure!=(StgClosure*)NULL) {
+    IF_GRAN_DEBUG(bq,
+		  belch("## ghuH: TSO %d (%p) in FetchReply is blocked on node %p (shouldn't happen AFAIK)",
+			tso->id, tso, node));
+    // unlink_from_bq(tso, node);
+  }
+    
+  if (RtsFlags.GranFlags.DoBulkFetching) {      /* bulk (packet) fetching */
+    rtsPackBuffer *buffer = (rtsPackBuffer*)node;
+    nat size = buffer->size;
+  
+    /* NB: Fetch misses can't occur with GUM fetching, as */
+    /* updatable closure are turned into RBHs and therefore locked */
+    /* for other processors that try to grab them. */
+  
+    closure = UnpackGraph(buffer);
+    CurrentTime[proc] += size * RtsFlags.GranFlags.Costs.munpacktime;
+  } else  // incremental fetching
+      /* Copy or  move node to CurrentProc */
+      if (fetchNode(node, creator, proc)) {
+        /* Fetch has failed i.e. node has been grabbed by another PE */
+        PEs p = where_is(node);
+        rtsTime fetchtime;
+     
+	if (RtsFlags.GranFlags.GranSimStats.Global)
+	  globalGranStats.fetch_misses++;
+
+	IF_GRAN_DEBUG(thunkStealing,
+		 belch("== Qu'vatlh! fetch miss @ %u: node %p is at proc %u (rather than proc %u)\n",
+		       CurrentTime[proc],node,p,creator));
+
+	CurrentTime[CurrentProc] += RtsFlags.GranFlags.Costs.mpacktime;
+	
+	/* Count fetch again !? */
+	++(tso->gran.fetchcount);
+	tso->gran.fetchtime += RtsFlags.GranFlags.Costs.fetchtime;
+        
+	fetchtime = stg_max(CurrentTime[CurrentProc],CurrentTime[p]) +
+		    RtsFlags.GranFlags.Costs.latency;
+	
+	/* Chase the grabbed node */
+	new_event(p, proc, fetchtime,
+		  FetchNode,
+		  tso, node, (rtsSpark*)NULL);
+
+# if 0 && defined(GRAN_CHECK) && defined(GRAN) /* Just for testing */
+       IF_GRAN_DEBUG(blockOnFetch,
+		     BlockedOnFetch[CurrentProc] = tso;) /*-rtsTrue;-*/
+	
+       IF_GRAN_DEBUG(blockOnFetch_sanity,
+		     tso->type |= FETCH_MASK_TSO;)
+# endif
+
+        CurrentTime[proc] += RtsFlags.GranFlags.Costs.mtidytime;
+	
+        return; /* NB: no REPLy has been processed; tso still sleeping */
+    }
+
+    /* -- Qapla'! Fetch has been successful; node is here, now  */
+    ++(event->tso->gran.fetchcount);
+    event->tso->gran.fetchtime += RtsFlags.GranFlags.Costs.fetchtime;
+
+    /* this is now done at the beginning of this routine
+    if (RtsFlags.GranFlags.GranSimStats.Full)
+       DumpRawGranEvent(proc,event->creator, GR_REPLY, event->tso,
+			(RtsFlags.GranFlags.DoBulkFetching ? 
+			       closure : 
+			       event->node),
+                        tso->gran.sparkname, spark_queue_len(proc));
+    */
+
+    ASSERT(OutstandingFetches[proc] > 0);
+    --OutstandingFetches[proc];
+    new_event(proc, proc, CurrentTime[proc],
+	      ResumeThread,
+	      event->tso, (RtsFlags.GranFlags.DoBulkFetching ? 
+			   closure : 
+			   event->node),
+	      (rtsSpark*)NULL);
+}
+
+//@cindex do_the_movethread
+
+void
+do_the_movethread(rtsEvent* event) {
+  PEs proc = event->proc,       /* proc that requested node */
+      creator = event->creator; /* proc that holds the requested node */
+  StgTSO* tso = event->tso;
+
+ IF_DEBUG(gran, fprintf(stderr, "GRAN: doing the MoveThread\n"));
+
+ ASSERT(CurrentProc==proc);
+ /* There should be no MOVETHREADs in GrAnSim Light setup */
+ ASSERT(!RtsFlags.GranFlags.Light);
+ /* MOVETHREAD events should never occur without -bM */
+ ASSERT(RtsFlags.GranFlags.DoThreadMigration);
+ /* Bitmask of moved thread should be 0 */
+ ASSERT(PROCS(tso)==0);
+ ASSERT(procStatus[proc] == Fishing ||
+	RtsFlags.GranFlags.DoAsyncFetch);
+ ASSERT(OutstandingFishes[proc]>0);
+
+ /* ToDo: exact costs for unpacking the whole TSO  */
+ CurrentTime[proc] +=  5l * RtsFlags.GranFlags.Costs.munpacktime;
+
+ /* ToDo: check whether this is the right place for dumping the event */
+ if (RtsFlags.GranFlags.GranSimStats.Full)
+   DumpRawGranEvent(proc, creator, 
+		    GR_STOLEN, tso, (StgClosure*)NULL, (StgInt)0, 0);
+
+ // ToDo: check cost functions
+ --OutstandingFishes[proc];
+ SET_GRAN_HDR(tso, ThisPE);         // adjust the bitmask for the TSO
+ insertThread(tso, proc);
+
+ if (procStatus[proc]==Fishing)
+   procStatus[proc] = Idle;
+
+ if (RtsFlags.GranFlags.GranSimStats.Global)
+   globalGranStats.tot_TSOs_migrated++;
+}
+
+//@cindex do_the_movespark
+
+void
+do_the_movespark(rtsEvent* event) {
+ PEs proc = event->proc,       /* proc that requested spark */
+     creator = event->creator; /* proc that holds the requested spark */
+ StgTSO* tso = event->tso;
+ rtsSparkQ spark = event->spark;
+
+ IF_DEBUG(gran, fprintf(stderr, "GRAN: doing the MoveSpark\n"))
+
+ ASSERT(CurrentProc==proc);
+ ASSERT(spark!=NULL);
+ ASSERT(procStatus[proc] == Fishing ||
+	RtsFlags.GranFlags.DoAsyncFetch);
+ ASSERT(OutstandingFishes[proc]>0); 
+
+ CurrentTime[proc] += RtsFlags.GranFlags.Costs.munpacktime;
+          
+ /* record movement of spark only if spark profiling is turned on */
+ if (RtsFlags.GranFlags.GranSimStats.Sparks)
+    DumpRawGranEvent(proc, creator,
+		     SP_ACQUIRED,
+		     tso, spark->node, spark->name, spark_queue_len(proc));
+
+ /* global statistics */
+ if ( RtsFlags.GranFlags.GranSimStats.Global &&
+      !closure_SHOULD_SPARK(spark->node))
+   globalGranStats.withered_sparks++;
+   /* Not adding the spark to the spark queue would be the right */
+   /* thing here, but it also would be cheating, as this info can't be */
+   /* available in a real system. -- HWL */
+
+ --OutstandingFishes[proc];
+
+ add_to_spark_queue(spark);
+
+ IF_GRAN_DEBUG(randomSteal, // ToDo: spark-distribution flag
+	       print_sparkq_stats());
+
+ /* Should we treat stolen sparks specially? Currently, we don't. */
+
+ if (procStatus[proc]==Fishing)
+   procStatus[proc] = Idle;
+
+ /* add_to_spark_queue will increase the time of the current proc. */
+ /*
+   If proc was fishing, it is Idle now with the new spark in its spark
+   pool. This means that the next time handleIdlePEs is called, a local
+   FindWork will be created on this PE to turn the spark into a thread. Of
+   course another PE might steal the spark in the meantime (that's why we
+   are using events rather than inlining all the operations in the first
+   place). */
+}
+
+/*
+  In the Constellation class version of GranSim the semantics of StarThread
+  events has changed. Now, StartThread has to perform 3 basic operations:
+   - create a new thread (previously this was done in ActivateSpark);
+   - insert the thread into the run queue of the current processor
+   - generate a new event for actually running the new thread
+  Note that the insertThread is called via createThread. 
+*/
+  
+//@cindex do_the_startthread
+
+void
+do_the_startthread(rtsEvent *event)
+{
+  PEs proc          = event->proc;        /* proc that requested node */
+  StgTSO *tso       = event->tso;         /* tso that requested node */
+  StgClosure  *node = event->node;        /* requested, remote node */
+  rtsSpark *spark   = event->spark;
+  GranEventType gr_evttype;
+
+  ASSERT(CurrentProc==proc);
+  ASSERT(!RtsFlags.GranFlags.Light || CurrentProc==0);
+  ASSERT(event->evttype == ResumeThread || event->evttype == StartThread);
+  /* if this was called via StartThread: */
+  ASSERT(event->evttype!=StartThread || tso == END_TSO_QUEUE); // not yet created
+  // ToDo: check: ASSERT(event->evttype!=StartThread || procStatus[proc]==Starting);
+  /* if this was called via ResumeThread: */
+  ASSERT(event->evttype!=ResumeThread || 
+	   RtsFlags.GranFlags.DoAsyncFetch ||!is_on_queue(tso,proc)); 
+
+  /* startThread may have been called from the main event handler upon
+     finding either a ResumeThread or a StartThread event; set the
+     gr_evttype (needed for writing to .gr file) accordingly */
+  // gr_evttype = (event->evttype == ResumeThread) ? GR_RESUME : GR_START;
+
+  if ( event->evttype == StartThread ) {
+    GranEventType gr_evttype = (run_queue_hds[proc]==END_TSO_QUEUE) ? 
+                                 GR_START : GR_STARTQ;
+
+    tso = createThread(BLOCK_SIZE_W, spark->gran_info);// implicit insertThread!
+    pushClosure(tso, node);
+
+    // ToDo: fwd info on local/global spark to thread -- HWL
+    // tso->gran.exported =  spark->exported;
+    // tso->gran.locked =   !spark->global;
+    tso->gran.sparkname = spark->name;
+
+    ASSERT(CurrentProc==proc);
+    if (RtsFlags.GranFlags.GranSimStats.Full)
+      DumpGranEvent(gr_evttype,tso);
+
+    CurrentTime[proc] += RtsFlags.GranFlags.Costs.threadcreatetime;
+  } else { // event->evttype == ResumeThread
+    GranEventType gr_evttype = (run_queue_hds[proc]==END_TSO_QUEUE) ? 
+                                 GR_RESUME : GR_RESUMEQ;
+
+    insertThread(tso, proc);
+
+    ASSERT(CurrentProc==proc);
+    if (RtsFlags.GranFlags.GranSimStats.Full)
+      DumpGranEvent(gr_evttype,tso);
+  }
+
+  ASSERT(run_queue_hds[proc]!=END_TSO_QUEUE); // non-empty run queue
+  procStatus[proc] = Busy;
+  /* make sure that this thread is actually run */
+  new_event(proc, proc, 
+	    CurrentTime[proc],
+	    ContinueThread,
+	    tso, node, (rtsSpark*)NULL);
+  
+  /* A wee bit of statistics gathering */
+  if (RtsFlags.GranFlags.GranSimStats.Global) {
+    globalGranStats.tot_add_threads++;
+    globalGranStats.tot_tq_len += thread_queue_len(CurrentProc);
+  }
+
+}
+
+//@cindex do_the_findwork
+void
+do_the_findwork(rtsEvent* event) 
+{
+  PEs proc = event->proc,       /* proc to search for work */
+      creator = event->creator; /* proc that requested work */
+  rtsSparkQ spark = event->spark;
+  /* ToDo: check that this size is safe -- HWL */
+#if 0
+ ToDo: check available heap
+
+  nat req_heap = sizeofW(StgTSO) + MIN_STACK_WORDS;
+                 // add this? -- HWL:RtsFlags.ConcFlags.stkChunkSize;
+#endif
+
+  IF_DEBUG(gran, fprintf(stderr, "GRAN: doing the Findwork\n"));
+
+  /* If GUM style fishing is enabled, the contents of the spark field says
+     what to steal (spark(1) or thread(2)); */
+  ASSERT(!(RtsFlags.GranFlags.Fishing && event->spark==(rtsSpark*)0));
+
+  /* Make sure that we have enough heap for creating a new
+     thread. This is a conservative estimate of the required heap.
+     This eliminates special checks for GC around NewThread within
+     ActivateSpark.                                                 */
+
+#if 0
+ ToDo: check available heap
+
+  if (Hp + req_heap > HpLim ) {
+    IF_DEBUG(gc, 
+	     belch("GC: Doing GC from within Findwork handling (that's bloody dangerous if you ask me)");)
+      GarbageCollect(GetRoots);
+      // ReallyPerformThreadGC(req_heap, rtsFalse);   old -- HWL
+      Hp -= req_heap;
+      if (procStatus[CurrentProc]==Sparking) 
+	procStatus[CurrentProc]=Idle;
+      return;
+  }
+#endif
+  
+  if ( RtsFlags.GranFlags.DoAlwaysCreateThreads ||
+       RtsFlags.GranFlags.Fishing ||
+       ((procStatus[proc]==Idle || procStatus[proc]==Sparking) &&
+	(RtsFlags.GranFlags.FetchStrategy >= 2 || 
+	 OutstandingFetches[proc] == 0)) ) 
+   {
+    rtsBool found;
+    rtsSparkQ  prev, spark;
+    
+    /* ToDo: check */
+    ASSERT(procStatus[proc]==Sparking ||
+	   RtsFlags.GranFlags.DoAlwaysCreateThreads ||
+	   RtsFlags.GranFlags.Fishing);
+    
+    /* SImmoHwI' yInej! Search spark queue! */
+    /* gimme_spark (event, &found, &spark); */
+    findLocalSpark(event, &found, &spark);
+
+    if (!found) { /* pagh vumwI' */
+      /*
+        If no spark has been found this can mean 2 things:
+	 1/ The FindWork was a fish (i.e. a message sent by another PE) and 
+	    the spark pool of the receiver is empty
+	    --> the fish has to be forwarded to another PE
+         2/ The FindWork was local to this PE (i.e. no communication; in this
+            case creator==proc) and the spark pool of the PE is not empty 
+	    contains only sparks of closures that should not be sparked 
+	    (note: if the spark pool were empty, handleIdlePEs wouldn't have 
+	    generated a FindWork in the first place)
+	    --> the PE has to be made idle to trigger stealing sparks the next
+	        time handleIdlePEs is performed
+      */ 
+
+      ASSERT(pending_sparks_hds[proc]==(rtsSpark*)NULL);
+      if (creator==proc) {
+	/* local FindWork */
+	if (procStatus[proc]==Busy) {
+	  belch("ghuH: PE %d in Busy state while processing local FindWork (spark pool is empty!) @ %lx",
+		proc, CurrentTime[proc]);
+	  procStatus[proc] = Idle;
+	}
+      } else {
+	/* global FindWork i.e. a Fish */
+	ASSERT(RtsFlags.GranFlags.Fishing);
+	/* actually this generates another request from the originating PE */
+	ASSERT(OutstandingFishes[creator]>0);
+	OutstandingFishes[creator]--;
+	/* ToDo: assign costs for sending fish to proc not to creator */
+	stealSpark(creator); /* might steal from same PE; ToDo: fix */
+	ASSERT(RtsFlags.GranFlags.maxFishes!=1 || procStatus[creator] == Fishing);
+	/* any assertions on state of proc possible here? */
+      }
+    } else {
+      /* DaH chu' Qu' yIchen! Now create new work! */ 
+      IF_GRAN_DEBUG(findWork,
+		    belch("+- munching spark %p; creating thread for node %p",
+			  spark, spark->node));
+      activateSpark (event, spark);
+      ASSERT(spark != (rtsSpark*)NULL);
+      spark = delete_from_sparkq (spark, proc, rtsTrue);
+    }
+
+    IF_GRAN_DEBUG(findWork,
+		  belch("+- Contents of spark queues at the end of FindWork @ %lx",
+			CurrentTime[proc]); 
+		  print_sparkq_stats());
+
+    /* ToDo: check ; not valid if GC occurs in ActivateSpark */
+    ASSERT(!found ||
+	    /* forward fish  or */
+	    (proc!=creator ||
+	    /* local spark  or */
+            (proc==creator && procStatus[proc]==Starting)) || 
+	   //(!found && procStatus[proc]==Idle) ||
+	   RtsFlags.GranFlags.DoAlwaysCreateThreads); 
+   } else {
+    IF_GRAN_DEBUG(findWork,
+		  belch("+- RTS refuses to findWork on PE %d @ %lx",
+			proc, CurrentTime[proc]);
+		  belch("  procStatus[%d]=%s, fetch strategy=%d, outstanding fetches[%d]=%d", 
+			proc, proc_status_names[procStatus[proc]],
+			RtsFlags.GranFlags.FetchStrategy, 
+			proc, OutstandingFetches[proc]));
+   }  
+}
+ 
+//@node GranSimLight routines, Code for Fetching Nodes, GranSim functions, GranSim specific code
+//@subsection GranSimLight routines
+
+/* 
+   This code is called from the central scheduler after having rgabbed a
+   new event and is only needed for GranSim-Light. It mainly adjusts the
+   ActiveTSO so that all costs that have to be assigned from within the
+   scheduler are assigned to the right TSO. The choice of ActiveTSO depends
+   on the type of event that has been found.  
+*/
+
+void
+GranSimLight_enter_system(event, ActiveTSOp)
+rtsEvent *event;
+StgTSO **ActiveTSOp;
+{
+  StgTSO *ActiveTSO = *ActiveTSOp;
+
+  ASSERT (RtsFlags.GranFlags.Light);
+  
+  /* Restore local clock of the virtual processor attached to CurrentTSO.
+     All costs will be associated to the `virt. proc' on which the tso
+     is living. */
+  if (ActiveTSO != NULL) {                     /* already in system area */
+    ActiveTSO->gran.clock = CurrentTime[CurrentProc];
+    if (RtsFlags.GranFlags.DoFairSchedule)
+      {
+	if (RtsFlags.GranFlags.GranSimStats.Full &&
+	    RtsFlags.GranFlags.Debug.checkLight)
+	  DumpGranEvent(GR_SYSTEM_END,ActiveTSO);
+      }
+  }
+  switch (event->evttype)
+    { 
+    case ContinueThread: 
+    case FindWork:       /* inaccurate this way */
+      ActiveTSO = run_queue_hd;
+      break;
+    case ResumeThread:   
+    case StartThread:
+    case MoveSpark:      /* has tso of virt proc in tso field of event */
+      ActiveTSO = event->tso;
+      break;
+    default: barf("Illegal event type %s (%d) in GrAnSim Light setup\n",
+		  event_names[event->evttype],event->evttype);
+    }
+  CurrentTime[CurrentProc] = ActiveTSO->gran.clock;
+  if (RtsFlags.GranFlags.DoFairSchedule) {
+      if (RtsFlags.GranFlags.GranSimStats.Full &&
+	  RtsFlags.GranFlags.Debug.checkLight)
+	DumpGranEvent(GR_SYSTEM_START,ActiveTSO);
+  }
+}
+
+void
+GranSimLight_leave_system(event, ActiveTSOp)
+rtsEvent *event;
+StgTSO **ActiveTSOp;
+{
+  StgTSO *ActiveTSO = *ActiveTSOp;
+
+  ASSERT(RtsFlags.GranFlags.Light);
+
+  /* Save time of `virt. proc' which was active since last getevent and
+     restore time of `virt. proc' where CurrentTSO is living on. */
+  if(RtsFlags.GranFlags.DoFairSchedule) {
+    if (RtsFlags.GranFlags.GranSimStats.Full &&
+	RtsFlags.GranFlags.Debug.checkLight) // ToDo: clean up flags
+      DumpGranEvent(GR_SYSTEM_END,ActiveTSO);
+  }
+  ActiveTSO->gran.clock = CurrentTime[CurrentProc];
+  ActiveTSO = (StgTSO*)NULL;
+  CurrentTime[CurrentProc] = CurrentTSO->gran.clock;
+  if (RtsFlags.GranFlags.DoFairSchedule /* &&  resched */ ) {
+    // resched = rtsFalse;
+    if (RtsFlags.GranFlags.GranSimStats.Full &&
+	RtsFlags.GranFlags.Debug.checkLight)
+      DumpGranEvent(GR_SCHEDULE,run_queue_hd);
+  }
+  /* 
+     if (TSO_LINK(ThreadQueueHd)!=PrelBase_Z91Z93_closure &&
+     (TimeOfNextEvent == 0 ||
+     TSO_CLOCK(TSO_LINK(ThreadQueueHd))+1000<TimeOfNextEvent)) {
+     new_event(CurrentProc,CurrentProc,TSO_CLOCK(TSO_LINK(ThreadQueueHd))+1000,
+     CONTINUETHREAD,TSO_LINK(ThreadQueueHd),PrelBase_Z91Z93_closure,NULL);
+     TimeOfNextEvent = get_time_of_next_event();
+     }
+  */
+}
+
+//@node Code for Fetching Nodes, Idle PEs, GranSimLight routines, GranSim specific code
+//@subsection Code for Fetching Nodes
+
+/*
+   The following GrAnSim routines simulate the fetching of nodes from a
+   remote processor. We use a 1 word bitmask to indicate on which processor
+   a node is lying. Thus, moving or copying a node from one processor to
+   another just requires an appropriate change in this bitmask (using
+   @SET_GA@).  Additionally, the clocks have to be updated.
+
+   A special case arises when the node that is needed by processor A has
+   been moved from a processor B to a processor C between sending out a
+   @FETCH@ (from A) and its arrival at B. In that case the @FETCH@ has to
+   be forwarded to C. This is simulated by issuing another FetchNode event
+   on processor C with A as creator.
+*/
+ 
+/* ngoqvam che' {GrAnSim}! */
+
+/* Fetch node "node" to processor "p" */
+
+//@cindex fetchNode
+
+rtsFetchReturnCode
+fetchNode(node,from,to)
+StgClosure* node;
+PEs from, to;
+{
+  /* In case of RtsFlags.GranFlags.DoBulkFetching this fct should never be 
+     entered! Instead, UnpackGraph is used in ReSchedule */
+  StgClosure* closure;
+
+  ASSERT(to==CurrentProc);
+  /* Should never be entered  in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+  /* fetchNode should never be entered with DoBulkFetching */
+  ASSERT(!RtsFlags.GranFlags.DoBulkFetching);
+
+  /* Now fetch the node */
+  if (!IS_LOCAL_TO(PROCS(node),from) &&
+      !IS_LOCAL_TO(PROCS(node),to) ) 
+    return NodeHasMoved;
+  
+  if (closure_HNF(node))                /* node already in head normal form? */
+    node->header.gran.procs |= PE_NUMBER(to);           /* Copy node */
+  else
+    node->header.gran.procs = PE_NUMBER(to);            /* Move node */
+
+  return Ok;
+}
+
+/* 
+   Process a fetch request. 
+   
+   Cost of sending a packet of size n = C + P*n
+   where C = packet construction constant, 
+         P = cost of packing one word into a packet
+   [Should also account for multiple packets].
+*/
+
+//@cindex handleFetchRequest
+
+rtsFetchReturnCode
+handleFetchRequest(node,to,from,tso)
+StgClosure* node;   // the node which is requested
+PEs to, from;       // fetch request: from -> to
+StgTSO* tso;        // the tso which needs the node
+{
+  ASSERT(!RtsFlags.GranFlags.Light);
+  /* ToDo: check assertion */
+  ASSERT(OutstandingFetches[from]>0);
+
+  /* probably wrong place; */
+  ASSERT(CurrentProc==to);
+
+  if (IS_LOCAL_TO(PROCS(node), from)) /* Somebody else moved node already => */
+    {                                 /* start tso */
+      IF_GRAN_DEBUG(thunkStealing,
+		    fprintf(stderr,"ghuH: handleFetchRequest entered with local node %p (%s) (PE %d)\n", 
+			    node, info_type(node), from));
+
+      if (RtsFlags.GranFlags.DoBulkFetching) {
+	nat size;
+	rtsPackBuffer *graph;
+
+	/* Create a 1-node-buffer and schedule a FETCHREPLY now */
+	graph = PackOneNode(node, tso, &size); 
+	new_event(from, to, CurrentTime[to],
+		  FetchReply,
+		  tso, (StgClosure *)graph, (rtsSpark*)NULL);
+      } else {
+	new_event(from, to, CurrentTime[to],
+		  FetchReply,
+		  tso, node, (rtsSpark*)NULL);
+      }
+      IF_GRAN_DEBUG(thunkStealing,
+		    belch("== majQa'! closure %p is local on PE %d already (this is a good thing)", node, from));
+      return (NodeIsLocal);
+    }
+  else if (IS_LOCAL_TO(PROCS(node), to) )   /* Is node still here? */
+    {
+      if (RtsFlags.GranFlags.DoBulkFetching) { /* {GUM}vo' ngoqvam vInIHta' */
+	nat size;                              /* (code from GUM) */
+	StgClosure* graph;
+
+	if (IS_BLACK_HOLE(node)) {   /* block on BH or RBH */
+	  new_event(from, to, CurrentTime[to],
+		    GlobalBlock,
+		    tso, node, (rtsSpark*)NULL);
+	  /* Note: blockFetch is done when handling GLOBALBLOCK event; 
+	           make sure the TSO stays out of the run queue */
+          /* When this thread is reawoken it does the usual: it tries to 
+             enter the updated node and issues a fetch if it's remote.
+             It has forgotten that it has sent a fetch already (i.e. a
+             FETCHNODE is swallowed by a BH, leaving the thread in a BQ) */
+          --OutstandingFetches[from];
+
+	  IF_GRAN_DEBUG(thunkStealing,
+			belch("== majQa'! closure %p on PE %d is a BH (demander=PE %d); faking a FMBQ", 
+			      node, to, from));
+	  if (RtsFlags.GranFlags.GranSimStats.Global) {
+	    globalGranStats.tot_FMBQs++;
+	  }
+	  return (NodeIsBH);
+	}
+
+	/* The tso requesting the node is blocked and cannot be on a run queue */
+	ASSERT(!is_on_queue(tso, from));
+	
+	// ToDo: check whether graph is ever used as an rtsPackBuffer!!
+	if ((graph = (StgClosure *)PackNearbyGraph(node, tso, &size, 0)) == NULL) 
+	  return (OutOfHeap);  /* out of heap */
+
+	/* Actual moving/copying of node is done on arrival; see FETCHREPLY */
+	/* Send a reply to the originator */
+	/* ToDo: Replace that by software costs for doing graph packing! */
+	CurrentTime[to] += size * RtsFlags.GranFlags.Costs.mpacktime;
+
+	new_event(from, to,
+		  CurrentTime[to]+RtsFlags.GranFlags.Costs.latency,
+		  FetchReply,
+		  tso, (StgClosure *)graph, (rtsSpark*)NULL);
+        
+	CurrentTime[to] += RtsFlags.GranFlags.Costs.mtidytime;
+	return (Ok);
+      } else {                   /* incremental (single closure) fetching */
+	/* Actual moving/copying of node is done on arrival; see FETCHREPLY */
+	/* Send a reply to the originator */
+	CurrentTime[to] += RtsFlags.GranFlags.Costs.mpacktime;
+
+	new_event(from, to,
+		  CurrentTime[to]+RtsFlags.GranFlags.Costs.latency,
+		  FetchReply,
+		  tso, node, (rtsSpark*)NULL);
+      
+	CurrentTime[to] += RtsFlags.GranFlags.Costs.mtidytime;
+	return (Ok);
+      }
+    }
+  else       /* Qu'vatlh! node has been grabbed by another proc => forward */
+    {    
+      PEs node_loc = where_is(node);
+      rtsTime fetchtime;
+
+      IF_GRAN_DEBUG(thunkStealing,
+		    belch("== Qu'vatlh! node %p has been grabbed by PE %d from PE %d (demander=%d) @ %d\n",
+			  node,node_loc,to,from,CurrentTime[to]));
+      if (RtsFlags.GranFlags.GranSimStats.Global) {
+	globalGranStats.fetch_misses++;
+      }
+
+      /* Prepare FORWARD message to proc p_new */
+      CurrentTime[to] += RtsFlags.GranFlags.Costs.mpacktime;
+      
+      fetchtime = stg_max(CurrentTime[to], CurrentTime[node_loc]) +
+                  RtsFlags.GranFlags.Costs.latency;
+          
+      new_event(node_loc, from, fetchtime,
+		FetchNode,
+		tso, node, (rtsSpark*)NULL);
+
+      CurrentTime[to] += RtsFlags.GranFlags.Costs.mtidytime;
+
+      return (NodeHasMoved);
+    }
+}
+
+/*
+   blockFetch blocks a BlockedFetch node on some kind of black hole.
+
+   Taken from gum/HLComms.lc.   [find a  better  place for that ?] --  HWL  
+
+   {\bf Note:} In GranSim we don't have @FETCHME@ nodes and therefore don't
+   create @FMBQ@'s (FetchMe blocking queues) to cope with global
+   blocking. Instead, non-local TSO are put into the BQ in the same way as
+   local TSOs. However, we have to check if a TSO is local or global in
+   order to account for the latencies involved and for keeping track of the
+   number of fetches that are really going on.  
+*/
+
+//@cindex blockFetch
+
+rtsFetchReturnCode
+blockFetch(tso, proc, bh)
+StgTSO* tso;                        /* TSO which gets blocked */
+PEs proc;                           /* PE where that tso was running */
+StgClosure* bh;                     /* closure to block on (BH, RBH, BQ) */
+{
+  StgInfoTable *info;
+
+  IF_GRAN_DEBUG(bq,
+		fprintf(stderr,"## blockFetch: blocking TSO %p (%d)[PE %d] on node %p (%s) [PE %d]. No graph is packed!\n", 
+		tso, tso->id, proc, bh, info_type(bh), where_is(bh)));
+
+    if (!IS_BLACK_HOLE(bh)) {                      /* catches BHs and RBHs */
+      IF_GRAN_DEBUG(bq,
+		    fprintf(stderr,"## blockFetch: node %p (%s) is not a BH => awakening TSO %p (%d) [PE %u]\n", 
+			    bh, info_type(bh), tso, tso->id, proc));
+
+      /* No BH anymore => immediately unblock tso */
+      new_event(proc, proc, CurrentTime[proc],
+	        UnblockThread,
+                tso, bh, (rtsSpark*)NULL);
+
+      /* Is this always a REPLY to a FETCH in the profile ? */
+      if (RtsFlags.GranFlags.GranSimStats.Full)
+	DumpRawGranEvent(proc, proc, GR_REPLY, tso, bh, (StgInt)0, 0);
+      return (NodeIsNoBH);
+    }
+
+    /* DaH {BQ}Daq Qu' Suq 'e' wISov!
+       Now we know that we have to put the tso into the BQ.
+       2 cases: If block-on-fetch, tso is at head of threadq => 
+                => take it out of threadq and into BQ
+                If reschedule-on-fetch, tso is only pointed to be event
+                => just put it into BQ
+
+    ngoq ngo'!!
+    if (!RtsFlags.GranFlags.DoAsyncFetch) {
+      GranSimBlock(tso, proc, bh);
+    } else {
+      if (RtsFlags.GranFlags.GranSimStats.Full)
+	DumpRawGranEvent(proc, where_is(bh), GR_BLOCK, tso, bh, (StgInt)0, 0);
+      ++(tso->gran.blockcount);
+      tso->gran.blockedat = CurrentTime[proc];
+    }
+    */
+
+    /* after scheduling the GlobalBlock event the TSO is not put into the
+       run queue again; it is only pointed to via the event we are
+       processing now; in GranSim 4.xx there is no difference between
+       synchr and asynchr comm here */
+    ASSERT(!is_on_queue(tso, proc));
+    ASSERT(tso->link == END_TSO_QUEUE);
+
+    GranSimBlock(tso, proc, bh);  /* GranSim statistics gathering */
+
+    /* Now, put tso into BQ (similar to blocking entry codes) */
+    info = get_itbl(bh);
+    switch (info -> type) {
+      case RBH:
+      case BLACKHOLE:
+      case CAF_BLACKHOLE: // ToDo: check whether this is a possibly ITBL here
+      case SE_BLACKHOLE:   // ToDo: check whether this is a possibly ITBL here
+      case SE_CAF_BLACKHOLE:// ToDo: check whether this is a possibly ITBL here
+	/* basically an inlined version of BLACKHOLE_entry -- HWL */
+	/* Change the BLACKHOLE into a BLACKHOLE_BQ */
+	((StgBlockingQueue *)bh)->header.info = &BLACKHOLE_BQ_info;
+	/* Put ourselves on the blocking queue for this black hole */
+	// tso->link=END_TSO_QUEUE;   not necessary; see assertion above
+	((StgBlockingQueue *)bh)->blocking_queue = (StgBlockingQueueElement *)tso;
+	tso->block_info.closure = bh;
+	recordMutable((StgMutClosure *)bh);
+	break;
+
+    case BLACKHOLE_BQ:
+	/* basically an inlined version of BLACKHOLE_BQ_entry -- HWL */
+	tso->link = (StgTSO *) (((StgBlockingQueue*)bh)->blocking_queue); 
+	((StgBlockingQueue*)bh)->blocking_queue = (StgBlockingQueueElement *)tso;
+	recordMutable((StgMutClosure *)bh);
+
+# if 0 && defined(GC_MUT_REQUIRED)
+	ToDo: check whether recordMutable is necessary -- HWL
+	/*
+	 * If we modify a black hole in the old generation, we have to make 
+	 * sure it goes on the mutables list
+	 */
+
+	if (bh <= StorageMgrInfo.OldLim) {
+	    MUT_LINK(bh) = (W_) StorageMgrInfo.OldMutables;
+	    StorageMgrInfo.OldMutables = bh;
+	} else
+	    MUT_LINK(bh) = MUT_NOT_LINKED;
+# endif
+	break;
+
+    case FETCH_ME_BQ:
+	barf("Qagh: FMBQ closure (%p) found in GrAnSim (TSO=%p (%d))\n",
+	     bh, tso, tso->id);
+
+    default:
+	{
+	  G_PRINT_NODE(bh);
+	  barf("Qagh: thought %p was a black hole (IP %p (%s))",
+		  bh, info, info_type(bh));
+	}
+      }
+    return (Ok);
+}
+
+
+//@node Idle PEs, Routines directly called from Haskell world, Code for Fetching Nodes, GranSim specific code
+//@subsection Idle PEs
+
+/*
+   Export work to idle PEs. This function is called from @ReSchedule@
+   before dispatching on the current event. @HandleIdlePEs@ iterates over
+   all PEs, trying to get work for idle PEs. Note, that this is a
+   simplification compared to GUM's fishing model. We try to compensate for
+   that by making the cost for stealing work dependent on the number of
+   idle processors and thereby on the probability with which a randomly
+   sent fish would find work.  
+*/
+
+//@cindex handleIdlePEs
+
+void
+handleIdlePEs(void)
+{
+  PEs p;
+
+  IF_DEBUG(gran, fprintf(stderr, "GRAN: handling Idle PEs\n"))
+
+  /* Should never be entered in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+
+  /* Could check whether there are idle PEs if it's a cheap check */
+  for (p = 0; p < RtsFlags.GranFlags.proc; p++) 
+    if (procStatus[p]==Idle)  /*  && IS_SPARKING(p) && IS_STARTING(p) */
+      /* First look for local work i.e. examine local spark pool! */
+      if (pending_sparks_hds[p]!=(rtsSpark *)NULL) {
+	new_event(p, p, CurrentTime[p],
+		  FindWork,
+		  (StgTSO*)NULL, (StgClosure*)NULL, (rtsSpark*)NULL);
+	procStatus[p] = Sparking;
+      } else if ((RtsFlags.GranFlags.maxFishes==0 ||
+		  OutstandingFishes[p]<RtsFlags.GranFlags.maxFishes) ) {
+
+	/* If no local work then try to get remote work! 
+	   Qu' Hopbe' pagh tu'lu'pu'chugh Qu' Hop yISuq ! */
+	if (RtsFlags.GranFlags.DoStealThreadsFirst && 
+	    (RtsFlags.GranFlags.FetchStrategy >= 4 || OutstandingFetches[p] == 0))
+	  {
+	    if (SurplusThreads > 0l)                    /* Steal a thread */
+	      stealThread(p);
+          
+	    if (procStatus[p]!=Idle)
+	      break;
+	  }
+	
+	if (SparksAvail > 0 && 
+	    (RtsFlags.GranFlags.FetchStrategy >= 3 || OutstandingFetches[p] == 0)) /* Steal a spark */
+	  stealSpark(p);
+	
+	if (SurplusThreads > 0 && 
+	    (RtsFlags.GranFlags.FetchStrategy >= 4 || OutstandingFetches[p] == 0)) /* Steal a thread */
+	  stealThread(p);
+      }
+}
+
+/*
+   Steal a spark and schedule moving it to proc. We want to look at PEs in
+   clock order -- most retarded first.  Currently sparks are only stolen
+   from the @ADVISORY_POOL@ never from the @REQUIRED_POOL@. Eventually,
+   this should be changed to first steal from the former then from the
+   latter.
+
+   We model a sort of fishing mechanism by counting the number of sparks
+   and threads we are currently stealing.  */
+
+/* 
+   Return a random nat value in the intervall [from, to) 
+*/
+static nat 
+natRandom(from, to)
+nat from, to;
+{
+  nat r, d;
+
+  ASSERT(from<=to);
+  d = to - from;
+  /* random returns a value in [0, RAND_MAX] */
+  r = (nat) ((float)from + ((float)random()*(float)d)/(float)RAND_MAX);
+  r = (r==to) ? from : r;
+  ASSERT(from<=r && (r<to || from==to));
+  return r;  
+}
+
+/* 
+   Find any PE other than proc. Used for GUM style fishing only.
+*/
+static PEs 
+findRandomPE (proc)
+PEs proc;
+{
+  nat p;
+
+  ASSERT(RtsFlags.GranFlags.Fishing);
+  if (RtsFlags.GranFlags.RandomSteal) {
+    p = natRandom(0,RtsFlags.GranFlags.proc);  /* full range of PEs */
+  } else {
+    p = 0;
+  }
+  IF_GRAN_DEBUG(randomSteal,
+		belch("^^ RANDOM_STEAL (fishing): stealing from PE %d (current proc is %d)",
+		      p, proc));
+    
+  return (PEs)p;
+}
+
+/*
+  Magic code for stealing sparks/threads makes use of global knowledge on
+  spark queues.  
+*/
+static void
+sortPEsByTime (proc, pes_by_time, firstp, np) 
+PEs proc;
+PEs *pes_by_time;
+nat *firstp, *np;
+{
+  PEs p, temp, n, i, j;
+  nat first, upb, r=0, q=0;
+
+  ASSERT(!RtsFlags.GranFlags.Fishing);
+
+#if 0  
+  upb = RtsFlags.GranFlags.proc;            /* full range of PEs */
+
+  if (RtsFlags.GranFlags.RandomSteal) {
+    r = natRandom(0,RtsFlags.GranFlags.proc);  /* full range of PEs */
+  } else {
+    r = 0;
+  }
+#endif
+
+  /* pes_by_time shall contain processors from which we may steal sparks */ 
+  for(n=0, p=0; p < RtsFlags.GranFlags.proc; ++p)
+    if ((proc != p) &&                       // not the current proc
+        (pending_sparks_hds[p] != (rtsSpark *)NULL) && // non-empty spark pool
+        (CurrentTime[p] <= CurrentTime[CurrentProc]))
+      pes_by_time[n++] = p;
+
+  /* sort pes_by_time */
+  for(i=0; i < n; ++i)
+    for(j=i+1; j < n; ++j)
+      if (CurrentTime[pes_by_time[i]] > CurrentTime[pes_by_time[j]]) {
+	rtsTime temp = pes_by_time[i];
+	pes_by_time[i] = pes_by_time[j];
+	pes_by_time[j] = temp;
+      }
+
+  /* Choose random processor to steal spark from; first look at processors */
+  /* that are earlier than the current one (i.e. proc) */
+  for(first=0; 
+      (first < n) && (CurrentTime[pes_by_time[first]] <= CurrentTime[proc]);
+      ++first)
+    /* nothing */ ;
+
+  /* if the assertion below is true we can get rid of first */
+  /* ASSERT(first==n); */
+  /* ToDo: check if first is really needed; find cleaner solution */
+
+  *firstp = first;
+  *np = n;
+}
+
+/* 
+   Steal a spark (piece of work) from any processor and bring it to proc.
+*/
+//@cindex stealSpark
+static rtsBool 
+stealSpark(PEs proc) { stealSomething(proc, rtsTrue, rtsFalse); }
+
+/* 
+   Steal a thread from any processor and bring it to proc i.e. thread migration
+*/
+//@cindex stealThread
+static rtsBool 
+stealThread(PEs proc) { stealSomething(proc, rtsFalse, rtsTrue); }
+
+/* 
+   Steal a spark or a thread and schedule moving it to proc.
+*/
+//@cindex stealSomething
+static rtsBool
+stealSomething(proc, steal_spark, steal_thread)
+PEs proc;                           // PE that needs work (stealer)
+rtsBool steal_spark, steal_thread;  // should a spark and/or thread be stolen
+{
+  PEs p;
+  rtsTime fish_arrival_time;
+  rtsSpark *spark, *prev, *next;
+  rtsBool stolen = rtsFalse;
+
+  ASSERT(steal_spark || steal_thread);
+
+  /* Should never be entered in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+  ASSERT(!steal_thread || RtsFlags.GranFlags.DoThreadMigration);
+
+  if (!RtsFlags.GranFlags.Fishing) {
+    // ToDo: check if stealing threads is prefered over stealing sparks
+    if (steal_spark) {
+      if (stealSparkMagic(proc))
+	return rtsTrue;
+      else                             // no spark found
+	if (steal_thread)
+	  return stealThreadMagic(proc);
+        else                           // no thread found
+	  return rtsFalse;             
+    } else {                           // ASSERT(steal_thread);
+      return stealThreadMagic(proc);
+    }
+    barf("stealSomething: never reached");
+  }
+
+  /* The rest of this function does GUM style fishing */
+  
+  p = findRandomPE(proc); /* find a random PE other than proc */
+  
+  /* Message packing costs for sending a Fish; qeq jabbI'ID */
+  CurrentTime[proc] += RtsFlags.GranFlags.Costs.mpacktime;
+  
+  /* use another GranEvent for requesting a thread? */
+  if (steal_spark && RtsFlags.GranFlags.GranSimStats.Sparks)
+    DumpRawGranEvent(p, proc, SP_REQUESTED,
+		     (StgTSO*)NULL, (StgClosure *)NULL, (StgInt)0, 0);
+
+  /* time of the fish arrival on the remote PE */
+  fish_arrival_time = CurrentTime[proc] + RtsFlags.GranFlags.Costs.latency;
+  
+  /* Phps use an own Fish event for that? */
+  /* The contents of the spark component is a HACK:
+      1 means give me a spark;
+      2 means give me a thread
+      0 means give me nothing (this should never happen)
+  */
+  new_event(p, proc, fish_arrival_time,
+	    FindWork,
+	    (StgTSO*)NULL, (StgClosure*)NULL, 
+	    (steal_spark ? (rtsSpark*)1 : steal_thread ? (rtsSpark*)2 : (rtsSpark*)0));
+  
+  ++OutstandingFishes[proc];
+  /* only with Async fetching? */
+  if (procStatus[proc]==Idle)  
+    procStatus[proc]=Fishing;
+  
+  /* time needed to clean up buffers etc after sending a message */
+  CurrentTime[proc] += RtsFlags.GranFlags.Costs.mtidytime;
+
+  /* If GUM style fishing stealing always succeeds because it only consists
+     of sending out a fish; of course, when the fish may return
+     empty-handed! */
+  return rtsTrue;
+}
+
+/* 
+   This version of stealing a spark makes use of the global info on all
+   spark pools etc which is not available in a real parallel system.
+   This could be extended to test e.g. the impact of perfect load information.
+*/
+//@cindex stealSparkMagic
+static rtsBool
+stealSparkMagic(proc)
+PEs proc;
+{
+  PEs p=0, i=0, j=0, n=0, first, upb;
+  rtsSpark *spark=NULL, *next;
+  PEs pes_by_time[MAX_PROC];
+  rtsBool stolen = rtsFalse;
+  rtsTime stealtime;
+
+  /* Should never be entered in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+
+  sortPEsByTime(proc, pes_by_time, &first, &n);
+
+  while (!stolen && n>0) {
+    upb = (first==0) ? n : first;
+    i = natRandom(0,upb);                /* choose a random eligible PE */
+    p = pes_by_time[i];
+
+    IF_GRAN_DEBUG(randomSteal,
+		  belch("^^ stealSparkMagic (random_steal, not fishing): stealing spark from PE %d (current proc is %d)",
+			p, proc));
+      
+    ASSERT(pending_sparks_hds[p]!=(rtsSpark *)NULL); /* non-empty spark pool */
+
+    /* Now go through rtsSparkQ and steal the first eligible spark */
+    
+    spark = pending_sparks_hds[p]; 
+    while (!stolen && spark != (rtsSpark*)NULL)
+      {
+	/* NB: no prev pointer is needed here because all sparks that are not 
+	   chosen are pruned
+	*/
+	if ((procStatus[p]==Idle || procStatus[p]==Sparking || procStatus[p] == Fishing) &&
+	    spark->next==(rtsSpark*)NULL) 
+	  {
+	    /* Be social! Don't steal the only spark of an idle processor 
+	       not {spark} neH yInIH !! */
+	    break; /* next PE */
+	  } 
+	else if (closure_SHOULD_SPARK(spark->node))
+	  {
+	    /* Don't Steal local sparks; 
+	       ToDo: optionally prefer local over global sparks
+	    if (!spark->global) {
+	      prev=spark;
+	      continue;                  next spark
+	    }
+	    */
+	    /* found a spark! */
+
+	    /* Prepare message for sending spark */
+	    CurrentTime[p] += RtsFlags.GranFlags.Costs.mpacktime;
+
+	    if (RtsFlags.GranFlags.GranSimStats.Sparks)
+	      DumpRawGranEvent(p, (PEs)0, SP_EXPORTED,
+			       (StgTSO*)NULL, spark->node,
+			       spark->name, spark_queue_len(p));
+
+	    stealtime = (CurrentTime[p] > CurrentTime[proc] ? 
+			   CurrentTime[p] : 
+			   CurrentTime[proc])
+	                + sparkStealTime();
+
+	    new_event(proc, p /* CurrentProc */, stealtime,
+		      MoveSpark,
+		      (StgTSO*)NULL, spark->node, spark);
+	    
+	    stolen = rtsTrue;
+	    ++OutstandingFishes[proc]; /* no. of sparks currently on the fly */
+	    if (procStatus[proc]==Idle)
+	      procStatus[proc] = Fishing;
+	    ++(spark->global);         /* record that this is a global spark */
+	    ASSERT(SparksAvail>0);
+	    --SparksAvail;            /* on-the-fly sparks are not available */
+	    next = delete_from_sparkq(spark, p, rtsFalse); // don't dispose!
+	    CurrentTime[p] += RtsFlags.GranFlags.Costs.mtidytime;
+	  }
+	else   /* !(closure_SHOULD_SPARK(SPARK_NODE(spark))) */
+	  {
+	   IF_GRAN_DEBUG(checkSparkQ,
+			 belch("^^ pruning spark %p (node %p) in stealSparkMagic",
+			       spark, spark->node));
+
+	    /* if the spark points to a node that should not be sparked,
+	       prune the spark queue at this point */
+	    if (RtsFlags.GranFlags.GranSimStats.Sparks)
+	      DumpRawGranEvent(p, (PEs)0, SP_PRUNED,
+			       (StgTSO*)NULL, spark->node,
+			       spark->name, spark_queue_len(p));
+	    if (RtsFlags.GranFlags.GranSimStats.Global)
+	      globalGranStats.pruned_sparks++;
+	    
+	    ASSERT(SparksAvail>0);
+	    --SparksAvail;
+	    spark = delete_from_sparkq(spark, p, rtsTrue);
+	  }
+	/* unlink spark (may have been freed!) from sparkq;
+	if (prev == NULL) // spark was head of spark queue
+	  pending_sparks_hds[p] = spark->next;
+        else  
+	  prev->next = spark->next;
+	if (spark->next == NULL)
+	  pending_sparks_tls[p] = prev;
+        else  
+	  next->prev = prev;
+	*/
+      }                    /* while ...    iterating over sparkq */
+
+    /* ToDo: assert that PE p still has work left after stealing the spark */
+
+    if (!stolen && (n>0)) {  /* nothing stealable from proc p :( */
+      ASSERT(pes_by_time[i]==p);
+
+      /* remove p from the list (at pos i) */
+      for (j=i; j+1<n; j++)
+	pes_by_time[j] = pes_by_time[j+1];
+      n--;
+      
+      /* update index to first proc which is later (or equal) than proc */
+      for ( ;
+	    (first>0) &&
+	      (CurrentTime[pes_by_time[first-1]]>CurrentTime[proc]);
+	    first--)
+	/* nothing */ ;
+    } 
+  }  /* while ... iterating over PEs in pes_by_time */
+
+  IF_GRAN_DEBUG(randomSteal,
+		if (stolen)
+		  belch("^^ stealSparkMagic: spark %p (node=%p) stolen by PE %d from PE %d (SparksAvail=%d; idlers=%d)",
+		       spark, spark->node, proc, p, 
+		       SparksAvail, idlers());
+		else  
+		  belch("^^ stealSparkMagic: nothing stolen by PE %d (sparkq len after pruning=%d)(SparksAvail=%d; idlers=%d)",
+		        proc, SparksAvail, idlers()));
+
+  if (RtsFlags.GranFlags.GranSimStats.Global &&
+      stolen && (i!=0)) {                          /* only for statistics */
+    globalGranStats.rs_sp_count++;
+    globalGranStats.ntimes_total += n;
+    globalGranStats.fl_total += first;
+    globalGranStats.no_of_steals++;
+  }
+
+  return stolen;
+}
+
+/* 
+   The old stealThread code, which makes use of global info and does not
+   send out fishes.  
+   NB: most of this is the same as in stealSparkMagic;
+       only the pieces specific to processing thread queues are different; 
+       long live polymorphism!  
+*/
+
+//@cindex stealThreadMagic
+static rtsBool
+stealThreadMagic(proc)
+PEs proc;
+{
+  PEs p=0, i=0, j=0, n=0, first, upb;
+  StgTSO *tso=END_TSO_QUEUE;
+  PEs pes_by_time[MAX_PROC];
+  rtsBool stolen = rtsFalse;
+  rtsTime stealtime;
+
+  /* Should never be entered in GrAnSim Light setup */
+  ASSERT(!RtsFlags.GranFlags.Light);
+
+  sortPEsByTime(proc, pes_by_time, &first, &n);
+
+  while (!stolen && n>0) {
+    upb = (first==0) ? n : first;
+    i = natRandom(0,upb);                /* choose a random eligible PE */
+    p = pes_by_time[i];
+
+    IF_GRAN_DEBUG(randomSteal,
+		  belch("^^ stealThreadMagic (random_steal, not fishing): stealing thread from PE %d (current proc is %d)",
+			p, proc));
+      
+    /* Steal the first exportable thread in the runnable queue but
+       never steal the first in the queue for social reasons;
+       not Qu' wa'DIch yInIH !!
+    */
+    /* Would be better to search through queue and have options which of
+       the threads to pick when stealing */
+    if (run_queue_hds[p] == END_TSO_QUEUE) {
+      IF_GRAN_DEBUG(randomSteal,
+		    belch("^^ stealThreadMagic: No thread to steal from PE %d (stealer=PE %d)", 
+			  p, proc));
+    } else {
+      tso = run_queue_hds[p]->link;  /* tso is *2nd* thread in thread queue */
+      /* Found one */
+      stolen = rtsTrue;
+
+      /* update links in queue */
+      run_queue_hds[p]->link = tso->link;
+      if (run_queue_tls[p] == tso)
+	run_queue_tls[p] = run_queue_hds[p];
+      
+      /* ToDo: Turn magic constants into params */
+      
+      CurrentTime[p] += 5l * RtsFlags.GranFlags.Costs.mpacktime;
+      
+      stealtime = (CurrentTime[p] > CurrentTime[proc] ? 
+		   CurrentTime[p] : 
+		   CurrentTime[proc])
+	+ sparkStealTime() 
+	+ 4l * RtsFlags.GranFlags.Costs.additional_latency
+	+ 5l * RtsFlags.GranFlags.Costs.munpacktime;
+
+      /* Move the thread; set bitmask to 0 while TSO is `on-the-fly' */
+      SET_GRAN_HDR(tso,Nowhere /* PE_NUMBER(proc) */); 
+
+      /* Move from one queue to another */
+      new_event(proc, p, stealtime,
+		MoveThread,
+		tso, (StgClosure*)NULL, (rtsSpark*)NULL);
+
+      /* MAKE_BUSY(proc);  not yet; only when thread is in threadq */
+      ++OutstandingFishes[proc];
+      if (procStatus[proc])
+	procStatus[proc] = Fishing;
+      --SurplusThreads;
+
+      if(RtsFlags.GranFlags.GranSimStats.Full)
+	DumpRawGranEvent(p, proc, 
+			 GR_STEALING, 
+			 tso, (StgClosure*)NULL, (StgInt)0, 0);
+      
+      /* costs for tidying up buffer after having sent it */
+      CurrentTime[p] += 5l * RtsFlags.GranFlags.Costs.mtidytime;
+    }
+
+    /* ToDo: assert that PE p still has work left after stealing the spark */
+
+    if (!stolen && (n>0)) {  /* nothing stealable from proc p :( */
+      ASSERT(pes_by_time[i]==p);
+
+      /* remove p from the list (at pos i) */
+      for (j=i; j+1<n; j++)
+	pes_by_time[j] = pes_by_time[j+1];
+      n--;
+      
+      /* update index to first proc which is later (or equal) than proc */
+      for ( ;
+	    (first>0) &&
+	      (CurrentTime[pes_by_time[first-1]]>CurrentTime[proc]);
+	    first--)
+	/* nothing */ ;
+    } 
+  }  /* while ... iterating over PEs in pes_by_time */
+
+  IF_GRAN_DEBUG(randomSteal,
+		if (stolen)
+  		  belch("^^ stealThreadMagic: stolen TSO %d (%p) by PE %d from PE %d (SparksAvail=%d; idlers=%d)",
+		        tso->id, tso, proc, p,
+		        SparksAvail, idlers());
+		else
+		  belch("stealThreadMagic: nothing stolen by PE %d (SparksAvail=%d; idlers=%d)",
+			proc, SparksAvail, idlers()));
+
+  if (RtsFlags.GranFlags.GranSimStats.Global &&
+      stolen && (i!=0)) { /* only for statistics */
+    /* ToDo: more statistics on avg thread queue lenght etc */
+    globalGranStats.rs_t_count++;
+    globalGranStats.no_of_migrates++;
+  }
+
+  return stolen;
+}
+
+//@cindex sparkStealTime
+static rtsTime
+sparkStealTime(void)
+{
+  double fishdelay, sparkdelay, latencydelay;
+  fishdelay =  (double)RtsFlags.GranFlags.proc/2;
+  sparkdelay = fishdelay - 
+          ((fishdelay-1.0)/(double)(RtsFlags.GranFlags.proc-1))*((double)idlers());
+  latencydelay = sparkdelay*((double)RtsFlags.GranFlags.Costs.latency);
+
+  return((rtsTime)latencydelay);
+}
+
+//@node Routines directly called from Haskell world, Emiting profiling info for GrAnSim, Idle PEs, GranSim specific code
+//@subsection Routines directly called from Haskell world
+/* 
+The @GranSim...@ routines in here are directly called via macros from the
+threaded world. 
+
+First some auxiliary routines.
+*/
+
+/* Take the current thread off the thread queue and thereby activate the 
+   next thread. It's assumed that the next ReSchedule after this uses 
+   NEW_THREAD as param. 
+   This fct is called from GranSimBlock and GranSimFetch 
+*/
+
+//@cindex ActivateNextThread
+
+void 
+ActivateNextThread (proc)
+PEs proc;
+{
+  StgTSO *t;
+  /*
+    This routine is entered either via GranSimFetch or via GranSimBlock.
+    It has to prepare the CurrentTSO for being blocked and update the
+    run queue and other statistics on PE proc. The actual enqueuing to the 
+    blocking queue (if coming from GranSimBlock) is done in the entry code 
+    of the BLACKHOLE and BLACKHOLE_BQ closures (see StgMiscClosures.hc).
+  */
+  /* ToDo: add assertions here!! */
+  //ASSERT(run_queue_hds[proc]!=END_TSO_QUEUE);
+
+  // Only necessary if the running thread is at front of the queue
+  // run_queue_hds[proc] = run_queue_hds[proc]->link;
+  ASSERT(CurrentProc==proc);
+  ASSERT(!is_on_queue(CurrentTSO,proc));
+  if (run_queue_hds[proc]==END_TSO_QUEUE) {
+    /* NB: this routine is only entered with asynchr comm (see assertion) */
+    procStatus[proc] = Idle;
+  } else {
+    /* ToDo: check cost assignment */
+    CurrentTime[proc] += RtsFlags.GranFlags.Costs.threadcontextswitchtime;
+    if (RtsFlags.GranFlags.GranSimStats.Full && 
+	(!RtsFlags.GranFlags.Light || RtsFlags.GranFlags.Debug.checkLight)) 
+                                      /* right flag !?? ^^^ */ 
+      DumpRawGranEvent(proc, 0, GR_SCHEDULE, run_queue_hds[proc],
+                       (StgClosure*)NULL, (StgInt)0, 0);
+  }
+}
+
+/* 
+   The following GranSim fcts are stg-called from the threaded world.    
+*/
+
+/* Called from HP_CHK and friends (see StgMacros.h)  */
+//@cindex GranSimAllocate
+void 
+GranSimAllocate(n)
+StgInt n;
+{
+  CurrentTSO->gran.allocs += n;
+  ++(CurrentTSO->gran.basicblocks);
+
+  if (RtsFlags.GranFlags.GranSimStats.Heap) {
+      DumpRawGranEvent(CurrentProc, 0, GR_ALLOC, CurrentTSO,
+                       (StgClosure*)NULL, (StgInt)0, n);
+  }
+  
+  CurrentTSO->gran.exectime += RtsFlags.GranFlags.Costs.heapalloc_cost;
+  CurrentTime[CurrentProc] += RtsFlags.GranFlags.Costs.heapalloc_cost;
+}
+
+/*
+  Subtract the values added above, if a heap check fails and
+  so has to be redone.
+*/
+//@cindex GranSimUnallocate
+void 
+GranSimUnallocate(n)
+StgInt n;
+{
+  CurrentTSO->gran.allocs -= n;
+  --(CurrentTSO->gran.basicblocks);
+  
+  CurrentTSO->gran.exectime -= RtsFlags.GranFlags.Costs.heapalloc_cost;
+  CurrentTime[CurrentProc] -= RtsFlags.GranFlags.Costs.heapalloc_cost;
+}
+
+/* NB: We now inline this code via GRAN_EXEC rather than calling this fct */
+//@cindex GranSimExec
+void 
+GranSimExec(ariths,branches,loads,stores,floats)
+StgWord ariths,branches,loads,stores,floats;
+{
+  StgWord cost = RtsFlags.GranFlags.Costs.arith_cost*ariths + 
+            RtsFlags.GranFlags.Costs.branch_cost*branches + 
+            RtsFlags.GranFlags.Costs.load_cost * loads +
+            RtsFlags.GranFlags.Costs.store_cost*stores + 
+            RtsFlags.GranFlags.Costs.float_cost*floats;
+
+  CurrentTSO->gran.exectime += cost;
+  CurrentTime[CurrentProc] += cost;
+}
+
+/* 
+   Fetch the node if it isn't local
+   -- result indicates whether fetch has been done.
+
+   This is GRIP-style single item fetching.
+*/
+
+//@cindex GranSimFetch
+StgInt 
+GranSimFetch(node /* , liveness_mask */ )
+StgClosure *node;
+/* StgInt liveness_mask; */
+{
+  /* reset the return value (to be checked within STG land) */
+  NeedToReSchedule = rtsFalse;   
+
+  if (RtsFlags.GranFlags.Light) {
+     /* Always reschedule in GrAnSim-Light to prevent one TSO from
+        running off too far 
+     new_event(CurrentProc,CurrentProc,CurrentTime[CurrentProc],
+	      ContinueThread,CurrentTSO,node,NULL);
+     */
+     return(0); 
+  }
+
+  /* Faking an RBH closure:
+     If the bitmask of the closure is 0 then this node is a fake RBH;
+  */
+  if (node->header.gran.procs == Nowhere) {
+    IF_GRAN_DEBUG(bq,
+		  belch("## Found fake RBH (node %p); delaying TSO %d (%p)", 
+			node, CurrentTSO->id, CurrentTSO));
+		  
+    new_event(CurrentProc, CurrentProc, CurrentTime[CurrentProc]+10000,
+	      ContinueThread, CurrentTSO, node, (rtsSpark*)NULL);
+
+    /* Rescheduling (GranSim internal) is necessary */
+    NeedToReSchedule = rtsTrue;
+    
+    return(1); 
+  }
+
+  /* Note: once a node has been fetched, this test will be passed */
+  if (!IS_LOCAL_TO(PROCS(node),CurrentProc))
+    {
+      PEs p = where_is(node);
+      rtsTime fetchtime;
+      
+      IF_GRAN_DEBUG(thunkStealing,
+		    if (p==CurrentProc) 
+		      belch("GranSimFetch: Trying to fetch from own processor%u\n", p););
+      
+      CurrentTime[CurrentProc] += RtsFlags.GranFlags.Costs.mpacktime;
+      /* NB: Fetch is counted on arrival (FetchReply) */
+      
+      fetchtime = stg_max(CurrentTime[CurrentProc],CurrentTime[p]) +
+	RtsFlags.GranFlags.Costs.latency;
+      
+      new_event(p, CurrentProc, fetchtime,
+		FetchNode, CurrentTSO, node, (rtsSpark*)NULL);
+      
+      if (fetchtime<TimeOfNextEvent)
+	TimeOfNextEvent = fetchtime;
+      
+      /* About to block */
+      CurrentTSO->gran.blockedat = CurrentTime[CurrentProc];
+      
+      ++OutstandingFetches[CurrentProc];
+      
+      if (RtsFlags.GranFlags.DoAsyncFetch) 
+	/* if asynchr comm is turned on, activate the next thread in the q */
+	ActivateNextThread(CurrentProc);
+      else
+	procStatus[CurrentProc] = Fetching;
+
+#if 0 
+      /* ToDo: nuke the entire if (anything special for fair schedule?) */
+      if (RtsFlags.GranFlags.DoAsyncFetch) 
+	{
+	  /* Remove CurrentTSO from the queue -- assumes head of queue == CurrentTSO */
+	  if(!RtsFlags.GranFlags.DoFairSchedule)
+	    {
+	      /* now done in do_the_fetchnode 
+	      if (RtsFlags.GranFlags.GranSimStats.Full)
+		DumpRawGranEvent(CurrentProc, p, GR_FETCH, CurrentTSO,
+				 node, (StgInt)0, 0);
+	      */				
+	      ActivateNextThread(CurrentProc);
+              
+# if 0 && defined(GRAN_CHECK)
+	      if (RtsFlags.GranFlags.Debug.blockOnFetch_sanity) {
+		if (TSO_TYPE(CurrentTSO) & FETCH_MASK_TSO) {
+		  fprintf(stderr,"FetchNode: TSO 0x%x has fetch-mask set @ %d\n",
+			  CurrentTSO,CurrentTime[CurrentProc]);
+		  stg_exit(EXIT_FAILURE);
+		} else {
+		  TSO_TYPE(CurrentTSO) |= FETCH_MASK_TSO;
+		}
+	      }
+# endif
+	      CurrentTSO->link = END_TSO_QUEUE;
+	      /* CurrentTSO = END_TSO_QUEUE; */
+	      
+	      /* CurrentTSO is pointed to by the FetchNode event; it is
+		 on no run queue any more */
+	  } else {  /* fair scheduling currently not supported -- HWL */
+	    barf("Asynchr communication is not yet compatible with fair scheduling\n");
+	  }
+	} else {                /* !RtsFlags.GranFlags.DoAsyncFetch */
+	  procStatus[CurrentProc] = Fetching; // ToDo: BlockedOnFetch;
+	  /* now done in do_the_fetchnode 
+	  if (RtsFlags.GranFlags.GranSimStats.Full)
+	    DumpRawGranEvent(CurrentProc, p,
+			     GR_FETCH, CurrentTSO, node, (StgInt)0, 0);
+	  */
+	  IF_GRAN_DEBUG(blockOnFetch, 
+			BlockedOnFetch[CurrentProc] = CurrentTSO;); /*- rtsTrue; -*/
+	}
+#endif /* 0 */
+
+      CurrentTime[CurrentProc] += RtsFlags.GranFlags.Costs.mtidytime;
+      
+      /* Rescheduling (GranSim internal) is necessary */
+      NeedToReSchedule = rtsTrue;
+      
+      return(1); 
+    }
+  return(0);
+}
+
+//@cindex GranSimSpark
+void 
+GranSimSpark(local,node)
+StgInt local;
+StgClosure *node;
+{
+  /* ++SparksAvail;  Nope; do that in add_to_spark_queue */
+  if (RtsFlags.GranFlags.GranSimStats.Sparks)
+    DumpRawGranEvent(CurrentProc, (PEs)0, SP_SPARK,
+		     END_TSO_QUEUE, node, (StgInt)0, spark_queue_len(CurrentProc)-1);
+
+  /* Force the PE to take notice of the spark */
+  if(RtsFlags.GranFlags.DoAlwaysCreateThreads) {
+    new_event(CurrentProc,CurrentProc,CurrentTime[CurrentProc],
+	      FindWork,
+	      END_TSO_QUEUE, (StgClosure*)NULL, (rtsSpark*)NULL);
+    if (CurrentTime[CurrentProc]<TimeOfNextEvent)
+      TimeOfNextEvent = CurrentTime[CurrentProc];
+  }
+
+  if(local)
+    ++CurrentTSO->gran.localsparks;
+  else
+    ++CurrentTSO->gran.globalsparks;
+}
+
+//@cindex GranSimSparkAt
+void 
+GranSimSparkAt(spark,where,identifier)
+rtsSpark *spark;
+StgClosure *where;    /* This should be a node; alternatively could be a GA */
+StgInt identifier;
+{
+  PEs p = where_is(where);
+  GranSimSparkAtAbs(spark,p,identifier);
+}
+
+//@cindex GranSimSparkAtAbs
+void 
+GranSimSparkAtAbs(spark,proc,identifier)
+rtsSpark *spark;
+PEs proc;        
+StgInt identifier;
+{
+  rtsTime exporttime;
+
+  if (spark == (rtsSpark *)NULL) /* Note: Granularity control might have */
+    return;                          /* turned a spark into a NULL. */
+
+  /* ++SparksAvail; Nope; do that in add_to_spark_queue */
+  if(RtsFlags.GranFlags.GranSimStats.Sparks)
+    DumpRawGranEvent(proc,0,SP_SPARKAT,
+		     END_TSO_QUEUE, spark->node, (StgInt)0, spark_queue_len(proc));
+
+  if (proc!=CurrentProc) {
+    CurrentTime[CurrentProc] += RtsFlags.GranFlags.Costs.mpacktime;
+    exporttime = (CurrentTime[proc] > CurrentTime[CurrentProc]? 
+                  CurrentTime[proc]: CurrentTime[CurrentProc])
+                 + RtsFlags.GranFlags.Costs.latency;
+  } else {
+    exporttime = CurrentTime[CurrentProc];
+  }
+
+  if ( RtsFlags.GranFlags.Light )
+    /* Need CurrentTSO in event field to associate costs with creating
+       spark even in a GrAnSim Light setup */
+    new_event(proc, CurrentProc, exporttime,
+	      MoveSpark,
+	      CurrentTSO, spark->node, spark);
+  else
+    new_event(proc, CurrentProc, exporttime,
+	      MoveSpark, (StgTSO*)NULL, spark->node, spark);
+  /* Bit of a hack to treat placed sparks the same as stolen sparks */
+  ++OutstandingFishes[proc];
+
+  /* Force the PE to take notice of the spark (FINDWORK is put after a
+     MoveSpark into the sparkq!) */
+  if (RtsFlags.GranFlags.DoAlwaysCreateThreads) {
+    new_event(CurrentProc,CurrentProc,exporttime+1,
+              FindWork,
+	      (StgTSO*)NULL, (StgClosure*)NULL, (rtsSpark*)NULL);
+  }
+
+  if (exporttime<TimeOfNextEvent)
+    TimeOfNextEvent = exporttime;
+
+  if (proc!=CurrentProc) {
+    CurrentTime[CurrentProc] += RtsFlags.GranFlags.Costs.mtidytime;
+    ++CurrentTSO->gran.globalsparks;
+  } else { 
+    ++CurrentTSO->gran.localsparks;
+  }
+}
+
+/* 
+   This function handles local and global blocking.  It's called either
+   from threaded code (RBH_entry, BH_entry etc) or from blockFetch when
+   trying to fetch an BH or RBH 
+*/
+
+//@cindex GranSimBlock
+void 
+GranSimBlock(tso, proc, node)
+StgTSO *tso;
+PEs proc;
+StgClosure *node;
+{
+  PEs node_proc = where_is(node), 
+      tso_proc = where_is((StgClosure *)tso);
+
+  ASSERT(tso_proc==CurrentProc);
+  // ASSERT(node_proc==CurrentProc);
+  IF_GRAN_DEBUG(bq,
+		if (node_proc!=CurrentProc) 
+		  belch("## ghuH: TSO %d (%lx) [PE %d] blocks on non-local node %p [PE %d] (no simulation of FETCHMEs)",
+		        tso->id, tso, tso_proc, node, node_proc)); 
+  ASSERT(tso->link==END_TSO_QUEUE);
+  ASSERT(!is_on_queue(tso,proc)); // tso must not be on run queue already!
+  //ASSERT(tso==run_queue_hds[proc]);
+
+  IF_DEBUG(gran,
+	   belch("GRAN: TSO %d (%p) [PE %d] blocks on closure %p @ %lx",
+		 tso->id, tso, proc, node, CurrentTime[proc]));
+
+
+    /* THIS SHOULD NEVER HAPPEN!
+       If tso tries to block on a remote node (i.e. node_proc!=CurrentProc)
+       we have missed a GranSimFetch before entering this closure;
+       we hack around it for now, faking a FetchNode; 
+       because GranSimBlock is entered via a BLACKHOLE(_BQ) closure,
+       tso will be blocked on this closure until the FetchReply occurs.
+
+       ngoq Dogh! 
+
+    if (node_proc!=CurrentProc) {
+      StgInt ret;
+      ret = GranSimFetch(node);
+      IF_GRAN_DEBUG(bq,
+                    if (ret)
+		      belch(".. GranSimBlock: faking a FetchNode of node %p from %d to %d",
+			    node, node_proc, CurrentProc););
+      return;
+    }
+    */
+
+  if (RtsFlags.GranFlags.GranSimStats.Full)
+    DumpRawGranEvent(proc,node_proc,GR_BLOCK,tso,node,(StgInt)0,0);
+
+  ++(tso->gran.blockcount);
+  /* Distinction  between local and global block is made in blockFetch */
+  tso->gran.blockedat = CurrentTime[proc];
+
+  CurrentTime[proc] += RtsFlags.GranFlags.Costs.threadqueuetime;
+  ActivateNextThread(proc);
+  /* tso->link = END_TSO_QUEUE;    not really necessary; only for testing */
+}
+
+#endif /* GRAN */
+
+//@node Index,  , Dumping routines, GranSim specific code
+//@subsection Index
+
+//@index
+//* ActivateNextThread::  @cindex\s-+ActivateNextThread
+//* CurrentProc::  @cindex\s-+CurrentProc
+//* CurrentTime::  @cindex\s-+CurrentTime
+//* GranSimAllocate::  @cindex\s-+GranSimAllocate
+//* GranSimBlock::  @cindex\s-+GranSimBlock
+//* GranSimExec::  @cindex\s-+GranSimExec
+//* GranSimFetch::  @cindex\s-+GranSimFetch
+//* GranSimLight_insertThread::  @cindex\s-+GranSimLight_insertThread
+//* GranSimSpark::  @cindex\s-+GranSimSpark
+//* GranSimSparkAt::  @cindex\s-+GranSimSparkAt
+//* GranSimSparkAtAbs::  @cindex\s-+GranSimSparkAtAbs
+//* GranSimUnallocate::  @cindex\s-+GranSimUnallocate
+//* any_idle::  @cindex\s-+any_idle
+//* blockFetch::  @cindex\s-+blockFetch
+//* do_the_fetchnode::  @cindex\s-+do_the_fetchnode
+//* do_the_fetchreply::  @cindex\s-+do_the_fetchreply
+//* do_the_findwork::  @cindex\s-+do_the_findwork
+//* do_the_globalblock::  @cindex\s-+do_the_globalblock
+//* do_the_movespark::  @cindex\s-+do_the_movespark
+//* do_the_movethread::  @cindex\s-+do_the_movethread
+//* do_the_startthread::  @cindex\s-+do_the_startthread
+//* do_the_unblock::  @cindex\s-+do_the_unblock
+//* fetchNode::  @cindex\s-+fetchNode
+//* ga_to_proc::  @cindex\s-+ga_to_proc
+//* get_next_event::  @cindex\s-+get_next_event
+//* get_time_of_next_event::  @cindex\s-+get_time_of_next_event
+//* grab_event::  @cindex\s-+grab_event
+//* handleFetchRequest::  @cindex\s-+handleFetchRequest
+//* handleIdlePEs::  @cindex\s-+handleIdlePEs
+//* idlers::  @cindex\s-+idlers
+//* insertThread::  @cindex\s-+insertThread
+//* insert_event::  @cindex\s-+insert_event
+//* is_on_queue::  @cindex\s-+is_on_queue
+//* is_unique::  @cindex\s-+is_unique
+//* new_event::  @cindex\s-+new_event
+//* prepend_event::  @cindex\s-+prepend_event
+//* print_event::  @cindex\s-+print_event
+//* print_eventq::  @cindex\s-+print_eventq
+//* prune_eventq ::  @cindex\s-+prune_eventq 
+//* spark queue::  @cindex\s-+spark queue
+//* sparkStealTime::  @cindex\s-+sparkStealTime
+//* stealSomething::  @cindex\s-+stealSomething
+//* stealSpark::  @cindex\s-+stealSpark
+//* stealSparkMagic::  @cindex\s-+stealSparkMagic
+//* stealThread::  @cindex\s-+stealThread
+//* stealThreadMagic::  @cindex\s-+stealThreadMagic
+//* thread_queue_len::  @cindex\s-+thread_queue_len
+//* traverse_eventq_for_gc::  @cindex\s-+traverse_eventq_for_gc
+//* where_is::  @cindex\s-+where_is
+//@end index
diff --git a/rts/parallel/GranSimRts.h b/rts/parallel/GranSimRts.h
new file mode 100644
index 0000000000..fc31a1f0a6
--- /dev/null
+++ b/rts/parallel/GranSimRts.h
@@ -0,0 +1,268 @@
+/* --------------------------------------------------------------------------
+   Time-stamp: <Tue Mar 06 2001 00:18:30 Stardate: [-30]6285.06 hwloidl>
+
+   Variables and functions specific to GranSim.
+   ----------------------------------------------------------------------- */
+
+#ifndef GRANSIM_RTS_H
+#define GRANSIM_RTS_H
+
+//@node Headers for GranSim objs used only in the RTS internally, , ,
+//@section Headers for GranSim objs used only in the RTS internally
+
+//@menu
+//* Event queue::		
+//* Spark handling routines::	
+//* Processor related stuff::	
+//* Local types::		
+//* Statistics gathering::	
+//* Prototypes::		
+//@end menu
+//*/ fool highlight
+
+//@node Event queue, Spark handling routines, Headers for GranSim objs used only in the RTS internally, Headers for GranSim objs used only in the RTS internally
+//@subsection Event queue
+
+#if defined(GRAN) || defined(PAR)
+/* Granularity event types for output (see DumpGranEvent) */
+typedef enum GranEventType_ {
+    GR_START = 0, GR_STARTQ, 
+    GR_STEALING, GR_STOLEN, GR_STOLENQ, 
+    GR_FETCH, GR_REPLY, GR_BLOCK, GR_RESUME, GR_RESUMEQ,
+    GR_SCHEDULE, GR_DESCHEDULE,
+    GR_END,
+    SP_SPARK, SP_SPARKAT, SP_USED, SP_PRUNED, SP_EXPORTED, SP_ACQUIRED, SP_REQUESTED,
+    GR_ALLOC,
+    GR_TERMINATE,
+    GR_SYSTEM_START, GR_SYSTEM_END,            /* only for debugging */
+    GR_EVENT_MAX
+} GranEventType;
+
+extern char *gran_event_names[];
+#endif
+
+#if defined(GRAN)                                            /* whole file */
+
+/* Event Types (internal use only) */
+typedef enum rtsEventType_ {
+ ContinueThread = 0,  /* Continue running the first thread in the queue */
+ StartThread,         /* Start a newly created thread */
+ ResumeThread,        /* Resume a previously running thread */
+ MoveSpark,           /* Move a spark from one PE to another */
+ MoveThread,          /* Move a thread from one PE to another */
+ FindWork,            /* Search for work */
+ FetchNode,           /* Fetch a node */
+ FetchReply,          /* Receive a node */
+ GlobalBlock,         /* Block a TSO on a remote node */
+ UnblockThread        /* Make a TSO runnable */
+} rtsEventType;
+
+/* Number of last event type */
+#define MAX_EVENT       9
+ 
+typedef struct rtsEvent_ {
+  PEs           proc;    /* Processor id */
+  PEs           creator; /* Processor id of PE that created the event */
+  rtsEventType  evttype; /* rtsEvent type */
+  rtsTime       time;    /* Time at which event happened */
+  StgTSO       *tso;     /* Associated TSO, if relevant */
+  StgClosure   *node;    /* Associated node, if relevant */
+  rtsSpark     *spark;   /* Associated SPARK, if relevant */
+  StgInt        gc_info; /* Counter of heap objects to mark (used in GC only)*/
+  struct rtsEvent_ *next;
+  } rtsEvent;
+
+typedef rtsEvent *rtsEventQ;
+
+extern rtsEventQ EventHd;
+
+/* Interface for ADT of Event Queue */
+rtsEvent *get_next_event(void);
+rtsTime   get_time_of_next_event(void);
+void      insert_event(rtsEvent *newentry);
+void      new_event(PEs proc, PEs creator, rtsTime time, 
+		    rtsEventType evttype, StgTSO *tso, 
+		    StgClosure *node, rtsSpark *spark);
+void      print_event(rtsEvent *event);
+void      print_eventq(rtsEvent *hd);
+void      prepend_event(rtsEvent *event);
+rtsEventQ grab_event(void);
+void      prune_eventq(StgTSO *tso, StgClosure *node); 
+
+void      traverse_eventq_for_gc(void);
+void      markEventQueue(void);
+
+//@node Spark handling routines, Processor related stuff, Event queue, Headers for GranSim objs used only in the RTS internally
+//@subsection Spark handling routines
+
+/* These functions are only used in the RTS internally; see GranSim.h for rest */
+void 	  disposeSpark(rtsSpark *spark);
+void 	  disposeSparkQ(rtsSparkQ spark);
+void 	  print_spark(rtsSpark *spark);
+void      print_sparkq(PEs proc);
+void 	  print_sparkq_stats(void);
+nat  	  spark_queue_len(PEs proc);
+rtsSpark *delete_from_sparkq (rtsSpark *spark, PEs p, rtsBool dispose_too);
+void      markSparkQueue(void);
+
+//@node Processor related stuff, Local types, Spark handling routines, Headers for GranSim objs used only in the RTS internally
+//@subsection Processor related stuff
+
+typedef enum rtsProcStatus_ {
+  Idle = 0,             /* empty threadq */
+  Sparking,             /* non-empty sparkq; FINDWORK has been issued */
+  Starting,             /* STARTTHREAD has been issue */
+  Fetching,             /* waiting for remote data (only if block-on-fetch) */
+  Fishing,              /* waiting for remote spark/thread */
+  Busy                  /* non-empty threadq, with head of queue active */
+} rtsProcStatus;
+
+/*
+#define IS_IDLE(proc)        (procStatus[proc] == Idle)
+#define IS_SPARKING(proc)    (procStatus[proc] == Sparking)
+#define IS_STARTING(proc)    (procStatus[proc] == Starting)
+#define IS_FETCHING(proc)    (procStatus[proc] == Fetching)
+#define IS_FISHING(proc)     (procStatus[proc] == Fishing)
+#define IS_BUSY(proc)        (procStatus[proc] == Busy)    
+#define ANY_IDLE             (any_idle())
+#define MAKE_IDLE(proc)      procStatus[proc] = Idle
+#define MAKE_SPARKING(proc)  procStatus[proc] = Sparking
+#define MAKE_STARTING(proc)  procStatus[proc] = Starting
+#define MAKE_FETCHING(proc)  procStatus[proc] = Fetching
+#define MAKE_FISHING(proc)   procStatus[proc] = Fishing
+#define MAKE_BUSY(proc)      procStatus[proc] = Busy
+*/
+
+//@node Local types, Statistics gathering, Processor related stuff, Headers for GranSim objs used only in the RTS internally
+//@subsection Local types
+
+/* Return codes of HandleFetchRequest:
+    0 ... ok (FETCHREPLY event with a buffer containing addresses of the 
+              nearby graph has been scheduled)
+    1 ... node is already local (fetched by somebody else; no event is
+                                  scheduled in here)
+    2 ... fetch request has been forwrded to the PE that now contains the
+           node
+    3 ... node is a black hole (BH, BQ or RBH); no event is scheduled, and
+           the current TSO is put into the blocking queue of that node
+    4 ... out of heap in PackNearbyGraph; GC should be triggered in calling
+          function to guarantee that the tso and node inputs are valid
+          (they may be moved during GC).
+   Return codes of blockFetch:
+    0 ... ok; tso is now at beginning of BQ attached to the bh closure
+    1 ... the bh closure is no BH any more; tso is immediately unblocked
+*/
+
+typedef enum rtsFetchReturnCode_ {
+  Ok = 0,
+  NodeIsLocal,
+  NodeHasMoved,
+  NodeIsBH,
+  NodeIsNoBH,
+  OutOfHeap,
+} rtsFetchReturnCode;
+  
+//@node Statistics gathering, Prototypes, Local types, Headers for GranSim objs used only in the RTS internally
+//@subsection Statistics gathering
+
+extern unsigned int /* nat */ OutstandingFetches[], OutstandingFishes[];
+extern rtsProcStatus procStatus[];
+extern StgTSO *BlockedOnFetch[];
+
+/* global structure for collecting statistics */
+typedef struct GlobalGranStats_ {
+  /* event stats */
+  nat noOfEvents;
+  nat event_counts[MAX_EVENT];
+
+  /* communication stats */
+  nat fetch_misses;
+  nat tot_fake_fetches;   // GranSim internal; faked Fetches are a kludge!!
+  nat tot_low_pri_sparks;
+
+  /* load distribution statistics */  
+  nat rs_sp_count, rs_t_count, ntimes_total, fl_total, 
+      no_of_steals, no_of_migrates;
+
+  /* spark queue stats */
+  nat tot_sq_len, tot_sq_probes, tot_sparks;
+  nat tot_add_threads, tot_tq_len, non_end_add_threads;
+
+  /* packet statistics */
+  nat tot_packets, tot_packet_size, tot_cuts, tot_thunks;
+
+  /* thread stats */
+  nat tot_threads_created, threads_created_on_PE[MAX_PROC],
+      tot_TSOs_migrated;
+
+  /* spark stats */
+  nat pruned_sparks, withered_sparks;
+  nat tot_sparks_created, sparks_created_on_PE[MAX_PROC];
+
+  /* scheduling stats */
+  nat tot_yields, tot_stackover, tot_heapover;
+
+  /* blocking queue statistics */
+  rtsTime tot_bq_processing_time;
+  nat tot_bq_len, tot_bq_len_local, tot_awbq, tot_FMBQs;
+} GlobalGranStats;
+
+extern GlobalGranStats globalGranStats;
+
+//@node Prototypes,  , Statistics gathering, Headers for GranSim objs used only in the RTS internally
+//@subsection Prototypes
+
+/* Generally useful fcts */
+PEs where_is(StgClosure *node);
+rtsBool is_unique(StgClosure *node);
+
+/* Prototypes of event handling functions; needed in Schedule.c:ReSchedule() */
+void do_the_globalblock (rtsEvent* event);
+void do_the_unblock (rtsEvent* event);
+void do_the_fetchnode (rtsEvent* event);
+void do_the_fetchreply (rtsEvent* event);
+void do_the_movethread (rtsEvent* event);
+void do_the_movespark (rtsEvent* event);
+void do_the_startthread(rtsEvent *event);
+void do_the_findwork(rtsEvent* event);
+void gimme_spark (rtsEvent *event, rtsBool *found_res, rtsSparkQ *spark_res);
+rtsBool munch_spark (rtsEvent *event, rtsSparkQ spark);
+
+/* GranSimLight routines */
+void GranSimLight_enter_system(rtsEvent *event, StgTSO **ActiveTSOp);
+void GranSimLight_leave_system(rtsEvent *event, StgTSO **ActiveTSOp);
+
+/* Communication related routines */
+rtsFetchReturnCode fetchNode(StgClosure* node, PEs from, PEs to);
+rtsFetchReturnCode handleFetchRequest(StgClosure* node, PEs curr_proc, PEs p, StgTSO* tso);
+void               handleIdlePEs(void);
+
+long int random(void); /* used in stealSpark() and stealThread() in GranSim.c */
+
+/* Scheduling fcts defined in GranSim.c */
+void    insertThread(StgTSO *tso, PEs proc);
+void    endThread(StgTSO *tso, PEs proc);
+rtsBool GranSimLight_insertThread(StgTSO *tso, PEs proc);
+nat     thread_queue_len(PEs proc);
+
+/* For debugging */
+rtsBool is_on_queue (StgTSO *tso, PEs proc);
+#endif
+
+#if defined(GRAN) || defined(PAR)
+/* 
+   Interface for dumping routines (i.e. writing to log file).
+   These routines are shared with GUM (and could also be used for SMP).
+*/
+void DumpGranEvent(GranEventType name, StgTSO *tso);
+void DumpEndEvent(PEs proc, StgTSO *tso, rtsBool mandatory_thread);
+void DumpTSO(StgTSO *tso);
+void DumpRawGranEvent(PEs proc, PEs p, GranEventType name, 
+ 	              StgTSO *tso, StgClosure *node, 
+		      StgInt sparkname, StgInt len);
+void DumpVeryRawGranEvent(rtsTime time, PEs proc, PEs p, GranEventType name,
+			  StgTSO *tso, StgClosure *node, 
+			  StgInt sparkname, StgInt len);
+#endif
+
+#endif /* GRANSIM_RTS_H  */
diff --git a/rts/parallel/HLC.h b/rts/parallel/HLC.h
new file mode 100644
index 0000000000..793ac840f9
--- /dev/null
+++ b/rts/parallel/HLC.h
@@ -0,0 +1,63 @@
+/* --------------------------------------------------------------------------
+   Time-stamp: <Sun Mar 18 2001 20:16:14 Stardate: [-30]6349.22 hwloidl>
+
+   High Level Communications Header (HLC.h)
+
+   Contains the high-level definitions (i.e. communication
+   subsystem independent) used by GUM
+   Phil Trinder, Glasgow University, 12 December 1994
+   H-W. Loidl, Heriot-Watt, November 1999
+   ----------------------------------------------------------------------- */
+
+#ifndef __HLC_H
+#define __HLC_H
+
+#ifdef PAR
+
+#include "LLC.h"
+
+#define NEW_FISH_AGE           0
+#define NEW_FISH_HISTORY       0
+#define NEW_FISH_HUNGER        0
+#define FISH_LIFE_EXPECTANCY  10
+
+
+//@node GUM Message Sending and Unpacking Functions
+//@subsection GUM Message Sending and Unpacking Functions
+
+rtsBool  initMoreBuffers(void);
+
+void 	 sendFetch (globalAddr *ga, globalAddr *bqga, int load);
+void 	 sendResume(globalAddr *rga, int nelem, rtsPackBuffer *packBuffer);
+void 	 sendAck (GlobalTaskId task, int ngas, globalAddr *gagamap);
+void 	 sendFish (GlobalTaskId destPE, GlobalTaskId origPE, int age, int history, int hunger);
+void 	 sendFree (GlobalTaskId destPE, int nelem, P_ data);
+void 	 sendSchedule(GlobalTaskId origPE, int nelem, rtsPackBuffer *packBuffer);
+void 	 sendReval(GlobalTaskId origPE, int nelem, rtsPackBuffer *data);
+
+//@node Message-Processing Functions
+//@subsection Message-Processing Functions
+
+rtsBool	 processMessages(void);
+void 	 processFetches(void);
+void 	 processTheRealFetches(void);
+
+//@node Miscellaneous Functions
+//@subsection Miscellaneous Functions
+
+void 	 prepareFreeMsgBuffers(void);
+void 	 freeRemoteGA (int pe, globalAddr *ga);
+void 	 sendFreeMessages(void);
+
+GlobalTaskId  choosePE(void);
+StgClosure   *createBlockedFetch (globalAddr ga, globalAddr rga);
+void 	      waitForTermination(void);
+
+/* Message bouncing (startup and shutdown, mainly) */
+void          bounceFish(void);
+void          bounceReval(void);
+
+void          DebugPrintGAGAMap (globalAddr *gagamap, int nGAs);
+
+#endif /* PAR */
+#endif /* __HLC_H */
diff --git a/rts/parallel/HLComms.c b/rts/parallel/HLComms.c
new file mode 100644
index 0000000000..b0982e441c
--- /dev/null
+++ b/rts/parallel/HLComms.c
@@ -0,0 +1,1810 @@
+/* ----------------------------------------------------------------------------
+ * Time-stamp: <Wed Mar 21 2001 16:34:41 Stardate: [-30]6363.45 hwloidl>
+ *
+ * High Level Communications Routines (HLComms.lc)
+ *
+ * Contains the high-level routines (i.e. communication
+ * subsystem independent) used by GUM
+ * 
+ * GUM 0.2x: Phil Trinder, Glasgow University, 12 December 1994
+ * GUM 3.xx: Phil Trinder, Simon Marlow July 1998
+ * GUM 4.xx: H-W. Loidl, Heriot-Watt University, November 1999 -
+ * 
+ * ------------------------------------------------------------------------- */
+
+#ifdef PAR /* whole file */
+
+//@node High Level Communications Routines, , ,
+//@section High Level Communications Routines
+
+//@menu
+//* Macros etc::		
+//* Includes::			
+//* GUM Message Sending and Unpacking Functions::  
+//* Message-Processing Functions::  
+//* GUM Message Processor::	
+//* Miscellaneous Functions::	
+//* Index::			
+//@end menu
+
+//@node Macros etc, Includes, High Level Communications Routines, High Level Communications Routines
+//@subsection Macros etc
+
+/* Evidently not Posix */
+/* #include "PosixSource.h" */
+
+//@node Includes, GUM Message Sending and Unpacking Functions, Macros etc, High Level Communications Routines
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsUtils.h"
+#include "RtsFlags.h"
+#include "Storage.h"   // for recordMutable
+#include "HLC.h"
+#include "Parallel.h"
+#include "GranSimRts.h"
+#include "ParallelRts.h"
+#include "Sparks.h"
+#include "FetchMe.h"     // for BLOCKED_FETCH_info etc
+#if defined(DEBUG)
+# include "ParallelDebug.h"
+#endif
+#include "StgMacros.h" // inlined IS_... fcts
+
+#ifdef DIST
+#include "SchedAPI.h" //for createIOThread
+extern unsigned int context_switch; 
+#endif /* DIST */
+
+//@node GUM Message Sending and Unpacking Functions, Message-Processing Functions, Includes, High Level Communications Routines
+//@subsection GUM Message Sending and Unpacking Functions
+
+/*
+ * GUM Message Sending and Unpacking Functions
+ */
+
+/*
+ * Allocate space for message processing
+ */
+
+//@cindex gumPackBuffer
+static rtsPackBuffer *gumPackBuffer;
+
+//@cindex initMoreBuffers
+rtsBool
+initMoreBuffers(void)
+{
+  if ((gumPackBuffer = (rtsPackBuffer *)stgMallocWords(RtsFlags.ParFlags.packBufferSize, 
+					     "initMoreBuffers")) == NULL)
+    return rtsFalse;
+  return rtsTrue;
+}
+
+/*
+ * SendFetch packs the two global addresses and a load into a message +
+ * sends it.  
+
+//@cindex FETCH
+
+   Structure of a FETCH message:
+
+         |    GA 1     |        GA 2          |
+         +------------------------------------+------+
+	 | gtid | slot | weight | gtid | slot | load |
+	 +------------------------------------+------+
+ */
+
+//@cindex sendFetch
+void
+sendFetch(globalAddr *rga, globalAddr *lga, int load)
+{
+  ASSERT(rga->weight > 0 && lga->weight > 0);
+  IF_PAR_DEBUG(fetch,
+	       belch("~^** Sending Fetch for ((%x, %d, 0)); locally ((%x, %d, %x)), load = %d", 
+		     rga->payload.gc.gtid, rga->payload.gc.slot, 
+		     lga->payload.gc.gtid, lga->payload.gc.slot, lga->weight,
+		     load));
+
+
+  /* ToDo: Dump event
+  DumpRawGranEvent(CURRENT_PROC, taskIDtoPE(rga->payload.gc.gtid), 
+		   GR_FETCH, CurrentTSO, (StgClosure *)(lga->payload.gc.slot),
+		   0, spark_queue_len(ADVISORY_POOL));
+  */
+
+  sendOpV(PP_FETCH, rga->payload.gc.gtid, 6,
+	  (StgWord) rga->payload.gc.gtid, (StgWord) rga->payload.gc.slot, 
+	  (StgWord) lga->weight, (StgWord) lga->payload.gc.gtid, 
+	  (StgWord) lga->payload.gc.slot, (StgWord) load);
+}
+
+/*
+ * unpackFetch unpacks a FETCH message into two Global addresses and a load
+ * figure.  
+*/
+
+//@cindex unpackFetch
+static void
+unpackFetch(globalAddr *lga, globalAddr *rga, int *load)
+{
+  long buf[6];
+
+  GetArgs(buf, 6); 
+
+  IF_PAR_DEBUG(fetch,
+	       belch("~^** Unpacking Fetch for ((%x, %d, 0)) to ((%x, %d, %x)), load = %d", 
+		     (GlobalTaskId) buf[0], (int) buf[1], 
+		     (GlobalTaskId) buf[3], (int) buf[4], buf[2], buf[5]));
+
+  lga->weight = 1;
+  lga->payload.gc.gtid = (GlobalTaskId) buf[0];
+  lga->payload.gc.slot = (int) buf[1];
+
+  rga->weight = (unsigned) buf[2];
+  rga->payload.gc.gtid = (GlobalTaskId) buf[3];
+  rga->payload.gc.slot = (int) buf[4];
+
+  *load = (int) buf[5];
+
+  ASSERT(rga->weight > 0);
+}
+
+/*
+ * SendResume packs the remote blocking queue's GA and data into a message 
+ * and sends it.
+
+//@cindex RESUME
+
+   Structure of a RESUME message:
+
+      -------------------------------
+      | weight | slot | n | data ...
+      -------------------------------
+
+   data is a packed graph represented as an rtsPackBuffer
+   n is the size of the graph (as returned by PackNearbyGraph) + packet hdr size
+ */
+
+//@cindex sendResume
+void
+sendResume(globalAddr *rga, int nelem, rtsPackBuffer *packBuffer)
+{
+  IF_PAR_DEBUG(fetch,
+	       belch("~^[] Sending Resume (packet <<%d>> with %d elems) for ((%x, %d, %x)) to [%x]", 
+		     packBuffer->id, nelem,
+		     rga->payload.gc.gtid, rga->payload.gc.slot, rga->weight,
+		     rga->payload.gc.gtid));
+  IF_PAR_DEBUG(packet,
+	       PrintPacket(packBuffer));
+
+  ASSERT(nelem==packBuffer->size);
+  /* check for magic end-of-buffer word */
+  IF_DEBUG(sanity, ASSERT(*(packBuffer->buffer+nelem) == END_OF_BUFFER_MARKER));
+
+  sendOpNV(PP_RESUME, rga->payload.gc.gtid, 
+	   nelem + PACK_BUFFER_HDR_SIZE + DEBUG_HEADROOM, (StgPtr)packBuffer, 
+	   2, (rtsWeight) rga->weight, (StgWord) rga->payload.gc.slot);
+}
+
+/*
+ * unpackResume unpacks a Resume message into two Global addresses and
+ * a data array.
+ */
+
+//@cindex unpackResume
+static void
+unpackResume(globalAddr *lga, int *nelem, rtsPackBuffer *packBuffer)
+{
+    long buf[3];
+
+    GetArgs(buf, 3); 
+
+    /*
+      RESUME event is written in awaken_blocked_queue
+    DumpRawGranEvent(CURRENT_PROC, taskIDtoPE(lga->payload.gc.gtid), 
+		     GR_RESUME, END_TSO_QUEUE, (StgClosure *)NULL, 0, 0);
+    */
+
+    lga->weight = (unsigned) buf[0];
+    lga->payload.gc.gtid = mytid;
+    lga->payload.gc.slot = (int) buf[1];
+
+    *nelem = (int) buf[2] - PACK_BUFFER_HDR_SIZE - DEBUG_HEADROOM;
+    GetArgs(packBuffer, *nelem + PACK_BUFFER_HDR_SIZE + DEBUG_HEADROOM);
+
+    IF_PAR_DEBUG(fetch,
+		 belch("~^[] Unpacking Resume (packet <<%d>> with %d elems) for ((%x, %d, %x))", 
+		       packBuffer->id, *nelem, mytid, (int) buf[1], (unsigned) buf[0]));
+
+    /* check for magic end-of-buffer word */
+    IF_DEBUG(sanity, ASSERT(*(packBuffer->buffer+*nelem) == END_OF_BUFFER_MARKER));
+}
+
+/*
+ * SendAck packs the global address being acknowledged, together with
+ * an array of global addresses for any closures shipped and sends them.
+
+//@cindex ACK
+
+   Structure of an ACK message:
+
+      |        GA 1          |        GA 2          | 
+      +---------------------------------------------+-------
+      | weight | gtid | slot | weight | gtid | slot |  .....  ngas times
+      + --------------------------------------------+------- 
+
+ */
+
+//@cindex sendAck
+void
+sendAck(GlobalTaskId task, int ngas, globalAddr *gagamap)
+{
+  static long *buffer;
+  long *p;
+  int i;
+
+  if(ngas==0)
+    return; //don't send unnecessary messages!!
+  
+  buffer = (long *) gumPackBuffer;
+
+  for(i = 0, p = buffer; i < ngas; i++, p += 6) {
+    ASSERT(gagamap[1].weight > 0);
+    p[0] = (long) gagamap->weight;
+    p[1] = (long) gagamap->payload.gc.gtid;
+    p[2] = (long) gagamap->payload.gc.slot;
+    gagamap++;
+    p[3] = (long) gagamap->weight;
+    p[4] = (long) gagamap->payload.gc.gtid;
+    p[5] = (long) gagamap->payload.gc.slot;
+    gagamap++;
+  }
+  IF_PAR_DEBUG(schedule,
+	       belch("~^,, Sending Ack (%d pairs) to [%x]\n", 
+		     ngas, task));
+
+  sendOpN(PP_ACK, task, p - buffer, (StgPtr)buffer);
+}
+
+/*
+ * unpackAck unpacks an Acknowledgement message into a Global address,
+ * a count of the number of global addresses following and a map of 
+ * Global addresses
+ */
+
+//@cindex unpackAck
+static void
+unpackAck(int *ngas, globalAddr *gagamap)
+{
+  long GAarraysize;
+  long buf[6];
+  
+  GetArgs(&GAarraysize, 1);
+  
+  *ngas = GAarraysize / 6;
+  
+  IF_PAR_DEBUG(schedule,
+	       belch("~^,, Unpacking Ack (%d pairs) on [%x]\n", 
+		     *ngas, mytid));
+
+  while (GAarraysize > 0) {
+    GetArgs(buf, 6);
+    gagamap->weight = (rtsWeight) buf[0];
+    gagamap->payload.gc.gtid = (GlobalTaskId) buf[1];
+    gagamap->payload.gc.slot = (int) buf[2];
+    gagamap++;
+    gagamap->weight = (rtsWeight) buf[3];
+    gagamap->payload.gc.gtid = (GlobalTaskId) buf[4];
+    gagamap->payload.gc.slot = (int) buf[5];
+    ASSERT(gagamap->weight > 0);
+    gagamap++;
+    GAarraysize -= 6;
+  }
+}
+
+/*
+ * SendFish packs the global address being acknowledged, together with
+ * an array of global addresses for any closures shipped and sends them.
+
+//@cindex FISH
+
+ Structure of a FISH message:
+
+     +----------------------------------+
+     | orig PE | age | history | hunger |
+     +----------------------------------+
+ */
+
+//@cindex sendFish
+void
+sendFish(GlobalTaskId destPE, GlobalTaskId origPE, 
+	 int age, int history, int hunger)
+{
+  IF_PAR_DEBUG(fish,
+	       belch("~^$$ Sending Fish to [%x] (%d outstanding fishes)", 
+		     destPE, outstandingFishes));
+
+  sendOpV(PP_FISH, destPE, 4, 
+	  (StgWord) origPE, (StgWord) age, (StgWord) history, (StgWord) hunger);
+
+  if (origPE == mytid) {
+    //fishing = rtsTrue;
+    outstandingFishes++;
+  }
+}
+
+/*
+ * unpackFish unpacks a FISH message into the global task id of the
+ * originating PE and 3 data fields: the age, history and hunger of the
+ * fish. The history + hunger are not currently used.
+
+ */
+
+//@cindex unpackFish
+static void
+unpackFish(GlobalTaskId *origPE, int *age, int *history, int *hunger)
+{
+  long buf[4];
+  
+  GetArgs(buf, 4);
+  
+  IF_PAR_DEBUG(fish,
+	       belch("~^$$ Unpacking Fish from [%x] (age=%d)", 
+		     (GlobalTaskId) buf[0], (int) buf[1]));
+
+  *origPE = (GlobalTaskId) buf[0];
+  *age = (int) buf[1];
+  *history = (int) buf[2];
+  *hunger = (int) buf[3];
+}
+
+/*
+ * SendFree sends (weight, slot) pairs for GAs that we no longer need
+ * references to.  
+
+//@cindex FREE
+
+   Structure of a FREE message:
+   
+       +-----------------------------
+       | n | weight_1 | slot_1 | ...
+       +-----------------------------
+ */
+//@cindex sendFree
+void
+sendFree(GlobalTaskId pe, int nelem, StgPtr data)
+{
+    IF_PAR_DEBUG(free,
+		 belch("~^!! Sending Free (%d GAs) to [%x]", 
+		       nelem/2, pe));
+
+    sendOpN(PP_FREE, pe, nelem, data);
+}
+
+/*
+ * unpackFree unpacks a FREE message into the amount of data shipped and
+ * a data block.
+ */
+//@cindex unpackFree
+static void
+unpackFree(int *nelem, StgWord *data)
+{
+  long buf[1];
+  
+  GetArgs(buf, 1);
+  *nelem = (int) buf[0];
+
+  IF_PAR_DEBUG(free,
+	       belch("~^!! Unpacking Free (%d GAs)", 
+		     *nelem/2));
+
+  GetArgs(data, *nelem);
+}
+
+/*
+ * SendSchedule sends a closure to be evaluated in response to a Fish
+ * message. The message is directed to the PE that originated the Fish
+ * (origPE), and includes the packed closure (data) along with its size
+ * (nelem).
+
+//@cindex SCHEDULE
+
+   Structure of a SCHEDULE message:
+
+       +------------------------------------
+       | PE | n | pack buffer of a graph ...
+       +------------------------------------
+ */
+//@cindex sendSchedule
+void
+sendSchedule(GlobalTaskId origPE, int nelem, rtsPackBuffer *packBuffer) 
+{
+  IF_PAR_DEBUG(schedule,
+	       belch("~^-- Sending Schedule (packet <<%d>> with %d elems) to [%x]\n", 
+		     packBuffer->id, nelem, origPE));
+  IF_PAR_DEBUG(packet,
+	       PrintPacket(packBuffer));
+
+  ASSERT(nelem==packBuffer->size);
+  /* check for magic end-of-buffer word */
+  IF_DEBUG(sanity, ASSERT(*(packBuffer->buffer+nelem) == END_OF_BUFFER_MARKER));
+
+  sendOpN(PP_SCHEDULE, origPE, 
+	  nelem + PACK_BUFFER_HDR_SIZE + DEBUG_HEADROOM, (StgPtr)packBuffer);
+}
+
+/*
+ * unpackSchedule unpacks a SCHEDULE message into the Global address of
+ * the closure shipped, the amount of data shipped (nelem) and the data
+ * block (data).
+ */
+
+//@cindex unpackSchedule
+static void
+unpackSchedule(int *nelem, rtsPackBuffer *packBuffer)
+{
+  long buf[1];
+
+  /* first, just unpack 1 word containing the total size (including header) */
+  GetArgs(buf, 1);
+  /* no. of elems, not counting the header of the pack buffer */
+  *nelem = (int) buf[0] - PACK_BUFFER_HDR_SIZE - DEBUG_HEADROOM;
+
+  /* automatic cast of flat pvm-data to rtsPackBuffer */
+  GetArgs(packBuffer, *nelem + PACK_BUFFER_HDR_SIZE + DEBUG_HEADROOM);
+
+  IF_PAR_DEBUG(schedule,
+	       belch("~^-- Unpacking Schedule (packet <<%d>> with %d elems) on [%x]\n", 
+		     packBuffer->id, *nelem, mytid));
+
+  ASSERT(*nelem==packBuffer->size);
+  /* check for magic end-of-buffer word */
+  IF_DEBUG(sanity, ASSERT(*(packBuffer->buffer+*nelem) == END_OF_BUFFER_MARKER));
+}
+
+#ifdef DIST
+/* sendReval is almost identical to the Schedule version, so we can unpack with unpackSchedule */
+void
+sendReval(GlobalTaskId origPE, int nelem, rtsPackBuffer *packBuffer) 
+{  
+  IF_PAR_DEBUG(schedule,
+	       belch("~^-- Sending Reval (packet <<%d>> with %d elems) to [%x]\n", 
+		     packBuffer->id, nelem, origPE));
+  IF_PAR_DEBUG(packet,
+	       PrintPacket(packBuffer));
+
+  ASSERT(nelem==packBuffer->size);
+  /* check for magic end-of-buffer word */
+  IF_DEBUG(sanity, ASSERT(*(packBuffer->buffer+nelem) == END_OF_BUFFER_MARKER));
+
+  sendOpN(PP_REVAL, origPE, 
+	  nelem + PACK_BUFFER_HDR_SIZE + DEBUG_HEADROOM, (StgPtr)packBuffer);
+}
+
+void FinishReval(StgTSO *t)
+{ StgClosure *res;
+  globalAddr ga;
+  nat size;
+  rtsPackBuffer *buffer=NULL;
+  
+  ga.payload.gc.slot = t->revalSlot;
+  ga.payload.gc.gtid = t->revalTid;
+  ga.weight = 0; 
+  
+  //find where the reval result is
+  res = GALAlookup(&ga);
+  ASSERT(res);
+  
+  IF_PAR_DEBUG(schedule,
+    printGA(&ga);
+    belch(" needs the result %08x\n",res));       
+  
+  //send off the result
+  buffer = PackNearbyGraph(res, END_TSO_QUEUE, &size,ga.payload.gc.gtid);
+  ASSERT(buffer != (rtsPackBuffer *)NULL);
+  sendResume(&ga, size, buffer);
+
+  IF_PAR_DEBUG(schedule,
+    belch("@;~) Reval Finished"));
+}
+
+#endif /* DIST */
+
+//@node Message-Processing Functions, GUM Message Processor, GUM Message Sending and Unpacking Functions, High Level Communications Routines
+//@subsection Message-Processing Functions
+
+/*
+ * Message-Processing Functions
+ *
+ * The following routines process incoming GUM messages. Often reissuing
+ * messages in response.
+ *
+ * processFish unpacks a fish message, reissuing it if it's our own,
+ * sending work if we have it or sending it onwards otherwise.
+ */
+
+/*
+ * processFetches constructs and sends resume messages for every
+ * BlockedFetch which is ready to be awakened.
+ * awaken_blocked_queue (in Schedule.c) is responsible for moving 
+ * BlockedFetches from a blocking queue to the PendingFetches queue.
+ */
+void GetRoots(void);
+extern StgBlockedFetch *PendingFetches;
+
+nat
+pending_fetches_len(void)
+{
+  StgBlockedFetch *bf;
+  nat n;
+
+  for (n=0, bf=PendingFetches; bf != END_BF_QUEUE; n++, bf = (StgBlockedFetch *)(bf->link)) {
+    ASSERT(get_itbl(bf)->type==BLOCKED_FETCH);
+  }
+  return n;
+}
+
+//@cindex processFetches
+void
+processFetches(void) {
+  StgBlockedFetch *bf, *next;
+  StgClosure *closure;
+  StgInfoTable *ip;
+  globalAddr rga;
+  static rtsPackBuffer *packBuffer;
+    
+  IF_PAR_DEBUG(verbose,
+	       belch("____ processFetches: %d pending fetches (root @ %p)",
+		     pending_fetches_len(), PendingFetches));
+  
+  for (bf = PendingFetches; 
+       bf != END_BF_QUEUE;
+       bf=next) {
+    /* the PendingFetches list contains only BLOCKED_FETCH closures */
+    ASSERT(get_itbl(bf)->type==BLOCKED_FETCH);
+    /* store link (we might overwrite it via blockFetch later on */
+    next = (StgBlockedFetch *)(bf->link);
+
+    /*
+     * Find the target at the end of the indirection chain, and
+     * process it in much the same fashion as the original target
+     * of the fetch.  Though we hope to find graph here, we could
+     * find a black hole (of any flavor) or even a FetchMe.
+     */
+    closure = bf->node;
+    /*
+      We evacuate BQs and update the node fields where necessary in GC.c
+      So, if we find an EVACUATED closure, something has gone Very Wrong
+      (and therefore we let the RTS crash most ungracefully).
+    */
+    ASSERT(get_itbl(closure)->type != EVACUATED);
+      //  closure = ((StgEvacuated *)closure)->evacuee;
+
+    closure = UNWIND_IND(closure);
+    //while ((ind = IS_INDIRECTION(closure)) != NULL) { closure = ind; }
+
+    ip = get_itbl(closure);
+    if (ip->type == FETCH_ME) {
+      /* Forward the Fetch to someone else */
+      rga.payload.gc.gtid = bf->ga.payload.gc.gtid;
+      rga.payload.gc.slot = bf->ga.payload.gc.slot;
+      rga.weight = bf->ga.weight;
+      
+      sendFetch(((StgFetchMe *)closure)->ga, &rga, 0 /* load */);
+
+      // Global statistics: count no. of fetches
+      if (RtsFlags.ParFlags.ParStats.Global &&
+	  RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	globalParStats.tot_fetch_mess++;
+      }
+
+      IF_PAR_DEBUG(fetch,
+		   belch("__-> processFetches: Forwarding fetch from %lx to %lx",
+			 mytid, rga.payload.gc.gtid));
+
+    } else if (IS_BLACK_HOLE(closure)) {
+      IF_PAR_DEBUG(verbose,
+		   belch("__++ processFetches: trying to send a BLACK_HOLE => doing a blockFetch on closure %p (%s)",
+			 closure, info_type(closure)));
+      bf->node = closure;
+      blockFetch(bf, closure);
+    } else {
+      /* We now have some local graph to send back */
+      nat size;
+
+      packBuffer = gumPackBuffer;
+      IF_PAR_DEBUG(verbose,
+		   belch("__*> processFetches: PackNearbyGraph of closure %p (%s)",
+			 closure, info_type(closure)));
+
+      if ((packBuffer = PackNearbyGraph(closure, END_TSO_QUEUE, &size, bf->ga.payload.gc.gtid)) == NULL) {
+	// Put current BF back on list
+	bf->link = (StgBlockingQueueElement *)PendingFetches;
+	PendingFetches = (StgBlockedFetch *)bf;
+	// ToDo: check that nothing more has to be done to prepare for GC!
+	barf("processFetches: out of heap while packing graph; ToDo: call GC here");
+	GarbageCollect(GetRoots, rtsFalse); 
+	bf = PendingFetches;
+	PendingFetches = (StgBlockedFetch *)(bf->link);
+	closure = bf->node;
+	packBuffer = PackNearbyGraph(closure, END_TSO_QUEUE, &size, bf->ga.payload.gc.gtid);
+	ASSERT(packBuffer != (rtsPackBuffer *)NULL);
+      }
+      rga.payload.gc.gtid = bf->ga.payload.gc.gtid;
+      rga.payload.gc.slot = bf->ga.payload.gc.slot;
+      rga.weight = bf->ga.weight;
+      
+      sendResume(&rga, size, packBuffer);
+
+      // Global statistics: count no. of fetches
+      if (RtsFlags.ParFlags.ParStats.Global &&
+	  RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	globalParStats.tot_resume_mess++;
+      }
+    }
+  }
+  PendingFetches = END_BF_QUEUE;
+}
+
+#if 0
+/*
+  Alternatively to sending fetch messages directly from the FETCH_ME_entry
+  code we could just store the data about the remote data in a global
+  variable and send the fetch request from the main scheduling loop (similar
+  to processFetches above). This would save an expensive STGCALL in the entry 
+  code because we have to go back to the scheduler anyway.
+*/
+//@cindex processFetches
+void
+processTheRealFetches(void) {
+  StgBlockedFetch *bf;
+  StgClosure *closure, *next;
+    
+  IF_PAR_DEBUG(verbose,
+	       belch("__ processTheRealFetches: ");
+	       printGA(&theGlobalFromGA);
+	       printGA(&theGlobalToGA));
+
+  ASSERT(theGlobalFromGA.payload.gc.gtid != 0 &&
+	 theGlobalToGA.payload.gc.gtid != 0);
+
+  /* the old version did this in the FETCH_ME entry code */
+  sendFetch(&theGlobalFromGA, &theGlobalToGA, 0/*load*/);
+  
+}
+#endif
+
+
+/* 
+   Way of dealing with unwanted fish.
+   Used during startup/shutdown, or from unknown PEs 
+*/
+void
+bounceFish(void) { 
+  GlobalTaskId origPE;
+  int age, history, hunger;
+  
+  /* IF_PAR_DEBUG(verbose, */
+	       belch(".... [%x] Bouncing unwanted FISH",mytid);
+
+  unpackFish(&origPE, &age, &history, &hunger);
+	  
+  if (origPE == mytid) {
+    //fishing = rtsFalse;                   // fish has come home
+    outstandingFishes--;
+    last_fish_arrived_at = CURRENT_TIME;  // remember time (see schedule fct)
+    return;                               // that's all
+  }
+
+  /* otherwise, send it home to die */
+  sendFish(origPE, origPE, (age + 1), NEW_FISH_HISTORY, NEW_FISH_HUNGER);
+  // Global statistics: count no. of fetches
+      if (RtsFlags.ParFlags.ParStats.Global &&
+	  RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	globalParStats.tot_fish_mess++;
+      }
+}
+   
+/*
+ * processFish unpacks a fish message, reissuing it if it's our own,
+ * sending work if we have it or sending it onwards otherwise.
+ */
+//@cindex processFish
+static void
+processFish(void)
+{
+  GlobalTaskId origPE;
+  int age, history, hunger;
+  rtsSpark spark;
+  static rtsPackBuffer *packBuffer; 
+
+  unpackFish(&origPE, &age, &history, &hunger);
+
+  if (origPE == mytid) {
+    //fishing = rtsFalse;                   // fish has come home
+    outstandingFishes--;
+    last_fish_arrived_at = CURRENT_TIME;  // remember time (see schedule fct)
+    return;                               // that's all
+  }
+
+  ASSERT(origPE != mytid);
+  IF_PAR_DEBUG(fish,
+	       belch("$$__ processing fish; %d sparks available",
+		     spark_queue_len(&(MainRegTable.rSparks))));
+  while ((spark = findSpark(rtsTrue/*for_export*/)) != NULL) {
+    nat size;
+    // StgClosure *graph;
+
+    packBuffer = gumPackBuffer; 
+    ASSERT(closure_SHOULD_SPARK((StgClosure *)spark));
+    if ((packBuffer = PackNearbyGraph(spark, END_TSO_QUEUE, &size,origPE)) == NULL) {
+      IF_PAR_DEBUG(fish,
+		   belch("$$ GC while trying to satisfy FISH via PackNearbyGraph of node %p",
+			 (StgClosure *)spark));
+      barf("processFish: out of heap while packing graph; ToDo: call GC here");
+      GarbageCollect(GetRoots, rtsFalse);
+      /* Now go back and try again */
+    } else {
+      IF_PAR_DEBUG(verbose,
+		   if (RtsFlags.ParFlags.ParStats.Sparks)
+		     belch("==== STEALING spark %x; sending to %x", spark, origPE));
+      
+      IF_PAR_DEBUG(fish,
+		   belch("$$-- Replying to FISH from %x by sending graph @ %p (%s)",
+			 origPE, 
+			 (StgClosure *)spark, info_type((StgClosure *)spark)));
+      sendSchedule(origPE, size, packBuffer);
+      disposeSpark(spark);
+      // Global statistics: count no. of fetches
+      if (RtsFlags.ParFlags.ParStats.Global &&
+	  RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	globalParStats.tot_schedule_mess++;
+      }
+
+      break;
+    }
+  }
+  if (spark == (rtsSpark)NULL) {
+    IF_PAR_DEBUG(fish,
+		 belch("$$^^ No sparks available for FISH from %x",
+		       origPE));
+    /* We have no sparks to give */
+    if (age < FISH_LIFE_EXPECTANCY) {
+      /* and the fish is atill young, send it to another PE to look for work */
+      sendFish(choosePE(), origPE,
+	       (age + 1), NEW_FISH_HISTORY, NEW_FISH_HUNGER);
+
+      // Global statistics: count no. of fetches
+      if (RtsFlags.ParFlags.ParStats.Global &&
+	  RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	globalParStats.tot_fish_mess++;
+      }
+    } else { /* otherwise, send it home to die */
+      sendFish(origPE, origPE, (age + 1), NEW_FISH_HISTORY, NEW_FISH_HUNGER);
+      // Global statistics: count no. of fetches
+      if (RtsFlags.ParFlags.ParStats.Global &&
+	  RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	globalParStats.tot_fish_mess++;
+      }
+    }
+  }
+}  /* processFish */
+
+/*
+ * processFetch either returns the requested data (if available) 
+ * or blocks the remote blocking queue on a black hole (if not).
+ */
+
+//@cindex processFetch
+static void
+processFetch(void)
+{
+  globalAddr ga, rga;
+  int load;
+  StgClosure *closure;
+  StgInfoTable *ip;
+
+  unpackFetch(&ga, &rga, &load);
+  IF_PAR_DEBUG(fetch,
+	       belch("%%%%__ Rcvd Fetch for ((%x, %d, 0)), Resume ((%x, %d, %x)) (load %d) from %x",
+		     ga.payload.gc.gtid, ga.payload.gc.slot,
+		     rga.payload.gc.gtid, rga.payload.gc.slot, rga.weight, load,
+		     rga.payload.gc.gtid));
+
+  closure = GALAlookup(&ga);
+  ASSERT(closure != (StgClosure *)NULL);
+  ip = get_itbl(closure);
+  if (ip->type == FETCH_ME) {
+    /* Forward the Fetch to someone else */
+    sendFetch(((StgFetchMe *)closure)->ga, &rga, load);
+
+    // Global statistics: count no. of fetches
+    if (RtsFlags.ParFlags.ParStats.Global &&
+	RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+      globalParStats.tot_fetch_mess++;
+    }
+  } else if (rga.payload.gc.gtid == mytid) {
+    /* Our own FETCH forwarded back around to us */
+    StgFetchMeBlockingQueue *fmbq = (StgFetchMeBlockingQueue *)GALAlookup(&rga);
+    
+    IF_PAR_DEBUG(fetch,
+		 belch("%%%%== Fetch returned to sending PE; closure=%p (%s); receiver=%p (%s)",
+		       closure, info_type(closure), fmbq, info_type((StgClosure*)fmbq)));
+    /* We may have already discovered that the fetch target is our own. */
+    if ((StgClosure *)fmbq != closure) 
+      CommonUp((StgClosure *)fmbq, closure);
+    (void) addWeight(&rga);
+  } else if (IS_BLACK_HOLE(closure)) {
+    /* This includes RBH's and FMBQ's */
+    StgBlockedFetch *bf;
+
+    /* Can we assert something on the remote GA? */
+    ASSERT(GALAlookup(&rga) == NULL);
+
+    /* If we're hitting a BH or RBH or FMBQ we have to put a BLOCKED_FETCH
+       closure into the BQ in order to denote that when updating this node
+       the result should be sent to the originator of this fetch message. */
+    bf = (StgBlockedFetch *)createBlockedFetch(ga, rga);
+    IF_PAR_DEBUG(fetch,
+		 belch("%%++ Blocking Fetch ((%x, %d, %x)) on %p (%s)",
+		       rga.payload.gc.gtid, rga.payload.gc.slot, rga.weight, 
+		       closure, info_type(closure)));
+    blockFetch(bf, closure);
+  } else {			
+    /* The target of the FetchMe is some local graph */
+    nat size;
+    // StgClosure *graph;
+    rtsPackBuffer *buffer = (rtsPackBuffer *)NULL;
+
+    if ((buffer = PackNearbyGraph(closure, END_TSO_QUEUE, &size, rga.payload.gc.gtid)) == NULL) {
+      barf("processFetch: out of heap while packing graph; ToDo: call GC here");
+      GarbageCollect(GetRoots, rtsFalse); 
+      closure = GALAlookup(&ga);
+      buffer = PackNearbyGraph(closure, END_TSO_QUEUE, &size, rga.payload.gc.gtid);
+      ASSERT(buffer != (rtsPackBuffer *)NULL);
+    }
+    sendResume(&rga, size, buffer);
+
+    // Global statistics: count no. of fetches
+    if (RtsFlags.ParFlags.ParStats.Global &&
+	RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+      globalParStats.tot_resume_mess++;
+    }
+  }
+}
+
+/* 
+   The list of pending fetches must be a root-list for GC.
+   This routine is called from GC.c (same as marking GAs etc).
+*/
+void
+markPendingFetches(rtsBool major_gc) {
+
+  /* No need to traverse the list; this is done via the scavenge code
+     for a BLOCKED_FETCH closure, which evacuates the link field */
+
+  if (PendingFetches != END_BF_QUEUE ) {
+    IF_PAR_DEBUG(tables,
+		 fprintf(stderr, "@@@@ PendingFetches is root; evaced from %p to",
+			 PendingFetches));
+
+    PendingFetches = MarkRoot((StgClosure*)PendingFetches);
+
+    IF_PAR_DEBUG(verbose,
+		 fprintf(stderr, " %p\n", PendingFetches));
+
+  } else {
+    IF_PAR_DEBUG(tables,
+		 fprintf(stderr, "@@@@ PendingFetches is empty; no need to mark it\n"));
+  }
+}
+
+/*
+ * processFree unpacks a FREE message and adds the weights to our GAs.
+ */
+//@cindex processFree
+static void
+processFree(void)
+{
+  int nelem;
+  static StgWord *buffer;
+  int i;
+  globalAddr ga;
+
+  buffer = (StgWord *)gumPackBuffer;
+  unpackFree(&nelem, buffer);
+  IF_PAR_DEBUG(free,
+	       belch("!!__ Rcvd Free (%d GAs)", nelem / 2));
+
+  ga.payload.gc.gtid = mytid;
+  for (i = 0; i < nelem;) {
+    ga.weight = (rtsWeight) buffer[i++];
+    ga.payload.gc.slot = (int) buffer[i++];
+    IF_PAR_DEBUG(free,
+		 fprintf(stderr, "!!-- Processing free "); 
+		 printGA(&ga);
+		 fputc('\n', stderr);
+		 );
+    (void) addWeight(&ga);
+  }
+}
+
+/*
+ * processResume unpacks a RESUME message into the graph, filling in
+ * the LA -> GA, and GA -> LA tables. Threads blocked on the original
+ * FetchMe (now a blocking queue) are awakened, and the blocking queue
+ * is converted into an indirection.  Finally it sends an ACK in response
+ * which contains any newly allocated GAs.
+ */
+
+//@cindex processResume
+static void
+processResume(GlobalTaskId sender)
+{
+  int nelem;
+  nat nGAs;
+  static rtsPackBuffer *packBuffer;
+  StgClosure *newGraph, *old;
+  globalAddr lga;
+  globalAddr *gagamap;
+  
+  packBuffer = (rtsPackBuffer *)gumPackBuffer;
+  unpackResume(&lga, &nelem, packBuffer);
+
+  IF_PAR_DEBUG(fetch,
+	       fprintf(stderr, "[]__ Rcvd Resume for "); 
+	       printGA(&lga);
+	       fputc('\n', stderr));
+  IF_PAR_DEBUG(packet,
+	       PrintPacket((rtsPackBuffer *)packBuffer));
+  
+  /* 
+   * We always unpack the incoming graph, even if we've received the
+   * requested node in some other data packet (and already awakened
+   * the blocking queue).
+  if (SAVE_Hp + packBuffer[0] >= SAVE_HpLim) {
+    ReallyPerformThreadGC(packBuffer[0], rtsFalse);
+    SAVE_Hp -= packBuffer[0];
+  }
+   */
+
+  // ToDo: Check for GC here !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+  /* Do this *after* GC; we don't want to release the object early! */
+
+  if (lga.weight > 0)
+    (void) addWeight(&lga);
+
+  old = GALAlookup(&lga);
+
+  /* ToDo:  The closure that requested this graph must be one of these two?*/
+  ASSERT(get_itbl(old)->type == FETCH_ME_BQ || 
+	 get_itbl(old)->type == RBH);
+
+  if (RtsFlags.ParFlags.ParStats.Full) {
+    StgBlockingQueueElement *bqe, *last_bqe;
+
+    IF_PAR_DEBUG(fetch,
+		 belch("[]-- Resume is REPLY to closure %lx", old));
+
+    /* Write REPLY events to the log file, indicating that the remote
+       data has arrived 
+       NB: we emit a REPLY only for the *last* elem in the queue; this is
+           the one that triggered the fetch message; all other entries
+	   have just added themselves to the queue, waiting for the data 
+	   they know that has been requested (see entry code for FETCH_ME_BQ)
+    */
+    if ((get_itbl(old)->type == FETCH_ME_BQ ||
+	 get_itbl(old)->type == RBH)) {
+      for (bqe = ((StgFetchMeBlockingQueue *)old)->blocking_queue,
+	   last_bqe = END_BQ_QUEUE;
+	     get_itbl(bqe)->type==TSO || 
+	     get_itbl(bqe)->type==BLOCKED_FETCH;
+	   last_bqe = bqe, bqe = bqe->link) { /* nothing */ }
+
+      ASSERT(last_bqe==END_BQ_QUEUE || 
+	     get_itbl((StgClosure *)last_bqe)->type == TSO);
+
+      /* last_bqe now points to the TSO that triggered the FETCH */ 
+      if (get_itbl((StgClosure *)last_bqe)->type == TSO)
+	DumpRawGranEvent(CURRENT_PROC, taskIDtoPE(sender), 
+			 GR_REPLY, ((StgTSO *)last_bqe), ((StgTSO *)last_bqe)->block_info.closure,
+			 0, spark_queue_len(&(MainRegTable.rSparks)));
+    }
+  }
+
+  newGraph = UnpackGraph(packBuffer, &gagamap, &nGAs);
+  ASSERT(newGraph != NULL);
+
+  /* 
+   * Sometimes, unpacking will common up the resumee with the
+   * incoming graph, but if it hasn't, we'd better do so now.
+   */
+   
+  if (get_itbl(old)->type == FETCH_ME_BQ)
+    CommonUp(old, newGraph);
+
+  IF_PAR_DEBUG(fetch,
+	       belch("[]-- Ready to resume unpacked graph at %p (%s)",
+		     newGraph, info_type(newGraph)));
+
+  IF_PAR_DEBUG(tables,
+	       DebugPrintGAGAMap(gagamap, nGAs));
+  
+  sendAck(sender, nGAs, gagamap);
+}
+
+/*
+ * processSchedule unpacks a SCHEDULE message into the graph, filling
+ * in the LA -> GA, and GA -> LA tables. The root of the graph is added to
+ * the local spark queue.  Finally it sends an ACK in response
+ * which contains any newly allocated GAs.
+ */
+//@cindex processSchedule
+static void
+processSchedule(GlobalTaskId sender)
+{
+  nat nelem, nGAs;
+  rtsBool success;
+  static rtsPackBuffer *packBuffer;
+  StgClosure *newGraph;
+  globalAddr *gagamap;
+  
+  packBuffer = gumPackBuffer;		/* HWL */
+  unpackSchedule(&nelem, packBuffer);
+
+  IF_PAR_DEBUG(schedule,
+	       belch("--__ Rcvd Schedule (%d elems)", nelem));
+  IF_PAR_DEBUG(packet,
+	       PrintPacket(packBuffer));
+
+  /*
+   * For now, the graph is a closure to be sparked as an advisory
+   * spark, but in future it may be a complete spark with
+   * required/advisory status, priority etc.
+   */
+
+  /*
+  space_required = packBuffer[0];
+  if (SAVE_Hp + space_required >= SAVE_HpLim) {
+    ReallyPerformThreadGC(space_required, rtsFalse);
+    SAVE_Hp -= space_required;
+  }
+  */
+  // ToDo: check whether GC is necessary !!!!!!!!!!!!!!!!!!!!!
+  newGraph = UnpackGraph(packBuffer, &gagamap, &nGAs);
+  ASSERT(newGraph != NULL);
+  success = add_to_spark_queue(newGraph, &(MainRegTable.rSparks));
+
+  if (RtsFlags.ParFlags.ParStats.Full && 
+      RtsFlags.ParFlags.ParStats.Sparks && 
+      success) 
+    DumpRawGranEvent(CURRENT_PROC, CURRENT_PROC, 
+		     GR_STOLEN, ((StgTSO *)NULL), newGraph, 
+		     0, 0 /* spark_queue_len(ADVISORY_POOL) */);
+
+  IF_PAR_DEBUG(schedule,
+	       if (success)
+  	         belch("--^^  added spark to unpacked graph %p (%s); %d sparks available on [%x] (%s)", 
+		     newGraph, info_type(newGraph), spark_queue_len(&(MainRegTable.rSparks)), mytid);
+	       else
+                 belch("--^^  received non-sparkable closure %p (%s); nothing added to spark pool; %d sparks available on [%x]", 
+		     newGraph, info_type(newGraph), spark_queue_len(&(MainRegTable.rSparks)), mytid));
+  IF_PAR_DEBUG(packet,
+	       belch("*<    Unpacked graph with root at %p (%s):", 
+		     newGraph, info_type(newGraph));
+	       PrintGraph(newGraph, 0));
+
+  IF_PAR_DEBUG(tables,
+  	       DebugPrintGAGAMap(gagamap, nGAs));
+
+  sendAck(sender, nGAs, gagamap);
+
+  //fishing = rtsFalse;
+  ASSERT(outstandingFishes>0);
+  outstandingFishes--;
+}
+
+/*
+ * processAck unpacks an ACK, and uses the GAGA map to convert RBH's
+ * (which represent shared thunks that have been shipped) into fetch-mes
+ * to remote GAs.
+ */
+//@cindex processAck
+static void
+processAck(void)
+{
+  nat nGAs;
+  globalAddr *gaga;
+  globalAddr gagamap[256]; // ToDo: elim magic constant!!   MAX_GAS * 2];??
+
+  unpackAck(&nGAs, gagamap);
+
+  IF_PAR_DEBUG(tables,
+	       belch(",,,, Rcvd Ack (%d pairs)", nGAs);
+	       DebugPrintGAGAMap(gagamap, nGAs));
+
+  IF_DEBUG(sanity,
+	   checkGAGAMap(gagamap, nGAs));
+
+  /*
+   * For each (oldGA, newGA) pair, set the GA of the corresponding
+   * thunk to the newGA, convert the thunk to a FetchMe, and return
+   * the weight from the oldGA.
+   */
+  for (gaga = gagamap; gaga < gagamap + nGAs * 2; gaga += 2) {
+    StgClosure *old_closure = GALAlookup(gaga);
+    StgClosure *new_closure = GALAlookup(gaga + 1);
+
+    ASSERT(old_closure != NULL);
+    if (new_closure == NULL) {
+      /* We don't have this closure, so we make a fetchme for it */
+      globalAddr *ga = setRemoteGA(old_closure, gaga + 1, rtsTrue);
+      
+      /* convertToFetchMe should be done unconditionally here.
+	 Currently, we assign GAs to CONSTRs, too, (a bit of a hack),
+	 so we have to check whether it is an RBH before converting
+
+	 ASSERT(get_itbl(old_closure)==RBH);
+      */
+      if (get_itbl(old_closure)->type==RBH)
+	convertToFetchMe((StgRBH *)old_closure, ga);
+    } else {
+      /* 
+       * Oops...we've got this one already; update the RBH to
+       * point to the object we already know about, whatever it
+       * happens to be.
+       */
+      CommonUp(old_closure, new_closure);
+      
+      /* 
+       * Increase the weight of the object by the amount just
+       * received in the second part of the ACK pair.
+       */
+      (void) addWeight(gaga + 1);
+    }
+    (void) addWeight(gaga);
+  }
+
+  /* check the sanity of the LAGA and GALA tables after mincing them */
+  IF_DEBUG(sanity, checkLAGAtable(rtsFalse));
+}
+
+#ifdef DIST
+
+void
+bounceReval(void) {  
+  barf("Task %x: TODO: should send NACK in response to REVAL",mytid);	  
+}
+
+static void
+processReval(GlobalTaskId sender) //similar to schedule...
+{ nat nelem, space_required, nGAs;
+  static rtsPackBuffer *packBuffer;
+  StgClosure *newGraph;
+  globalAddr *gagamap;
+  StgTSO*     tso;
+  globalAddr *ga;
+  
+  packBuffer = gumPackBuffer;		/* HWL */
+  unpackSchedule(&nelem, packBuffer); /* okay, since the structure is the same */
+
+  IF_PAR_DEBUG(packet,
+	       belch("@;~) [%x] Rcvd Reval (%d elems)", mytid, nelem);
+	       PrintPacket(packBuffer));
+
+  /*
+  space_required = packBuffer[0];
+  if (SAVE_Hp + space_required >= SAVE_HpLim) {
+    ReallyPerformThreadGC(space_required, rtsFalse);
+    SAVE_Hp -= space_required;
+  }
+  */
+  
+  // ToDo: check whether GC is necessary !!!!!!!!!!!!!!!!!!!!!
+  newGraph = UnpackGraph(packBuffer, &gagamap, &nGAs);
+  ASSERT(newGraph != NULL);
+  
+  IF_PAR_DEBUG(packet,
+	       belch("@;~)  Unpacked graph with root at %p (%s):", 
+		     newGraph, info_type(newGraph));
+	       PrintGraph(newGraph, 0));
+
+  IF_PAR_DEBUG(tables,
+  	       DebugPrintGAGAMap(gagamap, nGAs));
+
+  IF_PAR_DEBUG(tables, 
+    printLAGAtable();   
+    DebugPrintGAGAMap(gagamap, nGAs));   
+
+  //We don't send an Ack to the head!!!!
+  ASSERT(nGAs>0);  
+  sendAck(sender, nGAs-1, gagamap+2);
+  
+  IF_PAR_DEBUG(verbose,
+	       belch("@;~)  About to create Reval thread on behalf of %x", 
+		     sender));
+  
+  tso=createGenThread(RtsFlags.GcFlags.initialStkSize,newGraph);
+  tso->priority=RevalPriority;
+  tso->revalSlot=gagamap->payload.gc.slot;//record who sent the reval
+  tso->revalTid =gagamap->payload.gc.gtid;
+  scheduleThread(tso);
+  context_switch = 1; // switch at the earliest opportunity
+} 
+#endif
+
+
+//@node GUM Message Processor, Miscellaneous Functions, Message-Processing Functions, High Level Communications Routines
+//@subsection GUM Message Processor
+
+/*
+ * GUM Message Processor
+
+ * processMessages processes any messages that have arrived, calling
+ * appropriate routines depending on the message tag
+ * (opcode). N.B. Unless profiling it assumes that there {\em ARE} messages
+ * present and performs a blocking receive! During profiling it
+ * busy-waits in order to record idle time.
+ */
+
+//@cindex processMessages
+rtsBool
+processMessages(void)
+{
+  rtsPacket packet;
+  OpCode opcode;
+  GlobalTaskId task;
+  rtsBool receivedFinish = rtsFalse;
+
+  do {
+    packet = GetPacket();  /* Get next message; block until one available */
+    getOpcodeAndSender(packet, &opcode, &task);
+
+    if (task==SysManTask) { 
+      switch (opcode) { 
+      case PP_PETIDS:
+	processPEtids();
+	break;
+	  
+      case PP_FINISH:
+	IF_PAR_DEBUG(verbose,
+		     belch("==== received FINISH [%p]", mytid));
+	/* this boolean value is returned and propagated to the main 
+	   scheduling loop, thus shutting-down this PE */
+	receivedFinish = rtsTrue;
+	break;  
+	  
+      default:  
+	barf("Task %x: received unknown opcode %x from SysMan",mytid, opcode);
+      }
+    } else if (taskIDtoPE(task)==0) { 
+      /* When a new PE joins then potentially FISH & REVAL message may
+	 reach PES before they are notified of the new PEs existance.  The
+	 only solution is to bounce/fail these messages back to the sender.
+	 But we will worry about it once we start seeing these race
+	 conditions!  */
+      switch (opcode) { 
+      case PP_FISH:
+	bounceFish();
+	break;
+#ifdef DIST	  
+      case PP_REVAL:
+	bounceReval();
+	break;	  
+#endif          
+      case PP_PETIDS:
+	belch("Task %x: Ignoring PVM session opened by another SysMan %x",mytid,task);
+	break;
+        
+      case PP_FINISH:   
+	break;
+	
+      default:  
+	belch("Task %x: Ignoring opcode %x from unknown PE %x",mytid, opcode, task);
+      }
+    } else
+      switch (opcode) {
+      case PP_FETCH:
+	processFetch();
+	// Global statistics: count no. of fetches
+	if (RtsFlags.ParFlags.ParStats.Global &&
+	    RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	  globalParStats.rec_fetch_mess++;
+	}
+	break;
+
+      case PP_RESUME:
+	processResume(task);
+	// Global statistics: count no. of fetches
+	if (RtsFlags.ParFlags.ParStats.Global &&
+	    RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	  globalParStats.rec_resume_mess++;
+	}
+	break;
+
+      case PP_ACK:
+	processAck();
+	break;
+
+      case PP_FISH:
+	processFish();
+	// Global statistics: count no. of fetches
+	if (RtsFlags.ParFlags.ParStats.Global &&
+	    RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	  globalParStats.rec_fish_mess++;
+	}
+	break;
+
+      case PP_FREE:
+	processFree();
+	break;
+      
+      case PP_SCHEDULE:
+	processSchedule(task);
+	// Global statistics: count no. of fetches
+	if (RtsFlags.ParFlags.ParStats.Global &&
+	    RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	  globalParStats.rec_schedule_mess++;
+	}
+	break;
+      
+#ifdef DIST      
+      case PP_REVAL:
+	processReval(task);
+	// Global statistics: count no. of fetches
+	if (RtsFlags.ParFlags.ParStats.Global &&
+	    RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	  globalParStats.rec_reval_mess++;
+	}
+	break;
+#endif
+      
+      default:
+	/* Anything we're not prepared to deal with. */
+	barf("Task %x: Unexpected opcode %x from %x",
+	     mytid, opcode, task);
+      } /* switch */
+
+  } while (PacketsWaiting());	/* While there are messages: process them */
+  return receivedFinish;
+}				/* processMessages */
+
+//@node Miscellaneous Functions, Index, GUM Message Processor, High Level Communications Routines
+//@subsection Miscellaneous Functions
+
+/*
+ * blockFetch blocks a BlockedFetch node on some kind of black hole.
+ */
+//@cindex blockFetch
+void
+blockFetch(StgBlockedFetch *bf, StgClosure *bh) {
+  bf->node = bh;
+  switch (get_itbl(bh)->type) {
+  case BLACKHOLE:
+    bf->link = END_BQ_QUEUE;
+    //((StgBlockingQueue *)bh)->header.info = &stg_BLACKHOLE_BQ_info;
+    SET_INFO(bh, &stg_BLACKHOLE_BQ_info); // turn closure into a blocking queue
+    ((StgBlockingQueue *)bh)->blocking_queue = (StgBlockingQueueElement *)bf;
+    
+    // put bh on the mutables list
+    recordMutable((StgMutClosure *)bh);
+    break;
+    
+  case BLACKHOLE_BQ:
+    /* enqueue bf on blocking queue of closure bh */
+    bf->link = ((StgBlockingQueue *)bh)->blocking_queue;
+    ((StgBlockingQueue *)bh)->blocking_queue = (StgBlockingQueueElement *)bf;
+
+    // put bh on the mutables list; ToDo: check
+    recordMutable((StgMutClosure *)bh);
+    break;
+
+  case FETCH_ME_BQ:
+    /* enqueue bf on blocking queue of closure bh */
+    bf->link = ((StgFetchMeBlockingQueue *)bh)->blocking_queue;
+    ((StgFetchMeBlockingQueue *)bh)->blocking_queue = (StgBlockingQueueElement *)bf;
+
+    // put bh on the mutables list; ToDo: check
+    recordMutable((StgMutClosure *)bh);
+    break;
+    
+  case RBH:
+    /* enqueue bf on blocking queue of closure bh */
+    bf->link = ((StgRBH *)bh)->blocking_queue;
+    ((StgRBH *)bh)->blocking_queue = (StgBlockingQueueElement *)bf;
+
+    // put bh on the mutables list; ToDo: check
+    recordMutable((StgMutClosure *)bh);
+    break;
+    
+  default:
+    barf("blockFetch: thought %p was a black hole (IP %#lx, %s)",
+	 (StgClosure *)bh, get_itbl((StgClosure *)bh), 
+	 info_type((StgClosure *)bh));
+  }
+  IF_PAR_DEBUG(bq,
+	       belch("##++ blockFetch: after block the BQ of %p (%s) is:",
+		     bh, info_type(bh));
+	       print_bq(bh));
+}
+
+
+/*
+  @blockThread@ is called from the main scheduler whenever tso returns with
+  a ThreadBlocked return code; tso has already been added to a blocking
+  queue (that's done in the entry code of the closure, because it is a 
+  cheap operation we have to do in any case); the main purpose of this
+  routine is to send a Fetch message in case we are blocking on a FETCHME(_BQ)
+  closure, which is indicated by the tso.why_blocked field;
+  we also write an entry into the log file if we are generating one
+
+  Should update exectime etc in the entry code already; but we don't have
+  something like ``system time'' in the log file anyway, so this should
+  even out the inaccuracies.
+*/
+
+//@cindex blockThread
+void
+blockThread(StgTSO *tso)
+{
+  globalAddr *remote_ga=NULL;
+  globalAddr *local_ga;
+  globalAddr fmbq_ga;
+
+  // ASSERT(we are on some blocking queue)
+  ASSERT(tso->block_info.closure != (StgClosure *)NULL);
+
+  /*
+    We have to check why this thread has been blocked.
+  */
+  switch (tso->why_blocked) {
+    case BlockedOnGA:
+      /* the closure must be a FETCH_ME_BQ; tso came in here via 
+	 FETCH_ME entry code */
+      ASSERT(get_itbl(tso->block_info.closure)->type==FETCH_ME_BQ);
+
+      /* HACK: the link field is used to hold the GA between FETCH_ME_entry
+	 end this point; if something (eg. GC) happens inbetween the whole
+	 thing will blow up 
+	 The problem is that the ga field of the FETCH_ME has been overwritten
+	 with the head of the blocking queue (which is tso). 
+      */
+      ASSERT(looks_like_ga(&theGlobalFromGA));
+      // ASSERT(tso->link!=END_TSO_QUEUE && tso->link!=NULL);
+      remote_ga = &theGlobalFromGA; //tso->link;
+      tso->link = (StgTSO*)END_BQ_QUEUE;
+      /* it was tso which turned node from FETCH_ME into FETCH_ME_BQ =>
+	 we have to send a Fetch message here! */
+      if (RtsFlags.ParFlags.ParStats.Full) {
+	/* Note that CURRENT_TIME may perform an unsafe call */
+	tso->par.exectime += CURRENT_TIME - tso->par.blockedat;
+	tso->par.fetchcount++;
+	tso->par.blockedat = CURRENT_TIME;
+	/* we are about to send off a FETCH message, so dump a FETCH event */
+	DumpRawGranEvent(CURRENT_PROC, 
+			 taskIDtoPE(remote_ga->payload.gc.gtid),
+			 GR_FETCH, tso, tso->block_info.closure, 0, 0);
+      }
+      /* Phil T. claims that this was a workaround for a hard-to-find
+       * bug, hence I'm leaving it out for now --SDM 
+       */
+      /* Assign a brand-new global address to the newly created FMBQ  */
+      local_ga = makeGlobal(tso->block_info.closure, rtsFalse);
+      splitWeight(&fmbq_ga, local_ga);
+      ASSERT(fmbq_ga.weight == 1U << (BITS_IN(unsigned) - 1));
+      
+      sendFetch(remote_ga, &fmbq_ga, 0/*load*/);
+
+      // Global statistics: count no. of fetches
+      if (RtsFlags.ParFlags.ParStats.Global &&
+	  RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+	globalParStats.tot_fetch_mess++;
+      }
+
+      IF_DEBUG(sanity,
+	       theGlobalFromGA.payload.gc.gtid = (GlobalTaskId)0);
+      break;
+
+    case BlockedOnGA_NoSend:
+      /* the closure must be a FETCH_ME_BQ; tso came in here via 
+	 FETCH_ME_BQ entry code */
+      ASSERT(get_itbl(tso->block_info.closure)->type==FETCH_ME_BQ);
+
+      /* Fetch message has been sent already */
+      if (RtsFlags.ParFlags.ParStats.Full) {
+	/* Note that CURRENT_TIME may perform an unsafe call */
+	tso->par.exectime += CURRENT_TIME - tso->par.blockedat;
+	tso->par.blockcount++;
+	tso->par.blockedat = CURRENT_TIME;
+	/* dump a block event, because fetch has been sent already */
+	DumpRawGranEvent(CURRENT_PROC, thisPE,
+			 GR_BLOCK, tso, tso->block_info.closure, 0, 0);
+      }
+      break;
+
+    case BlockedOnMVar:
+    case BlockedOnBlackHole:
+      /* the closure must be a BLACKHOLE_BQ or an RBH; tso came in here via 
+	 BLACKHOLE(_BQ) or CAF_BLACKHOLE or RBH entry code */
+      ASSERT(get_itbl(tso->block_info.closure)->type==MVAR ||
+	     get_itbl(tso->block_info.closure)->type==BLACKHOLE_BQ ||
+	     get_itbl(tso->block_info.closure)->type==RBH);
+
+      /* if collecting stats update the execution time etc */
+      if (RtsFlags.ParFlags.ParStats.Full) {
+	/* Note that CURRENT_TIME may perform an unsafe call */
+	tso->par.exectime += CURRENT_TIME - tso->par.blockedat;
+	tso->par.blockcount++;
+	tso->par.blockedat = CURRENT_TIME;
+	DumpRawGranEvent(CURRENT_PROC, thisPE,
+			 GR_BLOCK, tso, tso->block_info.closure, 0, 0);
+      }
+      break;
+
+    case BlockedOnDelay:
+      /* Whats sort of stats shall we collect for an explicit threadDelay? */
+      IF_PAR_DEBUG(verbose,
+	       belch("##++ blockThread: TSO %d blocked on ThreadDelay",
+		     tso->id));
+      break;
+
+    /* Check that the following is impossible to happen, indeed
+    case BlockedOnException:
+    case BlockedOnRead:
+    case BlockedOnWrite:
+    */
+    default:
+      barf("blockThread: impossible why_blocked code %d for TSO %d",
+	   tso->why_blocked, tso->id);
+  }
+
+  IF_PAR_DEBUG(verbose,
+	       belch("##++ blockThread: TSO %d blocked on closure %p (%s); %s",
+		     tso->id, tso->block_info.closure, info_type(tso->block_info.closure),
+		     (tso->why_blocked==BlockedOnGA) ? "Sent FETCH for GA" : ""));
+  
+  IF_PAR_DEBUG(bq,
+	       print_bq(tso->block_info.closure));
+}
+
+/*
+ * ChoosePE selects a GlobalTaskId from the array of PEs 'at random'.
+ * Important properties:
+ *   - it varies during execution, even if the PE is idle
+ *   - it's different for each PE
+ *   - we never send a fish to ourselves
+ */
+extern long lrand48 (void);
+
+//@cindex choosePE
+GlobalTaskId
+choosePE(void)
+{
+  long temp;
+
+  temp = lrand48() % nPEs;
+  if (allPEs[temp] == mytid) {	/* Never send a FISH to yourself */
+    temp = (temp + 1) % nPEs;
+  }
+  return allPEs[temp];
+}
+
+/* 
+ * allocate a BLOCKED_FETCH closure and fill it with the relevant fields
+ * of the ga argument; called from processFetch when the local closure is
+ * under evaluation
+ */
+//@cindex createBlockedFetch
+StgClosure *
+createBlockedFetch (globalAddr ga, globalAddr rga)
+{
+  StgBlockedFetch *bf;
+  StgClosure *closure;
+
+  closure = GALAlookup(&ga);
+  if ((bf = (StgBlockedFetch *)allocate(_HS + sizeofW(StgBlockedFetch))) == NULL) {
+    barf("createBlockedFetch: out of heap while allocating heap for a BlocekdFetch; ToDo: call GC here");
+    GarbageCollect(GetRoots, rtsFalse); 
+    closure = GALAlookup(&ga);
+    bf = (StgBlockedFetch *)allocate(_HS + sizeofW(StgBlockedFetch));
+    // ToDo: check whether really guaranteed to succeed 2nd time around
+  }
+
+  ASSERT(bf != (StgBlockedFetch *)NULL);
+  SET_INFO((StgClosure *)bf, &stg_BLOCKED_FETCH_info);
+  // ToDo: check whether other header info is needed
+  bf->node = closure;
+  bf->ga.payload.gc.gtid = rga.payload.gc.gtid;
+  bf->ga.payload.gc.slot = rga.payload.gc.slot;
+  bf->ga.weight = rga.weight;
+  // bf->link = NULL;  debugging
+
+  IF_PAR_DEBUG(schedule,
+	       fprintf(stderr, "%%%%// created BF: bf=%p (%s) of closure , GA: ",
+		       bf, info_type((StgClosure*)bf));
+	       printGA(&(bf->ga));
+	       fputc('\n',stderr));
+  return (StgClosure *)bf;
+}
+
+/*
+ * waitForTermination enters a loop ignoring spurious messages while
+ * waiting for the termination sequence to be completed.  
+ */
+//@cindex waitForTermination
+void
+waitForTermination(void)
+{
+  do {
+    rtsPacket p = GetPacket();
+    processUnexpectedMessage(p);
+  } while (rtsTrue);
+}
+
+#ifdef DEBUG
+//@cindex DebugPrintGAGAMap
+void
+DebugPrintGAGAMap(globalAddr *gagamap, int nGAs)
+{
+  nat i;
+  
+  for (i = 0; i < nGAs; ++i, gagamap += 2)
+    fprintf(stderr, "__ gagamap[%d] = ((%x, %d, %x)) -> ((%x, %d, %x))\n", i,
+	    gagamap[0].payload.gc.gtid, gagamap[0].payload.gc.slot, gagamap[0].weight,
+	    gagamap[1].payload.gc.gtid, gagamap[1].payload.gc.slot, gagamap[1].weight);
+}
+
+//@cindex checkGAGAMap
+void
+checkGAGAMap(globalAddr *gagamap, int nGAs)
+{
+  nat i;
+  
+  for (i = 0; i < (nat)nGAs; ++i, gagamap += 2) {
+    ASSERT(looks_like_ga(gagamap));
+    ASSERT(looks_like_ga(gagamap+1));
+  }
+}
+#endif
+
+//@cindex freeMsgBuffer
+static StgWord **freeMsgBuffer = NULL;
+//@cindex freeMsgIndex
+static nat      *freeMsgIndex  = NULL;
+
+//@cindex prepareFreeMsgBuffers
+void
+prepareFreeMsgBuffers(void)
+{
+  nat i;
+  
+  /* Allocate the freeMsg buffers just once and then hang onto them. */
+  if (freeMsgIndex == NULL) {
+    freeMsgIndex = (nat *) stgMallocBytes(nPEs * sizeof(nat), 
+					  "prepareFreeMsgBuffers (Index)");
+    freeMsgBuffer = (StgWord **) stgMallocBytes(nPEs * sizeof(long *), 
+					  "prepareFreeMsgBuffers (Buffer)");
+    
+    for(i = 0; i < nPEs; i++) 
+      if (i != (thisPE-1)) 
+	freeMsgBuffer[i] = (StgPtr) stgMallocWords(RtsFlags.ParFlags.packBufferSize,
+					       "prepareFreeMsgBuffers (Buffer #i)");
+      else
+	freeMsgBuffer[i] = 0;
+  }
+  
+  /* Initialize the freeMsg buffer pointers to point to the start of their
+     buffers */
+  for (i = 0; i < nPEs; i++)
+    freeMsgIndex[i] = 0;
+}
+
+//@cindex freeRemoteGA
+void
+freeRemoteGA(int pe, globalAddr *ga)
+{
+  nat i;
+  
+  ASSERT(GALAlookup(ga) == NULL);
+  
+  if ((i = freeMsgIndex[pe]) + 2 >= RtsFlags.ParFlags.packBufferSize) {
+    IF_PAR_DEBUG(free,
+		 belch("!! Filled a free message buffer (sending remaining messages indivisually)"));	
+
+    sendFree(ga->payload.gc.gtid, i, freeMsgBuffer[pe]);
+    i = 0;
+  }
+  freeMsgBuffer[pe][i++] = (StgWord) ga->weight;
+  freeMsgBuffer[pe][i++] = (StgWord) ga->payload.gc.slot;
+  freeMsgIndex[pe] = i;
+
+  IF_DEBUG(sanity,
+	   ga->weight = 0xdead0add;
+	   ga->payload.gc.gtid = 0xbbbbbbbb;
+	   ga->payload.gc.slot = 0xbbbbbbbb;);
+}
+
+//@cindex sendFreeMessages
+void
+sendFreeMessages(void)
+{
+  nat i;
+  
+  for (i = 0; i < nPEs; i++) 
+    if (freeMsgIndex[i] > 0)
+      sendFree(allPEs[i], freeMsgIndex[i], freeMsgBuffer[i]);
+}
+
+/* synchronises with the other PEs. Receives and records in a global
+ * variable the task-id of SysMan. If this is the main thread (discovered
+ * in main.lc), identifies itself to SysMan. Finally it receives
+ * from SysMan an array of the Global Task Ids of each PE, which is
+ * returned as the value of the function.
+ */
+
+#if defined(PAR_TICKY)
+/* Has to see freeMsgIndex, so must be defined here not in ParTicky.c */
+//@cindex stats_CntFreeGA
+void
+stats_CntFreeGA (void) {  // stats only
+
+  // Global statistics: residency of thread and spark pool
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    nat i, s;
+  
+    globalParStats.cnt_free_GA++;
+    for (i = 0, s = 0; i < nPEs; i++) 
+      s += globalParStats.tot_free_GA += freeMsgIndex[i]/2;
+
+    if ( s > globalParStats.res_free_GA )
+      globalParStats.res_free_GA = s;
+  }
+}
+#endif /* PAR_TICKY */
+
+#endif /* PAR -- whole file */
+
+//@node Index,  , Miscellaneous Functions, High Level Communications Routines
+//@subsection Index
+
+//@index
+//* ACK::  @cindex\s-+ACK
+//* DebugPrintGAGAMap::  @cindex\s-+DebugPrintGAGAMap
+//* FETCH::  @cindex\s-+FETCH
+//* FISH::  @cindex\s-+FISH
+//* FREE::  @cindex\s-+FREE
+//* RESUME::  @cindex\s-+RESUME
+//* SCHEDULE::  @cindex\s-+SCHEDULE
+//* blockFetch::  @cindex\s-+blockFetch
+//* choosePE::  @cindex\s-+choosePE
+//* freeMsgBuffer::  @cindex\s-+freeMsgBuffer
+//* freeMsgIndex::  @cindex\s-+freeMsgIndex
+//* freeRemoteGA::  @cindex\s-+freeRemoteGA
+//* gumPackBuffer::  @cindex\s-+gumPackBuffer
+//* initMoreBuffers::  @cindex\s-+initMoreBuffers
+//* prepareFreeMsgBuffers::  @cindex\s-+prepareFreeMsgBuffers
+//* processAck::  @cindex\s-+processAck
+//* processFetch::  @cindex\s-+processFetch
+//* processFetches::  @cindex\s-+processFetches
+//* processFish::  @cindex\s-+processFish
+//* processFree::  @cindex\s-+processFree
+//* processMessages::  @cindex\s-+processMessages
+//* processResume::  @cindex\s-+processResume
+//* processSchedule::  @cindex\s-+processSchedule
+//* sendAck::  @cindex\s-+sendAck
+//* sendFetch::  @cindex\s-+sendFetch
+//* sendFish::  @cindex\s-+sendFish
+//* sendFree::  @cindex\s-+sendFree
+//* sendFreeMessages::  @cindex\s-+sendFreeMessages
+//* sendResume::  @cindex\s-+sendResume
+//* sendSchedule::  @cindex\s-+sendSchedule
+//* unpackAck::  @cindex\s-+unpackAck
+//* unpackFetch::  @cindex\s-+unpackFetch
+//* unpackFish::  @cindex\s-+unpackFish
+//* unpackFree::  @cindex\s-+unpackFree
+//* unpackResume::  @cindex\s-+unpackResume
+//* unpackSchedule::  @cindex\s-+unpackSchedule
+//* waitForTermination::  @cindex\s-+waitForTermination
+//@end index
diff --git a/rts/parallel/LLC.h b/rts/parallel/LLC.h
new file mode 100644
index 0000000000..536e431bef
--- /dev/null
+++ b/rts/parallel/LLC.h
@@ -0,0 +1,130 @@
+/* --------------------------------------------------------------------------
+   Time-stamp: <Sun Mar 18 2001 21:23:50 Stardate: [-30]6349.45 hwloidl>
+
+   Low Level Communications Header (LLC.h)
+
+   Contains the definitions used by the Low-level Communications
+   module of the GUM Haskell runtime environment.
+   Based on the Graph for PVM implementation.
+
+   Phil Trinder, Glasgow University, 13th Dec 1994
+   Adapted for the 4.xx RTS
+   H-W. Loidl, Heriot-Watt, November 1999
+   ----------------------------------------------------------------------- */
+
+#ifndef __LLC_H
+#define __LLC_H
+
+#ifdef PAR
+
+//@node Low Level Communications Header, , ,
+//@section Low Level Communications Header
+
+//@menu
+//* Includes::			
+//* Macros and Constants::	
+//* PVM macros::		
+//* Externs::			
+//@end menu
+
+//@node Includes, Macros and Constants, Low Level Communications Header, Low Level Communications Header
+//@subsection Includes
+
+#include "Rts.h"
+#include "Parallel.h"
+
+#include "PEOpCodes.h"
+#include "pvm3.h"
+
+//@node Macros and Constants, PVM macros, Includes, Low Level Communications Header
+//@subsection Macros and Constants
+
+#define	ANY_TASK	(-1)	/* receive messages from any task */
+#define ANY_GLOBAL_TASK	ANY_TASK
+#define ANY_OPCODE	(-1)	/* receive any opcode */
+#define	ALL_GROUP	(-1)	/* wait for barrier from every group member */
+
+#define	PEGROUP		"PE"
+
+#define	MGRGROUP	"MGR"
+#define	SYSGROUP	"SYS"
+
+
+#define	PETASK		"PE"
+
+//@node PVM macros, Externs, Macros and Constants, Low Level Communications Header
+//@subsection PVM macros
+
+#define	sync(gp,op)		do { \
+                                  broadcast(gp,op); \
+                                  pvm_barrier(gp,ALL_GROUP); \
+                                } while(0)
+
+#define broadcast(gp,op)	do { \
+                                  pvm_initsend(PvmDataDefault); \
+                                  pvm_bcast(gp,op); \
+                                } while(0)
+
+#define checkComms(c,s)		do { \
+                                  if ((c)<0) { \
+                                    pvm_perror(s); \
+                                    stg_exit(EXIT_FAILURE); \
+                                }} while(0)
+
+#define _my_gtid		pvm_mytid()
+#define GetPacket()             pvm_recv(ANY_TASK,ANY_OPCODE)
+#define PacketsWaiting()	(pvm_probe(ANY_TASK,ANY_OPCODE) != 0)
+
+#define SPARK_THREAD_DESCRIPTOR		1
+#define GLOBAL_THREAD_DESCRIPTOR	2
+
+#define _extract_jump_field(v)	(v)
+
+#define MAX_DATA_WORDS_IN_PACKET	1024
+
+/* basic PVM packing */
+#define PutArg1(a)		pvm_pklong((long *)&(a),1,1)
+#define PutArg2(a)		pvm_pklong((long *)&(a),1,1)
+#define PutArgN(n,a)		pvm_pklong((long *)&(a),1,1)
+#define PutArgs(b,n)		pvm_pklong((long *)b,n,1)
+
+#define PutLit(l)		{ int a = l; PutArgN(?,a); }
+
+/* basic PVM unpacking */
+#define GetArg1(a)		pvm_upklong((long *)&(a),1,1)
+#define GetArg2(a)		pvm_upklong((long *)&(a),1,1)
+#define GetArgN(n,a)		pvm_upklong((long *)&(a),1,1)
+#define GetArgs(b,n)		pvm_upklong((long *)b,n,1)
+
+//@node Externs,  , PVM macros, Low Level Communications Header
+//@subsection Externs
+
+/* basic message passing routines */
+extern void sendOp   (OpCode,GlobalTaskId),
+            sendOp1  (OpCode,GlobalTaskId,StgWord),
+            sendOp2  (OpCode,GlobalTaskId,StgWord,StgWord),
+	    sendOpV  (OpCode,GlobalTaskId,int,...), 
+            sendOpN  (OpCode,GlobalTaskId,int,StgPtr),
+            sendOpNV (OpCode,GlobalTaskId,int,StgPtr,int,...);
+
+extern void broadcastOpN(OpCode op, char *group, int n, StgPtr args);
+
+/* extracting data out of a packet */
+OpCode        getOpcode (rtsPacket p);
+void          getOpcodeAndSender (rtsPacket p, OpCode *popcode, 
+			          GlobalTaskId *psender_id);
+GlobalTaskId  senderTask (rtsPacket p);
+rtsPacket     waitForPEOp(OpCode op, GlobalTaskId who, void(*processUnexpected)(rtsPacket) );
+
+/* Init and shutdown routines */
+void          startUpPE (void);
+void          shutDownPE(void);
+int           getExitCode(int nbytes, GlobalTaskId *sender_idp);
+
+/* aux functions */
+char  *getOpName (unsigned op);  // returns string of opcode
+void   processUnexpectedMessage (rtsPacket);
+//void   NullException(void);
+
+#endif /*PAR */
+#endif /*defined __LLC_H */
diff --git a/rts/parallel/LLComms.c b/rts/parallel/LLComms.c
new file mode 100644
index 0000000000..baa6dddf0c
--- /dev/null
+++ b/rts/parallel/LLComms.c
@@ -0,0 +1,489 @@
+/* ----------------------------------------------------------------------------
+ * Time-stamp: <Mon Mar 19 2001 22:10:38 Stardate: [-30]6354.62 hwloidl>
+ *
+ * GUM Low-Level Inter-Task Communication
+ *
+ * This module defines PVM Routines for PE-PE  communication.
+ *
+ * P. Trinder, December 5th. 1994.
+ * P. Trinder, July 1998
+ * H-W. Loidl, November 1999 -
+ --------------------------------------------------------------------------- */
+
+#ifdef PAR /* whole file */
+
+//@node GUM Low-Level Inter-Task Communication, , ,
+//@section GUM Low-Level Inter-Task Communication
+
+/*
+ *This module defines the routines which communicate between PEs.  The
+ *code is based on Kevin Hammond's GRIP RTS. (OpCodes.h defines
+ *PEOp1 etc. in terms of sendOp1 etc.).  
+ *
+ *Routine	&	Arguments 
+ *		&		
+ *sendOp	&	0			\\
+ *sendOp1	&	1			\\
+ *sendOp2	&	2			\\
+ *sendOpN	&	vector			\\
+ *sendOpV	&	variable		\\
+ *sendOpNV	&	variable+ vector	\\
+ *
+ *First the standard include files.
+ */
+
+//@menu
+//* Macros etc::		
+//* Includes::			
+//* Auxiliary functions::	
+//* Index::			
+//@end menu
+
+//@node Macros etc, Includes, GUM Low-Level Inter-Task Communication, GUM Low-Level Inter-Task Communication
+//@subsection Macros etc
+
+/* Evidently not Posix */
+/* #include "PosixSource.h" */
+
+#define UNUSED           /* nothing */
+
+//@node Includes, Auxiliary functions, Macros etc, GUM Low-Level Inter-Task Communication
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "Parallel.h"
+#include "ParallelRts.h"
+#if defined(DEBUG)
+# include "ParallelDebug.h"
+#endif
+#include "LLC.h"
+
+#ifdef __STDC__
+#include <stdarg.h>
+#else
+#include <varargs.h>
+#endif
+
+/* Cannot use std macro when compiling for SysMan */
+/* debugging enabled */
+// #define IF_PAR_DEBUG(c,s)  { s; }
+/* debugging disabled */
+#define IF_PAR_DEBUG(c,s)  /* nothing */
+
+//@node Auxiliary functions, Index, Includes, GUM Low-Level Inter-Task Communication
+//@subsection Auxiliary functions
+
+/*
+ * heapChkCounter tracks the number of heap checks since the last probe.
+ * Not currently used! We check for messages when a thread is resheduled.
+ */
+int heapChkCounter = 0;
+
+/*
+ * Then some miscellaneous functions. 
+ * getOpName returns the character-string name of any OpCode.
+ */
+
+char *UserPEOpNames[] = { PEOP_NAMES };
+
+//@cindex getOpName
+char *
+getOpName(nat op)
+{
+    if (op >= MIN_PEOPS && op <= MAX_PEOPS)
+	return (UserPEOpNames[op - MIN_PEOPS]);
+    else
+	return ("Unknown PE OpCode");
+}
+
+/*
+ * traceSendOp handles the tracing of messages. 
+ */
+
+//@cindex traceSendOp
+static void
+traceSendOp(OpCode op, GlobalTaskId dest UNUSED,
+	     unsigned int data1 UNUSED, unsigned int data2 UNUSED)
+{
+    char *OpName;
+
+    OpName = getOpName(op);
+    IF_PAR_DEBUG(trace,
+		 fprintf(stderr," %s [%x,%x] sent from %x to %x", 
+		       OpName, data1, data2, mytid, dest));
+}
+
+/*
+ * sendOp sends a 0-argument message with OpCode {\em op} to
+ * the global task {\em task}.
+ */
+
+//@cindex sendOp
+void
+sendOp(OpCode op, GlobalTaskId task)
+{
+    traceSendOp(op, task,0,0);
+
+    pvm_initsend(PvmDataRaw);
+    pvm_send(task, op);
+}
+
+/*
+ * sendOp1 sends a 1-argument message with OpCode {\em op}
+ * to the global task {\em task}.
+ */
+
+//@cindex sendOp1
+void
+sendOp1(OpCode op, GlobalTaskId task, StgWord arg1)
+{
+    traceSendOp(op, task, arg1,0);
+
+    pvm_initsend(PvmDataRaw);
+    PutArg1(arg1);
+    pvm_send(task, op);
+}
+
+
+/*
+ * sendOp2 is used by the FP code only. 
+ */
+
+//@cindex sendOp2
+void
+sendOp2(OpCode op, GlobalTaskId task, StgWord arg1, StgWord arg2)
+{
+    traceSendOp(op, task, arg1, arg2);
+
+    pvm_initsend(PvmDataRaw);
+    PutArg1(arg1);
+    PutArg2(arg2);
+    pvm_send(task, op);
+}
+
+/*
+ *
+ * sendOpV takes a variable number of arguments, as specified by {\em n}.  
+ * For example,
+ *
+ *    sendOpV( PP_STATS, StatsTask, 3, start_time, stop_time, sparkcount);
+ */
+
+//@cindex sendOpV
+void
+sendOpV(OpCode op, GlobalTaskId task, int n, ...)
+{
+    va_list ap;
+    int i;
+    StgWord arg;
+
+    va_start(ap, n);
+
+    traceSendOp(op, task, 0, 0);
+
+    pvm_initsend(PvmDataRaw);
+
+    for (i = 0; i < n; ++i) {
+	arg = va_arg(ap, StgWord);
+	PutArgN(i, arg);
+    }
+    va_end(ap);
+
+    pvm_send(task, op);
+}
+
+/*    
+ *
+ * sendOpNV takes a variable-size datablock, as specified by {\em
+ * nelem} and a variable number of arguments, as specified by {\em
+ * narg}. N.B. The datablock and the additional arguments are contiguous
+ * and are copied over together.  For example,
+ *
+ *        sendOpNV(PP_RESUME, tsoga.pe, 6, nelem, data,
+ *	    (W_) ga.weight, (W_) ga.loc.gc.gtid, (W_) ga.loc.gc.slot, 
+ *	    (W_) tsoga.weight, (W_) tsoga.loc.gc.gtid, (W_) tsoga.loc.gc.slot);
+ *
+ * Important: The variable arguments must all be StgWords.
+
+ sendOpNV(_, tid, m, n, data, x1, ..., xm):
+
+                         |   n elems
+     +------------------------------
+     | x1 | ... | xm | n | data ....
+     +------------------------------
+ */
+
+//@cindex sendOpNV
+void
+sendOpNV(OpCode op, GlobalTaskId task, int nelem, 
+	 StgWord *datablock, int narg, ...)
+{
+    va_list ap;
+    int i;
+    StgWord arg;
+
+    va_start(ap, narg);
+
+    traceSendOp(op, task, 0, 0);
+    IF_PAR_DEBUG(trace,
+		 fprintf(stderr,"~~ sendOpNV: op = %x (%s), task = %x, narg = %d, nelem = %d",
+		       op, getOpName(op), task, narg, nelem));
+
+    pvm_initsend(PvmDataRaw);
+
+    for (i = 0; i < narg; ++i) {
+	arg = va_arg(ap, StgWord);
+        IF_PAR_DEBUG(trace,
+		     fprintf(stderr,"~~ sendOpNV: arg = %d\n",arg));
+	PutArgN(i, arg);
+    }
+    arg = (StgWord) nelem;
+    PutArgN(narg, arg);
+
+/*  for (i=0; i < nelem; ++i) fprintf(stderr, "%d ",datablock[i]); */
+/*  fprintf(stderr," in sendOpNV\n");*/
+
+    PutArgs(datablock, nelem);
+    va_end(ap);
+
+    pvm_send(task, op);
+}
+
+/*    
+ * sendOpN take a variable size array argument, whose size is given by
+ * {\em n}.  For example,
+ *
+ *    sendOpN( PP_STATS, StatsTask, 3, stats_array);
+ */
+
+//@cindex sendOpN
+void
+sendOpN(OpCode op, GlobalTaskId task, int n, StgPtr args)
+{
+    long arg;
+
+    traceSendOp(op, task, 0, 0);
+
+    pvm_initsend(PvmDataRaw);
+    arg = (long) n;
+    PutArgN(0, arg);
+    PutArgs(args, n);
+    pvm_send(task, op);
+}
+
+/*    
+ * broadcastOpN is as sendOpN but broadcasts to all members of a group.
+ */
+
+void
+broadcastOpN(OpCode op, char *group, int n, StgPtr args)
+{
+  long arg;
+
+  //traceSendOp(op, task, 0, 0);
+  
+  pvm_initsend(PvmDataRaw);
+  arg = (long) n;
+  PutArgN(0, arg);
+  PutArgs(args, n);
+  pvm_bcast(group, op);
+}
+
+/*
+   waitForPEOp waits for a packet from global task who with the
+   OpCode op.  If ignore is true all other messages are simply ignored; 
+   otherwise they are handled by processUnexpected.
+ */
+//@cindex waitForPEOp
+rtsPacket 
+waitForPEOp(OpCode op, GlobalTaskId who, void(*processUnexpected)(rtsPacket) )
+{
+  rtsPacket p;
+  int nbytes;
+  OpCode opCode;
+  GlobalTaskId sender_id;
+  rtsBool match;
+
+  IF_PAR_DEBUG(verbose,
+	       fprintf(stderr,"~~ waitForPEOp: expecting op = %x (%s), who = [%x]\n", 
+		       op, getOpName(op), who)); 
+
+  do {
+    while((p = pvm_recv(ANY_TASK,ANY_OPCODE)) < 0)
+      pvm_perror("waitForPEOp: Waiting for PEOp");
+      
+    pvm_bufinfo( p, &nbytes, &opCode, &sender_id );
+    match = (op == ANY_OPCODE || op == opCode) && 
+            (who == ANY_TASK || who == sender_id);
+
+    if (match) {
+      IF_PAR_DEBUG(verbose,
+		   fprintf(stderr,
+			   "~~waitForPEOp: Qapla! received: OpCode = %#x (%s), sender_id = [%x]",
+			   opCode, getOpName(opCode), sender_id)); 
+
+      return(p);
+    }
+
+    /* Handle the unexpected OpCodes */
+    if (processUnexpected!=NULL) {
+      (*processUnexpected)(p);
+    } else {
+      IF_PAR_DEBUG(verbose,
+		   fprintf(stderr,
+			   "~~ waitForPEOp: ignoring OpCode = %#x (%s), sender_id = [%x]",
+			   opCode, getOpName(opCode), sender_id)); 
+    }
+
+  } while(rtsTrue);
+}
+
+/*
+  processUnexpected processes unexpected messages. If the message is a
+  FINISH it exits the prgram, and PVM gracefully
+ */
+//@cindex processUnexpectedMessage
+void
+processUnexpectedMessage(rtsPacket packet) {
+    OpCode opCode = getOpcode(packet);
+
+    IF_PAR_DEBUG(verbose,
+		 GlobalTaskId sender = senderTask(packet); 
+		 fprintf(stderr,"~~ [%x] processUnexpected: Received %x (%s), sender %x\n",
+		       mytid, opCode, getOpName(opCode), sender)); 
+
+    switch (opCode) {
+    case PP_FINISH:
+        stg_exit(EXIT_SUCCESS);
+	break;
+
+      /* Anything we're not prepared to deal with.  Note that ALL OpCodes
+	 are discarded during termination -- this helps prevent bizarre
+	 race conditions.  */
+      default:
+	// if (!GlobalStopPending) 
+        {
+	  GlobalTaskId errorTask;
+	  OpCode opCode;
+
+	  getOpcodeAndSender(packet, &opCode, &errorTask);
+	  fprintf(stderr,"== Task %x: Unexpected OpCode %x from %x in processUnexpected",
+		mytid, opCode, errorTask );
+            
+	  stg_exit(EXIT_FAILURE);
+	}
+    }
+}
+
+//@cindex getOpcode
+OpCode 
+getOpcode(rtsPacket p)
+{
+  int nbytes;
+  OpCode OpCode;
+  GlobalTaskId sender_id;
+  /* read PVM buffer */
+  pvm_bufinfo(p, &nbytes, &OpCode, &sender_id);
+  /* return tag of the buffer as opcode */
+  return(OpCode);
+}
+
+//@cindex getOpcodeAndSender
+void
+getOpcodeAndSender(rtsPacket p, OpCode *opCodep, GlobalTaskId *senderIdp)
+{
+  int nbytes;
+  /* read PVM buffer */
+  pvm_bufinfo(p, &nbytes, opCodep, senderIdp);
+}
+
+//@cindex senderTask
+GlobalTaskId
+senderTask(rtsPacket p)
+{
+  int nbytes;
+  OpCode opCode;
+  GlobalTaskId sender_id;
+  /* read PVM buffer */
+  pvm_bufinfo(p, &nbytes, &opCode, &sender_id);
+  return(sender_id);
+}
+
+/*
+ * startUpPE does the low-level comms specific startup stuff for a
+ * PE. It initialises the comms system, joins the appropriate groups
+ * allocates the PE buffer
+ */
+
+//@cindex startUpPE
+void
+startUpPE(void)
+{ 
+  mytid = _my_gtid;	/* Initialise PVM and get task id into global var.*/
+  
+  IF_PAR_DEBUG(verbose,
+	       fprintf(stderr,"== [%x] PEStartup: Task id = [%x], No. PEs = %d \n", 
+		       mytid, mytid, nPEs));
+  checkComms(pvm_joingroup(PEGROUP), "PEStartup");
+  IF_PAR_DEBUG(verbose,
+	       fprintf(stderr,"== [%x] PEStartup: Joined PEGROUP\n", mytid));
+}
+
+/*
+ * PEShutdown does the low-level comms-specific shutdown stuff for a
+ * single PE. It leaves the groups and then exits from pvm.
+ */
+//@cindex shutDownPE
+void
+shutDownPE(void)
+{    
+  IF_PAR_DEBUG(verbose,
+	       fprintf(stderr, "== [%x] PEshutdown\n", mytid));
+
+  checkComms(pvm_lvgroup(PEGROUP),"PEShutDown");
+  checkComms(pvm_exit(),"PEShutDown");
+}
+
+/* 
+   Extract the exit code out of a PP_FINISH packet (used in SysMan)
+*/
+int
+getExitCode(int nbytes, GlobalTaskId *sender_idp) {
+  int exitCode=0;
+
+  if (nbytes==4) {               // Notification from a task doing pvm_exit
+    GetArgs(sender_idp,1);       // Presumably this must be MainPE Id
+    exitCode = -1;
+  } else if (nbytes==8) {        // Doing a controlled shutdown
+    GetArgs(&exitCode,1);        // HACK: controlled shutdown == 2 values
+    GetArgs(&exitCode,1);
+  } else {
+    exitCode = -2;               // everything else
+  }
+  return exitCode;
+}
+
+#endif /* PAR -- whole file */
+
+//@node Index,  , Auxiliary functions, GUM Low-Level Inter-Task Communication
+//@subsection Index
+
+//@index
+//* getOpName::  @cindex\s-+getOpName
+//* traceSendOp::  @cindex\s-+traceSendOp
+//* sendOp::  @cindex\s-+sendOp
+//* sendOp1::  @cindex\s-+sendOp1
+//* sendOp2::  @cindex\s-+sendOp2
+//* sendOpV::  @cindex\s-+sendOpV
+//* sendOpNV::  @cindex\s-+sendOpNV
+//* sendOpN::  @cindex\s-+sendOpN
+//* waitForPEOp::  @cindex\s-+waitForPEOp
+//* processUnexpectedMessage::  @cindex\s-+processUnexpectedMessage
+//* getOpcode::  @cindex\s-+getOpcode
+//* getOpcodeAndSender::  @cindex\s-+getOpcodeAndSender
+//* senderTask::  @cindex\s-+senderTask
+//* startUpPE::  @cindex\s-+startUpPE
+//* shutDownPE::  @cindex\s-+shutDownPE
+//@end index
diff --git a/rts/parallel/PEOpCodes.h b/rts/parallel/PEOpCodes.h
new file mode 100644
index 0000000000..2d18b439f2
--- /dev/null
+++ b/rts/parallel/PEOpCodes.h
@@ -0,0 +1,58 @@
+#ifndef PEOPCODES_H
+#define PEOPCODES_H
+
+/************************************************************************
+*                         PEOpCodes.h                                   *
+*									*
+*	This file contains definitions for all the GUM PE Opcodes       *
+*       It's based on the GRAPH for PVM version                         *
+*       Phil Trinder, Glasgow University 8th December 1994              *
+*									*
+   RFPointon, December 1999
+     - removed PP_SYSMAN_TID, introduced PP_READY
+     - removed PP_MAIN_TASK, introduced PP_NEWPE
+     - added PP_REVAL
+************************************************************************/
+
+#define REPLY_OK		0x00
+
+/*Startup + Shutdown*/
+#define	PP_READY		0x50  /* sent PEs -> SysMan */
+#define	PP_NEWPE		0x51  /* sent via newHost notify -> SysMan */
+#define	PP_FINISH		0x52  /* sent PEs & via taskExit notfiy -> SysMan */
+#define	PP_PETIDS		0x53  /* sent sysman -> PEs */
+
+/* Stats stuff */
+#define	PP_STATS		0x54
+#define PP_STATS_ON		0x55
+#define PP_STATS_OFF		0x56
+
+//#define PP_FAIL		0x57 
+
+/*Garbage Collection*/
+#define PP_GC_INIT              0x58
+#define PP_FULL_SYSTEM          0x59
+#define PP_GC_POLL              0x5a
+
+/*GUM Messages*/
+#define PP_FETCH                0x5b
+#define PP_RESUME               0x5c
+#define PP_ACK                  0x5d
+#define PP_FISH                 0x5e
+#define PP_SCHEDULE             0x5f
+#define PP_FREE			0x60
+#define PP_REVAL		0x61
+
+
+#define	MIN_PEOPS		0x50
+#define	MAX_PEOPS		0x61
+
+#define	PEOP_NAMES		"Ready", "NewPE", \
+				"Finish", "PETIDS", \
+                                "Stats", "Stats_On", "Stats_Off", \
+  				"Fail", \
+                                "GCInit", "FullSystem", "GCPoll", \
+                                "Fetch","Resume","ACK","Fish","Schedule", \
+				"Free","REval"
+
+#endif /* PEOPCODES_H */
diff --git a/rts/parallel/Pack.c b/rts/parallel/Pack.c
new file mode 100644
index 0000000000..e8653f6303
--- /dev/null
+++ b/rts/parallel/Pack.c
@@ -0,0 +1,4293 @@
+/* 
+   Time-stamp: <Wed Mar 21 2001 16:32:47 Stardate: [-30]6363.44 hwloidl>
+
+   Graph packing and unpacking code for sending it to another processor
+   and retrieving the original graph structure from the packet.
+   In the old RTS the code was split into Pack.c and Unpack.c (now deceased)
+   Used in GUM and GrAnSim.
+
+   The GrAnSim version of the code defines routines for *simulating* the
+   packing of closures in the same way it is done in the parallel runtime
+   system. Basically GrAnSim only puts the addresses of the closures to be
+   transferred into a buffer. This buffer will then be associated with the
+   event of transferring the graph. When this event is scheduled, the
+   @UnpackGraph@ routine is called and the buffer can be discarded
+   afterwards.
+
+   Note that in GranSim we need many buffers, not just one per PE.
+*/
+
+//@node Graph packing, , ,
+//@section Graph packing
+
+#if defined(PAR) || defined(GRAN)   /* whole file */
+
+//@menu
+//* Includes::			
+//* Prototypes::		
+//* Global variables::		
+//* ADT of Closure Queues::	
+//* Initialisation for packing::  
+//* Packing Functions::		
+//* Low level packing routines::  
+//* Unpacking routines::	
+//* Aux fcts for packing::	
+//* Printing Packet Contents::	
+//* End of file::		
+//@end menu
+//*/
+
+//@node Includes, Prototypes, Graph packing, Graph packing
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "ClosureTypes.h"
+#include "Storage.h"
+#include "Hash.h"
+#include "Parallel.h"
+#include "GranSimRts.h"
+#include "ParallelRts.h"
+# if defined(DEBUG)
+# include "Sanity.h"
+# include "Printer.h"
+# include "ParallelDebug.h"
+# endif
+#include "FetchMe.h"
+
+/* Which RTS flag should be used to get the size of the pack buffer ? */
+# if defined(PAR)
+#  define RTS_PACK_BUFFER_SIZE   RtsFlags.ParFlags.packBufferSize
+# else   /* GRAN */
+#  define RTS_PACK_BUFFER_SIZE   RtsFlags.GranFlags.packBufferSize
+# endif
+
+//@node Prototypes, Global variables, Includes, Graph packing
+//@subsection Prototypes
+/* 
+   Code declarations. 
+*/
+
+//@node ADT of closure queues, Init for packing, Prototypes, Prototypes
+//@subsubsection ADT of closure queues
+
+static inline void    	  InitClosureQueue(void);
+static inline rtsBool 	  QueueEmpty(void);
+static inline void    	  QueueClosure(StgClosure *closure);
+static inline StgClosure *DeQueueClosure(void);
+
+//@node Init for packing, Packing routines, ADT of closure queues, Prototypes
+//@subsubsection Init for packing
+
+static void     InitPacking(rtsBool unpack);
+# if defined(PAR)
+rtsBool         InitPackBuffer(void);
+# elif defined(GRAN)
+rtsPackBuffer  *InstantiatePackBuffer (void);
+static void     reallocPackBuffer (void);
+# endif
+
+//@node Packing routines, Low level packing fcts, Init for packing, Prototypes
+//@subsubsection Packing routines
+
+static void    PackClosure (StgClosure *closure);
+
+//@node Low level packing fcts, Unpacking routines, Packing routines, Prototypes
+//@subsubsection Low level packing fcts
+
+# if defined(GRAN)
+static  void    Pack (StgClosure *data);
+# else
+static  void    Pack (StgWord data);
+
+static void    PackGeneric(StgClosure *closure);
+static void    PackArray(StgClosure *closure);
+static void    PackPLC (StgPtr addr);
+static void    PackOffset (int offset);
+static void    PackPAP(StgPAP *pap);
+static rtsPackBuffer *PackTSO(StgTSO *tso, nat *packBufferSize);
+static rtsPackBuffer *PackStkO(StgPtr stko, nat *packBufferSize);
+static void           PackFetchMe(StgClosure *closure);
+
+static void    GlobaliseAndPackGA (StgClosure *closure);
+# endif
+
+//@node Unpacking routines, Aux fcts for packing, Low level packing fcts, Prototypes
+//@subsubsection Unpacking routines
+
+# if defined(PAR)
+void        InitPendingGABuffer(nat size); 
+void        CommonUp(StgClosure *src, StgClosure *dst);
+static StgClosure *SetGAandCommonUp(globalAddr *gaP, StgClosure *closure, 
+				  rtsBool hasGA);
+static nat         FillInClosure(StgWord ***bufptrP, StgClosure *graph);
+static void        LocateNextParent(StgClosure **parentP,
+				    nat *pptrP, nat *pptrsP, nat *sizeP);
+StgClosure        *UnpackGraph(rtsPackBuffer *packBuffer,
+			       globalAddr **gamap,
+			       nat *nGAs);
+static  StgClosure *UnpackClosure (StgWord ***bufptrP, StgClosure **graphP, 
+				   globalAddr *ga);
+static  StgWord   **UnpackGA(StgWord **bufptr, globalAddr *ga);
+static  StgClosure *UnpackOffset(globalAddr *ga);
+static  StgClosure *UnpackPLC(globalAddr *ga);
+static  void        UnpackArray(StgWord ***bufptrP, StgClosure *graph);
+static  nat         UnpackPAP(StgWord ***bufptrP, StgClosure *graph);
+
+# elif defined(GRAN)
+void        CommonUp(StgClosure *src, StgClosure *dst);
+StgClosure *UnpackGraph(rtsPackBuffer* buffer);
+#endif
+
+//@node Aux fcts for packing,  , Unpacking routines, Prototypes
+//@subsubsection Aux fcts for packing
+
+# if defined(PAR)
+static void 	DonePacking(void);
+static void 	AmPacking(StgClosure *closure);
+static int  	OffsetFor(StgClosure *closure);
+static rtsBool  NotYetPacking(int offset);
+static inline rtsBool  RoomToPack (nat size, nat ptrs);
+static inline rtsBool  isOffset(globalAddr *ga);
+static inline rtsBool  isFixed(globalAddr *ga);
+static inline rtsBool  isConstr(globalAddr *ga);
+static inline rtsBool  isUnglobalised(globalAddr *ga);
+# elif defined(GRAN)
+static void     DonePacking(void);
+static rtsBool  NotYetPacking(StgClosure *closure);
+# endif
+
+//@node Global variables, ADT of Closure Queues, Prototypes, Graph packing
+//@subsection Global variables
+/*
+  Static data declarations
+*/
+
+static nat     pack_locn,           /* ptr to first free loc in pack buffer */
+               clq_size, clq_pos,
+               buf_id = 1;          /* identifier for buffer */
+static nat     unpacked_size;
+static rtsBool roomInBuffer;
+#if defined(PAR)
+static GlobalTaskId dest_gtid=0;    /* destination for message to send */
+#endif
+
+/* 
+   The pack buffer
+   To be pedantic: in GrAnSim we're packing *addresses* of closures,
+   not the closures themselves.
+*/
+static rtsPackBuffer *globalPackBuffer = NULL,    /* for packing a graph */
+                     *globalUnpackBuffer = NULL;  /* for unpacking a graph */
+
+
+/*
+  Bit of a hack for testing if a closure is the root of the graph. This is
+  set in @PackNearbyGraph@ and tested in @PackClosure@.  
+*/
+
+static nat          packed_thunks = 0;
+static StgClosure  *graph_root;
+
+# if defined(PAR)
+/*
+  The offset hash table is used during packing to record the location in
+  the pack buffer of each closure which is packed.
+*/
+//@cindex offsetTable
+static HashTable *offsetTable;
+
+//@cindex PendingGABuffer
+static globalAddr *PendingGABuffer, *gaga;
+
+# endif /* PAR */
+
+
+//@node ADT of Closure Queues, Initialisation for packing, Global variables, Graph packing
+//@subsection ADT of Closure Queues
+
+//@menu
+//* Closure Queues::		
+//* Init routines::		
+//* Basic routines::		
+//@end menu
+
+//@node Closure Queues, Init routines, ADT of Closure Queues, ADT of Closure Queues
+//@subsubsection Closure Queues
+/*
+  Closure Queues
+
+  These routines manage the closure queue.
+*/
+
+static nat clq_pos, clq_size;
+
+static StgClosure **ClosureQueue = NULL;   /* HWL: init in main */
+
+#if defined(DEBUG)
+static char graphFingerPrint[MAX_FINGER_PRINT_LEN];
+#endif
+
+//@node Init routines, Basic routines, Closure Queues, ADT of Closure Queues
+//@subsubsection Init routines
+
+/* @InitClosureQueue@ allocates and initialises the closure queue. */
+
+//@cindex InitClosureQueue
+static inline void
+InitClosureQueue(void)
+{
+  clq_pos = clq_size = 0;
+
+  if (ClosureQueue==NULL)
+    ClosureQueue = (StgClosure**) stgMallocWords(RTS_PACK_BUFFER_SIZE, 
+						 "InitClosureQueue");
+}
+
+//@node Basic routines, Types of Global Addresses, Init routines, ADT of Closure Queues
+//@subsubsection Basic routines
+
+/*
+  QueueEmpty returns rtsTrue if the closure queue is empty; rtsFalse otherwise.
+*/
+
+//@cindex QueueEmpty
+static inline rtsBool
+QueueEmpty(void)
+{
+  return(clq_pos >= clq_size);
+}
+
+/* QueueClosure adds its argument to the closure queue. */
+
+//@cindex QueueClosure
+static inline void
+QueueClosure(closure)
+StgClosure *closure;
+{
+  if(clq_size < RTS_PACK_BUFFER_SIZE ) {
+    IF_PAR_DEBUG(paranoia,
+		 belch(">__> <<%d>> Q: %p (%s); %d elems in q",
+		       globalPackBuffer->id, closure, info_type(closure), clq_size-clq_pos));
+    ClosureQueue[clq_size++] = closure;
+  } else { 
+    barf("Closure Queue Overflow (EnQueueing %p (%s))", 
+	 closure, info_type(closure));
+  }
+}
+
+/* DeQueueClosure returns the head of the closure queue. */
+
+//@cindex DeQueueClosure
+static inline StgClosure * 
+DeQueueClosure(void)
+{
+  if(!QueueEmpty()) {
+    IF_PAR_DEBUG(paranoia,
+		 belch(">__> <<%d>> DeQ: %p (%s); %d elems in q",
+		       globalPackBuffer->id, ClosureQueue[clq_pos], info_type(ClosureQueue[clq_pos]), 
+		       clq_size-clq_pos));
+    return(ClosureQueue[clq_pos++]);
+  } else {
+    return((StgClosure*)NULL);
+  }
+}
+
+/* DeQueueClosure returns the head of the closure queue. */
+
+#if defined(DEBUG)
+//@cindex PrintQueueClosure
+static void
+PrintQueueClosure(void)
+{
+  nat i;
+
+  fputs("Closure queue:", stderr);
+  for (i=clq_pos; i < clq_size; i++)
+    fprintf(stderr, "%p (%s), ", 
+	    (StgClosure *)ClosureQueue[clq_pos++], 
+	    info_type(ClosureQueue[clq_pos++]));
+  fputc('\n', stderr);
+}
+#endif
+
+//@node Types of Global Addresses,  , Basic routines, ADT of Closure Queues
+//@subsubsection Types of Global Addresses
+
+/*
+  Types of Global Addresses
+
+  These routines determine whether a GA is one of a number of special types
+  of GA.
+*/
+
+# if defined(PAR)
+//@cindex isOffset
+static inline rtsBool 
+isOffset(globalAddr *ga)
+{
+    return (ga->weight == 1U && ga->payload.gc.gtid == (GlobalTaskId)0);
+}
+
+//@cindex isFixed
+static inline rtsBool
+isFixed(globalAddr *ga)
+{
+    return (ga->weight == 0U);
+}
+
+//@cindex isConstr
+static inline rtsBool
+isConstr(globalAddr *ga)
+{
+    return (ga->weight == 2U);
+}
+
+//@cindex isUnglobalised
+static inline rtsBool
+isUnglobalised(globalAddr *ga)
+{
+    return (ga->weight == 2U);
+}
+# endif
+
+//@node Initialisation for packing, Packing Functions, ADT of Closure Queues, Graph packing
+//@subsection Initialisation for packing
+/*
+  Simple Packing Routines
+
+  About packet sizes in GrAnSim: In GrAnSim we use a malloced block of
+  gransim_pack_buffer_size words to simulate a packet of pack_buffer_size
+  words.  In the simulated PackBuffer we only keep the addresses of the
+  closures that would be packed in the parallel system (see Pack). To
+  decide if a packet overflow occurs pack_buffer_size must be compared
+  versus unpacked_size (see RoomToPack).  Currently, there is no multi
+  packet strategy implemented, so in the case of an overflow we just stop
+  adding closures to the closure queue.  If an overflow of the simulated
+  packet occurs, we just realloc some more space for it and carry on as
+  usual.  -- HWL
+*/
+
+# if defined(GRAN)
+rtsPackBuffer *
+InstantiatePackBuffer (void) {
+  extern rtsPackBuffer *globalPackBuffer;
+
+  globalPackBuffer = (rtsPackBuffer *) stgMallocWords(sizeofW(rtsPackBuffer), 
+			 "InstantiatePackBuffer: failed to alloc packBuffer");
+  globalPackBuffer->size = RtsFlags.GranFlags.packBufferSize_internal;
+  globalPackBuffer->buffer = (StgWord **) stgMallocWords(RtsFlags.GranFlags.packBufferSize_internal,
+				 "InstantiatePackBuffer: failed to alloc GranSim internal packBuffer");
+  /* NB: gransim_pack_buffer_size instead of pack_buffer_size -- HWL */
+  /* stgMallocWords is now simple allocate in Storage.c */
+
+  return (globalPackBuffer);
+}
+
+/* 
+   Reallocate the GranSim internal pack buffer to make room for more closure
+   pointers. This is independent of the check for packet overflow as in GUM
+*/
+static void
+reallocPackBuffer (void) {
+
+  ASSERT(pack_locn >= (int)globalPackBuffer->size+sizeofW(rtsPackBuffer));
+
+  IF_GRAN_DEBUG(packBuffer,
+		belch("** Increasing size of PackBuffer %p to %d words (PE %u @ %d)\n",
+		      globalPackBuffer, globalPackBuffer->size+REALLOC_SZ,
+		      CurrentProc, CurrentTime[CurrentProc]));
+  
+  globalPackBuffer = (rtsPackBuffer*)realloc(globalPackBuffer, 
+				  sizeof(StgClosure*)*(REALLOC_SZ +
+						       (int)globalPackBuffer->size +
+						       sizeofW(rtsPackBuffer))) ;
+  if (globalPackBuffer==(rtsPackBuffer*)NULL) 
+    barf("Failing to realloc %d more words for PackBuffer %p (PE %u @ %d)\n", 
+	 REALLOC_SZ, globalPackBuffer, CurrentProc, CurrentTime[CurrentProc]);
+  
+  globalPackBuffer->size += REALLOC_SZ;
+
+  ASSERT(pack_locn < globalPackBuffer->size+sizeofW(rtsPackBuffer));
+}
+# endif
+
+# if defined(PAR)
+/* @initPacking@ initialises the packing buffer etc. */
+//@cindex InitPackBuffer
+rtsBool
+InitPackBuffer(void)
+{
+  if (globalPackBuffer==(rtsPackBuffer*)NULL) {
+    if ((globalPackBuffer = (rtsPackBuffer *) 
+	 stgMallocWords(sizeofW(rtsPackBuffer)+RtsFlags.ParFlags.packBufferSize+DEBUG_HEADROOM,
+			"InitPackBuffer")) == NULL)
+      return rtsFalse;
+  }
+  return rtsTrue;
+}
+
+# endif 
+//@cindex InitPacking
+static void
+InitPacking(rtsBool unpack)
+{
+# if defined(GRAN)
+  globalPackBuffer = InstantiatePackBuffer();     /* for GrAnSim only -- HWL */
+                                       /* NB: free in UnpackGraph */
+# elif defined(PAR)
+  if (unpack) {
+    /* allocate a GA-to-GA map (needed for ACK message) */
+    InitPendingGABuffer(RtsFlags.ParFlags.packBufferSize);
+  } else {
+    /* allocate memory to pack the graph into */
+    InitPackBuffer();
+  }
+# endif
+  /* init queue of closures seen during packing */
+  InitClosureQueue();
+
+  if (unpack) 
+    return;
+
+  globalPackBuffer->id = buf_id++;  /* buffer id are only used for debugging! */
+  pack_locn = 0;         /* the index into the actual pack buffer */
+  unpacked_size = 0;     /* the size of the whole graph when unpacked */
+  roomInBuffer = rtsTrue;
+  packed_thunks = 0;   /* total number of thunks packed so far */
+# if defined(PAR)
+  offsetTable = allocHashTable();
+# endif
+}
+
+//@node Packing Functions, Low level packing routines, Initialisation for packing, Graph packing
+//@subsection Packing Functions
+
+//@menu
+//* Packing Sections of Nearby Graph::	
+//* Packing Closures::		
+//@end menu
+
+//@node Packing Sections of Nearby Graph, Packing Closures, Packing Functions, Packing Functions
+//@subsubsection Packing Sections of Nearby Graph
+/*
+  Packing Sections of Nearby Graph
+
+  @PackNearbyGraph@ packs a closure and associated graph into a static
+  buffer (@PackBuffer@).  It returns the address of this buffer and the
+  size of the data packed into the buffer (in its second parameter,
+  @packBufferSize@).  The associated graph is packed in a depth first
+  manner, hence it uses an explicit queue of closures to be packed rather
+  than simply using a recursive algorithm.  Once the packet is full,
+  closures (other than primitive arrays) are packed as FetchMes, and their
+  children are not queued for packing.  */
+
+//@cindex PackNearbyGraph
+
+/* NB: this code is shared between GranSim and GUM;
+       tso only used in GranSim */
+rtsPackBuffer *
+PackNearbyGraph(closure, tso, packBufferSize, dest)
+StgClosure* closure;
+StgTSO* tso;
+nat *packBufferSize;
+GlobalTaskId dest;
+{
+  IF_PAR_DEBUG(resume,
+	       graphFingerPrint[0] = '\0');
+
+  ASSERT(RTS_PACK_BUFFER_SIZE > 0);
+  ASSERT(_HS==1);  // HWL HACK; compile time constant
+
+#if defined(PAR_TICKY) // HWL HAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACK
+  PAR_TICKY_PACK_NEARBY_GRAPH_START();
+#endif
+
+  /* ToDo: check that we have enough heap for the packet
+     ngoq ngo'
+     if (Hp + PACK_HEAP_REQUIRED > HpLim) 
+     return NULL;
+  */
+  InitPacking(rtsFalse);
+# if defined(PAR)
+  dest_gtid=dest; //-1 to disable
+# elif defined(GRAN)
+  graph_root = closure;
+# endif
+
+  IF_GRAN_DEBUG(pack,
+		belch(">>> Packing <<%d>> (buffer @ %p); graph root @ %p [PE %d]\n    demanded by TSO %d (%p) [PE %u]",
+		      globalPackBuffer->id, globalPackBuffer, closure, where_is(closure), 
+		      tso->id, tso, where_is((StgClosure*)tso)));
+
+  IF_GRAN_DEBUG(pack,
+		belch("** PrintGraph of %p is:", closure); 
+		PrintGraph(closure,0));
+
+  IF_PAR_DEBUG(resume,
+	       GraphFingerPrint(closure, graphFingerPrint);
+	       ASSERT(strlen(graphFingerPrint)<=MAX_FINGER_PRINT_LEN);
+	       belch(">>> Packing <<%d>> (buffer @ %p); graph root @ %p [%x]\n    demanded by TSO %d (%p); Finger-print is\n    {%s}",
+		     globalPackBuffer->id, globalPackBuffer, closure, mytid,
+		     tso->id, tso, graphFingerPrint)); 
+
+  IF_PAR_DEBUG(packet,
+	       belch("** PrintGraph of %p is:", closure); 
+	       belch("** pack_locn=%d", pack_locn);
+	       PrintGraph(closure,0));
+
+  QueueClosure(closure);
+  do {
+    PackClosure(DeQueueClosure());
+  } while (!QueueEmpty());
+  
+# if defined(PAR)
+
+  /* Record how much space the graph needs in packet and in heap */
+  globalPackBuffer->tso = tso;       // currently unused, I think (debugging?)
+  globalPackBuffer->unpacked_size = unpacked_size;
+  globalPackBuffer->size = pack_locn;
+
+  /* Check for buffer overflow (again) */
+  ASSERT(pack_locn <= RtsFlags.ParFlags.packBufferSize+DEBUG_HEADROOM);
+  IF_DEBUG(sanity,                           // write magic end-of-buffer word
+	   globalPackBuffer->buffer[pack_locn] = END_OF_BUFFER_MARKER);
+  *packBufferSize = pack_locn;
+
+# else  /* GRAN */
+
+  /* Record how much space is needed to unpack the graph */
+  // PackBuffer[PACK_FLAG_LOCN] = (P_) MAGIC_PACK_FLAG;  for testing
+  globalPackBuffer->tso = tso;
+  globalPackBuffer->unpacked_size = unpacked_size;
+
+  // ASSERT(pack_locn <= PackBuffer[PACK_SIZE_LOCN]+PACK_HDR_SIZE);
+  /* ToDo: Print an earlier, more meaningful message */
+  if (pack_locn==0)   /* i.e. packet is empty */
+    barf("EMPTY PACKET! Can't transfer closure %p at all!!\n",
+	 closure);
+  globalPackBuffer->size = pack_locn;
+  *packBufferSize = pack_locn;
+
+# endif
+
+  DonePacking();                               /* {GrAnSim}vaD 'ut'Ha' */
+
+# if defined(GRAN)
+  IF_GRAN_DEBUG(pack ,
+		belch("** Finished <<%d>> packing graph %p; closures packed: %d; thunks packed: %d; size of graph: %d",
+		      globalPackBuffer->id, closure, globalPackBuffer->size, packed_thunks, globalPackBuffer->unpacked_size));
+  if (RtsFlags.GranFlags.GranSimStats.Global) {
+    globalGranStats.tot_packets++; 
+    globalGranStats.tot_packet_size += pack_locn; 
+  }
+  
+  IF_GRAN_DEBUG(pack, PrintPacket(globalPackBuffer));
+# elif defined(PAR)
+  IF_PAR_DEBUG(packet,
+		belch("** Finished <<%d>> packing graph %p (%s); closures packed: %d; thunks packed: %d; size of graph: %d",
+		      globalPackBuffer->id, closure, info_type(closure),
+		      globalPackBuffer->size, packed_thunks, 
+		      globalPackBuffer->unpacked_size));;
+
+  IF_DEBUG(sanity, // do a sanity check on the packet just constructed 
+	   checkPacket(globalPackBuffer));
+# endif   /* GRAN */
+
+#if defined(PAR_TICKY) // HWL HAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACK
+  PAR_TICKY_PACK_NEARBY_GRAPH_END(globalPackBuffer->size, packed_thunks);
+#endif
+  
+  return (globalPackBuffer);
+}
+
+//@cindex PackOneNode
+
+# if defined(GRAN)
+/* This version is used when the node is already local */
+
+rtsPackBuffer *
+PackOneNode(closure, tso, packBufferSize)
+StgClosure* closure;
+StgTSO* tso;
+nat *packBufferSize;
+{
+  extern rtsPackBuffer *globalPackBuffer;
+  int i, clpack_locn;
+
+  InitPacking(rtsFalse);
+
+  IF_GRAN_DEBUG(pack,
+		belch("** PackOneNode: %p (%s)[PE %d] requested by TSO %d (%p) [PE %d]",
+		      closure, info_type(closure),
+		      where_is(closure), tso->id, tso, where_is((StgClosure *)tso)));
+
+  Pack(closure);
+
+  /* Record how much space is needed to unpack the graph */
+  globalPackBuffer->tso = tso;
+  globalPackBuffer->unpacked_size = unpacked_size;
+
+  /* Set the size parameter */
+  ASSERT(pack_locn <= RTS_PACK_BUFFER_SIZE);
+  globalPackBuffer->size =  pack_locn;
+  *packBufferSize = pack_locn;
+
+  if (RtsFlags.GranFlags.GranSimStats.Global) {
+    globalGranStats.tot_packets++; 
+    globalGranStats.tot_packet_size += pack_locn; 
+  }
+  IF_GRAN_DEBUG(pack,
+    PrintPacket(globalPackBuffer));
+
+  return (globalPackBuffer);
+}
+# endif  /* GRAN */
+
+#if defined(GRAN)
+
+/*
+   PackTSO and PackStkO are entry points for two special kinds of closure
+   which are used in the parallel RTS.  Compared with other closures they
+   are rather awkward to pack because they don't follow the normal closure
+   layout (where all pointers occur before all non-pointers).  Luckily,
+   they're only needed when migrating threads between processors.  */
+
+//@cindex PackTSO
+rtsPackBuffer*
+PackTSO(tso, packBufferSize)
+StgTSO *tso;
+nat *packBufferSize;
+{
+  extern rtsPackBuffer *globalPackBuffer;
+  IF_GRAN_DEBUG(pack,
+		belch("** Packing TSO %d (%p)", tso->id, tso));
+  *packBufferSize = 0;
+  // PackBuffer[0] = PackBuffer[1] = 0; ???
+  return(globalPackBuffer);
+}
+
+//@cindex PackStkO
+static rtsPackBuffer*
+PackStkO(stko, packBufferSize)
+StgPtr stko;
+nat *packBufferSize;
+{
+  extern rtsPackBuffer *globalPackBuffer;
+  IF_GRAN_DEBUG(pack,
+		belch("** Packing STKO %p", stko));
+  *packBufferSize = 0;
+  // PackBuffer[0] = PackBuffer[1] = 0;
+  return(globalPackBuffer);
+}
+
+static void
+PackFetchMe(StgClosure *closure)
+{
+  barf("{PackFetchMe}Daq Qagh: no FetchMe closures in GRAN!");
+}
+
+#elif defined(PAR)
+
+static rtsPackBuffer*
+PackTSO(tso, packBufferSize)
+StgTSO *tso;
+nat *packBufferSize;
+{
+  barf("{PackTSO}Daq Qagh: trying to pack a TSO %d (%p) of size %d; thread migrations not supported, yet",
+       tso->id, tso, packBufferSize);
+}
+
+rtsPackBuffer*
+PackStkO(stko, packBufferSize)
+StgPtr stko;
+nat *packBufferSize;
+{
+  barf("{PackStkO}Daq Qagh: trying to pack a STKO (%p) of size %d; thread migrations not supported, yet",
+       stko, packBufferSize);
+}
+
+//@cindex PackFetchMe
+static void
+PackFetchMe(StgClosure *closure)
+{
+  StgInfoTable *ip;
+  nat i;
+  int offset;
+#if defined(DEBUG)
+  nat x = pack_locn;
+#endif
+
+#if defined(GRAN)
+  barf("{PackFetchMe}Daq Qagh: no FetchMe closures in GRAN!");
+#else
+  offset = OffsetFor(closure);
+  if (!NotYetPacking(offset)) {
+    IF_PAR_DEBUG(pack,
+		 belch("*>.. Packing FETCH_ME for closure %p (s) as offset to %d",
+		       closure, info_type(closure), offset));
+    PackOffset(offset);
+    // unpacked_size += 0;   // unpacked_size unchanged (closure is shared!!)
+    return;
+  }
+
+  /* Need a GA even when packing a constructed FETCH_ME (cruel world!) */
+  AmPacking(closure);
+  /* FMs must be always globalised */
+  GlobaliseAndPackGA(closure);
+
+  IF_PAR_DEBUG(pack,
+	       belch("*>.. Packing FETCH_ME for closure %p (%s) with GA: ((%x, %d, %x))",
+		     closure, info_type(closure), 
+		     globalPackBuffer->buffer[pack_locn-2],
+		     globalPackBuffer->buffer[pack_locn-1],
+		     globalPackBuffer->buffer[pack_locn-3]));
+
+  /* Pack a FetchMe closure instead of closure */
+  ip = &stg_FETCH_ME_info;
+  /* this assumes that the info ptr is always the first word in a closure*/
+  Pack((StgWord)ip);
+  for (i = 1; i < _HS; ++i)               // pack rest of fixed header
+    Pack((StgWord)*(((StgPtr)closure)+i));
+  
+  unpacked_size += sizeofW(StgFetchMe);
+  /* size of FETCHME in packed is the same as that constant */
+  ASSERT(pack_locn-x==PACK_FETCHME_SIZE);
+  /* In the pack buffer the pointer to a GA (in the FetchMe closure) 
+     is expanded to the full GA; this is a compile-time const */
+  //ASSERT(PACK_FETCHME_SIZE == sizeofW(StgFetchMe)-1+PACK_GA_SIZE);  
+#endif
+}
+
+#endif
+
+#ifdef DIST
+static void
+PackRemoteRef(StgClosure *closure)
+{
+  StgInfoTable *ip;
+  nat i;
+  int offset;
+
+  offset = OffsetFor(closure);
+  if (!NotYetPacking(offset)) {
+    PackOffset(offset);
+    unpacked_size += 2;
+    return;
+  }
+
+  /* Need a GA even when packing a constructed REMOTE_REF (cruel world!) */
+  AmPacking(closure);
+  
+  /* basically we just Globalise, but for sticky things we can't have multiple GAs,
+     so we must prevent the GAs being split.
+     
+     In returning things to the true sticky owner, this case is already handled, but for
+     anything else we just give up at the moment... This needs to be fixed! 
+  */
+  { globalAddr *ga;
+    ga = LAGAlookup(closure); // surely this ga must exist?
+    
+    // ***************************************************************************
+    // ***************************************************************************
+    // REMOTE_REF HACK - dual is in SetGAandCommonUp
+    // - prevents the weight from ever reaching zero
+    if(ga != NULL) 
+      ga->weight=0x06660666; //anything apart from 0 really...
+    // ***************************************************************************
+    // ***************************************************************************
+    
+    if((ga != NULL)&&(ga->weight / 2 <= 2))
+      barf("Cant split the weight any further when packing REMOTE_REF for closure %p (%s) with GA: ((%x, %d, %x))",
+		closure, info_type(closure), 
+	   	ga->payload.gc.gtid, ga->payload.gc.slot, ga->weight);   			     
+  } 
+  GlobaliseAndPackGA(closure);
+      
+  IF_PAR_DEBUG(pack,
+	       belch("*>.. Packing REMOTE_REF for closure %p (%s) with GA: ((%x, %d, %x))",
+		     closure, info_type(closure), 
+		     globalPackBuffer->buffer[pack_locn-2],
+		     globalPackBuffer->buffer[pack_locn-1],
+		     globalPackBuffer->buffer[pack_locn-3]));
+
+  /* Pack a REMOTE_REF closure instead of closure */
+  ip = &stg_REMOTE_REF_info;
+  /* this assumes that the info ptr is always the first word in a closure*/
+  Pack((StgWord)ip);
+  for (i = 1; i < _HS; ++i)               // pack rest of fixed header
+    Pack((StgWord)*(((StgPtr)closure)+i));
+  
+  unpacked_size += PACK_FETCHME_SIZE;
+}
+#endif /* DIST */
+
+//@node Packing Closures,  , Packing Sections of Nearby Graph, Packing Functions
+//@subsubsection Packing Closures
+/*
+  Packing Closures
+
+  @PackClosure@ is the heart of the normal packing code.  It packs a single
+  closure into the pack buffer, skipping over any indirections and
+  globalising it as necessary, queues any child pointers for further
+  packing, and turns it into a @FetchMe@ or revertible black hole (@RBH@)
+  locally if it was a thunk.  Before the actual closure is packed, a
+  suitable global address (GA) is inserted in the pack buffer.  There is
+  always room to pack a fetch-me to the closure (guaranteed by the
+  RoomToPack calculation), and this is packed if there is no room for the
+  entire closure.
+
+  Space is allocated for any primitive array children of a closure, and
+  hence a primitive array can always be packed along with it's parent
+  closure.  */
+
+//@cindex PackClosure
+
+# if defined(PAR)
+
+void
+PackClosure(closure)
+StgClosure *closure;
+{
+  StgInfoTable *info;
+  nat clpack_locn;
+
+  ASSERT(LOOKS_LIKE_GHC_INFO(get_itbl(closure)));
+
+  closure = UNWIND_IND(closure);
+  /* now closure is the thing we want to pack */
+  info = get_itbl(closure);
+
+  clpack_locn = OffsetFor(closure);
+
+  /* If the closure has been packed already, just pack an indirection to it
+     to guarantee that the graph doesn't become a tree when unpacked */
+  if (!NotYetPacking(clpack_locn)) {
+    PackOffset(clpack_locn);
+    return;
+  }
+
+  switch (info->type) {
+
+  case CONSTR_CHARLIKE:
+    IF_PAR_DEBUG(pack,
+		 belch("*>^^ Packing a charlike closure %d", 
+		       ((StgIntCharlikeClosure*)closure)->data));
+    
+    PackPLC((StgPtr)CHARLIKE_CLOSURE(((StgIntCharlikeClosure*)closure)->data));
+    // NB: unpacked_size of a PLC is 0
+    return;
+      
+  case CONSTR_INTLIKE:
+    {
+      StgInt val = ((StgIntCharlikeClosure*)closure)->data;
+
+      if ((val <= MAX_INTLIKE) && (val >= MIN_INTLIKE)) {
+	IF_PAR_DEBUG(pack,
+		     belch("*>^^ Packing a small intlike %d as a PLC", 
+			   val));
+	PackPLC((StgPtr)INTLIKE_CLOSURE(val));
+	// NB: unpacked_size of a PLC is 0
+	return;
+      } else {
+	IF_PAR_DEBUG(pack,
+		     belch("*>^^ Packing a big intlike %d as a normal closure", 
+			   val));
+	PackGeneric(closure);
+	return;
+      }
+    }
+
+  case CONSTR:
+  case CONSTR_1_0:
+  case CONSTR_0_1:
+  case CONSTR_2_0:
+  case CONSTR_1_1:
+  case CONSTR_0_2:
+    /* it's a constructor (i.e. plain data) */
+    IF_PAR_DEBUG(pack,
+		 belch("*>^^ Packing a CONSTR %p (%s) using generic packing", 
+		       closure, info_type(closure)));
+    PackGeneric(closure);
+    return;
+
+  case THUNK_STATIC:       // ToDo: check whether that's ok
+  case FUN_STATIC:       // ToDo: check whether that's ok
+  case CONSTR_STATIC:
+  case CONSTR_NOCAF_STATIC:// For now we ship indirections to CAFs: They are
+			   // evaluated on each PE if needed
+    IF_PAR_DEBUG(pack,
+		 belch("*>~~ Packing a %p (%s) as a PLC", 
+		       closure, info_type(closure)));
+
+    PackPLC((StgPtr)closure);
+    // NB: unpacked_size of a PLC is 0
+    return;
+
+  case THUNK_SELECTOR: 
+    {
+      StgClosure *selectee = ((StgSelector *)closure)->selectee;
+
+      IF_PAR_DEBUG(pack,
+		   belch("*>** Found THUNK_SELECTOR at %p (%s) pointing to %p (%s); using PackGeneric", 
+			 closure, info_type(closure), 
+			 selectee, info_type(selectee)));
+      PackGeneric(closure);
+      /* inlined code; probably could use PackGeneric
+      Pack((StgWord)(*(StgPtr)closure));  
+      Pack((StgWord)(selectee));
+      QueueClosure(selectee);
+      unpacked_size += 2;
+      */
+    }
+    return;
+
+  case  FUN:
+  case	FUN_1_0:
+  case	FUN_0_1:
+  case	FUN_2_0:
+  case	FUN_1_1:
+  case	FUN_0_2:
+  case  THUNK:
+  case	THUNK_1_0:
+  case	THUNK_0_1:
+  case	THUNK_2_0:
+  case	THUNK_1_1:
+  case	THUNK_0_2:
+    PackGeneric(closure);
+    return;
+
+  case AP_UPD:
+  case PAP:
+    /* 
+    barf("*>   Packing of PAP not implemented %p (%s)",
+		       closure, info_type(closure));
+	 
+       Currently we don't pack PAPs; we pack a FETCH_ME to the closure, 
+       instead. Note that since PAPs contain a chunk of stack as payload,
+       implementing packing of PAPs is a first step towards thread migration.
+    IF_PAR_DEBUG(pack,
+		 belch("*>.. Packing a PAP closure at %p (%s) as a FETCH_ME", 
+		       closure, info_type(closure)));
+    PackFetchMe(closure);
+    */
+    PackPAP((StgPAP *)closure);
+    return;
+
+  case CAF_BLACKHOLE:
+  case BLACKHOLE:
+  case BLACKHOLE_BQ:
+  case SE_BLACKHOLE:
+  case SE_CAF_BLACKHOLE:
+  case RBH:
+  case FETCH_ME:
+  case FETCH_ME_BQ:
+
+    /* If it's a (revertible) black-hole, pack a FetchMe closure to it */
+    //ASSERT(pack_locn > PACK_HDR_SIZE);
+    
+    IF_PAR_DEBUG(pack,
+		 belch("*>.. Packing a BH-like closure at %p (%s) as a FETCH_ME", 
+		       closure, info_type(closure)));
+    /* NB: in case of a FETCH_ME this might build up a chain of FETCH_MEs;
+           phps short-cut the GA here */
+    PackFetchMe(closure);
+    return;
+
+#ifdef DIST    
+  case REMOTE_REF:
+    IF_PAR_DEBUG(pack,
+		 belch("*>.. Packing %p (%s) as a REMOTE_REF", 
+		       closure, info_type(closure)));
+    PackRemoteRef(closure);
+    /* we hopefully don't end up with a chain of REMOTE_REFs!!!!!!!!!! */
+
+    return;
+#endif  
+    
+  case TSO:
+  case MVAR:
+#ifdef DIST
+          IF_PAR_DEBUG(pack,
+		 belch("*>.. Packing %p (%s) as a RemoteRef", 
+		       closure, info_type(closure)));
+    PackRemoteRef(closure);
+#else
+    barf("{Pack}Daq Qagh: Only GdH can pack %p (%s)", 
+	 closure, info_type(closure));
+#endif    
+    return;
+    
+  case ARR_WORDS:
+    PackArray(closure);
+    return;
+
+  case MUT_ARR_PTRS:
+  case MUT_ARR_PTRS_FROZEN:
+  case MUT_VAR:
+    /* 
+       Eventually, this should use the same packing routine as ARR_WRODS
+
+       GlobaliseAndPackGA(closure);
+       PackArray(closure);
+       return;
+    */
+    barf("Qagh{Pack}Doq: packing of mutable closures not yet implemented: %p (%s)",
+	 closure, info_type(closure));
+
+#  ifdef DEBUG
+  case BCO:
+    barf("{Pack}Daq Qagh: found BCO closure %p (%s); GUM hates interpreted code", 
+	 closure, info_type(closure));
+    /* never reached */
+    
+    // check error cases only in a debugging setup
+  case RET_BCO:
+  case RET_SMALL:
+  case RET_VEC_SMALL:
+  case RET_BIG:
+  case RET_VEC_BIG:
+  case RET_DYN:
+    barf("{Pack}Daq Qagh: found return vector %p (%s) when packing (thread migration not implemented)", 
+	 closure, info_type(closure));
+    /* never reached */
+    
+  case UPDATE_FRAME:
+  case STOP_FRAME:
+  case CATCH_FRAME:
+  case SEQ_FRAME:
+    barf("{Pack}Daq Qagh: found stack frame %p (%s) when packing (thread migration not implemented)", 
+	 closure, info_type(closure));
+    /* never reached */
+
+  case BLOCKED_FETCH:
+  case EVACUATED:
+    /* something's very wrong */
+    barf("{Pack}Daq Qagh: found %s (%p) when packing", 
+	 info_type(closure), closure);
+    /* never reached */
+
+  case IND:
+  case IND_OLDGEN:
+  case IND_PERM:
+  case IND_OLDGEN_PERM:
+  case IND_STATIC:
+    barf("Pack: found IND_... after shorting out indirections %d (%s)", 
+	 (nat)(info->type), info_type(closure));
+
+  case WEAK:
+  case FOREIGN:
+  case STABLE_NAME:
+    barf("Pack: found foreign thingy; not yet implemented in %d (%s)", 
+	 (nat)(info->type), info_type(closure));
+#endif
+
+  default:
+    barf("Pack: strange closure %d", (nat)(info->type));
+  } /* switch */
+}
+
+/*
+  Pack a constructor of unknown size.
+  Similar to PackGeneric but without creating GAs.
+*/
+#if 0
+//@cindex PackConstr
+static void
+PackConstr(StgClosure *closure)
+{
+  StgInfoTable *info;
+  nat size, ptrs, nonptrs, vhs, i;
+  char str[80];
+
+  ASSERT(LOOKS_LIKE_GHC_INFO(closure->header.info));
+
+  /* get info about basic layout of the closure */
+  info = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str);
+
+  ASSERT(info->type == CONSTR ||
+         info->type == CONSTR_1_0 ||
+         info->type == CONSTR_0_1 ||
+         info->type == CONSTR_2_0 ||
+         info->type == CONSTR_1_1 ||
+         info->type == CONSTR_0_2);
+
+  IF_PAR_DEBUG(pack,
+	       fprintf(stderr, "*>^^ packing a constructor at %p (%s) (size=%d, ptrs=%d, nonptrs=%d)\n",
+		       closure, info_type(closure), size, ptrs, nonptrs));
+
+  /* Primitive arrays have gone; now we have (MUT_)ARR_WORDS etc */
+
+  if (!RoomToPack(PACK_GA_SIZE + _HS + vhs + nonptrs, ptrs)) {
+    IF_PAR_DEBUG(pack,
+		 belch("*>&& pack buffer is full; packing FETCH_ME for closure %p (%s)",
+		       closure, info_type(closure)));
+    PackFetchMe(closure);
+    return;
+  }
+
+  /* Record the location of the GA */
+  AmPacking(closure);
+
+  /* Pack Constructor marker */
+  Pack((StgWord)2);
+
+  /* pack fixed and variable header */
+  for (i = 0; i < _HS + vhs; ++i)
+    Pack((StgWord)*(((StgPtr)closure)+i));
+      
+  /* register all ptrs for further packing */
+  for (i = 0; i < ptrs; ++i)
+    QueueClosure(((StgClosure *) *(((StgPtr)closure)+(_HS+vhs)+i)));
+
+  /* pack non-ptrs */
+  for (i = 0; i < nonptrs; ++i)
+    Pack((StgWord)*(((StgPtr)closure)+(_HS+vhs)+ptrs+i));
+}
+#endif
+
+/*
+  Generic packing code.
+  This code is performed for `ordinary' closures such as CONSTR, THUNK etc.
+*/
+//@cindex PackGeneric
+static void
+PackGeneric(StgClosure *closure)
+{
+  StgInfoTable *info;
+  StgClosure *rbh;
+  nat size, ptrs, nonptrs, vhs, i, m;
+  char str[80];
+
+  ASSERT(LOOKS_LIKE_COOL_CLOSURE(closure));
+
+  /* get info about basic layout of the closure */
+  info = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str);
+
+  ASSERT(!IS_BLACK_HOLE(closure));
+
+  IF_PAR_DEBUG(pack,
+	       fprintf(stderr, "*>== %p (%s): generic packing (size=%d, ptrs=%d, nonptrs=%d)\n",
+		       closure, info_type(closure), size, ptrs, nonptrs));
+
+  /* packing strategies: how many thunks to add to a packet; 
+     default is infinity i.e. RtsFlags.ParFlags.thunksToPack==0 */
+  if (RtsFlags.ParFlags.thunksToPack &&
+      packed_thunks >= RtsFlags.ParFlags.thunksToPack &&
+      closure_THUNK(closure)) {
+    IF_PAR_DEBUG(pack,
+		 belch("*>&& refusing to pack more than %d thunks per packet; packing FETCH_ME for closure %p (%s)",
+		       packed_thunks, closure, info_type(closure)));
+    PackFetchMe(closure);
+    return;
+  }
+
+  /* Primitive arrays have gone; now we have (MUT_)ARR_WORDS etc */
+
+  if (!RoomToPack(PACK_GA_SIZE + _HS + vhs + nonptrs, ptrs)) {
+    IF_PAR_DEBUG(pack,
+		 belch("*>&& pack buffer is full; packing FETCH_ME for closure %p (%s)",
+		       closure, info_type(closure)));
+    PackFetchMe(closure);
+    return;
+  }
+
+  /* Record the location of the GA */
+  AmPacking(closure);
+  /* Allocate a GA for this closure and put it into the buffer */
+  /* Checks for globalisation scheme; default: globalise everything thunks */
+  if ( RtsFlags.ParFlags.globalising == 0 || 
+       (closure_THUNK(closure) && !closure_UNPOINTED(closure)) )
+    GlobaliseAndPackGA(closure);
+  else
+    Pack((StgWord)2);  // marker for unglobalised closure
+
+
+  ASSERT(!(info->type == ARR_WORDS || info->type == MUT_ARR_PTRS ||
+	   info->type == MUT_ARR_PTRS_FROZEN || info->type == MUT_VAR));
+
+  /* At last! A closure we can actually pack! */
+  if (ip_MUTABLE(info) && ((info->type != FETCH_ME)||(info->type != REMOTE_REF)))
+    barf("*>// %p (%s) PackClosure: trying to replicate a Mutable closure!",
+	 closure, info_type(closure));
+      
+  /* 
+     Remember, the generic closure layout is as follows:
+        +-------------------------------------------------+
+	| FIXED HEADER | VARIABLE HEADER | PTRS | NON-PRS |
+        +-------------------------------------------------+
+  */
+  /* pack fixed and variable header */
+  for (i = 0; i < _HS + vhs; ++i)
+    Pack((StgWord)*(((StgPtr)closure)+i));
+      
+  /* register all ptrs for further packing */
+  for (i = 0; i < ptrs; ++i)
+    QueueClosure(((StgClosure *) *(((StgPtr)closure)+(_HS+vhs)+i)));
+
+  /* pack non-ptrs */
+  for (i = 0; i < nonptrs; ++i)
+    Pack((StgWord)*(((StgPtr)closure)+(_HS+vhs)+ptrs+i));
+      
+  // ASSERT(_HS+vhs+ptrs+nonptrs==size);
+  if ((m=_HS+vhs+ptrs+nonptrs)<size) {
+    IF_PAR_DEBUG(pack,
+		 belch("*>** WARNING: slop in closure %p (%s); filling %d words; SHOULD NEVER HAPPEN",
+		       closure, info_type(closure), size-m));
+    for (i=m; i<size; i++) 
+      Pack((StgWord)*(((StgPtr)closure)+i));
+  }
+
+  unpacked_size += size;
+  //unpacked_size += (size < MIN_UPD_SIZE) ? MIN_UPD_SIZE : size;
+
+  /*
+   * Record that this is a revertable black hole so that we can fill in
+   * its address from the fetch reply.  Problem: unshared thunks may cause
+   * space leaks this way, their GAs should be deallocated following an
+   * ACK.
+   */
+      
+  if (closure_THUNK(closure) && !closure_UNPOINTED(closure)) { 
+    rbh = convertToRBH(closure);
+    ASSERT(size>=_HS+MIN_UPD_SIZE); // min size for any updatable closure
+    ASSERT(rbh == closure);         // rbh at the same position (minced version)
+    packed_thunks++;
+  } else if ( closure==graph_root ) {
+    packed_thunks++;                // root of graph is counted as a thunk
+  }
+}
+/*
+  Pack an array of words.
+  ToDo: implement packing of MUT_ARRAYs
+*/
+
+//@cindex PackArray
+static void
+PackArray(StgClosure *closure)
+{
+  StgInfoTable *info;
+  nat size, ptrs, nonptrs, vhs;
+  nat i, n;
+  char str[80];
+
+  /* get info about basic layout of the closure */
+  info = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str);
+
+  ASSERT(info->type == ARR_WORDS || info->type == MUT_ARR_PTRS ||
+	 info->type == MUT_ARR_PTRS_FROZEN || info->type == MUT_VAR);
+
+  n = ((StgArrWords *)closure)->words;
+  // this includes the header!: arr_words_sizeW(stgCast(StgArrWords*,q)); 
+
+  IF_PAR_DEBUG(pack,
+	       belch("*>== %p (%s): packing an array of %d words (size=%d)\n",
+		     closure, info_type(closure), n,
+		     arr_words_sizeW((StgArrWords *)closure)));
+
+  /* check that we have enough room in the pack buffer */
+  if (!RoomToPack(PACK_GA_SIZE + _HS + vhs + nonptrs, ptrs)) {
+    IF_PAR_DEBUG(pack,
+		 belch("*>&& pack buffer is full; packing FETCH_ME for closure %p (%s)",
+		       closure, info_type(closure)));
+    PackFetchMe(closure);
+    return;
+  }
+
+  /* global stats about arrays sent */
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    globalParStats.tot_arrs++;
+    globalParStats.tot_arr_size += ((StgArrWords *)closure)->words;
+  }
+
+  /* record offset of the closure and allocate a GA */
+  AmPacking(closure);
+  /* Checks for globalisation scheme; default: globalise everything thunks */
+  if ( RtsFlags.ParFlags.globalising == 0 || 
+       (closure_THUNK(closure) && !closure_UNPOINTED(closure)) )
+    GlobaliseAndPackGA(closure);
+  else
+    Pack((StgWord)2);  // marker for unglobalised closure
+
+  /* Pack the header (2 words: info ptr and the number of words to follow) */
+  Pack((StgWord)*(StgPtr)closure);
+  Pack(((StgArrWords *)closure)->words);
+
+  /* pack the payload of the closure (all non-ptrs) */
+  for (i=0; i<n; i++)
+    Pack((StgWord)((StgArrWords *)closure)->payload[i]);
+
+  unpacked_size += arr_words_sizeW((StgArrWords *)closure);
+}
+
+/*
+   Pack a PAP closure.
+   Note that the representation of a PAP in the buffer is different from
+   its representation in the heap. In particular, pointers to local
+   closures are packed directly as FETCHME closures, using
+   PACK_FETCHME_SIZE words to represent q 1 word pointer in the orig graph
+   structure. To account for the difference in size we store the packed
+   size of the closure as part of the PAP's variable header in the buffer.
+*/
+
+//@cindex PackPAP
+static void
+PackPAP(StgPAP *pap) {
+  nat n, i, j, pack_start;
+  StgPtr p, q;
+  const StgInfoTable* info;
+  StgWord bitmap;
+  /* debugging only */
+  StgPtr end;
+  nat size, ptrs, nonptrs, vhs;
+  char str[80];
+  nat unpacked_size_before_PAP, FMs_in_PAP=0; // debugging only
+
+  /* This is actually a setup invariant; checked here 'cause it affects PAPs*/
+  //ASSERT(PACK_FETCHME_SIZE == sizeofW(StgFetchMe)-1+PACK_GA_SIZE);
+  ASSERT(NotYetPacking(OffsetFor((StgClosure *)pap)));
+  IF_DEBUG(sanity,
+	   unpacked_size_before_PAP = unpacked_size);
+
+  n = (nat)(pap->n_args);
+
+  /* get info about basic layout of the closure */
+  info = get_closure_info((StgClosure *)pap, &size, &ptrs, &nonptrs, &vhs, str);
+  ASSERT(ptrs==0 && nonptrs==0 && size==pap_sizeW(pap));
+
+  IF_PAR_DEBUG(pack,
+	       belch("*>**  %p (%s): PackPAP: packing PAP with %d words (size=%d; ptrs=%d; nonptrs=%d:", 
+		     (StgClosure *)pap, info_type((StgClosure *)pap),
+		     n, size, ptrs, nonptrs);
+               printClosure((StgClosure *)pap));
+
+  /* check that we have enough room in the pack buffer */
+  if (!RoomToPack(PACK_GA_SIZE + _HS + vhs + nonptrs, ptrs)) {
+    IF_PAR_DEBUG(pack,
+		 belch("*>&& pack buffer is full; packing FETCH_ME for closure %p (%s)",
+		       (StgClosure *)pap, info_type((StgClosure *)pap)));
+    PackFetchMe((StgClosure *)pap);
+    return;
+  }
+
+  /* record offset of the closure and allocate a GA */
+  AmPacking((StgClosure *)pap);
+  /* Checks for globalisation scheme; default: globalise everything thunks */
+  if ( RtsFlags.ParFlags.globalising == 0 || 
+       (closure_THUNK(pap) && !closure_UNPOINTED(pap)) )
+    GlobaliseAndPackGA((StgClosure *)pap);
+  else
+    Pack((StgWord)2);  // marker for unglobalised closure
+
+  /* Pack the PAP header */
+  Pack((StgWord)(pap->header.info));
+  Pack((StgWord)(pap->n_args));
+  Pack((StgWord)(pap->fun));
+  pack_start = pack_locn;   // to compute size of PAP in buffer
+  Pack((StgWord)0);    // this will be filled in later (size of PAP in buffer)
+
+  /* Pack the payload of a PAP i.e. a stack chunk */
+  /* pointers to start of stack chunk */
+  p = (StgPtr)(pap->payload);
+  end = (StgPtr)((nat)pap+pap_sizeW(pap)*sizeof(StgWord)); // (StgPtr)((nat)pap+sizeof(StgPAP)+sizeof(StgPtr)*n);
+  while (p<end) {
+    /* the loop body has been borrowed from scavenge_stack */
+    q = (StgPtr)*p;
+
+    /* If we've got a tag, pack all words in that block */
+    if (IS_ARG_TAG((W_)q)) {   // q stands for the no. of non-ptrs to follow
+      nat m = ARG_TAG((W_)q);      // first word after this block
+      IF_PAR_DEBUG(pack,
+		   belch("*>**    PackPAP @ %p: packing %d words (tagged), starting @ %p", 
+			 p, m, p));
+      for (i=0; i<m+1; i++)
+	Pack((StgWord)*(p+i));
+      p += m+1;                // m words + the tag
+      continue;
+    }
+     
+    /* If q is is a pointer to a (heap allocated) closure we pack a FETCH_ME
+       ToDo: provide RTS flag to also pack these closures
+    */
+    if (! LOOKS_LIKE_GHC_INFO(q) ) {
+      /* distinguish static closure (PLC) from other closures (FM) */
+      switch (get_itbl((StgClosure*)q)->type) {
+      case CONSTR_CHARLIKE:
+	IF_PAR_DEBUG(pack,
+		     belch("*>**    PackPAP: packing a charlike closure %d", 
+			   ((StgIntCharlikeClosure*)q)->data));
+    
+	PackPLC((StgPtr)CHARLIKE_CLOSURE(((StgIntCharlikeClosure*)q)->data));
+	p++;
+	break;
+      
+      case CONSTR_INTLIKE:
+	{
+	  StgInt val = ((StgIntCharlikeClosure*)q)->data;
+      
+	  if ((val <= MAX_INTLIKE) && (val >= MIN_INTLIKE)) {
+	    IF_PAR_DEBUG(pack,
+			 belch("*>**    PackPAP: Packing ptr to a small intlike %d as a PLC", val));
+	    PackPLC((StgPtr)INTLIKE_CLOSURE(val));
+	    p++;
+	    break;
+	  } else {
+	    IF_PAR_DEBUG(pack,
+			 belch("*>**    PackPAP: Packing a ptr to a big intlike %d as a FM", 
+			       val));
+	    Pack((StgWord)(ARGTAG_MAX+1));
+	    PackFetchMe((StgClosure *)q);
+	    p++;
+	    IF_DEBUG(sanity, FMs_in_PAP++);
+	    break;
+	  }
+	}
+	case THUNK_STATIC:       // ToDo: check whether that's ok
+	case FUN_STATIC:       // ToDo: check whether that's ok
+	case CONSTR_STATIC:
+	case CONSTR_NOCAF_STATIC:
+	  {
+	    IF_PAR_DEBUG(pack,
+			 belch("*>**    PackPAP: packing a ptr to a %p (%s) as a PLC", 
+			       q, info_type((StgClosure *)q)));
+	    
+	    PackPLC((StgPtr)q);
+	    p++;
+	    break;
+	  }
+      default:
+	  IF_PAR_DEBUG(pack,
+		       belch("*>**    PackPAP @ %p: packing FM to %p (%s)", 
+			     p, q, info_type((StgClosure*)q)));
+	  Pack((StgWord)(ARGTAG_MAX+1));
+	  PackFetchMe((StgClosure *)q);
+	  p++;
+	  IF_DEBUG(sanity, FMs_in_PAP++);
+	  break;
+      }
+      continue;
+    }
+	
+    /* 
+     * Otherwise, q must be the info pointer of an activation
+     * record.  All activation records have 'bitmap' style layout
+     * info.
+     */
+    info  = get_itbl((StgClosure *)p);
+    switch (info->type) {
+	
+      /* Dynamic bitmap: the mask is stored on the stack */
+    case RET_DYN:
+      IF_PAR_DEBUG(pack,
+		   belch("*>**    PackPAP @ %p: RET_DYN", 
+			 p));
+
+      /* Pack the header as is */
+      Pack((StgWord)(((StgRetDyn *)p)->info));
+      Pack((StgWord)(((StgRetDyn *)p)->liveness));
+      Pack((StgWord)(((StgRetDyn *)p)->ret_addr));
+
+      bitmap = ((StgRetDyn *)p)->liveness;
+      p      = (P_)&((StgRetDyn *)p)->payload[0];
+      goto small_bitmap;
+
+      /* probably a slow-entry point return address: */
+    case FUN:
+    case FUN_STATIC:
+      {
+      IF_PAR_DEBUG(pack,
+		   belch("*>**    PackPAP @ %p: FUN or FUN_STATIC", 
+			 p));
+
+      Pack((StgWord)(((StgClosure *)p)->header.info));
+      p++;
+
+      goto follow_srt; //??
+      }
+
+      /* Using generic code here; could inline as in scavenge_stack */
+    case UPDATE_FRAME:
+      {
+	StgUpdateFrame *frame = (StgUpdateFrame *)p;
+	nat type = get_itbl(frame->updatee)->type;
+
+	ASSERT(type==BLACKHOLE || type==CAF_BLACKHOLE || type==BLACKHOLE_BQ);
+
+	IF_PAR_DEBUG(pack,
+		     belch("*>**    PackPAP @ %p: UPDATE_FRAME (updatee=%p; link=%p)", 
+			   p, frame->updatee, frame->link));
+
+	Pack((StgWord)(frame->header.info));
+	Pack((StgWord)(frame->link));     // ToDo: fix intra-stack pointer
+	Pack((StgWord)(frame->updatee));  // ToDo: follow link 
+
+	p += 3;
+      }
+
+      /* small bitmap (< 32 entries, or 64 on a 64-bit machine) */
+    case STOP_FRAME:
+      {
+	IF_PAR_DEBUG(pack,
+		     belch("*>**    PackPAP @ %p: STOP_FRAME", 
+			   p));
+	Pack((StgWord)((StgStopFrame *)p)->header.info);
+	p++;
+      }
+
+    case CATCH_FRAME:
+      {
+	IF_PAR_DEBUG(pack,
+		     belch("*>**    PackPAP @ %p: CATCH_FRAME (handler=%p)", 
+			   p, ((StgCatchFrame *)p)->handler));
+
+	Pack((StgWord)((StgCatchFrame *)p)->header.info);
+	Pack((StgWord)((StgCatchFrame *)p)->link); // ToDo: fix intra-stack pointer
+	Pack((StgWord)((StgCatchFrame *)p)->exceptions_blocked);
+	Pack((StgWord)((StgCatchFrame *)p)->handler);
+	p += 4;
+      }
+
+    case SEQ_FRAME:
+      {
+	IF_PAR_DEBUG(pack,
+		     belch("*>**    PackPAP @ %p: UPDATE_FRAME (link=%p)", 
+			   p, ((StgSeqFrame *)p)->link));
+
+	Pack((StgWord)((StgSeqFrame *)p)->header.info);
+	Pack((StgWord)((StgSeqFrame *)p)->link); // ToDo: fix intra-stack pointer
+
+        // ToDo: handle bitmap
+        bitmap = info->layout.bitmap;
+
+        p = (StgPtr)&(((StgClosure *)p)->payload);
+        goto small_bitmap;
+      }
+    case RET_BCO:
+    case RET_SMALL:
+    case RET_VEC_SMALL:
+      IF_PAR_DEBUG(pack,
+		   belch("*>**    PackPAP @ %p: RET_{BCO,SMALL,VEC_SMALL} (bitmap=%o)", 
+			 p, info->layout.bitmap));
+
+
+      Pack((StgWord)((StgClosure *)p)->header.info);
+      p++;
+      // ToDo: handle bitmap
+      bitmap = info->layout.bitmap;
+      /* this assumes that the payload starts immediately after the info-ptr */
+
+    small_bitmap:
+      while (bitmap != 0) {
+	if ((bitmap & 1) == 0) {
+	  Pack((StgWord)(ARGTAG_MAX+1));
+	  PackFetchMe((StgClosure *)*p++); // pack a FetchMe to the closure
+	  IF_DEBUG(sanity, FMs_in_PAP++);
+	} else {
+	  Pack((StgWord)*p++);
+	}
+	bitmap = bitmap >> 1;
+      }
+      
+    follow_srt:
+	IF_PAR_DEBUG(pack,
+	             belch("*>--    PackPAP: nothing to do for follow_srt"));
+      continue;
+
+      /* large bitmap (> 32 entries) */
+    case RET_BIG:
+    case RET_VEC_BIG:
+      {
+	StgPtr q;
+	StgLargeBitmap *large_bitmap;
+
+	IF_PAR_DEBUG(pack,
+		     belch("*>**    PackPAP @ %p: RET_{BIG,VEC_BIG} (large_bitmap=%p)", 
+			   p, info->layout.large_bitmap));
+
+
+	Pack((StgWord)((StgClosure *)p)->header.info);
+	p++;
+
+	large_bitmap = info->layout.large_bitmap;
+
+	for (j=0; j<large_bitmap->size; j++) {
+	  bitmap = large_bitmap->bitmap[j];
+	  q = p + BITS_IN(W_);
+	  while (bitmap != 0) {
+	    if ((bitmap & 1) == 0) {
+	      Pack((StgWord)(ARGTAG_MAX+1));
+	      PackFetchMe((StgClosure *)*p++); // ToDo: pack pointer(StgClosure *)*p = evacuate((StgClosure *)*p);
+	      IF_DEBUG(sanity, FMs_in_PAP++);
+	    } else {
+	      Pack((StgWord)*p++);
+	    }
+	    bitmap = bitmap >> 1;
+	  }
+	  if (j+1 < large_bitmap->size) {
+	    while (p < q) {
+	      Pack((StgWord)(ARGTAG_MAX+1));
+	      PackFetchMe((StgClosure *)*p++); // ToDo: pack pointer (StgClosure *)*p = evacuate((StgClosure *)*p);
+	      IF_DEBUG(sanity, FMs_in_PAP++);
+	    }
+	  }
+	}
+
+	/* and don't forget to follow the SRT */
+	goto follow_srt;
+      }
+
+    default:
+      barf("PackPAP: weird activation record found on stack (@ %p): %d", 
+	   p, (int)(info->type));
+    }
+  }
+  // fill in size of the PAP (only the payload!) in buffer
+  globalPackBuffer->buffer[pack_start] = (StgWord)(pack_locn - pack_start - 1*sizeofW(StgWord));
+  /*
+    We can use the generic pap_sizeW macro to compute the size of the
+    unpacked PAP because whenever we pack a new FETCHME as part of the
+    PAP's payload we also adjust unpacked_size accordingly (smart, aren't we?)
+
+    NB: the current PAP (un-)packing code  relies on the fact that
+    the size of the unpacked PAP + size of all unpacked FMs is the same as
+    the size of the packed PAP!!
+  */
+  unpacked_size += pap_sizeW(pap); // sizeofW(pap) + (nat)(globalPackBuffer->buffer[pack_start]);
+  IF_DEBUG(sanity,
+	   ASSERT(unpacked_size-unpacked_size_before_PAP==pap_sizeW(pap)+FMs_in_PAP*sizeofW(StgFetchMe)));
+}
+# else  /* GRAN */
+
+/* Fake the packing of a closure */
+
+void
+PackClosure(closure)
+StgClosure *closure;
+{
+  StgInfoTable *info, *childInfo;
+  nat size, ptrs, nonptrs, vhs;
+  char info_hdr_ty[80];
+  nat i;
+  StgClosure *indirectee, *rbh;
+  char str[80];
+  rtsBool is_mutable, will_be_rbh, no_more_thunks_please;
+
+  is_mutable = rtsFalse;
+
+  /* In GranSim we don't pack and unpack closures -- we just simulate
+     packing by updating the bitmask. So, the graph structure is unchanged
+     i.e. we don't short out indirections here. -- HWL */
+
+  /* Nothing to do with packing but good place to (sanity) check closure;
+     if the closure is a thunk, it must be unique; otherwise we have copied
+     work at some point before that which violates one of our main global
+     assertions in GranSim/GUM */
+  ASSERT(!closure_THUNK(closure) || is_unique(closure));
+
+  IF_GRAN_DEBUG(pack,
+		belch("**  Packing closure %p (%s)",
+		      closure, info_type(closure)));
+
+  if (where_is(closure) != where_is(graph_root)) {
+    IF_GRAN_DEBUG(pack,
+		  belch("**   faking a FETCHME [current PE: %d, closure's PE: %d]",
+			where_is(graph_root), where_is(closure)));
+
+    /* GUM would pack a FETCHME here; simulate that by increasing the */
+    /* unpacked size accordingly but don't pack anything -- HWL */
+    unpacked_size += _HS + 2 ; // sizeofW(StgFetchMe);
+    return; 
+  }
+
+  /* If the closure's not already being packed */
+  if (!NotYetPacking(closure)) 
+    /* Don't have to do anything in GrAnSim if closure is already */
+    /* packed -- HWL */
+    {
+      IF_GRAN_DEBUG(pack,
+		    belch("**    Closure %p is already packed and omitted now!",
+			    closure));
+      return;
+    }
+
+  switch (get_itbl(closure)->type) {
+    /* ToDo: check for sticky bit here? */
+    /* BH-like closures which must not be moved to another PE */
+    case CAF_BLACKHOLE:       /* # of ptrs, nptrs: 0,2 */
+    case SE_BLACKHOLE:        /* # of ptrs, nptrs: 0,2 */
+    case SE_CAF_BLACKHOLE:    /* # of ptrs, nptrs: 0,2 */
+    case BLACKHOLE:           /* # of ptrs, nptrs: 0,2 */
+    case BLACKHOLE_BQ:        /* # of ptrs, nptrs: 1,1 */
+    case RBH:                 /* # of ptrs, nptrs: 1,1 */
+      /* same for these parallel specific closures */
+    case BLOCKED_FETCH:
+    case FETCH_ME:
+    case FETCH_ME_BQ:
+      IF_GRAN_DEBUG(pack,
+	belch("**    Avoid packing BH-like closures (%p, %s)!", 
+	      closure, info_type(closure)));
+      /* Just ignore RBHs i.e. they stay where they are */
+      return;
+
+    case THUNK_SELECTOR:
+      {
+	StgClosure *selectee = ((StgSelector *)closure)->selectee;
+
+	IF_GRAN_DEBUG(pack,
+		      belch("**    Avoid packing THUNK_SELECTOR (%p, %s) but queuing %p (%s)!", 
+			    closure, info_type(closure), selectee, info_type(selectee)));
+	QueueClosure(selectee);
+	IF_GRAN_DEBUG(pack,
+		      belch("**    [%p (%s) (Queueing closure) ....]",
+			    selectee, info_type(selectee)));
+      }
+      return;
+
+    case CONSTR_STATIC:
+    case CONSTR_NOCAF_STATIC:
+                                  /* For now we ship indirections to CAFs:
+				   * They are evaluated on each PE if needed */
+      IF_GRAN_DEBUG(pack,
+	belch("**    Nothing to pack for %p (%s)!", 
+	      closure, info_type(closure)));
+      // Pack(closure); GUM only
+      return;
+
+    case CONSTR_CHARLIKE:
+    case CONSTR_INTLIKE:
+      IF_GRAN_DEBUG(pack,
+	belch("**    Nothing to pack for %s (%p)!", 
+	      closure, info_type(closure)));
+      // PackPLC(((StgIntCharlikeClosure *)closure)->data); GUM only
+      return;
+
+    case AP_UPD:   
+    case PAP:
+      /* partial applications; special treatment necessary? */
+      break;
+
+    case MVAR:
+      barf("{PackClosure}Daq Qagh: found an MVAR (%p, %s); ToDo: implement proper treatment of MVARs",
+	   closure, info_type(closure));
+
+    case ARR_WORDS:
+    case MUT_VAR:
+    case MUT_ARR_PTRS:
+    case MUT_ARR_PTRS_FROZEN:
+      /* Mutable objects; require special treatment to ship all data */
+      is_mutable = rtsTrue;
+      break;	  
+
+    case WEAK:
+    case FOREIGN:
+    case STABLE_NAME:
+	  /* weak pointers and other FFI objects */
+      barf("{PackClosure}Daq Qagh: found an FFI object (%p, %s); FFI not yet supported by GranSim, sorry",
+	   closure, info_type(closure));
+
+    case TSO:
+      /* parallel objects */
+      barf("{PackClosure}Daq Qagh: found a TSO when packing (%p, %s); thread migration not yet implemented, sorry",
+	   closure, info_type(closure));
+
+    case BCO:
+      /* Hugs objects (i.e. closures used by the interpreter) */
+      barf("{PackClosure}Daq Qagh: found a Hugs closure when packing (%p, %s); GranSim not yet integrated with Hugs, sorry",
+	   closure, info_type(closure));
+      
+    case IND:              /* # of ptrs, nptrs: 1,0 */
+    case IND_STATIC:       /* # of ptrs, nptrs: 1,0 */
+    case IND_PERM:         /* # of ptrs, nptrs: 1,1 */
+    case IND_OLDGEN:       /* # of ptrs, nptrs: 1,1 */
+    case IND_OLDGEN_PERM:  /* # of ptrs, nptrs: 1,1 */
+      /* we shouldn't find an indirection here, because we have shorted them
+	 out at the beginning of this functions already.
+      */
+      break;
+      /* should be:
+      barf("{PackClosure}Daq Qagh: found indirection when packing (%p, %s)",
+	   closure, info_type(closure));
+      */
+
+    case UPDATE_FRAME:
+    case CATCH_FRAME:
+    case SEQ_FRAME:
+    case STOP_FRAME:
+      /* stack frames; should never be found when packing for now;
+	 once we support thread migration these have to be covered properly
+      */
+      barf("{PackClosure}Daq Qagh: found stack frame when packing (%p, %s)",
+	   closure, info_type(closure));
+
+    case RET_BCO:
+    case RET_SMALL:
+    case RET_VEC_SMALL:
+    case RET_BIG:
+    case RET_VEC_BIG:
+    case RET_DYN:
+      /* vectored returns; should never be found when packing; */
+      barf("{PackClosure}Daq Qagh: found vectored return (%p, %s)",
+	   closure, info_type(closure));
+
+    case INVALID_OBJECT:
+      barf("{PackClosure}Daq Qagh: found Invalid object (%p, %s)",
+	   closure, info_type(closure));
+
+    default:
+      /* 
+	 Here we know that the closure is a CONSTR, FUN or THUNK (maybe
+	 a specialised version with wired in #ptr/#nptr info; currently
+	 we treat these specialised versions like the generic version)
+      */
+    }     /* switch */
+
+    /* Otherwise it's not Fixed */
+
+    info = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str);
+    will_be_rbh = closure_THUNK(closure) && !closure_UNPOINTED(closure);
+
+    IF_GRAN_DEBUG(pack,
+		belch("**    Info on closure %p (%s): size=%d; ptrs=%d",
+		      closure, info_type(closure),
+		      size, ptrs, 
+		      (will_be_rbh) ? "will become RBH" : "will NOT become RBH"));
+    
+    // check whether IS_UPDATABLE(closure) == !closure_UNPOINTED(closure) -- HWL
+    no_more_thunks_please = 
+      (RtsFlags.GranFlags.ThunksToPack>0) && 
+      (packed_thunks>=RtsFlags.GranFlags.ThunksToPack);
+
+    /*
+      should be covered by get_closure_info
+    if (info->type == FETCH_ME || info->type == FETCH_ME_BQ || 
+	info->type == BLACKHOLE || info->type == RBH )
+      size = ptrs = nonptrs = vhs = 0;
+    */
+    /* Now peek ahead to see whether the closure has any primitive */
+    /* array children */ 
+    /* 
+       ToDo: fix this code
+       for (i = 0; i < ptrs; ++i) {
+       P_ childInfo;
+       W_ childSize, childPtrs, childNonPtrs, childVhs;
+       
+       childInfo = get_closure_info(((StgPtrPtr) (closure))[i + _HS + vhs],
+       &childSize, &childPtrs, &childNonPtrs,
+       &childVhs, junk_str);
+       if (IS_BIG_MOTHER(childInfo)) {
+       reservedPAsize += PACK_GA_SIZE + _HS + 
+       childVhs + childNonPtrs +
+       childPtrs * PACK_FETCHME_SIZE;
+       PAsize += PACK_GA_SIZE + _HS + childSize;
+       PAptrs += childPtrs;
+       }
+       }
+    */
+    /* Don't pack anything (GrAnSim) if it's a black hole, or the buffer
+     * is full and it isn't a primitive array. N.B. Primitive arrays are
+     * always packed (because their parents index into them directly) */
+
+    if (IS_BLACK_HOLE(closure))
+	/*
+	  ToDo: fix this code
+	  || 
+	  !(RoomToPack(PACK_GA_SIZE + _HS + vhs + nonptrs, ptrs) 
+	  || IS_BIG_MOTHER(info))) 
+	  */
+      return;
+
+    /* At last! A closure we can actually pack! */
+
+    if (closure_MUTABLE(closure)) // not nec. && (info->type != FETCHME))
+      belch("ghuH: Replicated a Mutable closure!");
+
+    if (RtsFlags.GranFlags.GranSimStats.Global &&  
+	no_more_thunks_please && will_be_rbh) {
+      globalGranStats.tot_cuts++;
+      if ( RtsFlags.GranFlags.Debug.pack ) 
+	belch("**    PackClosure (w/ ThunksToPack=%d): Cutting tree with root at %#x\n",
+		RtsFlags.GranFlags.ThunksToPack, closure);
+    } else if (will_be_rbh || (closure==graph_root) ) {
+      packed_thunks++;
+      globalGranStats.tot_thunks++;
+    }
+
+    if (no_more_thunks_please && will_be_rbh) 
+      return; /* don't pack anything */
+
+    /* actual PACKING done here --  HWL */
+    Pack(closure);         
+    for (i = 0; i < ptrs; ++i) {
+      /* extract i-th pointer from closure */
+      QueueClosure((StgClosure *)(closure->payload[i]));
+      IF_GRAN_DEBUG(pack,
+		    belch("**    [%p (%s) (Queueing closure) ....]",
+			  closure->payload[i], 
+	                  info_type(*stgCast(StgPtr*,((closure)->payload+(i))))));
+                                  //^^^^^^^^^^^ payloadPtr(closure,i))));
+    }
+
+    /* 
+       for packing words (GUM only) do something like this:
+
+       for (i = 0; i < ptrs; ++i) {
+         Pack(payloadWord(obj,i+j));
+       }
+    */
+    /* Turn thunk into a revertible black hole. */
+    if (will_be_rbh) { 
+	rbh = convertToRBH(closure);
+	ASSERT(rbh != NULL);
+    }
+}
+# endif  /* PAR */
+
+//@node Low level packing routines, Unpacking routines, Packing Functions, Graph packing
+//@subsection Low level packing routines
+
+/*
+   @Pack@ is the basic packing routine.  It just writes a word of data into
+   the pack buffer and increments the pack location.  */
+
+//@cindex Pack
+
+# if defined(PAR)
+static  void
+Pack(data)
+StgWord data;
+{
+  ASSERT(pack_locn < RtsFlags.ParFlags.packBufferSize);
+  globalPackBuffer->buffer[pack_locn++] = data;
+}
+#endif
+
+#if defined(GRAN)
+static  void
+Pack(closure)
+StgClosure *closure;
+{
+  StgInfoTable *info;
+  nat size, ptrs, nonptrs, vhs;
+  char str[80];
+
+  /* This checks the size of the GrAnSim internal pack buffer. The simulated
+     pack buffer is checked via RoomToPack (as in GUM) */
+  if (pack_locn >= (int)globalPackBuffer->size+sizeofW(rtsPackBuffer)) 
+    reallocPackBuffer();
+
+  if (closure==(StgClosure*)NULL) 
+    belch("Qagh {Pack}Daq: Trying to pack 0");
+  globalPackBuffer->buffer[pack_locn++] = closure;
+  /* ASSERT: Data is a closure in GrAnSim here */
+  info = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str);
+  // ToDo: is check for MIN_UPD_SIZE really needed? */
+  unpacked_size += _HS + (size < MIN_UPD_SIZE ? 
+				        MIN_UPD_SIZE : 
+				        size);
+}
+# endif  /* GRAN */
+
+/*
+   If a closure is local, make it global.  Then, divide its weight for
+   export.  The GA is then packed into the pack buffer.  */
+
+# if defined(PAR)
+//@cindex GlobaliseAndPackGA
+static void
+GlobaliseAndPackGA(closure)
+StgClosure *closure;
+{
+  globalAddr *ga;
+  globalAddr packGA;
+
+  if ((ga = LAGAlookup(closure)) == NULL) {
+    ga = makeGlobal(closure, rtsTrue);
+
+    // Global statistics: increase amount of global data by closure-size
+    if (RtsFlags.ParFlags.ParStats.Global &&
+	RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+      StgInfoTable *info;
+      nat size, ptrs, nonptrs, vhs, i, m; // stats only!!
+      char str[80]; // stats only!!
+
+      info = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str);
+      globalParStats.tot_global += size;
+    }
+  }
+  ASSERT(ga->weight==MAX_GA_WEIGHT || ga->weight > 2);
+  
+  if(dest_gtid==ga->payload.gc.gtid)
+  {  packGA.payload = ga->payload;
+     packGA.weight = 0xFFFFFFFF; // 0,1,2 are used already
+  }
+  else
+  { splitWeight(&packGA, ga);
+    ASSERT(packGA.weight > 0);
+  }  
+ 
+  IF_PAR_DEBUG(pack,
+	       fprintf(stderr, "*>## %p (%s): Globalising (%s) closure with GA ",
+		       closure, info_type(closure),
+		       ( (ga->payload.gc.gtid==dest_gtid)?"returning":
+		           ( (ga->payload.gc.gtid==mytid)?"creating":"sharing" ) ));
+	       printGA(&packGA);
+	       fputc('\n', stderr));
+
+
+  Pack((StgWord) packGA.weight);
+  Pack((StgWord) packGA.payload.gc.gtid);
+  Pack((StgWord) packGA.payload.gc.slot);
+}
+
+/*
+   @PackPLC@ makes up a bogus GA for a PLC. Weight 0 implies that a PLC
+   address follows instead of PE, slot.  */
+
+//@cindex PackPLC
+
+static void
+PackPLC(addr)
+StgPtr addr;
+{
+  Pack(0L);			/* weight */
+  Pack((StgWord) addr);		/* address */
+}
+
+/*
+   @PackOffset@ packs a special GA value that will be interpreted as an
+   offset to a closure in the pack buffer.  This is used to avoid unfolding
+   the graph structure into a tree.  */
+
+static void
+PackOffset(offset)
+int offset;
+{
+  /*
+  IF_PAR_DEBUG(pack,
+	       belch("** Packing Offset %d at pack location %u",
+		     offset, pack_locn));
+  */
+  Pack(1L);			/* weight */
+  Pack(0L);			/* pe */
+  Pack(offset);		        /* slot/offset */
+}
+# endif  /* PAR */
+
+//@node Unpacking routines, Aux fcts for packing, Low level packing routines, Graph packing
+//@subsection Unpacking routines
+
+/*
+  This was formerly in the (now deceased) module Unpack.c
+
+  Unpacking closures which have been exported to remote processors
+
+  This module defines routines for unpacking closures in the parallel
+  runtime system (GUM).
+
+  In the case of GrAnSim, this module defines routines for *simulating* the
+  unpacking of closures as it is done in the parallel runtime system.
+*/
+
+//@node GUM code, GranSim Code, Unpacking routines, Unpacking routines
+//@subsubsection GUM code
+
+#if defined(PAR) 
+
+//@cindex InitPendingGABuffer
+void
+InitPendingGABuffer(size)
+nat size; 
+{
+  if (PendingGABuffer==(globalAddr *)NULL)
+    PendingGABuffer = (globalAddr *) 
+      stgMallocBytes(size*2*sizeof(globalAddr),
+		     "InitPendingGABuffer");
+
+  /* current location in the buffer */
+  gaga = PendingGABuffer; 
+}
+
+/*
+  @CommonUp@ commons up two closures which we have discovered to be
+  variants of the same object.  One is made an indirection to the other.  */
+
+//@cindex CommonUp
+void
+CommonUp(StgClosure *src, StgClosure *dst)
+{
+  StgBlockingQueueElement *bqe;
+#if defined(DEBUG)
+  StgInfoTable *info;
+  nat size, ptrs, nonptrs, vhs, i;
+  char str[80];
+
+  /* get info about basic layout of the closure */
+  info = get_closure_info(src, &size, &ptrs, &nonptrs, &vhs, str);
+#endif
+
+  ASSERT(src != (StgClosure *)NULL && dst != (StgClosure *)NULL);
+  ASSERT(src != dst);
+
+  IF_PAR_DEBUG(pack,
+	       belch("*___  CommonUp %p (%s) --> %p (%s)",
+		     src, info_type(src), dst, info_type(dst)));
+  
+  switch (get_itbl(src)->type) {
+  case BLACKHOLE_BQ:
+    bqe = ((StgBlockingQueue *)src)->blocking_queue;
+    break;
+
+  case FETCH_ME_BQ:
+    bqe = ((StgFetchMeBlockingQueue *)src)->blocking_queue;
+    break;
+    
+  case RBH:
+    bqe = ((StgRBH *)src)->blocking_queue;
+    break;
+    
+  case BLACKHOLE:
+  case FETCH_ME:
+    bqe = END_BQ_QUEUE;
+    break;
+
+    /* These closures are too small to be updated with an indirection!!! */
+  case CONSTR_1_0:
+  case CONSTR_0_1:
+    ASSERT(size<_HS+MIN_UPD_SIZE); // that's why we have to avoid UPD_IND
+    return;
+
+    /* currently we also common up 2 CONSTRs; this should reduce heap 
+     * consumption but also does more work; not sure whether it's worth doing 
+     */ 
+  case CONSTR:
+  case CONSTR_2_0:
+  case CONSTR_1_1:
+  case CONSTR_0_2:
+  case ARR_WORDS:
+  case MUT_ARR_PTRS:
+  case MUT_ARR_PTRS_FROZEN:
+  case MUT_VAR:
+    break;
+
+  default:
+    /* Don't common up anything else */
+    return;
+  }
+
+  /* closure must be big enough to permit update with ind */
+  ASSERT(size>=_HS+MIN_UPD_SIZE);
+  /* NB: this also awakens the blocking queue for src */
+  UPD_IND(src, dst);
+}
+
+/*
+ * Common up the new closure with any existing closure having the same
+ * GA
+ */
+//@cindex SetGAandCommonUp
+static StgClosure *
+SetGAandCommonUp(globalAddr *ga, StgClosure *closure, rtsBool hasGA)
+{
+  StgClosure *existing;
+  StgInfoTable *ip, *oldip;
+  globalAddr *newGA;
+
+  if (!hasGA)
+    return closure;
+  
+  /* should we already have a local copy? */
+  if (ga->weight==0xFFFFFFFF) { 
+    ASSERT(ga->payload.gc.gtid==mytid); //sanity
+    ga->weight=0;
+    /* probably should also ASSERT that a commonUp takes place...*/
+  }
+  
+  ip = get_itbl(closure);
+  if ((existing = GALAlookup(ga)) == NULL) {
+    /* Just keep the new object */
+    IF_PAR_DEBUG(pack,
+		 belch("*<##  New local object for GA ((%x, %d, %x)) is %p (%s)", 
+		       ga->payload.gc.gtid, ga->payload.gc.slot, ga->weight,
+		       closure, info_type(closure)));
+
+    // make an entry binding closure to ga in the RemoteGA table
+    newGA = setRemoteGA(closure, ga, rtsTrue);
+    // if local closure is a FETCH_ME etc fill in the global indirection
+    if (ip->type == FETCH_ME || ip->type == REMOTE_REF)
+      ((StgFetchMe *)closure)->ga = newGA;
+  } else {
+    
+
+#ifdef DIST 
+// ***************************************************************************
+// ***************************************************************************
+// REMOTE_REF HACK - dual is in PackRemoteRef  
+// - prevents the weight ever being updated
+  if (ip->type == REMOTE_REF)
+    ga->weight=0;
+// ***************************************************************************
+// ***************************************************************************
+#endif /* DIST */
+    
+    /* Two closures, one global name.  Someone loses */
+    oldip = get_itbl(existing);
+    if ((oldip->type == FETCH_ME || 
+	 IS_BLACK_HOLE(existing) ||
+	 /* try to share evaluated closures */
+         oldip->type == CONSTR ||
+	 oldip->type == CONSTR_1_0 ||
+	 oldip->type == CONSTR_0_1 ||
+	 oldip->type == CONSTR_2_0 ||
+	 oldip->type == CONSTR_1_1 ||
+	 oldip->type == CONSTR_0_2 
+	) &&
+	ip->type != FETCH_ME) 
+    {
+      IF_PAR_DEBUG(pack,
+		   belch("*<#-  Duplicate local object for GA ((%x, %d, %x)); redirecting %p (%s) -> %p (%s)",
+			 ga->payload.gc.gtid, ga->payload.gc.slot, ga->weight,
+			 existing, info_type(existing), closure, info_type(closure)));
+
+      /* 
+       * What we had wasn't worth keeping, so make the old closure an
+       * indirection to the new closure (copying BQs if necessary) and
+       * make sure that the old entry is not the preferred one for this
+       * closure.
+       */
+      CommonUp(existing, closure);
+      //GALAdeprecate(ga);
+#if defined(DEBUG)
+      { 
+      	 StgInfoTable *info;
+      	 nat size, ptrs, nonptrs, vhs, i;
+      	 char str[80];
+      
+      	 /* get info about basic layout of the closure */
+      	 info = get_closure_info(GALAlookup(ga), &size, &ptrs, &nonptrs, &vhs, str);
+      
+      	 /* now ga indirectly refers to the new closure */
+      	 ASSERT(size<_HS+MIN_UPD_SIZE || 
+      		UNWIND_IND(GALAlookup(ga))==closure);
+      }
+#endif
+    } else {
+      /*
+       * Either we already had something worthwhile by this name or
+       * the new thing is just another FetchMe.  However, the thing we
+       * just unpacked has to be left as-is, or the child unpacking
+       * code will fail.  Remember that the way pointer words are
+       * filled in depends on the info pointers of the parents being
+       * the same as when they were packed.
+       */
+      IF_PAR_DEBUG(pack,
+		   belch("*<#@  Duplicate local object for GA ((%x, %d, %x)); keeping %p (%s) nuking unpacked %p (%s)", 
+			 ga->payload.gc.gtid, ga->payload.gc.slot, ga->weight,
+			 existing, info_type(existing), closure, info_type(closure)));
+
+      /* overwrite 2nd word; indicates that the closure is garbage */
+      IF_DEBUG(sanity,
+	       ((StgFetchMe*)closure)->ga = (globalAddr*)GARBAGE_MARKER;
+	       IF_PAR_DEBUG(pack,
+			    belch("++++  unpacked closure %p (%s) is garbage: %p",
+				  closure, info_type(closure), *(closure+1))));
+
+      closure = existing;
+#if 0
+      // HACK
+      ty = get_itbl(closure)->type;
+      if (ty == CONSTR ||
+	  ty == CONSTR_1_0 ||
+	  ty == CONSTR_0_1 ||
+	  ty == CONSTR_2_0 ||
+	  ty == CONSTR_1_1 ||
+	  ty == CONSTR_0_2)
+	CommonUp(closure, graph);
+#endif
+    }
+    /* We don't use this GA after all, so give back the weight */
+    (void) addWeight(ga);
+  }
+
+  /* if we have unpacked a FETCH_ME, we have a GA, too */
+  ASSERT(get_itbl(closure)->type!=FETCH_ME || 
+	 looks_like_ga(((StgFetchMe*)closure)->ga));
+
+  /* Sort out the global address mapping */
+  if (ip_THUNK(ip)){ 
+    // || // (ip_THUNK(ip) && !ip_UNPOINTED(ip)) || 
+    //(ip_MUTABLE(ip) && ip->type != FETCH_ME)) {
+    /* Make up new GAs for single-copy closures */
+    globalAddr *newGA = makeGlobal(closure, rtsTrue);
+    
+    // It's a new GA and therefore has the full weight
+    ASSERT(newGA->weight==0);
+
+    /* Create an old GA to new GA mapping */
+    *gaga++ = *ga;
+    splitWeight(gaga, newGA);
+    /* inlined splitWeight; we know that newGALA has full weight 
+    newGA->weight = gaga->weight = 1L << (BITS_IN(unsigned) - 1);    
+    gaga->payload = newGA->payload;
+    */
+    ASSERT(gaga->weight == 1U << (BITS_IN(unsigned) - 1));
+    gaga++;
+  }
+  return closure;
+}
+
+/*
+  Copies a segment of the buffer, starting at @bufptr@, representing a closure
+  into the heap at @graph@.
+ */
+//@cindex FillInClosure
+static nat
+FillInClosure(StgWord ***bufptrP, StgClosure *graph)
+{
+  StgInfoTable *ip;
+  StgWord **bufptr = *bufptrP;
+  nat ptrs, nonptrs, vhs, i, size;
+  char str[80];
+
+  ASSERT(LOOKS_LIKE_GHC_INFO(((StgClosure*)bufptr)->header.info));
+
+  /*
+   * Close your eyes.  You don't want to see where we're looking. You
+   * can't get closure info until you've unpacked the variable header,
+   * but you don't know how big it is until you've got closure info.
+   * So...we trust that the closure in the buffer is organized the
+   * same way as they will be in the heap...at least up through the
+   * end of the variable header.
+   */
+  ip = get_closure_info((StgClosure *)bufptr, &size, &ptrs, &nonptrs, &vhs, str);
+	  
+  /* Make sure that nothing sans the fixed header is filled in
+     The ga field of the FETCH_ME is filled in in SetGAandCommonUp */
+  if (ip->type == FETCH_ME || ip->type == REMOTE_REF) {
+    ASSERT(size>=_HS+MIN_UPD_SIZE);    // size of the FM in the heap
+    ptrs = nonptrs = vhs = 0;      // i.e. only unpack FH from buffer
+  }
+  /* ToDo: check whether this is really needed */
+  if (ip->type == ARR_WORDS) {
+    UnpackArray(bufptrP, graph);
+    return arr_words_sizeW((StgArrWords *)bufptr);
+  }
+
+  if (ip->type == PAP || ip->type == AP_UPD) {
+    return UnpackPAP(bufptrP, graph); // includes size of unpackes FMs
+  }
+
+  /* 
+     Remember, the generic closure layout is as follows:
+     +-------------------------------------------------+
+     | FIXED HEADER | VARIABLE HEADER | PTRS | NON-PRS |
+     +-------------------------------------------------+
+  */
+  /* Fill in the fixed header */
+  for (i = 0; i < _HS; i++)
+    ((StgPtr)graph)[i] = (StgWord)*bufptr++;
+
+  /* Fill in the packed variable header */
+  for (i = 0; i < vhs; i++)
+    ((StgPtr)graph)[_HS + i] = (StgWord)*bufptr++;
+  
+  /* Pointers will be filled in later */
+  
+  /* Fill in the packed non-pointers */
+  for (i = 0; i < nonptrs; i++)
+    ((StgPtr)graph)[_HS + i + vhs + ptrs] = (StgWord)*bufptr++;
+
+  /* Indirections are never packed */
+  // ASSERT(INFO_PTR(graph) != (W_) Ind_info_TO_USE);
+  // return bufptr;
+   *bufptrP = bufptr;
+   ASSERT(((ip->type==FETCH_ME || ip->type==REMOTE_REF)&& sizeofW(StgFetchMe)==size) ||
+	  _HS+vhs+ptrs+nonptrs == size);
+   return size; 
+}
+
+/*
+  Find the next pointer field in the parent closure.
+  If the current parent has been completely unpacked already, get the
+  next closure from the global closure queue.
+*/
+//@cindex LocateNextParent
+static void
+LocateNextParent(parentP, pptrP, pptrsP, sizeP)
+StgClosure **parentP;
+nat *pptrP, *pptrsP, *sizeP;
+{
+  StgInfoTable *ip; // debugging
+  nat nonptrs, pvhs;
+  char str[80];
+
+  /* pptr as an index into the current parent; find the next pointer field
+     in the parent by increasing pptr; if that takes us off the closure
+     (i.e. *pptr + 1 > *pptrs) grab a new parent from the closure queue
+  */
+  (*pptrP)++;
+  while (*pptrP + 1 > *pptrsP) {
+    /* *parentP has been constructed (all pointer set); so check it now */
+    IF_DEBUG(sanity,
+	     if ((*parentP!=(StgClosure*)NULL) &&         // not root
+		 (*((StgPtr)(*parentP)+1)!=GARBAGE_MARKER) && // not commoned up
+		 (get_itbl(*parentP)->type != FETCH_ME))
+	       checkClosure(*parentP));
+
+    *parentP = DeQueueClosure();
+    
+    if (*parentP == NULL)
+      break;
+    else {
+      ip = get_closure_info(*parentP, sizeP, pptrsP, &nonptrs,
+			    &pvhs, str);
+      *pptrP = 0;
+    }
+  }
+  /* *parentP points to the new (or old) parent; */
+  /* *pptr, *pptrs and *size have been updated referring to the new parent */
+}
+
+/* 
+   UnpackClosure is the heart of the unpacking routine. It is called for 
+   every closure found in the packBuffer. Any prefix such as GA, PLC marker
+   etc has been unpacked into the *ga structure. 
+   UnpackClosure does the following:
+     - check for the kind of the closure (PLC, Offset, std closure)
+     - copy the contents of the closure from the buffer into the heap
+     - update LAGA tables (in particular if we end up with 2 closures 
+       having the same GA, we make one an indirection to the other)
+     - set the GAGA map in order to send back an ACK message
+
+   At the end of this function *graphP has been updated to point to the
+   next free word in the heap for unpacking the rest of the graph and
+   *bufptrP points to the next word in the pack buffer to be unpacked.
+*/
+
+static  StgClosure*
+UnpackClosure (StgWord ***bufptrP, StgClosure **graphP, globalAddr *ga) {
+  StgClosure *closure;
+  nat size;
+  rtsBool hasGA = rtsFalse, unglobalised = rtsFalse;
+
+  /* Now unpack the closure body, if there is one; three cases:
+     - PLC: closure is just a pointer to a static closure
+     - Offset: closure has been unpacked already
+     - else: copy data from packet into closure
+  */
+  if (isFixed(ga)) {
+    closure = UnpackPLC(ga);
+  } else if (isOffset(ga)) {
+    closure = UnpackOffset(ga);
+  } else {
+    /* if not PLC or Offset it must be a GA and then the closure */
+    ASSERT(RtsFlags.ParFlags.globalising!=0 || LOOKS_LIKE_GA(ga));
+    /* check whether this is an unglobalised closure */
+    unglobalised = isUnglobalised(ga);
+    /* Now we have to build something. */
+    hasGA = !isConstr(ga);
+    /* the new closure will be built here */
+    closure = *graphP;
+
+    /* fill in the closure from the buffer */
+    size = FillInClosure(/*in/out*/bufptrP, /*in*/closure);
+    /* if it is unglobalised, it may not be a thunk!! */
+    ASSERT(!unglobalised || !closure_THUNK(closure));
+    
+   /* Add to queue for processing */
+    QueueClosure(closure);
+
+    /* common up with other graph if necessary */
+    if (!unglobalised)
+      closure = SetGAandCommonUp(ga, closure, hasGA);
+
+    /* if we unpacked a THUNK, check that it is large enough to update */
+    ASSERT(!closure_THUNK(closure) || size>=_HS+MIN_UPD_SIZE);
+    /* graph shall point to next free word in the heap */
+    *graphP += size;
+    //*graphP += (size < _HS+MIN_UPD_SIZE) ? _HS+MIN_UPD_SIZE : size; // see ASSERT
+  }
+  return closure;
+}
+
+/*
+  @UnpackGraph@ unpacks the graph contained in a message buffer.  It
+  returns a pointer to the new graph.  The @gamap@ parameter is set to
+  point to an array of (oldGA,newGA) pairs which were created as a result
+  of unpacking the buffer; @nGAs@ is set to the number of GA pairs which
+  were created.
+
+  The format of graph in the pack buffer is as defined in @Pack.lc@.  */
+
+//@cindex UnpackGraph
+StgClosure *
+UnpackGraph(packBuffer, gamap, nGAs)
+rtsPackBuffer *packBuffer;
+globalAddr **gamap;
+nat *nGAs;
+{
+  StgWord **bufptr, **slotptr;
+  globalAddr gaS;
+  StgClosure *closure, *graphroot, *graph, *parent;
+  nat size, heapsize, bufsize, 
+      pptr = 0, pptrs = 0, pvhs = 0;
+  nat unpacked_closures = 0, unpacked_thunks = 0; // stats only
+
+  IF_PAR_DEBUG(resume,
+	       graphFingerPrint[0] = '\0');
+
+  ASSERT(_HS==1);  // HWL HACK; compile time constant
+
+#if defined(PAR_TICKY) // HWL HAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACK
+  PAR_TICKY_UNPACK_GRAPH_START();
+#endif
+  
+  /* Initialisation */
+  InitPacking(rtsTrue);      // same as in PackNearbyGraph
+  globalUnpackBuffer = packBuffer;
+
+  IF_DEBUG(sanity, // do a sanity check on the incoming packet
+	   checkPacket(packBuffer));
+
+  ASSERT(gaga==PendingGABuffer); 
+  graphroot = (StgClosure *)NULL;
+
+  /* Unpack the header */
+  bufsize = packBuffer->size;
+  heapsize = packBuffer->unpacked_size;
+  bufptr = packBuffer->buffer;
+
+  /* allocate heap */
+  if (heapsize > 0) {
+    graph = (StgClosure *)allocate(heapsize);
+    ASSERT(graph != NULL);
+    // parallel global statistics: increase amount of global data
+    if (RtsFlags.ParFlags.ParStats.Global &&
+	RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+      globalParStats.tot_global += heapsize;
+    }
+  }
+
+  /* iterate over the buffer contents and unpack all closures */
+  parent = (StgClosure *)NULL;
+  do {
+    /* check that we aren't at the end of the buffer, yet */
+    IF_DEBUG(sanity, ASSERT(*bufptr != END_OF_BUFFER_MARKER));
+
+    /* This is where we will ultimately save the closure's address */
+    slotptr = bufptr;
+
+    /* fill in gaS from buffer; gaS may receive GA, PLC- or offset-marker */
+    bufptr = UnpackGA(/*in*/bufptr, /*out*/&gaS);
+
+    /* this allocates heap space, updates LAGA tables etc */
+    closure = UnpackClosure (/*in/out*/&bufptr, /*in/out*/&graph, /*in*/&gaS);
+    unpacked_closures++; // stats only; doesn't count FMs in PAP!!!
+    unpacked_thunks += (closure_THUNK(closure)) ? 1 : 0; // stats only
+
+    /*
+     * Set parent pointer to point to chosen closure.  If we're at the top of
+     * the graph (our parent is NULL), then we want to arrange to return the
+     * chosen closure to our caller (possibly in place of the allocated graph
+     * root.)
+     */
+    if (parent == NULL)
+      graphroot = closure;
+    else
+      ((StgPtr)parent)[_HS + pvhs + pptr] = (StgWord) closure;
+
+    /* Save closure pointer for resolving offsets */
+    *slotptr = (StgWord*) closure;
+
+    /* Locate next parent pointer */
+    LocateNextParent(&parent, &pptr, &pptrs, &size);
+
+    IF_DEBUG(sanity,
+	     gaS.weight          = 0xdeadffff;
+	     gaS.payload.gc.gtid = 0xdead;
+	     gaS.payload.gc.slot = 0xdeadbeef;);
+  } while (parent != NULL);
+
+  IF_PAR_DEBUG(resume,
+	       GraphFingerPrint(graphroot, graphFingerPrint);
+	       ASSERT(strlen(graphFingerPrint)<=MAX_FINGER_PRINT_LEN);
+	       belch(">>> Fingerprint of graph rooted at %p (after unpacking <<%d>>:\n    {%s}",
+		     graphroot, packBuffer->id, graphFingerPrint));
+
+  /* we unpacked exactly as many words as there are in the buffer */
+  ASSERT(bufsize == (nat) (bufptr-(packBuffer->buffer)));
+  /* we filled no more heap closure than we allocated at the beginning; 
+     ideally this should be a ==; 
+     NB: test is only valid if we unpacked anything at all (graphroot might
+         end up to be a PLC!), therfore the strange test for HEAP_ALLOCED 
+  */
+
+  /*
+  {
+   StgInfoTable *info = get_itbl(graphroot);
+   ASSERT(!HEAP_ALLOCED(graphroot) || heapsize >= (nat) (graph-graphroot) ||
+          // ToDo: check whether CAFs are really a special case here!!
+          info->type==CAF_BLACKHOLE || info->type==FETCH_ME || info->type==FETCH_ME_BQ); 
+  }
+  */
+
+  /* check for magic end-of-buffer word */
+  IF_DEBUG(sanity, ASSERT(*bufptr == END_OF_BUFFER_MARKER));
+
+  *gamap = PendingGABuffer;
+  *nGAs = (gaga - PendingGABuffer) / 2;
+
+  IF_PAR_DEBUG(tables,
+	       belch("**   LAGA table after unpacking closure %p:",
+		     graphroot);
+	       printLAGAtable());
+
+  /* ToDo: are we *certain* graphroot has been set??? WDP 95/07 */
+  ASSERT(graphroot!=NULL);
+
+  IF_DEBUG(sanity,
+           {
+	     StgPtr p;
+
+	     /* check the unpacked graph */
+	     //checkHeapChunk(graphroot,graph-sizeof(StgWord));
+
+	     // if we do sanity checks, then wipe the pack buffer after unpacking
+	     for (p=(StgPtr)packBuffer->buffer; p<(StgPtr)(packBuffer->buffer)+(packBuffer->size); )
+	       *p++ = 0xdeadbeef;
+            });
+
+  /* reset the global variable */
+  globalUnpackBuffer = (rtsPackBuffer*)NULL;
+
+#if defined(PAR_TICKY) // HWL HAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACK
+  PAR_TICKY_UNPACK_GRAPH_END(unpacked_closures, unpacked_thunks);
+#endif
+
+  return (graphroot);
+}
+
+//@cindex UnpackGA
+static  StgWord **
+UnpackGA(StgWord **bufptr, globalAddr *ga)
+{
+  /* First, unpack the next GA or PLC */
+  ga->weight = (rtsWeight) *bufptr++;
+
+  if (ga->weight == 2) {  // unglobalised closure to follow
+    // nothing to do; closure starts at *bufptr
+  } else if (ga->weight > 0) { // fill in GA
+    ga->payload.gc.gtid = (GlobalTaskId) *bufptr++;
+    ga->payload.gc.slot = (int) *bufptr++;
+  } else {
+    ga->payload.plc = (StgPtr) *bufptr++;
+  }
+  return bufptr;
+}
+
+//@cindex UnpackPLC
+static  StgClosure *
+UnpackPLC(globalAddr *ga)
+{
+  /* No more to unpack; just set closure to local address */
+  IF_PAR_DEBUG(pack,
+	       belch("*<^^ Unpacked PLC at %x", ga->payload.plc)); 
+  return (StgClosure*)ga->payload.plc;
+}
+
+//@cindex UnpackOffset
+static  StgClosure *
+UnpackOffset(globalAddr *ga)
+{
+  /* globalUnpackBuffer is a global var init in UnpackGraph */
+  ASSERT(globalUnpackBuffer!=(rtsPackBuffer*)NULL);
+  /* No more to unpack; just set closure to cached address */
+  IF_PAR_DEBUG(pack,
+	       belch("*<__ Unpacked indirection to %p (was OFFSET %d)", 
+		     (StgClosure *)((globalUnpackBuffer->buffer)[ga->payload.gc.slot]),
+		     ga->payload.gc.slot)); 
+  return (StgClosure *)(globalUnpackBuffer->buffer)[ga->payload.gc.slot];
+}
+
+/*
+  Input: *bufptrP, *graphP  ... ptrs to the pack buffer and into the heap.
+
+  *bufptrP points to something that should be unpacked as a FETCH_ME:
+    |
+    v
+    +-------------------------------
+    |    GA    | FH of FM
+    +-------------------------------
+
+  The first 3 words starting at *bufptrP are the GA address; the next
+  word is the generic FM info ptr followed by the remaining FH (if any)
+  The result after unpacking will be a FETCH_ME closure, pointed to by
+  *graphP at the start of the fct;
+    |
+    v
+    +------------------------+
+    | FH of FM | ptr to a GA |
+    +------------------------+
+
+   The ptr field points into the RemoteGA table, which holds the actual GA.
+   *bufptrP has been updated to point to the next word in the buffer.
+   *graphP has been updated to point to the first free word at the end.
+*/
+
+static StgClosure*
+UnpackFetchMe (StgWord ***bufptrP, StgClosure **graphP) {
+  StgClosure *closure, *foo;
+  globalAddr gaS;
+
+  /* This fct relies on size of FM < size of FM in pack buffer */
+  ASSERT(sizeofW(StgFetchMe)<=PACK_FETCHME_SIZE);
+
+  /* fill in gaS from buffer */
+  *bufptrP = UnpackGA(*bufptrP, &gaS);
+  /* might be an offset to a closure in the pack buffer */
+  if (isOffset(&gaS)) {
+    belch("*<   UnpackFetchMe: found OFFSET to %d when unpacking FM at buffer loc %p",
+		  gaS.payload.gc.slot, *bufptrP);
+
+    closure = UnpackOffset(&gaS);
+    /* return address of previously unpacked closure; leaves *graphP unchanged */
+    return closure;
+  }
+
+  /* we have a proper GA at hand */
+  ASSERT(LOOKS_LIKE_GA(&gaS));
+
+  IF_DEBUG(sanity,
+	   if (isFixed(&gaS)) 
+	   barf("*<   UnpackFetchMe: found PLC where FM was expected %p (%s)",
+		*bufptrP, info_type((StgClosure*)*bufptrP)));
+
+  IF_PAR_DEBUG(pack,
+	       belch("*<_- Unpacked @ %p a FETCH_ME to GA ", 
+		     *graphP);
+	       printGA(&gaS);
+	       fputc('\n', stderr));
+
+  /* the next thing must be the IP to a FETCH_ME closure */
+  ASSERT(get_itbl((StgClosure *)*bufptrP)->type == FETCH_ME);
+  
+  closure = *graphP;
+  /* fill in the closure from the buffer */
+  FillInClosure(bufptrP, closure);
+  
+  /* the newly built closure is a FETCH_ME */
+  ASSERT(get_itbl(closure)->type == FETCH_ME);
+  
+  /* common up with other graph if necessary 
+     this also assigns the contents of gaS to the ga field of the FM closure */
+  foo = SetGAandCommonUp(&gaS, closure, rtsTrue);
+  
+  ASSERT(foo!=closure || LOOKS_LIKE_GA(((StgFetchMe*)closure)->ga));
+  
+  IF_PAR_DEBUG(pack,
+	       if (foo==closure) {  // only if not commoned up 
+	         belch("*<_- current FM @ %p next FM @ %p; unpacked FM @ %p is ", 
+		       *graphP, *graphP+sizeofW(StgFetchMe), closure);
+	         printClosure(closure);
+               });
+  *graphP += sizeofW(StgFetchMe);
+  return foo;
+}
+
+/*
+  Unpack an array of words.
+  Could use generic unpack most of the time, but cleaner to separate it.
+  ToDo: implement packing of MUT_ARRAYs
+*/
+
+//@cindex UnackArray
+static void
+UnpackArray(StgWord ***bufptrP, StgClosure *graph)
+{
+  StgInfoTable *info;
+  StgWord **bufptr=*bufptrP;
+  nat size, ptrs, nonptrs, vhs, i, n;
+  char str[80];
+
+  /* yes, I know I am paranoid; but who's asking !? */
+  IF_DEBUG(sanity,
+	   info = get_closure_info((StgClosure*)bufptr, 
+				   &size, &ptrs, &nonptrs, &vhs, str);
+	   ASSERT(info->type == ARR_WORDS || info->type == MUT_ARR_PTRS ||
+		  info->type == MUT_ARR_PTRS_FROZEN || info->type == MUT_VAR));
+
+  n = ((StgArrWords *)bufptr)->words;
+  // this includes the header!: arr_words_sizeW(stgCast(StgArrWords*,q)); 
+
+  IF_PAR_DEBUG(pack,
+               if (n<100) 
+	         belch("*<== unpacking an array of %d words %p (%s) (size=%d) |%s|\n",
+		     n, (StgClosure*)bufptr, info_type((StgClosure*)bufptr), 
+		     arr_words_sizeW((StgArrWords *)bufptr), 
+		       /* print array (string?) */
+                     ((StgArrWords *)graph)->payload);
+               else
+	         belch("*<== unpacking an array of %d words %p (%s) (size=%d)\n",
+		     n, (StgClosure*)bufptr, info_type((StgClosure*)bufptr), 
+		     arr_words_sizeW((StgArrWords *)bufptr)));
+
+  /* Unpack the header (2 words: info ptr and the number of words to follow) */
+  ((StgArrWords *)graph)->header.info = (StgInfoTable*)*bufptr++;  // assumes _HS==1; yuck!
+  ((StgArrWords *)graph)->words = (StgWord)*bufptr++;
+
+  /* unpack the payload of the closure (all non-ptrs) */
+  for (i=0; i<n; i++)
+    ((StgArrWords *)graph)->payload[i] = (StgWord)*bufptr++;
+
+  ASSERT(bufptr==*bufptrP+arr_words_sizeW((StgArrWords *)*bufptrP));
+  *bufptrP = bufptr;
+}
+
+/* 
+   Unpack a PAP in the buffer into a heap closure.
+   For each FETCHME we find in the packed PAP we have to unpack a separate
+   FETCHME closure and insert a pointer to this closure into the PAP. 
+   We unpack all FETCHMEs into an area after the PAP proper (the `FM area').
+   Note that the size of a FETCHME in the buffer is exactly the same as
+   the size of an unpacked FETCHME plus 1 word for the pointer to it.
+   Therefore, we just allocate packed_size words in the heap for the unpacking.
+   After this routine the heap starting from *graph looks like this:
+
+   graph
+     |
+     v             PAP closure                 |   FM area        |
+     +------------------------------------------------------------+
+     | PAP header | n_args | fun | payload ... | FM_1 | FM_2 .... |
+     +------------------------------------------------------------+
+
+   where payload contains pointers to each of the unpacked FM_1, FM_2 ...
+   The size of the PAP closure plus all FMs is _HS+2+packed_size.
+*/
+
+//@cindex UnpackPAP
+static nat
+UnpackPAP(StgWord ***bufptrP, StgClosure *graph) 
+{
+  nat n, i, j, packed_size = 0;
+  StgPtr p, q, end, payload_start, p_FMs;
+  const StgInfoTable* info;
+  StgWord bitmap;
+  StgWord **bufptr = *bufptrP;
+#if defined(DEBUG)
+  nat FMs_in_PAP=0;
+  void checkPAPSanity(StgPAP *graph, StgPtr p_FM_begin, StgPtr p_FM_end);
+#endif
+
+  IF_PAR_DEBUG(pack,
+	       belch("*<** UnpackPAP: unpacking PAP @ %p with %d words to closure %p", 
+			 *bufptr, *(bufptr+1), graph));
+
+  /* Unpack the PAP header (both fixed and variable) */
+  ((StgPAP *)graph)->header.info = (StgInfoTable*)*bufptr++;
+  n = ((StgPAP *)graph)->n_args = (StgWord)*bufptr++;
+  ((StgPAP *)graph)->fun = (StgClosure*)*bufptr++;
+  packed_size = (nat)*bufptr++;
+
+  IF_PAR_DEBUG(pack,
+	       belch("*<** UnpackPAP: PAP header is [%p, %d, %p] %d",
+		     ((StgPAP *)graph)->header.info,
+		     ((StgPAP *)graph)->n_args,
+		     ((StgPAP *)graph)->fun,
+		     packed_size));
+
+  payload_start = (StgPtr)bufptr;
+  /* p points to the current word in the heap */
+  p = (StgPtr)((StgPAP *)graph)->payload;      // payload of PAP will be unpacked here
+  p_FMs = (StgPtr)graph+pap_sizeW((StgPAP*)graph);  // FMs will be unpacked here
+  end = (StgPtr) payload_start+packed_size;
+  /*
+    The main loop unpacks the PAP in *bufptr into *p, with *p_FMS as the
+    FM area for unpacking all FETCHMEs encountered during unpacking.
+  */
+  while ((StgPtr)bufptr<end) {
+    /* be sure that we don't write more than we allocated for this closure */
+    ASSERT(p_FMs <= (StgPtr)(graph+_HS+2+packed_size));
+    /* be sure that the unpacked PAP doesn't run into the FM area */
+    ASSERT(p < (StgPtr)(graph+pap_sizeW((StgPAP*)graph)));
+    /* the loop body has been borrowed from scavenge_stack */
+    q = *bufptr; // let q be the contents of the current pointer into the buffer
+
+    /* Test whether the next thing is a FETCH_ME.
+       In PAPs FETCH_ME are encoded via a starting marker of ARGTAG_MAX+1
+    */
+    if (q==(StgPtr)(ARGTAG_MAX+1)) {
+      IF_PAR_DEBUG(pack,
+		   belch("*<** UnpackPAP @ %p: unpacking FM; filling in ptr to FM area: %p", 
+			 p, p_FMs));
+      bufptr++;         // skip ARGTAG_MAX+1 marker
+      // Unpack a FM into the FM area after the PAP proper and insert pointer
+      *p++ = (StgWord)UnpackFetchMe(&bufptr, (StgClosure**)&p_FMs); 
+      IF_DEBUG(sanity, FMs_in_PAP++);
+      continue;
+    }
+
+    /* Test whether it is a PLC */
+    if (q==(StgPtr)0) { // same as isFixed(q)
+      IF_PAR_DEBUG(pack,
+		   belch("*<** UnpackPAP @ %p: unpacking PLC to %p", 
+			 p, *(bufptr+1)));
+      bufptr++;          // skip 0 marker
+      *p++ = (StgWord)*bufptr++;
+      continue;
+    }
+
+    /* If we've got a tag, pack all words in that block */
+    if (IS_ARG_TAG((W_)q)) {   // q stands for the no. of non-ptrs to follow
+      nat m = ARG_SIZE(q);     // first word after this block
+      IF_PAR_DEBUG(pack,
+		   belch("*<** UnpackPAP @ %p: unpacking %d words (tagged), starting @ %p", 
+			 p, m, p));
+      for (i=0; i<m+1; i++)
+	*p++ = (StgWord)*bufptr++;
+      continue;
+    }
+
+    /* 
+     * Otherwise, q must be the info pointer of an activation
+     * record.  All activation records have 'bitmap' style layout
+     * info.
+     */
+    info  = get_itbl((StgClosure *)q);
+    switch (info->type) {
+	
+      /* Dynamic bitmap: the mask is stored on the stack */
+    case RET_DYN:
+      IF_PAR_DEBUG(pack,
+		   belch("*<** UnpackPAP @ %p: RET_DYN", 
+			 p));
+
+      /* Pack the header as is */
+      ((StgRetDyn *)p)->info     = (StgWord)*bufptr++;
+      ((StgRetDyn *)p)->liveness = (StgWord)*bufptr++;
+      ((StgRetDyn *)p)->ret_addr = (StgWord)*bufptr++;
+      p += 3;
+
+      //bitmap = ((StgRetDyn *)p)->liveness;
+      //p      = (P_)&((StgRetDyn *)p)->payload[0];
+      goto small_bitmap;
+
+      /* probably a slow-entry point return address: */
+    case FUN:
+    case FUN_STATIC:
+      {
+      IF_PAR_DEBUG(pack,
+		   belch("*<** UnpackPAP @ %p: FUN or FUN_STATIC", 
+			 p));
+
+      ((StgClosure *)p)->header.info = (StgInfoTable*)*bufptr;
+      p++;
+
+      goto follow_srt; //??
+      }
+
+      /* Using generic code here; could inline as in scavenge_stack */
+    case UPDATE_FRAME:
+      {
+	StgUpdateFrame *frame = (StgUpdateFrame *)p;
+	//nat type = get_itbl(frame->updatee)->type;
+
+	//ASSERT(type==BLACKHOLE || type==CAF_BLACKHOLE || type==BLACKHOLE_BQ);
+
+	IF_PAR_DEBUG(pack,
+		     belch("*<** UnackPAP @ %p: UPDATE_FRAME", 
+			   p));
+
+	((StgUpdateFrame *)p)->header.info = (StgInfoTable*)*bufptr++;
+	((StgUpdateFrame *)p)->link        = (StgUpdateFrame*)*bufptr++;     // ToDo: fix intra-stack pointer
+	((StgUpdateFrame *)p)->updatee     = (StgClosure*)*bufptr++;   // ToDo: follow link 
+
+	p += 3;
+      }
+
+      /* small bitmap (< 32 entries, or 64 on a 64-bit machine) */
+    case STOP_FRAME:
+      {
+	IF_PAR_DEBUG(pack,
+		     belch("*<** UnpackPAP @ %p: STOP_FRAME", 
+			   p));
+	((StgStopFrame *)p)->header.info = (StgInfoTable*)*bufptr;
+	p++;
+      }
+
+    case CATCH_FRAME:
+      {
+	IF_PAR_DEBUG(pack,
+		     belch("*<** UnpackPAP @ %p: CATCH_FRAME",
+			   p));
+
+	((StgCatchFrame *)p)->header.info = (StgInfoTable*)*bufptr++;
+	((StgCatchFrame *)p)->link        = (StgUpdateFrame*)*bufptr++;
+	((StgCatchFrame *)p)->exceptions_blocked = (StgInt)*bufptr++;
+	((StgCatchFrame *)p)->handler     = (StgClosure*)*bufptr++;
+	p += 4;
+      }
+
+    case SEQ_FRAME:
+      {
+	IF_PAR_DEBUG(pack,
+		     belch("*<** UnpackPAP @ %p: UPDATE_FRAME",
+			   p));
+
+	((StgSeqFrame *)p)->header.info = (StgInfoTable*)*bufptr++;
+	((StgSeqFrame *)p)->link        = (StgUpdateFrame*)*bufptr++;
+
+        // ToDo: handle bitmap
+        bitmap = info->layout.bitmap;
+
+        p = (StgPtr)&(((StgClosure *)p)->payload);
+        goto small_bitmap;
+      }
+    case RET_BCO:
+    case RET_SMALL:
+    case RET_VEC_SMALL:
+      IF_PAR_DEBUG(pack,
+		   belch("*<** UnpackPAP @ %p: RET_{BCO,SMALL,VEC_SMALL}",
+			 p));
+
+
+      ((StgClosure *)p)->header.info = (StgInfoTable*)*bufptr++;
+      p++;
+      // ToDo: handle bitmap
+      bitmap = info->layout.bitmap;
+      /* this assumes that the payload starts immediately after the info-ptr */
+
+    small_bitmap:
+      while (bitmap != 0) {
+	if ((bitmap & 1) == 0) {
+	  *p++ = (StgWord)UnpackFetchMe(&bufptr, (StgClosure**)&p_FMs);
+	  IF_DEBUG(sanity, FMs_in_PAP++);
+	} else {
+	  *p++ = (StgWord)*bufptr++;
+	}
+	bitmap = bitmap >> 1;
+      }
+      
+    follow_srt:
+      belch("*<-- UnpackPAP: nothing to do for follow_srt");
+      continue;
+
+      /* large bitmap (> 32 entries) */
+    case RET_BIG:
+    case RET_VEC_BIG:
+      {
+	StgPtr q;
+	StgLargeBitmap *large_bitmap;
+
+	IF_PAR_DEBUG(pack,
+		     belch("*<** UnpackPAP @ %p: RET_{BIG,VEC_BIG} (large_bitmap=%p)", 
+			   p, info->layout.large_bitmap));
+
+
+	((StgClosure *)p)->header.info = (StgInfoTable*)*bufptr++;
+	p++;
+
+	large_bitmap = info->layout.large_bitmap;
+
+	for (j=0; j<large_bitmap->size; j++) {
+	  bitmap = large_bitmap->bitmap[j];
+	  q = p + BITS_IN(W_);
+	  while (bitmap != 0) {
+	    if ((bitmap & 1) == 0) {
+	      *p++ = (StgWord)UnpackFetchMe(&bufptr, (StgClosure**)&p_FMs);
+	      IF_DEBUG(sanity, FMs_in_PAP++);
+	    } else {
+	      *p++ = (StgWord)*bufptr;
+	    }
+	    bitmap = bitmap >> 1;
+	  }
+	  if (j+1 < large_bitmap->size) {
+	    while (p < q) {
+	      *p++ = (StgWord)UnpackFetchMe(&bufptr, (StgClosure**)&p_FMs);
+	      IF_DEBUG(sanity, FMs_in_PAP++);
+	    }
+	  }
+	}
+
+	/* and don't forget to follow the SRT */
+	goto follow_srt;
+      }
+
+    default:
+      barf("UnpackPAP: weird activation record found on stack: %d", 
+	   (int)(info->type));
+    }
+  }
+  IF_PAR_DEBUG(pack,
+	       belch("*<** UnpackPAP finished; unpacked closure @ %p is:",
+		     (StgClosure *)graph);
+	       printClosure((StgClosure *)graph));
+
+  IF_DEBUG(sanity,               /* check sanity of unpacked PAP */
+	   checkClosure(graph));
+
+  *bufptrP = bufptr;
+  /* 
+     Now p points to the first word after the PAP proper and p_FMs points 
+     to the next free word in the heap; everything between p and p_FMs are 
+     FETCHMEs 
+  */
+  IF_DEBUG(sanity,
+	   checkPAPSanity(graph, p, p_FMs));
+
+  /* we have to return the size of PAP + FMs as size of the unpacked thing */
+  ASSERT(graph+pap_sizeW((StgPAP*)graph)==p);
+  return (nat)((StgClosure*)p_FMs-graph);
+}
+
+#if defined(DEBUG)
+/* 
+   Check sanity of a PAP after unpacking the PAP.
+   This means that there is slice of heap after the PAP containing FETCHMEs
+*/
+void
+checkPAPSanity(StgPAP *graph, StgPtr p_FM_begin, StgPtr p_FM_end)
+{
+  StgPtr xx;
+
+  /* check that the main unpacked closure is a PAP */
+  ASSERT(graph->header.info = &stg_PAP_info);
+  checkClosure(graph);
+  /* check that all of the closures in the FM-area are FETCHMEs */
+  for (xx=p_FM_begin; xx<p_FM_end; xx += sizeofW(StgFetchMe)) {
+    /* must be a FETCHME closure */
+    ASSERT(((StgClosure*)xx)->header.info == &stg_FETCH_ME_info);
+    /* it might have been commoned up (=> marked as garbage);
+       otherwise it points to a GA */
+    ASSERT((((StgFetchMe*)xx)->ga)==GARBAGE_MARKER ||
+	   LOOKS_LIKE_GA(((StgFetchMe*)xx)->ga));
+  }
+  /* traverse the payload of the PAP */
+  for (xx=graph->payload; xx-(StgPtr)(graph->payload)<graph->n_args; xx++) {
+    /* if the current elem is a pointer into the FM area, check that
+       the GA field is ok */
+    ASSERT(!(p_FM_begin<(StgPtr)*xx && (StgPtr)*xx<p_FM_end) ||
+	   LOOKS_LIKE_GA(((StgFetchMe*)*xx)->ga));
+  }
+}
+#endif  /* DEBUG */
+#endif  /* PAR */
+
+//@node GranSim Code,  , GUM code, Unpacking routines
+//@subsubsection GranSim Code
+
+/*
+   For GrAnSim: No actual unpacking should be necessary. We just
+   have to walk over the graph and set the bitmasks appropriately.
+   Since we use RBHs similarly to GUM but without an ACK message/event
+   we have to revert the RBH from within the UnpackGraph routine (good luck!)
+   -- HWL 
+*/
+
+#if defined(GRAN)
+void
+CommonUp(StgClosure *src, StgClosure *dst)
+{
+  barf("CommonUp: should never be entered in a GranSim setup");
+}
+
+StgClosure*
+UnpackGraph(buffer)
+rtsPackBuffer* buffer;
+{
+  nat size, ptrs, nonptrs, vhs,
+      bufptr = 0;
+  StgClosure *closure, *graphroot, *graph;
+  StgInfoTable *ip;
+  StgWord bufsize, unpackedsize,
+          pptr = 0, pptrs = 0, pvhs;
+  StgTSO* tso;
+  char str[240], str1[80];
+  int i;
+
+  bufptr = 0;
+  graphroot = buffer->buffer[0];
+
+  tso = buffer->tso;
+
+  /* Unpack the header */
+  unpackedsize = buffer->unpacked_size;
+  bufsize = buffer->size;
+
+  IF_GRAN_DEBUG(pack,
+		belch("<<< Unpacking <<%d>> (buffer @ %p):\n    (root @ %p, PE %d,size=%d), demanded by TSO %d (%p)[PE %d]",
+		      buffer->id, buffer, graphroot, where_is(graphroot), 
+		      bufsize, tso->id, tso, 
+		      where_is((StgClosure *)tso)));
+
+  do {
+    closure = buffer->buffer[bufptr++]; /* that's all we need for GrAnSim -- HWL */
+      
+    /* Actually only ip is needed; rest is useful for TESTING -- HWL */
+    ip = get_closure_info(closure, 
+			  &size, &ptrs, &nonptrs, &vhs, str);
+      
+    IF_GRAN_DEBUG(pack,
+		  sprintf(str, "**    (%p): Changing bitmask[%s]: 0x%x ",
+			  closure, (closure_HNF(closure) ? "NF" : "__"),
+			  PROCS(closure)));
+
+    if (get_itbl(closure)->type == RBH) {
+      /* if it's an RBH, we have to revert it into a normal closure, thereby
+	 awakening the blocking queue; not that this is code currently not
+	 needed in GUM, but it should be added with the new features in
+	 GdH (and the implementation of an NACK message)
+      */
+      // closure->header.gran.procs = PE_NUMBER(CurrentProc);
+      SET_GRAN_HDR(closure, PE_NUMBER(CurrentProc));    /* Move node */
+
+      IF_GRAN_DEBUG(pack,
+		    strcat(str, " (converting RBH) ")); 
+
+      convertFromRBH(closure);   /* In GUM that's done by convertToFetchMe */
+
+      IF_GRAN_DEBUG(pack,
+		    belch("::  closure %p (%s) is a RBH; after reverting: IP=%p",
+			  closure, info_type(closure), get_itbl(closure)));
+    } else if (IS_BLACK_HOLE(closure)) {
+      IF_GRAN_DEBUG(pack,
+		    belch("::  closure %p (%s) is a BH; copying node to %d",
+			  closure, info_type(closure), CurrentProc));
+      closure->header.gran.procs |= PE_NUMBER(CurrentProc); /* Copy node */
+    } else if ( (closure->header.gran.procs & PE_NUMBER(CurrentProc)) == 0 ) {
+      if (closure_HNF(closure)) {
+	IF_GRAN_DEBUG(pack,
+		      belch("::  closure %p (%s) is a HNF; copying node to %d",
+			    closure, info_type(closure), CurrentProc));
+	closure->header.gran.procs |= PE_NUMBER(CurrentProc); /* Copy node */
+      } else { 
+	IF_GRAN_DEBUG(pack,
+		      belch("::  closure %p (%s) is no (R)BH or HNF; moving node to %d",
+			    closure, info_type(closure), CurrentProc));
+	closure->header.gran.procs = PE_NUMBER(CurrentProc);  /* Move node */
+      }
+    }
+
+    IF_GRAN_DEBUG(pack,
+		  sprintf(str1, "0x%x",   PROCS(closure)); strcat(str, str1));
+    IF_GRAN_DEBUG(pack, belch(str));
+    
+  } while (bufptr<buffer->size) ;   /*  (parent != NULL);  */
+
+  /* In GrAnSim we allocate pack buffers dynamically! -- HWL */
+  free(buffer->buffer);
+  free(buffer);
+
+  IF_GRAN_DEBUG(pack,
+		belch("PrintGraph of %p is:", graphroot); PrintGraph(graphroot,0));
+
+  return (graphroot);
+}
+#endif  /* GRAN */
+
+//@node Aux fcts for packing, Printing Packet Contents, Unpacking routines, Graph packing
+//@subsection Aux fcts for packing
+
+//@menu
+//* Offset table::		
+//* Packet size::		
+//* Types of Global Addresses::	 
+//* Closure Info::		
+//@end menu
+
+//@node Offset table, Packet size, Aux fcts for packing, Aux fcts for packing
+//@subsubsection Offset table
+
+/*
+   DonePacking is called when we've finished packing.  It releases memory
+   etc.  */
+
+//@cindex DonePacking
+
+# if defined(PAR)
+
+static void
+DonePacking(void)
+{
+  freeHashTable(offsetTable, NULL);
+  offsetTable = NULL;
+}
+
+/*
+   AmPacking records that the closure is being packed.  Note the abuse of
+   the data field in the hash table -- this saves calling @malloc@!  */
+
+//@cindex AmPacking
+
+static void
+AmPacking(closure)
+StgClosure *closure;
+{
+  insertHashTable(offsetTable, (StgWord) closure, (void *) (StgWord) pack_locn);
+}
+
+/*
+   OffsetFor returns an offset for a closure which is already being packed.  */
+
+//@cindex OffsetFor
+
+static int
+OffsetFor(closure)
+StgClosure *closure;
+{
+  return (int) (StgWord) lookupHashTable(offsetTable, (StgWord) closure);
+}
+
+/*
+   NotYetPacking determines whether the closure's already being packed.
+   Offsets $<$ @PACK_HDR_SIZE@ (e.g. 0) mean no.  */
+
+//@cindex NotYetPacking
+
+static rtsBool
+NotYetPacking(offset)
+int offset;
+{
+  return(offset == 0); // ToDo: what if root is found again?? FIX 
+}
+
+# else  /* GRAN */
+
+static void
+DonePacking(void)
+{
+  /* nothing */
+}
+
+/* 
+   NotYetPacking searches through the whole pack buffer for closure.  */
+
+static rtsBool
+NotYetPacking(closure)
+StgClosure *closure;
+{ nat i;
+  rtsBool found = rtsFalse;
+
+  for (i=0; (i<pack_locn) && !found; i++)
+    found = globalPackBuffer->buffer[i]==closure;
+
+  return (!found);
+}
+# endif
+
+//@node Packet size, Closure Info, Offset table, Aux fcts for packing
+//@subsubsection Packet size
+
+/* 
+   The size needed if all currently queued closures are packed as FETCH_ME
+   closures. This represents the headroom we must have when packing the
+   buffer in order to maintain all links in the graphs.
+*/
+// ToDo: check and merge cases
+#if defined(PAR)
+static nat
+QueuedClosuresMinSize (nat ptrs) {
+  return ((clq_size - clq_pos) + ptrs) * PACK_FETCHME_SIZE;
+}
+#else /* GRAN */
+static nat
+QueuedClosuresMinSize (nat ptrs) {
+  return ((clq_size - clq_pos) + ptrs) * PACK_FETCHME_SIZE;
+}
+#endif 
+
+/*
+  RoomToPack determines whether there's room to pack the closure into
+  the pack buffer based on 
+
+  o how full the buffer is already,
+  o the closures' size and number of pointers (which must be packed as GAs),
+  o the size and number of pointers held by any primitive arrays that it 
+    points to
+  
+    It has a *side-effect* (naughty, naughty) in assigning roomInBuffer 
+    to rtsFalse.
+*/
+
+//@cindex RoomToPack
+static rtsBool
+RoomToPack(size, ptrs)
+nat size, ptrs;
+{
+# if defined(PAR)
+  if (roomInBuffer &&
+      (pack_locn +                 // where we are in the buffer right now
+       size +                      // space needed for the current closure
+       QueuedClosuresMinSize(ptrs) // space for queued closures as FETCH_MEs
+       + 1                         // headroom (DEBUGGING only)
+       >= 
+       RTS_PACK_BUFFER_SIZE))
+    {
+      roomInBuffer = rtsFalse;
+    }
+# else   /* GRAN */
+  if (roomInBuffer &&
+      (unpacked_size + 
+       size +
+       QueuedClosuresMinSize(ptrs)
+       >= 
+       RTS_PACK_BUFFER_SIZE))
+    {
+      roomInBuffer = rtsFalse;
+    }
+# endif
+  return (roomInBuffer);
+}
+
+//@node Closure Info,  , Packet size, Aux fcts for packing
+//@subsubsection Closure Info
+
+/*
+   Closure Info
+
+   @get_closure_info@ determines the size, number of pointers etc. for this
+   type of closure -- see @SMInfoTables.lh@ for the legal info. types etc.
+
+[Can someone please keep this function up to date.  I keep needing it
+ (or something similar) for interpretive code, and it keeps
+ bit-rotting.  {\em It really belongs somewhere else too}.  KH @@ 17/2/95] */
+
+#if 0
+
+// {Parallel.h}Daq ngoqvam vIroQpu'
+
+# if defined(GRAN) || defined(PAR)
+/* extracting specific info out of closure; currently only used in GRAN -- HWL */
+//@cindex get_closure_info
+StgInfoTable*
+get_closure_info(node, size, ptrs, nonptrs, vhs, info_hdr_ty)
+StgClosure* node;
+nat *size, *ptrs, *nonptrs, *vhs;
+char *info_hdr_ty;
+{
+  StgInfoTable *info;
+
+  info = get_itbl(node);
+  /* the switch shouldn't be necessary, really; just use default case */
+  switch (info->type) {
+#if 0
+   case CONSTR_1_0:
+   case THUNK_1_0:
+   case FUN_1_0:
+     *size = sizeW_fromITBL(info);
+     *ptrs = (nat) 1; // (info->layout.payload.ptrs);
+     *nonptrs = (nat) 0; // (info->layout.payload.nptrs);
+     *vhs = (nat) 0; // unknown
+     info_hdr_type(node, info_hdr_ty);
+     return info;
+     
+  case CONSTR_0_1:
+  case THUNK_0_1:
+  case FUN_0_1:
+     *size = sizeW_fromITBL(info);
+     *ptrs = (nat) 0; // (info->layout.payload.ptrs);
+     *nonptrs = (nat) 1; // (info->layout.payload.nptrs);
+     *vhs = (nat) 0; // unknown
+     info_hdr_type(node, info_hdr_ty);
+     return info;
+
+  case CONSTR_2_0:
+  case THUNK_2_0:
+  case FUN_2_0:
+     *size = sizeW_fromITBL(info);
+     *ptrs = (nat) 2; // (info->layout.payload.ptrs);
+     *nonptrs = (nat) 0; // (info->layout.payload.nptrs);
+     *vhs = (nat) 0; // unknown
+     info_hdr_type(node, info_hdr_ty);
+     return info;
+
+  case CONSTR_1_1:
+  case THUNK_1_1:
+  case FUN_1_1:
+     *size = sizeW_fromITBL(info);
+     *ptrs = (nat) 1; // (info->layout.payload.ptrs);
+     *nonptrs = (nat) 1; // (info->layout.payload.nptrs);
+     *vhs = (nat) 0; // unknown
+     info_hdr_type(node, info_hdr_ty);
+     return info;
+
+  case CONSTR_0_2:
+  case THUNK_0_2:
+  case FUN_0_2:
+     *size = sizeW_fromITBL(info);
+     *ptrs = (nat) 0; // (info->layout.payload.ptrs);
+     *nonptrs = (nat) 2; // (info->layout.payload.nptrs);
+     *vhs = (nat) 0; // unknown
+     info_hdr_type(node, info_hdr_ty);
+     return info;
+#endif
+  case RBH:
+    {
+      StgInfoTable *rip = REVERT_INFOPTR(info); // closure to revert to
+      *size = sizeW_fromITBL(rip);
+      *ptrs = (nat) (rip->layout.payload.ptrs);
+      *nonptrs = (nat) (rip->layout.payload.nptrs);
+      *vhs = (nat) 0; // unknown
+      info_hdr_type(node, info_hdr_ty);
+      return rip;  // NB: we return the reverted info ptr for a RBH!!!!!!
+    }
+
+  default:
+    *size = sizeW_fromITBL(info);
+    *ptrs = (nat) (info->layout.payload.ptrs);
+    *nonptrs = (nat) (info->layout.payload.nptrs);
+    *vhs = (nat) 0; // unknown
+    info_hdr_type(node, info_hdr_ty);
+    return info;
+  }
+} 
+
+//@cindex IS_BLACK_HOLE
+rtsBool
+IS_BLACK_HOLE(StgClosure* node)          
+{ 
+  StgInfoTable *info;
+  info = get_itbl(node);
+  return ((info->type == BLACKHOLE || info->type == RBH) ? rtsTrue : rtsFalse);
+}
+
+//@cindex IS_INDIRECTION
+StgClosure *
+IS_INDIRECTION(StgClosure* node)          
+{ 
+  StgInfoTable *info;
+  info = get_itbl(node);
+  switch (info->type) {
+    case IND:
+    case IND_OLDGEN:
+    case IND_PERM:
+    case IND_OLDGEN_PERM:
+    case IND_STATIC:
+      /* relies on indirectee being at same place for all these closure types */
+      return (((StgInd*)node) -> indirectee);
+    default:
+      return NULL;
+  }
+}
+
+/*
+rtsBool
+IS_THUNK(StgClosure* node)
+{
+  StgInfoTable *info;
+  info = get_itbl(node);
+  return ((info->type == THUNK ||
+	   info->type == THUNK_STATIC ||
+	   info->type == THUNK_SELECTOR) ? rtsTrue : rtsFalse);
+}
+*/
+
+# endif /* GRAN */
+#endif /* 0 */
+
+# if 0
+/* ngoq ngo' */
+
+P_
+get_closure_info(closure, size, ptrs, nonptrs, vhs, type)
+P_ closure;
+W_ *size, *ptrs, *nonptrs, *vhs;
+char *type;
+{
+   P_ ip = (P_) INFO_PTR(closure);
+
+   if (closure==NULL) {
+     fprintf(stderr, "Qagh {get_closure_info}Daq: NULL closure\n");
+     *size = *ptrs = *nonptrs = *vhs = 0; 
+     strcpy(type,"ERROR in get_closure_info");
+     return;
+   } else if (closure==PrelBase_Z91Z93_closure) {
+     /* fprintf(stderr, "Qagh {get_closure_info}Daq: PrelBase_Z91Z93_closure closure\n"); */
+     *size = *ptrs = *nonptrs = *vhs = 0; 
+     strcpy(type,"PrelBase_Z91Z93_closure");
+     return;
+   };
+
+    ip = (P_) INFO_PTR(closure);
+
+    switch (INFO_TYPE(ip)) {
+    case INFO_SPEC_U_TYPE:
+    case INFO_SPEC_S_TYPE:
+    case INFO_SPEC_N_TYPE:
+	*size = SPEC_CLOSURE_SIZE(closure);
+	*ptrs = SPEC_CLOSURE_NoPTRS(closure);
+	*nonptrs = SPEC_CLOSURE_NoNONPTRS(closure);
+	*vhs = 0 /*SPEC_VHS*/;
+	strcpy(type,"SPEC");
+	break;
+
+    case INFO_GEN_U_TYPE:
+    case INFO_GEN_S_TYPE:
+    case INFO_GEN_N_TYPE:
+	*size = GEN_CLOSURE_SIZE(closure);
+	*ptrs = GEN_CLOSURE_NoPTRS(closure);
+	*nonptrs = GEN_CLOSURE_NoNONPTRS(closure);
+	*vhs = GEN_VHS;
+	strcpy(type,"GEN");
+	break;
+
+    case INFO_DYN_TYPE:
+	*size = DYN_CLOSURE_SIZE(closure);
+	*ptrs = DYN_CLOSURE_NoPTRS(closure);
+	*nonptrs = DYN_CLOSURE_NoNONPTRS(closure);
+	*vhs = DYN_VHS;
+	strcpy(type,"DYN");
+	break;
+
+    case INFO_TUPLE_TYPE:
+	*size = TUPLE_CLOSURE_SIZE(closure);
+	*ptrs = TUPLE_CLOSURE_NoPTRS(closure);
+	*nonptrs = TUPLE_CLOSURE_NoNONPTRS(closure);
+	*vhs = TUPLE_VHS;
+	strcpy(type,"TUPLE");
+	break;
+
+    case INFO_DATA_TYPE:
+	*size = DATA_CLOSURE_SIZE(closure);
+	*ptrs = DATA_CLOSURE_NoPTRS(closure);
+	*nonptrs = DATA_CLOSURE_NoNONPTRS(closure);
+	*vhs = DATA_VHS;
+	strcpy(type,"DATA");
+	break;
+
+    case INFO_IMMUTUPLE_TYPE:
+    case INFO_MUTUPLE_TYPE:
+	*size = MUTUPLE_CLOSURE_SIZE(closure);
+	*ptrs = MUTUPLE_CLOSURE_NoPTRS(closure);
+	*nonptrs = MUTUPLE_CLOSURE_NoNONPTRS(closure);
+	*vhs = MUTUPLE_VHS;
+	strcpy(type,"(IM)MUTUPLE");
+	break;
+
+    case INFO_STATIC_TYPE:
+	*size = STATIC_CLOSURE_SIZE(closure);
+	*ptrs = STATIC_CLOSURE_NoPTRS(closure);
+	*nonptrs = STATIC_CLOSURE_NoNONPTRS(closure);
+	*vhs = STATIC_VHS;
+	strcpy(type,"STATIC");
+	break;
+
+    case INFO_CAF_TYPE:
+    case INFO_IND_TYPE:
+	*size = IND_CLOSURE_SIZE(closure);
+	*ptrs = IND_CLOSURE_NoPTRS(closure);
+	*nonptrs = IND_CLOSURE_NoNONPTRS(closure);
+	*vhs = IND_VHS;
+	strcpy(type,"CAF|IND");
+	break;
+
+    case INFO_CONST_TYPE:
+	*size = CONST_CLOSURE_SIZE(closure);
+	*ptrs = CONST_CLOSURE_NoPTRS(closure);
+	*nonptrs = CONST_CLOSURE_NoNONPTRS(closure);
+	*vhs = CONST_VHS;
+	strcpy(type,"CONST");
+	break;
+
+    case INFO_SPEC_RBH_TYPE:
+	*size = SPEC_RBH_CLOSURE_SIZE(closure);
+	*ptrs = SPEC_RBH_CLOSURE_NoPTRS(closure);
+	*nonptrs = SPEC_RBH_CLOSURE_NoNONPTRS(closure);
+	if (*ptrs <= 2) {
+	    *nonptrs -= (2 - *ptrs);
+	    *ptrs = 1;
+	} else
+	    *ptrs -= 1;
+	*vhs = SPEC_RBH_VHS;
+	strcpy(type,"SPEC_RBH");
+	break;
+
+    case INFO_GEN_RBH_TYPE:
+	*size = GEN_RBH_CLOSURE_SIZE(closure);
+	*ptrs = GEN_RBH_CLOSURE_NoPTRS(closure);
+	*nonptrs = GEN_RBH_CLOSURE_NoNONPTRS(closure);
+	if (*ptrs <= 2) {
+	    *nonptrs -= (2 - *ptrs);
+	    *ptrs = 1;
+	} else
+	    *ptrs -= 1;
+	*vhs = GEN_RBH_VHS;
+	strcpy(type,"GEN_RBH");
+	break;
+
+    case INFO_CHARLIKE_TYPE:
+	*size = CHARLIKE_CLOSURE_SIZE(closure);
+	*ptrs = CHARLIKE_CLOSURE_NoPTRS(closure);
+	*nonptrs = CHARLIKE_CLOSURE_NoNONPTRS(closure);
+	*vhs = CHARLIKE_VHS;
+	strcpy(type,"CHARLIKE");
+	break;
+
+    case INFO_INTLIKE_TYPE:
+	*size = INTLIKE_CLOSURE_SIZE(closure);
+	*ptrs = INTLIKE_CLOSURE_NoPTRS(closure);
+	*nonptrs = INTLIKE_CLOSURE_NoNONPTRS(closure);
+	*vhs = INTLIKE_VHS;
+	strcpy(type,"INTLIKE");
+	break;
+
+#  if !defined(GRAN)
+    case INFO_FETCHME_TYPE:
+	*size = FETCHME_CLOSURE_SIZE(closure);
+        *ptrs = FETCHME_CLOSURE_NoPTRS(closure);
+        *nonptrs = FETCHME_CLOSURE_NoNONPTRS(closure);
+        *vhs = FETCHME_VHS;
+	strcpy(type,"FETCHME");
+	break;
+
+    case INFO_FMBQ_TYPE:
+	*size = FMBQ_CLOSURE_SIZE(closure);
+        *ptrs = FMBQ_CLOSURE_NoPTRS(closure);
+        *nonptrs = FMBQ_CLOSURE_NoNONPTRS(closure);
+        *vhs = FMBQ_VHS;
+	strcpy(type,"FMBQ");
+	break;
+#  endif
+
+    case INFO_BQ_TYPE:
+	*size = BQ_CLOSURE_SIZE(closure);
+        *ptrs = BQ_CLOSURE_NoPTRS(closure);
+        *nonptrs = BQ_CLOSURE_NoNONPTRS(closure);
+        *vhs = BQ_VHS;
+	strcpy(type,"BQ");
+	break;
+
+    case INFO_BH_TYPE:
+	*size = BH_CLOSURE_SIZE(closure);
+        *ptrs = BH_CLOSURE_NoPTRS(closure);
+        *nonptrs = BH_CLOSURE_NoNONPTRS(closure);
+        *vhs = BH_VHS;
+	strcpy(type,"BH");
+	break;
+
+    case INFO_TSO_TYPE:
+	*size = 0; /* TSO_CLOSURE_SIZE(closure); */
+        *ptrs = 0; /* TSO_CLOSURE_NoPTRS(closure); */
+        *nonptrs = 0; /* TSO_CLOSURE_NoNONPTRS(closure); */
+        *vhs = TSO_VHS;
+	strcpy(type,"TSO");
+	break;
+
+    case INFO_STKO_TYPE:
+        *size = 0;
+    	*ptrs = 0;
+        *nonptrs = 0;
+    	*vhs = STKO_VHS;
+    	strcpy(type,"STKO");
+        break;
+
+    default:
+	fprintf(stderr, "get_closure_info:  Unexpected closure type (%lu), closure %lx\n",
+	  INFO_TYPE(ip), (StgWord) closure);
+	EXIT(EXIT_FAILURE);
+    }
+
+    return ip;
+}
+# endif
+
+# if 0
+// Use allocate in Storage.c instead
+/*
+   @AllocateHeap@ will bump the heap pointer by @size@ words if the space
+   is available, but it will not perform garbage collection.
+   ToDo: check whether we can use an existing STG allocation routine -- HWL
+*/
+
+
+//@cindex AllocateHeap
+StgPtr
+AllocateHeap(size)
+nat size;
+{
+  StgPtr newClosure;
+  
+  /* Allocate a new closure */
+  if (Hp + size > HpLim)
+    return NULL;
+  
+  newClosure = Hp + 1;
+  Hp += size;
+  
+  return newClosure;
+}
+# endif
+
+# if defined(PAR)
+
+//@cindex doGlobalGC
+void
+doGlobalGC(void)
+{
+  fprintf(stderr,"Splat -- we just hit global GC!\n");
+  stg_exit(EXIT_FAILURE);
+  //fishing = rtsFalse;
+  outstandingFishes--;
+}
+
+# endif /* PAR */
+
+//@node Printing Packet Contents, End of file, Aux fcts for packing, Graph packing
+//@subsection Printing Packet Contents
+/*
+  Printing Packet Contents
+  */
+
+#if defined(DEBUG) || defined(GRAN_CHECK)
+
+//@cindex PrintPacket
+
+#if defined(PAR)
+void
+PrintPacket(packBuffer)
+rtsPackBuffer *packBuffer;
+{
+  StgClosure *parent, *graphroot, *closure_start;
+  const StgInfoTable *ip;
+  globalAddr ga;
+  StgWord **bufptr, **slotptr;
+
+  nat bufsize;
+  nat pptr = 0, pptrs = 0, pvhs;
+  nat locn = 0;
+  nat i;
+  nat size, ptrs, nonptrs, vhs;
+  char str[80];
+
+  /* disable printing if a non-std globalisation scheme is used; ToDo: FIX */
+  if (RtsFlags.ParFlags.globalising != 0)
+    return;
+
+  /* NB: this whole routine is more or less a copy of UnpackGraph with all
+     unpacking components replaced by printing fcts
+     Long live higher-order fcts!
+  */
+  /* Initialisation */
+  //InitPackBuffer();                  /* in case it isn't already init'd */
+  InitClosureQueue();
+  // ASSERT(gaga==PendingGABuffer); 
+  graphroot = (StgClosure *)NULL;
+
+  /* Unpack the header */
+  bufsize = packBuffer->size;
+  bufptr = packBuffer->buffer;
+
+  fprintf(stderr, "*. Printing <<%d>> (buffer @ %p):\n", 
+	  packBuffer->id, packBuffer);
+  fprintf(stderr, "*.   size: %d; unpacked_size: %d; tso: %p; buffer: %p\n",
+	  packBuffer->size, packBuffer->unpacked_size, 
+	  packBuffer->tso, packBuffer->buffer);
+
+  parent = (StgClosure *)NULL;
+
+  do {
+    /* This is where we will ultimately save the closure's address */
+    slotptr = bufptr;
+    locn = slotptr-(packBuffer->buffer); // index of closure in buffer
+
+    /* First, unpack the next GA or PLC */
+    ga.weight = (rtsWeight) *bufptr++;
+
+    if (ga.weight == 2) {  // unglobalised closure to follow
+      // nothing to do; closure starts at *bufptr
+    } else if (ga.weight > 0) { // fill in GA
+      ga.payload.gc.gtid = (GlobalTaskId) *bufptr++;
+      ga.payload.gc.slot = (int) *bufptr++;
+    } else
+      ga.payload.plc = (StgPtr) *bufptr++;
+    
+    /* Now unpack the closure body, if there is one */
+    if (isFixed(&ga)) {
+      fprintf(stderr, "*. %u: PLC @ %p\n", locn, ga.payload.plc);
+      // closure = ga.payload.plc;
+    } else if (isOffset(&ga)) {
+      fprintf(stderr, "*. %u: OFFSET TO %d\n", locn, ga.payload.gc.slot);
+      // closure = (StgClosure *) buffer[ga.payload.gc.slot];
+    } else {
+      /* Print normal closures */
+
+      ASSERT(bufsize > 0);
+
+      fprintf(stderr, "*. %u: ((%x, %d, %x)) ", locn,
+              ga.payload.gc.gtid, ga.payload.gc.slot, ga.weight);
+
+      closure_start = (StgClosure*)bufptr;
+      ip = get_closure_info((StgClosure *)bufptr, 
+			    &size, &ptrs, &nonptrs, &vhs, str);
+	  
+      /* ToDo: check whether this is really needed */
+      if (ip->type == FETCH_ME || ip->type == REMOTE_REF) {
+	size = _HS;
+	ptrs = nonptrs = vhs = 0;
+      }
+      /* ToDo: check whether this is really needed */
+      if (ip->type == ARR_WORDS) {
+	ptrs = vhs = 0;
+	nonptrs = ((StgArrWords *)bufptr)->words;
+	size = arr_words_sizeW((StgArrWords *)bufptr);
+      }
+
+      /* special code for printing a PAP in a buffer */
+      if (ip->type == PAP || ip->type == AP_UPD) {
+        vhs = 3; 
+	ptrs = 0;
+        nonptrs = (nat)((StgPAP *)bufptr)->payload[0];
+	size = _HS+vhs+ptrs+nonptrs;
+      }
+
+      /* 
+	 Remember, the generic closure layout is as follows:
+	 +-------------------------------------------------+
+	 | FIXED HEADER | VARIABLE HEADER | PTRS | NON-PRS |
+	 +-------------------------------------------------+
+      */
+      /* Print fixed header */
+      fprintf(stderr, "FH ["); 
+      for (i = 0; i < _HS; i++)
+	fprintf(stderr, " %p", *bufptr++);
+
+      if (ip->type == FETCH_ME || ip->type == REMOTE_REF)
+	size = ptrs = nonptrs = vhs = 0;
+
+      // VH is always empty in the new RTS
+      ASSERT(vhs==0 ||
+             ip->type == PAP || ip->type == AP_UPD);
+      /* Print variable header */
+      fprintf(stderr, "] VH ["); 
+      for (i = 0; i < vhs; i++)
+	fprintf(stderr, " %p", *bufptr++);
+
+      //fprintf(stderr, "] %d PTRS [", ptrs); 
+      /* Pointers will be filled in later */
+
+      fprintf(stderr, " ] (%d, %d) [", ptrs, nonptrs); 
+      /* Print non-pointers */
+      for (i = 0; i < nonptrs; i++)
+	fprintf(stderr, " %p", *bufptr++);
+
+      fprintf(stderr, "] (%s)\n", str);
+
+      /* Indirections are never packed */
+      // ASSERT(INFO_PTR(graph) != (W_) Ind_info_TO_USE);
+
+      /* Add to queue for processing 
+	 When just printing the packet we do not have an unpacked closure
+	 in hand, so we feed it the packet entry; 
+	 again, this assumes that at least the fixed header of the closure
+	 has the same layout in the packet; also we may not overwrite entries
+	 in the packet (done in Unpack), but for printing that's a bad idea
+	 anyway */
+      QueueClosure((StgClosure *)closure_start);
+	
+      /* No Common up needed for printing */
+
+      /* No Sort out the global address mapping for printing */
+
+    } /* normal closure case */
+
+    /* Locate next parent pointer */
+    pptr++;
+    while (pptr + 1 > pptrs) {
+      parent = DeQueueClosure();
+
+      if (parent == NULL)
+	break;
+      else {
+	(void) get_closure_info(parent, &size, &pptrs, &nonptrs,
+					&pvhs, str);
+	pptr = 0;
+      }
+    }
+  } while (parent != NULL);
+  fprintf(stderr, "*. --- End packet <<%d>> (claimed size=%d; real size=%d)---\n", 
+	  packBuffer->id, packBuffer->size, size);
+
+}
+
+/*
+  Doing a sanity check on a packet.
+  This does a full iteration over the packet, as in PrintPacket.
+*/
+//@cindex checkPacket
+void
+checkPacket(packBuffer)
+rtsPackBuffer *packBuffer;
+{
+  StgClosure *parent, *graphroot, *closure_start;
+  const StgInfoTable *ip;
+  globalAddr ga;
+  StgWord **bufptr, **slotptr;
+
+  nat bufsize;
+  nat pptr = 0, pptrs = 0, pvhs;
+  nat locn = 0;
+  nat size, ptrs, nonptrs, vhs;
+  char str[80];
+
+  /* NB: this whole routine is more or less a copy of UnpackGraph with all
+     unpacking components replaced by printing fcts
+     Long live higher-order fcts!
+  */
+  /* Initialisation */
+  //InitPackBuffer();                  /* in case it isn't already init'd */
+  InitClosureQueue();
+  // ASSERT(gaga==PendingGABuffer); 
+  graphroot = (StgClosure *)NULL;
+
+  /* Unpack the header */
+  bufsize = packBuffer->size;
+  bufptr = packBuffer->buffer;
+  parent = (StgClosure *)NULL;
+  ASSERT(bufsize > 0);
+  do {
+    /* check that we are not at the end of the buffer, yet */
+    IF_DEBUG(sanity, ASSERT(*bufptr != END_OF_BUFFER_MARKER));
+
+    /* This is where we will ultimately save the closure's address */
+    slotptr = bufptr;
+    locn = slotptr-(packBuffer->buffer); // index of closure in buffer
+    ASSERT(locn<=bufsize);
+  
+    /* First, check whether we have a GA, a PLC, or an OFFSET at hand */
+    ga.weight = (rtsWeight) *bufptr++;
+
+    if (ga.weight == 2) {  // unglobalised closure to follow
+      // nothing to do; closure starts at *bufptr
+    } else if (ga.weight > 0) { // fill in GA
+      ga.payload.gc.gtid = (GlobalTaskId) *bufptr++;
+      ga.payload.gc.slot = (int) *bufptr++;
+    } else
+      ga.payload.plc = (StgPtr) *bufptr++;
+    
+    /* Now unpack the closure body, if there is one */
+    if (isFixed(&ga)) {
+      /* It's a PLC */
+      ASSERT(LOOKS_LIKE_STATIC(ga.payload.plc));
+    } else if (isOffset(&ga)) {
+      ASSERT(ga.payload.gc.slot<=(int)bufsize);
+    } else {
+      /* normal closure */
+      ASSERT(!RtsFlags.ParFlags.globalising==0 || LOOKS_LIKE_GA(&ga));
+
+      closure_start = (StgClosure*)bufptr;
+      ASSERT(LOOKS_LIKE_GHC_INFO((StgPtr)*bufptr));
+      ip = get_closure_info((StgClosure *)bufptr, 
+			    &size, &ptrs, &nonptrs, &vhs, str);
+
+      /* ToDo: check whether this is really needed */
+      if (ip->type == FETCH_ME || ip->type == REMOTE_REF) {
+	size = _HS;
+	ptrs = nonptrs = vhs = 0;
+      }
+      /* ToDo: check whether this is really needed */
+      if (ip->type == ARR_WORDS) {
+	ptrs = vhs = 0;
+	nonptrs = ((StgArrWords *)bufptr)->words+1; // payload+words
+	size = arr_words_sizeW((StgArrWords *)bufptr);
+	ASSERT(size==_HS+vhs+nonptrs);
+      }
+      /* special code for printing a PAP in a buffer */
+      if (ip->type == PAP || ip->type == AP_UPD) {
+        vhs = 3; 
+	ptrs = 0;
+        nonptrs = (nat)((StgPAP *)bufptr)->payload[0];
+	size = _HS+vhs+ptrs+nonptrs;
+      }
+
+      /* no checks on contents of closure (pointers aren't packed anyway) */
+      ASSERT(_HS+vhs+nonptrs>=MIN_NONUPD_SIZE);
+      bufptr += _HS+vhs+nonptrs;
+
+      /* Add to queue for processing */
+      QueueClosure((StgClosure *)closure_start);
+	
+      /* No Common up needed for checking */
+
+      /* No Sort out the global address mapping for checking */
+
+    } /* normal closure case */
+
+    /* Locate next parent pointer */
+    pptr++;
+    while (pptr + 1 > pptrs) {
+      parent = DeQueueClosure();
+
+      if (parent == NULL)
+	break;
+      else {
+	//ASSERT(LOOKS_LIKE_GHC_INFO((StgPtr)*parent));
+	(void) get_closure_info(parent, &size, &pptrs, &nonptrs,
+					&pvhs, str);
+	pptr = 0;
+      }
+    }
+  } while (parent != NULL);
+  /* we unpacked exactly as many words as there are in the buffer */
+  ASSERT(packBuffer->size == bufptr-(packBuffer->buffer));
+  /* check for magic end-of-buffer word */  
+  IF_DEBUG(sanity, ASSERT(*bufptr == END_OF_BUFFER_MARKER));
+}
+#else  /* GRAN */
+void
+PrintPacket(buffer)
+rtsPackBuffer *buffer;
+{
+    // extern char *info_hdr_type(P_ infoptr);  /* defined in Threads.lc */
+    // extern char *display_info_type(P_ infoptr);      /* defined in Threads.lc */
+
+    StgInfoTable *info;
+    nat size, ptrs, nonptrs, vhs;
+    char info_hdr_ty[80];
+    char str1[80], str2[80], junk_str[80];
+
+    /* globalAddr ga; */
+
+    nat bufsize, unpacked_size ;
+    StgClosure *parent;
+    nat pptr = 0, pptrs = 0, pvhs;
+
+    nat unpack_locn = 0;
+    nat gastart = unpack_locn;
+    nat closurestart = unpack_locn;
+
+    StgTSO *tso;
+    StgClosure *closure, *p;
+
+    nat i;
+
+    fprintf(stderr, "*** Printing <<%d>> (buffer @ %p):\n", buffer->id, buffer);
+    fprintf(stderr, "  size: %d; unpacked_size: %d; tso: %d (%p); buffer: %p\n",
+	    buffer->size, buffer->unpacked_size, buffer->tso, buffer->buffer);
+    fputs("  contents: ", stderr);
+    for (unpack_locn=0; unpack_locn<buffer->size; unpack_locn++) {
+      closure = buffer->buffer[unpack_locn];
+      fprintf(stderr, ", %p (%s)", 
+	      closure, info_type(closure)); 
+    }
+    fputc('\n', stderr);
+
+#if 0
+    /* traverse all elements of the graph; omitted for now, but might be usefule */
+    InitClosureQueue();
+
+    tso = buffer->tso;
+
+    /* Unpack the header */
+    unpacked_size = buffer->unpacked_size;
+    bufsize = buffer->size;
+
+    fprintf(stderr, "Packet %p, size %u (unpacked size is %u); demanded by TSO %d (%p)[PE %d]\n--- Begin ---\n", 
+	            buffer, bufsize, unpacked_size,  
+	            tso->id, tso, where_is((StgClosure*)tso));
+
+    do {
+	closurestart = unpack_locn;
+	closure = buffer->buffer[unpack_locn++];
+	
+	fprintf(stderr, "[%u]: (%p) ", closurestart, closure);
+
+	info = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str1);
+	strcpy(str2, str1);
+	fprintf(stderr, "(%s|%s) ", str1, str2);
+	
+        if (info->type == FETCH_ME || info->type == FETCH_ME_BQ || 
+	    IS_BLACK_HOLE(closure))
+	  size = ptrs = nonptrs = vhs = 0;
+	
+	if (closure_THUNK(closure)) {
+		if (closure_UNPOINTED(closure))
+		    fputs("UNPOINTED ", stderr);
+		else
+		    fputs("POINTED ", stderr);
+	} 
+        if (IS_BLACK_HOLE(closure)) {
+		fputs("BLACK HOLE\n", stderr);
+	} else {
+		/* Fixed header */
+		fprintf(stderr, "FH ["); 
+		for (i = 0, p = (StgClosure*)&(closure->header); i < _HS; i++, p++)
+		    fprintf(stderr, " %p", *p);
+	
+		/* Variable header 
+		if (vhs > 0) {
+		    fprintf(stderr, "] VH [%p", closure->payload[_HS]);
+	
+		    for (i = 1; i < vhs; i++)
+			fprintf(stderr, " %p", closure->payload[_HS+i]);
+		}
+		*/
+		fprintf(stderr, "] PTRS %u", ptrs);
+	
+		/* Non-pointers */
+		if (nonptrs > 0) {
+		    fprintf(stderr, " NPTRS [%p", closure->payload[_HS+vhs]);
+		
+		    for (i = 1; i < nonptrs; i++)
+			fprintf(stderr, " %p", closure->payload[_HS+vhs+i]);
+	
+		    putc(']', stderr);
+		}
+		putc('\n', stderr);
+	}
+    } while (unpack_locn<bufsize) ;  /* (parent != NULL); */
+
+    fprintf(stderr, "--- End ---\n\n");
+#endif /* 0 */
+}
+#endif /* PAR */
+#endif /* DEBUG || GRAN_CHECK */
+
+#endif /* PAR  || GRAN  -- whole file */
+
+//@node End of file,  , Printing Packet Contents, Graph packing
+//@subsection End of file
+
+//@index
+//* AllocateHeap::  @cindex\s-+AllocateHeap
+//* AmPacking::  @cindex\s-+AmPacking
+//* CommonUp::  @cindex\s-+CommonUp
+//* DeQueueClosure::  @cindex\s-+DeQueueClosure
+//* DeQueueClosure::  @cindex\s-+DeQueueClosure
+//* DonePacking::  @cindex\s-+DonePacking
+//* FillInClosure::  @cindex\s-+FillInClosure
+//* IS_BLACK_HOLE::  @cindex\s-+IS_BLACK_HOLE
+//* IS_INDIRECTION::  @cindex\s-+IS_INDIRECTION
+//* InitClosureQueue::  @cindex\s-+InitClosureQueue
+//* InitPendingGABuffer::  @cindex\s-+InitPendingGABuffer
+//* LocateNextParent::  @cindex\s-+LocateNextParent
+//* NotYetPacking::  @cindex\s-+NotYetPacking
+//* OffsetFor::  @cindex\s-+OffsetFor
+//* Pack::  @cindex\s-+Pack
+//* PackArray::  @cindex\s-+PackArray
+//* PackClosure::  @cindex\s-+PackClosure
+//* PackFetchMe::  @cindex\s-+PackFetchMe
+//* PackGeneric::  @cindex\s-+PackGeneric
+//* PackNearbyGraph::  @cindex\s-+PackNearbyGraph
+//* PackOneNode::  @cindex\s-+PackOneNode
+//* PackPAP::  @cindex\s-+PackPAP
+//* PackPLC::  @cindex\s-+PackPLC
+//* PackStkO::  @cindex\s-+PackStkO
+//* PackTSO::  @cindex\s-+PackTSO
+//* PendingGABuffer::  @cindex\s-+PendingGABuffer
+//* PrintPacket::  @cindex\s-+PrintPacket
+//* QueueClosure::  @cindex\s-+QueueClosure
+//* QueueEmpty::  @cindex\s-+QueueEmpty
+//* RoomToPack::  @cindex\s-+RoomToPack
+//* SetGAandCommonUp::  @cindex\s-+SetGAandCommonUp
+//* UnpackGA::  @cindex\s-+UnpackGA
+//* UnpackGraph::  @cindex\s-+UnpackGraph
+//* UnpackOffset::  @cindex\s-+UnpackOffset
+//* UnpackPLC::  @cindex\s-+UnpackPLC
+//* doGlobalGC::  @cindex\s-+doGlobalGC
+//* get_closure_info::  @cindex\s-+get_closure_info
+//* InitPackBuffer::  @cindex\s-+initPackBuffer
+//* isFixed::  @cindex\s-+isFixed
+//* isOffset::  @cindex\s-+isOffset
+//* offsetTable::  @cindex\s-+offsetTable
+//@end index
+
diff --git a/rts/parallel/ParInit.c b/rts/parallel/ParInit.c
new file mode 100644
index 0000000000..22c9119c89
--- /dev/null
+++ b/rts/parallel/ParInit.c
@@ -0,0 +1,322 @@
+/* --------------------------------------------------------------------------
+   Time-stamp: <Wed Mar 21 2001 16:37:16 Stardate: [-30]6363.46 hwloidl>
+
+   Initialising the parallel RTS
+
+   An extension based on Kevin Hammond's GRAPH for PVM version
+   P. Trinder, January 17th 1995.
+   Adapted for the new RTS
+   P. Trinder, July 1997.
+   H-W. Loidl, November 1999.
+
+   ------------------------------------------------------------------------ */
+
+#ifdef PAR /* whole file */
+
+//@menu
+//* Includes::			
+//* Global variables::		
+//* Initialisation Routines::	
+//@end menu
+
+//@node Includes, Global variables
+//@subsection Includes
+
+/* Evidently not Posix */
+/* #include "PosixSource.h" */
+
+#include <setjmp.h>
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "ParallelRts.h"
+#include "Sparks.h"
+#include "LLC.h"
+#include "HLC.h"
+
+//@node Global variables, Initialisation Routines, Includes
+//@subsection Global variables
+
+/* Global conditions defined here. */
+
+rtsBool	IAmMainThread = rtsFalse;	/* Set for the main thread	*/
+
+/* Task identifiers for various interesting global tasks. */
+
+GlobalTaskId IOTask = 0,                /* The IO Task Id		*/
+             SysManTask = 0,            /* The System Manager Task Id	*/
+             mytid = 0;                 /* This PE's Task Id		*/
+
+rtsTime 	main_start_time;	/* When the program started	*/
+rtsTime   	main_stop_time;	    	/* When the program finished    */
+jmp_buf		exit_parallel_system;	/* How to abort from the RTS	*/
+
+
+//rtsBool fishing = rtsFalse;             /* We have no fish out in the stream */
+rtsTime last_fish_arrived_at = 0;       /* Time of arrival of most recent fish*/
+nat     outstandingFishes = 0;          /* Number of active fishes */ 
+
+//@cindex spark queue
+/* GranSim: a globally visible array of spark queues */
+rtsSpark *pending_sparks_hd[SPARK_POOLS],  /* ptr to start of a spark pool */ 
+         *pending_sparks_tl[SPARK_POOLS],  /* ptr to end of a spark pool */ 
+         *pending_sparks_lim[SPARK_POOLS],
+         *pending_sparks_base[SPARK_POOLS]; 
+
+//@cindex spark_limit
+/* max number of sparks permitted on the PE; 
+   see RtsFlags.ParFlags.maxLocalSparks */
+nat spark_limit[SPARK_POOLS];
+
+//@cindex PendingFetches
+/* A list of fetch reply messages not yet processed; this list is filled
+   by awaken_blocked_queue and processed by processFetches */
+StgBlockedFetch *PendingFetches = END_BF_QUEUE;
+
+//@cindex allPEs
+GlobalTaskId *allPEs;
+
+//@cindex nPEs
+nat nPEs = 0;
+
+//@cindex sparksIgnored
+nat sparksIgnored = 0, sparksCreated = 0, 
+    threadsIgnored = 0, threadsCreated = 0;
+
+//@cindex advisory_thread_count
+nat advisory_thread_count = 0;
+
+globalAddr theGlobalFromGA;
+
+/* For flag handling see RtsFlags.h */
+
+//@node Prototypes
+//@subsection Prototypes
+
+/* Needed for FISH messages (initialisation of random number generator) */
+void srand48 (long);
+time_t time (time_t *);
+
+//@node Initialisation Routines,  , Global variables
+//@subsection Initialisation Routines
+
+/*
+  par_exit defines how to terminate the program.  If the exit code is
+  non-zero (i.e. an error has occurred), the PE should not halt until
+  outstanding error messages have been processed.  Otherwise, messages
+  might be sent to non-existent Task Ids.  The infinite loop will actually
+  terminate, since STG_Exception will call myexit\tr{(0)} when
+  it received a PP_FINISH from the system manager task.
+*/
+//@cindex shutdownParallelSystem
+void
+shutdownParallelSystem(StgInt n)
+{
+  /* use the file specified via -S */ 
+  FILE *sf = RtsFlags.GcFlags.statsFile;
+
+  IF_PAR_DEBUG(verbose,
+	       if (n==0)
+  	         belch("==== entered shutdownParallelSystem ...");
+               else
+  	         belch("==== entered shutdownParallelSystem (ERROR %d)...", n);
+	       );
+  
+  stopPEComms(n);
+
+#if 0
+  if (sf!=(FILE*)NULL) 
+    fprintf(sf, "PE %x: %u sparks created, %u sparks Ignored, %u threads created, %u threads Ignored", 
+	    (W_) mytid, sparksCreated, sparksIgnored,
+	    threadsCreated, threadsIgnored);
+#endif
+
+  ShutdownEachPEHook();
+}
+
+//@cindex initParallelSystem
+void
+initParallelSystem(void)
+{
+  /* Don't buffer standard channels... */
+  setbuf(stdout,NULL);
+  setbuf(stderr,NULL);
+  
+  srand48(time(NULL) * getpid()); /* Initialise Random-number generator seed*/
+                                  /* used to select target of FISH message*/
+  if (!InitPackBuffer())
+    barf("InitPackBuffer");
+
+  if (!initMoreBuffers())
+    barf("initMoreBuffers");
+
+  if (!initSparkPools())
+    barf("initSparkPools");
+}
+
+/* 
+ * SynchroniseSystem synchronises the reduction task with the system
+ * manager, and initialises the Global address tables (LAGA & GALA)
+ */
+
+//@cindex synchroniseSystem
+void
+synchroniseSystem(void)
+{
+  /* Only in debug mode? */
+  fprintf(stderr, "==== Starting parallel execution on %d processors ...\n", nPEs);
+
+  InitEachPEHook();                  /* HWL: hook to be execed on each PE */
+
+  /* Initialize global address tables */
+  initGAtables();
+
+  initParallelSystem();
+  
+  startPEComms();
+}
+
+/* 
+  Do the startup stuff (this is PVM specific!).
+  Determines global vars: mytid, IAmMainThread, SysManTask, nPEs
+  Called at the beginning of RtsStartup.startupHaskell
+*/
+void 
+startupParallelSystem(char *argv[]) { 
+ mytid = pvm_mytid();	        /* Connect to PVM */
+
+ if (*argv[0] == '-') {         /* Look to see whether we're the Main Thread */
+  IAmMainThread = rtsTrue;
+  sscanf(argv[0],"-%0X",&SysManTask);  /* extract SysMan task ID*/	
+  argv++;	                       /* Strip off flag argument */
+ } else {
+  SysManTask = pvm_parent();
+ }
+
+ IF_PAR_DEBUG(verbose,
+	       fprintf(stderr, "==== [%x] %s PE located SysMan at %x\n",
+		       mytid, IAmMainThread?"Main":"Remote", SysManTask));
+
+ nPEs = atoi(argv[1]);
+}
+
+/* 
+   Exception handler during startup.
+*/
+void *
+processUnexpectedMessageDuringStartup(rtsPacket p) {
+  OpCode opCode;
+  GlobalTaskId sender_id;
+
+  getOpcodeAndSender(p, &opCode, &sender_id);
+
+  switch(opCode) { 
+      case PP_FISH:
+        bounceFish();
+	break;
+#if defined(DIST)
+      case PP_REVAL:
+	bounceReval();
+	break;
+#endif
+      case PP_FINISH:
+        stg_exit(EXIT_SUCCESS);	
+	break;
+      default:
+	fprintf(stderr,"== Task %x: Unexpected OpCode %x (%s) from %x in startPEComms\n",
+		mytid, opCode, getOpName(opCode), sender_id);
+    }
+}
+
+void 
+startPEComms(void){ 
+
+  startUpPE(); 
+  allPEs = (GlobalTaskId *) stgMallocBytes(sizeof(GlobalTaskId) * MAX_PES,
+					   "(PEs)");
+  
+  /* Send our tid and IAmMainThread flag back to SysMan */
+  sendOp1(PP_READY, SysManTask, (StgWord)IAmMainThread);  
+  /* Wait until we get the PE-Id table from Sysman */    
+  waitForPEOp(PP_PETIDS, SysManTask, processUnexpectedMessageDuringStartup); 
+
+  IF_PAR_DEBUG(verbose,
+               belch("==-- startPEComms: methinks we just received a PP_PETIDS message"));
+
+  /* Digest the PE table we received */
+  processPEtids();
+}
+
+void
+processPEtids(void) { 
+  long newPE;
+  nat i, sentPEs, currentPEs;
+
+  nPEs=0;
+	  
+  currentPEs = nPEs;
+
+  IF_PAR_DEBUG(verbose,
+		belch("==-- processPEtids: starting to iterate over a PVM buffer"));
+  /* ToDo: this has to go into LLComms !!! */
+  GetArgs(&sentPEs,1);
+
+  ASSERT(sentPEs > currentPEs);
+  ASSERT(sentPEs < MAX_PES); /* enforced by SysMan too*/  
+  
+  for (i = 0; i < sentPEs; i++) { 
+    GetArgs(&newPE,1);
+    if (i<currentPEs) { 
+      ASSERT(newPE == allPEs[i]);
+    } else { 
+#if defined(DIST)
+      // breaks with PAR && !DEBUG
+      IF_PAR_DEBUG(verbose,
+	fprintf(stderr, "[%x] registering %d'th %x\n", mytid, i, newPE)); 
+      if(!looks_like_tid(newPE))
+	  barf("unacceptable taskID %x\n",newPE);
+#endif
+      allPEs[i] = newPE;      
+      nPEs++;
+      registerTask(newPE); 
+    }
+  }
+
+  IF_PAR_DEBUG(verbose,
+  	       /* debugging */
+  	       belch("++++ [%x] PE table as I see it:", mytid);
+  	       for (i = 0; i < sentPEs; i++) { 
+  		 belch("++++ allPEs[%d] = %x", i, allPEs[i]);
+               });
+}
+
+void 
+stopPEComms(StgInt n) { 
+  if (n != 0) { 
+    /* In case sysman doesn't know about us yet...
+    pvm_initsend(PvmDataDefault);
+    PutArgs(&IAmMainThread,1);
+    pvm_send(SysManTask, PP_READY);
+     */
+    sendOp(PP_READY, SysManTask);  
+  } 
+  
+  sendOp2(PP_FINISH, SysManTask, n, n);  
+  waitForPEOp(PP_FINISH, SysManTask, NULL);
+  fflush(gr_file);
+  shutDownPE();
+}
+
+#endif /* PAR -- whole file */
+
+//@index
+//* PendingFetches::  @cindex\s-+PendingFetches
+//* SynchroniseSystem::  @cindex\s-+SynchroniseSystem
+//* allPEs::  @cindex\s-+allPEs
+//* initParallelSystem::  @cindex\s-+initParallelSystem
+//* nPEs::  @cindex\s-+nPEs
+//* par_exit::  @cindex\s-+par_exit
+//* spark queue::  @cindex\s-+spark queue
+//* sparksIgnored::  @cindex\s-+sparksIgnored
+//@end index
+
diff --git a/rts/parallel/ParInit.h b/rts/parallel/ParInit.h
new file mode 100644
index 0000000000..a22a50bae6
--- /dev/null
+++ b/rts/parallel/ParInit.h
@@ -0,0 +1,19 @@
+/* -----------------------------------------------------------------------------
+ * ParInit.h,1
+ * 
+ * Phil Trinder
+ * July 1998
+ *
+ * External Parallel Initialisation Interface
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef PARINIT_H
+#define PARINIT_H
+
+extern void RunParallelSystem (P_);
+extern void initParallelSystem(void);
+extern void SynchroniseSystem(void);
+extern void par_exit(I_);
+
+#endif /* PARINIT_H */
diff --git a/rts/parallel/ParTicky.c b/rts/parallel/ParTicky.c
new file mode 100644
index 0000000000..347c2b8bca
--- /dev/null
+++ b/rts/parallel/ParTicky.c
@@ -0,0 +1,450 @@
+/* -------------------------------------------------------------------------
+ *
+ * (c) Hans-Wolfgang Loidl, 2000-
+ *
+ * Parallel ticky profiling, monitoring basic RTS operations in GUM.
+ * Similar in structure to TICKY_TICKY profiling, but doesn't need a 
+ * separate way of building GHC.
+ *-------------------------------------------------------------------------- */
+
+#if defined(PAR) && defined(PAR_TICKY)
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+//#include "StoragePriv.h"
+//#include "MBlock.h"
+//#include "Schedule.h"
+#include "GC.h"
+#include "Stats.h"
+#include "ParTicky.h"                       // ToDo: move into Rts.h
+#include "ParallelRts.h"
+
+#if defined(PAR) && defined(HAVE_GETRUSAGE)
+#include <sys/resource.h>
+#endif
+
+/* external data */
+extern double ElapsedTimeStart;
+
+extern ullong GC_tot_alloc;
+extern ullong GC_tot_copied;
+
+extern lnat MaxResidency;     /* in words; for stats only */
+extern lnat ResidencySamples; /* for stats only */
+
+/* ngIplu' {Stats.c}vo' */
+#define BIG_STRING_LEN              512
+
+/* ngIplu' {Ticky.c}vo' */
+#define INTAVG(a,b) ((b == 0) ? 0.0 : ((double) (a) / (double) (b)))
+#define PC(a)	    (100.0 * a)
+
+#define AVG(thing) \
+	StgDouble avg##thing  = INTAVG(tot##thing,ctr##thing)
+
+
+#if 0
+void
+set_foo_time(double *x) {
+  *x = usertime();
+}
+
+double
+get_foo_time(double x) {
+  fprintf(stderr, "get_foo_time: %7.2f (%7.5f,%7.5f) \n", 
+	  usertime()-x,usertime(),x);
+  return (usertime()-x);
+}
+#endif
+
+static double start_time_GA = 0.0;
+static double start_mark = 0.0;
+static double start_pack = 0.0;
+static double start_unpack = 0.0;
+
+void
+par_ticky_Par_start (void) {
+# if !defined(HAVE_GETRUSAGE) || irix_HOST_OS || defined(_WIN32)
+    fprintf(stderr, "|| sorry don't have RUSAGE\n");
+    return ;
+# else
+    FILE *sf = RtsFlags.GcFlags.statsFile;
+    struct rusage t;
+    double utime, stime;
+
+    if (RtsFlags.GcFlags.giveStats>1 && sf != NULL) {
+      getrusage(RUSAGE_SELF, &t);
+      
+      utime = t.ru_utime.tv_sec + 1e-6*t.ru_utime.tv_usec;
+      stime = t.ru_stime.tv_sec + 1e-6*t.ru_stime.tv_usec;
+      
+      fprintf(stderr, "|| user time: %5.2f; system time: %5.2f\n",
+	      utime, stime);
+      fprintf(stderr, "|| max RSS: %ld; int SM size: %ld; int USM data size: %ld; int USS size: %ld\n",
+	      t.ru_maxrss, t.ru_ixrss, t.ru_idrss, t.ru_isrss);
+    }
+#endif
+}
+
+#if 0	    
+FYI:
+            struct rusage
+            {
+                 struct timeval ru_utime; /* user time used */
+                 struct timeval ru_stime; /* system time used */
+                 long ru_maxrss;          /* maximum resident set size */
+                 long ru_ixrss;      /* integral shared memory size */
+                 long ru_idrss;      /* integral unshared data size */
+                 long ru_isrss;      /* integral unshared stack size */
+                 long ru_minflt;          /* page reclaims */
+                 long ru_majflt;          /* page faults */
+                 long ru_nswap;      /* swaps */
+                 long ru_inblock;         /* block input operations */
+                 long ru_oublock;         /* block output operations */
+                 long ru_msgsnd;          /* messages sent */
+                 long ru_msgrcv;          /* messages received */
+                 long ru_nsignals;        /* signals received */
+                 long ru_nvcsw;      /* voluntary context switches */
+                 long ru_nivcsw;          /* involuntary context switches */
+            };
+#endif
+
+
+void
+par_ticky_rebuildGAtables_start(void) {
+  // collect parallel global statistics (currently done together with GC stats)
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    //set_foo_time(&start_time_GA);
+    start_time_GA = usertime();
+  }
+}
+
+void
+par_ticky_rebuildGAtables_end(nat n, nat size_GA) {
+  // collect parallel global statistics (currently done together with GC stats)
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    static double foo = 0.0; 
+    foo = usertime() - start_time_GA; // get_foo_time(start_time_GA);
+    globalParStats.cnt_rebuild_GA++;
+    globalParStats.tot_rebuild_GA += n;
+    if ( n > globalParStats.res_rebuild_GA ) 
+      globalParStats.res_rebuild_GA = n;
+    // fprintf(stderr, "rebuildGAtables: footime=%7.2f (%11.5f, %11.5f)\n", 
+    //    foo, usertime(), start_time_GA);
+    globalParStats.time_rebuild_GA += foo;
+    globalParStats.tot_size_GA += size_GA;
+    if ( size_GA > globalParStats.res_size_GA ) 
+      globalParStats.res_size_GA = size_GA;
+  }
+  // fprintf(stderr, ">> n: %d; size: %d;; tot: %d;  res: %d\n",
+  //	  n, size_GA, globalParStats.tot_size_GA, globalParStats.res_size_GA);
+}
+
+void
+par_ticky_markLocalGAs_start(void) {
+  // collect parallel global statistics (currently done together with GC stats)
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    start_time_GA = usertime();
+  }
+}
+
+void
+par_ticky_markLocalGAs_end(nat n) {
+  // collect parallel global statistics (currently done together with GC stats)
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    globalParStats.cnt_mark_GA++;
+    globalParStats.tot_mark_GA += n;
+    if ( n > globalParStats.res_mark_GA ) 
+      globalParStats.res_mark_GA = n;
+    globalParStats.time_mark_GA += usertime() - start_time_GA;
+  }
+}
+
+void
+par_ticky_markSparkQueue_start(void) {
+  // collect parallel global statistics (currently done together with GC stats)
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    start_mark=usertime();
+  }
+}
+
+void
+par_ticky_markSparkQueue_end(nat n) {
+  // collect parallel global statistics (currently done together with GC stats)
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    globalParStats.time_sparks += usertime() - start_mark;
+
+    globalParStats.tot_sparks_marked += n;
+    if ( n > globalParStats.res_sparks_marked ) 
+      globalParStats.res_sparks_marked = n;
+  }
+}
+
+void
+par_ticky_PackNearbyGraph_start (void) {
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    start_pack=usertime();
+  }
+}
+
+void
+par_ticky_PackNearbyGraph_end(nat n, nat thunks) {
+  // collect parallel global statistics (currently done together with GC stats)
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    globalParStats.time_pack += usertime() - start_pack;
+
+    globalParStats.tot_packets++;
+    globalParStats.tot_packet_size += n;
+    if ( n > globalParStats.res_packet_size ) 
+      globalParStats.res_packet_size = n;
+    globalParStats.tot_thunks += thunks;
+    if ( thunks > globalParStats.res_thunks ) 
+      globalParStats.res_thunks = thunks;
+  }
+}
+
+void
+par_ticky_UnpackGraph_start (void) {
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    start_unpack=usertime();
+  }
+}
+
+void
+par_ticky_UnpackGraph_end(nat n, nat thunks) {
+  // collect parallel global statistics (currently done together with GC stats)
+  if (RtsFlags.ParFlags.ParStats.Global &&
+      RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+    globalParStats.time_unpack += usertime() - start_unpack;
+
+    globalParStats.rec_packets++;
+    globalParStats.rec_packet_size += n;
+    /*
+    if ( n > globalParStats.res_packet_size ) 
+      globalParStats.res_packet_size = n;
+    */
+    globalParStats.rec_thunks += thunks;
+    /*
+    if ( thunks > globalParStats.res_thunks ) 
+      globalParStats.res_thunks = thunks;
+    */
+  }
+}
+
+void
+par_ticky_TP (void) {
+    StgSparkPool *pool;
+    nat tp_size, sp_size; // stats only
+
+    // Global stats gathering
+    /* the spark pool for the current PE */
+    pool = &(MainRegTable.rSparks); // generalise to cap = &MainRegTable
+
+    // Global statistics: residency of thread and spark pool
+    if (RtsFlags.ParFlags.ParStats.Global &&
+	RtsFlags.GcFlags.giveStats > NO_GC_STATS) {
+      tp_size = run_queue_len() + 1; // add the TSO just poped
+      // No: there may be many blocked threads being awoken at the same time
+      // ASSERT(tp_size <= RtsFlags.ParFlags.maxThreads);
+      globalParStats.tot_tp += tp_size;
+      globalParStats.emp_tp += (tp_size==0) ? 1 : 0;
+      globalParStats.cnt_tp++;
+      if ( tp_size > globalParStats.res_tp)
+	globalParStats.res_tp = tp_size;
+      // fprintf(stderr, "run_queue_len() = %d (max %d)\n", run_queue_len(), globalParStats.res_tp);
+      sp_size = spark_queue_len(pool);
+      //ASSERT(sp_size <= RtsFlags.ParFlags.maxLocalSparks);
+      globalParStats.tot_sp += sp_size;
+      globalParStats.emp_sp += (sp_size==0) ? 1 : 0;
+      globalParStats.cnt_sp++;
+      if ( sp_size > globalParStats.res_sp)
+	globalParStats.res_sp = sp_size;
+      // fprintf(stderr, "spark_queue_len(pool) = %d (max %d)\n", spark_queue_len(pool), globalParStats.res_sp);
+    }
+}
+
+void
+globalParStat_exit(void)
+{
+    FILE *sf = RtsFlags.GcFlags.statsFile;
+    double time, etime;
+
+    /* print only if GC stats is enabled, too; i.e. -sstderr */
+    if (!(RtsFlags.ParFlags.ParStats.Global &&
+	RtsFlags.GcFlags.giveStats > NO_GC_STATS)) 
+      return;
+
+    time = usertime();
+    etime = elapsedtime() - ElapsedTimeStart;
+    // fprintf(stderr, "foo=%7.2f\n", time);
+
+    if (sf != NULL){
+        char temp[BIG_STRING_LEN];
+
+	// GC_tot_alloc += alloc;
+	fprintf(sf,"\n");
+
+	fprintf(sf, "%11d threads created\n", 
+		globalParStats.tot_threads_created);
+	/*
+	  Would need to add a ++ to the par macro to use this
+
+	fprintf(sf, "%11d sparks created\n", 
+		globalParStats.tot_sparks_created);
+	fprintf(sf, "%11d sparks ignored\n", 
+		globalParStats.tot_sparks_ignored);
+	*/
+	ullong_format_string(globalParStats.res_tp, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s thread pool residency", temp);
+	fprintf(sf, " (avg: %3.2f; %d times (%2.2f%%) of %d empty)\n", 
+		(double)globalParStats.tot_tp/(double)globalParStats.cnt_tp,
+		globalParStats.emp_tp, 
+		globalParStats.emp_tp*100.0/(double)globalParStats.cnt_tp,
+		globalParStats.cnt_tp);
+	ullong_format_string(globalParStats.res_sp, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s spark pool residency", temp);
+
+	fprintf(sf, " (avg: %3.2f; %d times (%2.2f%%) of %d empty)\n", 
+		(double)globalParStats.tot_sp/(double)globalParStats.cnt_sp,
+		globalParStats.emp_sp, 
+		globalParStats.emp_sp*100.0/(double)globalParStats.cnt_sp,
+		globalParStats.cnt_sp);
+	//ullong_format_string(globalParStats.tot_fishes, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11d messages sent (%d fish, %d fetch, %d resume, %d schedule", 
+		globalParStats.tot_fish_mess+globalParStats.tot_fetch_mess+
+		globalParStats.tot_resume_mess+globalParStats.tot_schedule_mess,
+		globalParStats.tot_fish_mess, globalParStats.tot_fetch_mess, 
+		globalParStats.tot_resume_mess, globalParStats.tot_schedule_mess);
+#if defined(DIST)
+	fprintf(sf, "%d revals", globalParStats.tot_reval_mess);
+#endif
+	fprintf(sf,")\n");
+	fprintf(sf, "%11d messages received (%d fish, %d fetch, %d resume, %d schedule", 
+		globalParStats.rec_fish_mess+globalParStats.rec_fetch_mess+
+		globalParStats.rec_resume_mess+globalParStats.rec_schedule_mess,
+		globalParStats.rec_fish_mess, globalParStats.rec_fetch_mess, 
+		globalParStats.rec_resume_mess, globalParStats.rec_schedule_mess);
+#if defined(DIST)
+	fprintf(sf, "%d revals", globalParStats.rec_reval_mess);
+#endif
+	fprintf(sf,")\n\n");
+
+	ullong_format_string(globalParStats.tot_size_GA*sizeof(W_), temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s bytes of global heap in total ", temp);
+	fprintf(sf, "(%5.2f%% of total allocated heap)\n", 
+		globalParStats.tot_size_GA*sizeof(W_)*100.0/(double)GC_tot_alloc*sizeof(W_));
+	ullong_format_string(globalParStats.res_size_GA*sizeof(W_), temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s bytes global heap residency ", temp);
+	fprintf(sf, "(%5.2f%% of max heap residency)\n", 
+		globalParStats.res_size_GA*sizeof(W_)*100.0/(double)MaxResidency*sizeof(W_));
+
+	//ullong_format_string(globalParStats.res_mark_GA, temp, rtsTrue/*commas*/);
+	//fprintf(sf, "%11s GAs residency in GALA table ", temp);
+	// ullong_format_string(globalParStats.tot_mark_GA, temp, rtsTrue/*commas*/);
+	//fprintf(sf, "(avg %5.2f; %d samples)\n", 
+	//	(double)globalParStats.tot_mark_GA/(double)globalParStats.cnt_mark_GA,
+	//	globalParStats.cnt_mark_GA);
+
+	ullong_format_string(globalParStats.local_alloc_GA, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s GAs locally allocated (calls to makeGlobal)\n", temp);
+
+	ullong_format_string(globalParStats.tot_rebuild_GA, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s live GAs in total (after rebuilding tables)\n", temp);
+	ullong_format_string(globalParStats.res_rebuild_GA, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s GAs residency (after rebuilding tables) ", temp);
+	fprintf(sf, "(avg %5.2f; %d samples)\n", 
+		(double)globalParStats.tot_rebuild_GA/(double)globalParStats.cnt_rebuild_GA,
+		globalParStats.cnt_rebuild_GA);
+	ullong_format_string(globalParStats.res_free_GA, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s residency of freeing GAs", temp);
+	fprintf(sf, " (avg %5.2f; %d samples)\n", 
+		(double)globalParStats.tot_free_GA/(double)globalParStats.cnt_free_GA,
+		globalParStats.cnt_free_GA);
+
+	fprintf(sf, "%11.2fs spent marking GAs (%7.2f%% of %7.2fs)\n", 
+		globalParStats.time_mark_GA,
+		globalParStats.time_mark_GA*100./time, time);
+	fprintf(sf, "%11.2fs spent rebuilding GALA tables (%7.2f%% of %7.2fs; %7.2f%% of %7.2fs)\n", 
+		globalParStats.time_rebuild_GA,
+		globalParStats.time_rebuild_GA*100./time, time,
+		globalParStats.time_rebuild_GA*100./etime, etime);
+
+	ullong_format_string(globalParStats.tot_sparks_marked, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s sparks marked\t", temp);
+	ullong_format_string(globalParStats.res_sparks_marked, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%6s spark mark residency\n", temp);
+	fprintf(sf, "%11.2fs spent marking sparks (%7.2f%% of %7.2fs; %7.2f%% of %7.2fs elapsed)\n", 
+		globalParStats.time_sparks,
+		globalParStats.time_sparks*100./time, time,
+		globalParStats.time_sparks*100./etime, etime);
+
+	fprintf(sf,"\n");
+
+	ullong_format_string(globalParStats.tot_packets, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s packets sent\n", temp);
+	ullong_format_string(globalParStats.tot_packet_size, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s bytes of graph sent in total (max %d; avg %.2f)\n",
+		temp, globalParStats.res_packet_size,
+		(double)globalParStats.tot_packet_size/(double)globalParStats.tot_packets);
+	ullong_format_string(globalParStats.tot_thunks, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s thunks sent in total (max %d; avg %.2f)\n",
+		temp, globalParStats.res_thunks,
+		(double)globalParStats.tot_thunks/(double)globalParStats.tot_packets);
+	fprintf(sf, "%11.2fs spent packing graph structures (%7.2f%% of %7.2fs; %7.2f%% of %7.2fs elapsed)\n", 
+		globalParStats.time_pack,
+		globalParStats.time_pack*100./time, time,
+		globalParStats.time_pack*100./etime, etime);
+
+	ullong_format_string(globalParStats.rec_packets, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s packets received\n", temp);
+	ullong_format_string(globalParStats.rec_packet_size, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s bytes of graph received in total (max %d; avg %.2f)\n",
+		temp, globalParStats.rec_res_packet_size,
+		(double)globalParStats.rec_packet_size/(double)globalParStats.rec_packets);
+	ullong_format_string(globalParStats.rec_thunks, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s thunks received in total (max %d; avg %.2f)\n",
+		temp, globalParStats.rec_res_thunks,
+		(double)globalParStats.rec_thunks/(double)globalParStats.rec_packets);
+	fprintf(sf, "%11.2fs spent unpacking graph structures (%7.2f%% of %7.2fs; %7.2f%% of %7.2fs elapsed)\n", 
+		globalParStats.time_unpack,
+		globalParStats.time_unpack*100./time, time,
+		globalParStats.time_unpack*100./etime, etime);
+
+	fprintf(sf,"\n");
+
+	ullong_format_string(globalParStats.tot_arrs, temp, rtsTrue/*commas*/);
+	fprintf(sf, "%11s bytearrays sent; ", temp);
+	ullong_format_string(globalParStats.tot_arr_size, temp, rtsTrue/*commas*/);
+	fprintf(sf, " %s bytes in total (avg %.2f)\n",
+		temp, 
+		(double)globalParStats.tot_arr_size/(double)globalParStats.tot_arrs);
+	
+	fprintf(sf,"\n");
+
+	fprintf(sf, "%11d yields, %d stack overflows, %d heap overflows\n",
+		globalParStats.tot_yields, globalParStats.tot_stackover,
+ 		globalParStats.tot_heapover); 
+
+	fprintf(sf,"\n");
+
+	//fprintf(stderr, "Printing this pathetic statistics took %7.2fs (start @ %7.2f)\n",
+	//	usertime()-time, time);
+
+	fflush(sf);
+	// Open filehandle needed by other stats printing fcts
+	// fclose(sf);
+    }
+}
+
+#endif
+
diff --git a/rts/parallel/ParTicky.h b/rts/parallel/ParTicky.h
new file mode 100644
index 0000000000..1d6e7435c9
--- /dev/null
+++ b/rts/parallel/ParTicky.h
@@ -0,0 +1,60 @@
+/* --------------------------------------------------------------------------
+ *
+ * (c) Hans-Wolfgang Loidl, 2000-
+ *
+ * Header for ParTicky.c
+ *
+ * --------------------------------------------------------------------------*/
+
+#if defined(PAR_TICKY)
+
+/* macros */
+#define PAR_TICKY_PAR_START()              par_ticky_Par_start () 
+#define PAR_TICKY_PAR_END()                globalParStat_exit () 
+#define PAR_TICKY_REBUILD_GA_TABLES_START()  par_ticky_rebuildGAtables_start() 
+#define PAR_TICKY_REBUILD_GA_TABLES_END(n, size_GA) par_ticky_rebuildGAtables_end(n, size_GA) 
+#define PAR_TICKY_MARK_LOCAL_GAS_START()     par_ticky_markLocalGAs_start() 
+#define PAR_TICKY_MARK_LOCAL_GAS_END(n)      par_ticky_markLocalGAs_end(n) 
+#define PAR_TICKY_MARK_SPARK_QUEUE_START()   par_ticky_markSparkQueue_start() 
+#define PAR_TICKY_MARK_SPARK_QUEUE_END(n)    par_ticky_markSparkQueue_end(n) 
+#define PAR_TICKY_PACK_NEARBY_GRAPH_START()  (par_ticky_PackNearbyGraph_start())
+#define PAR_TICKY_PACK_NEARBY_GRAPH_END(n, thunks) par_ticky_PackNearbyGraph_end(n, thunks) 
+#define PAR_TICKY_UNPACK_GRAPH_START()      par_ticky_UnpackGraph_start() 
+#define PAR_TICKY_UNPACK_GRAPH_END(n,thunks) par_ticky_UnpackGraph_end(n,thunks)
+#define PAR_TICKY_TP()                     par_ticky_TP() 
+#define PAR_TICKY_CNT_FREE_GA()            stats_CntFreeGA()
+
+/* prototypes */
+extern void par_ticky_Par_start (void) ;
+extern void par_ticky_rebuildGAtables_start(void) ;
+extern void par_ticky_rebuildGAtables_end(nat n, nat size_GA) ;
+extern void par_ticky_markLocalGAs_start(void) ;
+extern void par_ticky_markLocalGAs_end(nat n) ;
+extern void par_ticky_markSparkQueue_start(void) ;
+extern void par_ticky_markSparkQueue_end(nat n) ;
+extern void par_ticky_PackNearbyGraph_start (void) ;
+extern void par_ticky_PackNearbyGraph_end(nat n, nat thunks) ;
+extern void par_ticky_UnpackGraph_start (void) ;
+extern void par_ticky_UnpackGraph_end(nat n, nat thunks) ;
+extern void par_ticky_TP (void) ;
+extern void globalParStat_exit(void);
+
+#else
+
+#define PAR_TICKY_PAR_START()
+#define PAR_TICKY_PAR_END()  
+#define PAR_TICKY_REBUILD_GA_TABLES_START()
+#define PAR_TICKY_REBUILD_GA_TABLES_END(n, size_GA)
+#define PAR_TICKY_MARK_LOCAL_GAS_START()
+#define PAR_TICKY_MARK_LOCAL_GAS_END(n) 
+#define PAR_TICKY_MARK_SPARK_QUEUE_START()
+#define PAR_TICKY_MARK_SPARK_QUEUE_END(n) 
+#define PAR_TICKY_PACK_NEARBY_GRAPH_START () 
+#define PAR_TICKY_PACK_NEARBY_GRAPH_END(n, thunks)
+#define PAR_TICKY_UNPACK_GRAPH_START ()    
+#define PAR_TICKY_UNPACK_GRAPH_END(n, thunks) 
+#define PAR_TICKY_TP ()                    
+#define PAR_TICKY_CNT_FREE_GA()            
+
+#endif
+
diff --git a/rts/parallel/ParTypes.h b/rts/parallel/ParTypes.h
new file mode 100644
index 0000000000..910a6f2d99
--- /dev/null
+++ b/rts/parallel/ParTypes.h
@@ -0,0 +1,38 @@
+/* ---------------------------------------------------------------------------
+ * Time-stamp: <Tue Nov 09 1999 16:31:38 Stardate: [-30]3873.44 hwloidl>
+ *
+ * Runtime system types for GUM
+ *
+ * ------------------------------------------------------------------------- */
+
+#ifndef PARTYPES_H
+#define PARTYPES_H
+
+#ifdef PAR /* all of it */
+
+// now in Parallel.h 
+//typedef struct hashtable  HashTable;
+//typedef struct hashlist   HashList;
+
+/* Global addresses now live in Parallel.h (needed in Closures.h) */
+// gaddr
+
+// now in Parallel.h 
+/* (GA, LA) pairs 
+typedef struct gala {
+    globalAddr   ga;
+    StgPtr       la;
+    struct gala *next;
+    rtsBool      preferred;
+} rtsGaLa;
+*/
+
+#if defined(GRAN)
+typedef unsigned long TIME;
+typedef unsigned char Proc;
+typedef unsigned char EVTTYPE;
+#endif
+
+#endif /* PAR */
+
+#endif /* ! PARTYPES_H */
diff --git a/rts/parallel/Parallel.c b/rts/parallel/Parallel.c
new file mode 100644
index 0000000000..414b7e4406
--- /dev/null
+++ b/rts/parallel/Parallel.c
@@ -0,0 +1,1140 @@
+/*
+  Time-stamp: <Wed Mar 21 2001 16:42:40 Stardate: [-30]6363.48 hwloidl>
+
+  Basic functions for use in either GranSim or GUM.
+*/
+
+#if defined(GRAN) || defined(PAR)                              /* whole file */
+
+//@menu
+//* Includes::			
+//* Variables and constants::	
+//* Writing to the log-file::	
+//* Global statistics::		
+//* Dumping routines::		
+//@end menu
+//*/ fool highlight
+
+//@node Includes, Variables and constants
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "Storage.h"
+#include "GranSimRts.h"
+#include "ParallelRts.h"
+
+//@node Variables and constants, Writing to the log-file, Includes
+//@subsection Variables and constants
+
+/* Where to write the log file */
+FILE *gr_file = NULL;
+char gr_filename[STATS_FILENAME_MAXLEN];
+
+#if defined(PAR)
+/* Global statistics */
+GlobalParStats globalParStats;
+#endif
+
+#if defined(PAR)
+ullong startTime = 0;
+#endif
+
+#if defined(PAR) && !defined(DEBUG)
+// HAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCKKKKKKKKKKKK
+// Definitely the wrong place for info_type in !DEBUG (see Printer.c) -- HWL
+
+static char *closure_type_names[] = {
+  "INVALID_OBJECT",          	/* 0  */
+  "CONSTR",                  	/* 1  */
+  "CONSTR_1_0",			/* 2  */
+  "CONSTR_0_1",			/* 3  */
+  "CONSTR_2_0",			/* 4  */
+  "CONSTR_1_1",			/* 5  */
+  "CONSTR_0_2",			/* 6  */
+  "CONSTR_INTLIKE",	        /* 7  */
+  "CONSTR_CHARLIKE",	        /* 8  */
+  "CONSTR_STATIC",	        /* 9  */
+  "CONSTR_NOCAF_STATIC",     	/* 10 */
+  "FUN",		        /* 11 */
+  "FUN_1_0",		  	/* 12 */
+  "FUN_0_1",		  	/* 13 */
+  "FUN_2_0",		  	/* 14 */
+  "FUN_1_1",		  	/* 15 */
+  "FUN_0_2",			/* 16 */
+  "FUN_STATIC",	        	/* 17 */
+  "THUNK",		        /* 18 */
+  "THUNK_1_0",	  		/* 19 */
+  "THUNK_0_1",	  		/* 20 */
+  "THUNK_2_0",	  		/* 21 */
+  "THUNK_1_1",	  		/* 22 */
+  "THUNK_0_2",			/* 23 */
+  "THUNK_STATIC",	        /* 24 */
+  "THUNK_SELECTOR",	        /* 25 */
+  "BCO",		        /* 26 */
+  "AP_UPD",		        /* 27 */
+  "PAP",			/* 28 */
+  "IND",		        /* 29 */
+  "IND_OLDGEN",	        	/* 30 */
+  "IND_PERM",	        	/* 31 */
+  "IND_OLDGEN_PERM",	        /* 32 */
+  "IND_STATIC",	        	/* 33 */
+  "CAF_UNENTERED",           	/* 34 */
+  "CAF_ENTERED",		/* 35 */
+  "CAF_BLACKHOLE",		/* 36 */
+  "RET_BCO",                 	/* 37 */
+  "RET_SMALL",	        	/* 38 */
+  "RET_VEC_SMALL",	        /* 39 */
+  "RET_BIG",		        /* 40 */
+  "RET_VEC_BIG",	        /* 41 */
+  "RET_DYN",		        /* 42 */
+  "UPDATE_FRAME",	        /* 43 */
+  "CATCH_FRAME",	        /* 44 */
+  "STOP_FRAME",	        	/* 45 */
+  "SEQ_FRAME",	        	/* 46 */
+  "BLACKHOLE",	        	/* 47 */
+  "BLACKHOLE_BQ",	        /* 48 */
+  "SE_BLACKHOLE",		/* 49 */
+  "SE_CAF_BLACKHOLE",		/* 50 */
+  "MVAR",		        /* 51 */
+  "ARR_WORDS",	        	/* 52 */
+  "MUT_ARR_PTRS",	        /* 53 */
+  "MUT_ARR_PTRS_FROZEN",     	/* 54 */
+  "MUT_VAR",		        /* 55 */
+  "WEAK",		        /* 56 */
+  "FOREIGN",		        /* 57 */
+  "STABLE_NAME",	        /* 58 */
+  "TSO",		        /* 59 */
+  "BLOCKED_FETCH",	        /* 60 */
+  "FETCH_ME",                   /* 61 */
+  "FETCH_ME_BQ",                /* 62 */
+  "RBH",                        /* 63 */
+  "EVACUATED",                  /* 64 */
+  "REMOTE_REF",                 /* 65 */
+  "N_CLOSURE_TYPES"         	/* 66 */
+};
+
+char *
+info_type(StgClosure *closure){ 
+  return closure_type_names[get_itbl(closure)->type];
+}
+
+char *
+info_type_by_ip(StgInfoTable *ip){ 
+  return closure_type_names[ip->type];
+}
+
+void
+info_hdr_type(StgClosure *closure, char *res){ 
+  strcpy(res,closure_type_names[get_itbl(closure)->type]);
+}
+#endif
+
+//@node Writing to the log-file, Global statistics, Variables and constants
+//@subsection Writing to the log-file
+/*
+  Writing to the log-file
+
+  These routines dump event-based info to the main log-file.
+  The code for writing log files is shared between GranSim and GUM.
+*/
+
+/* 
+ * If you're not using GNUC and you're on a 32-bit machine, you're 
+ * probably out of luck here.  However, since CONCURRENT currently
+ * requires GNUC, I'm not too worried about it.  --JSM
+ */
+
+//@cindex init_gr_simulation
+#if defined(GRAN)
+void
+init_gr_simulation(rts_argc, rts_argv, prog_argc, prog_argv)
+char *prog_argv[], *rts_argv[];
+int prog_argc, rts_argc;
+{
+  nat i;
+  char *extension = RtsFlags.GranFlags.GranSimStats.Binary ? "gb" : "gr";
+
+  if (RtsFlags.GranFlags.GranSimStats.Global)
+    init_gr_stats();
+
+  /* init global constants for costs of basic operations */
+  gran_arith_cost = RtsFlags.GranFlags.Costs.arith_cost;
+  gran_branch_cost = RtsFlags.GranFlags.Costs.branch_cost;
+  gran_load_cost = RtsFlags.GranFlags.Costs.load_cost;
+  gran_store_cost = RtsFlags.GranFlags.Costs.store_cost;
+  gran_float_cost = RtsFlags.GranFlags.Costs.float_cost;
+
+  if (RtsFlags.GranFlags.GranSimStats.Suppressed)
+    return;
+
+  if (!RtsFlags.GranFlags.GranSimStats.Full) 
+    return;
+
+  sprintf(gr_filename, GR_FILENAME_FMT, prog_argv[0], extension);
+
+  if ((gr_file = fopen(gr_filename, "w")) == NULL) {
+    barf("Can't open granularity simulation report file %s\n", 
+	 gr_filename);
+  }
+
+  setbuf(gr_file, NULL);                   /* turn  buffering off */
+
+  /* write header with program name, options and setup to gr_file */
+  fputs("Granularity Simulation for ", gr_file);
+  for (i = 0; i < prog_argc; ++i) {
+    fputs(prog_argv[i], gr_file);
+    fputc(' ', gr_file);
+  }
+
+  if (rts_argc > 0) {
+    fputs("+RTS ", gr_file);
+    
+    for (i = 0; i < rts_argc; ++i) {
+      fputs(rts_argv[i], gr_file);
+      fputc(' ', gr_file);
+    }
+  }
+
+  fputs("\nStart time: ", gr_file);
+  fputs(time_str(), gr_file);               /* defined in RtsUtils.c */
+  fputc('\n', gr_file);
+    
+  fputs("\n\n--------------------\n\n", gr_file);
+
+  fputs("General Parameters:\n\n", gr_file);
+
+  if (RtsFlags.GranFlags.Light) 
+    fprintf(gr_file, "GrAnSim-Light\nPEs infinite, %s Scheduler, %sMigrate Threads %s, %s\n",
+	    RtsFlags.GranFlags.DoFairSchedule?"Fair":"Unfair",
+	    RtsFlags.GranFlags.DoThreadMigration?"":"Don't ",
+	    RtsFlags.GranFlags.DoThreadMigration && RtsFlags.GranFlags.DoStealThreadsFirst?" Before Sparks":"",
+	    RtsFlags.GranFlags.DoAsyncFetch ? "Asynchronous Fetch" :
+	    "Block on Fetch");
+  else 
+    fprintf(gr_file, "PEs %u, %s Scheduler, %sMigrate Threads %s, %s\n",
+	    RtsFlags.GranFlags.proc,RtsFlags.GranFlags.DoFairSchedule?"Fair":"Unfair",
+	    RtsFlags.GranFlags.DoThreadMigration?"":"Don't ",
+	    RtsFlags.GranFlags.DoThreadMigration && RtsFlags.GranFlags.DoStealThreadsFirst?" Before Sparks":"",
+	    RtsFlags.GranFlags.DoAsyncFetch ? "Asynchronous Fetch" :
+	    "Block on Fetch");
+  
+  if (RtsFlags.GranFlags.DoBulkFetching) 
+    if (RtsFlags.GranFlags.ThunksToPack)
+      fprintf(gr_file, "Bulk Fetching: Fetch %d Thunks in Each Packet (Packet Size = %d closures)\n",
+	      RtsFlags.GranFlags.ThunksToPack, 
+	      RtsFlags.GranFlags.packBufferSize);
+    else
+      fprintf(gr_file, "Bulk Fetching: Fetch as many closures as possible (Packet Size = %d closures)\n",
+	      RtsFlags.GranFlags.packBufferSize);
+  else
+    fprintf(gr_file, "Incremental Fetching: Fetch Exactly One Closure in Each Packet\n");
+  
+  fprintf(gr_file, "Fetch Strategy(%u):If outstanding fetches %s\n",
+	  RtsFlags.GranFlags.FetchStrategy,
+	  RtsFlags.GranFlags.FetchStrategy==0 ?
+	    " block (block-on-fetch)":
+	  RtsFlags.GranFlags.FetchStrategy==1 ?
+	    "only run runnable threads":
+	  RtsFlags.GranFlags.FetchStrategy==2 ? 
+	    "create threads only from local sparks":
+	  RtsFlags.GranFlags.FetchStrategy==3 ? 
+	    "create threads from local or global sparks":
+	  RtsFlags.GranFlags.FetchStrategy==4 ?
+	    "create sparks and steal threads if necessary":
+	  "unknown");
+
+  if (RtsFlags.GranFlags.DoPrioritySparking)
+    fprintf(gr_file, "Priority Sparking (i.e. keep sparks ordered by priority)\n");
+
+  if (RtsFlags.GranFlags.DoPriorityScheduling)
+    fprintf(gr_file, "Priority Scheduling (i.e. keep threads ordered by priority)\n");
+
+  fprintf(gr_file, "Thread Creation Time %u, Thread Queue Time %u\n",
+	  RtsFlags.GranFlags.Costs.threadcreatetime, 
+	  RtsFlags.GranFlags.Costs.threadqueuetime);
+  fprintf(gr_file, "Thread DeSchedule Time %u, Thread Schedule Time %u\n",
+	  RtsFlags.GranFlags.Costs.threaddescheduletime, 
+	  RtsFlags.GranFlags.Costs.threadscheduletime);
+  fprintf(gr_file, "Thread Context-Switch Time %u\n",
+	  RtsFlags.GranFlags.Costs.threadcontextswitchtime);
+  fputs("\n\n--------------------\n\n", gr_file);
+
+  fputs("Communication Metrics:\n\n", gr_file);
+  fprintf(gr_file,
+	  "Latency %u (1st) %u (rest), Fetch %u, Notify %u (Global) %u (Local)\n",
+	  RtsFlags.GranFlags.Costs.latency, 
+	  RtsFlags.GranFlags.Costs.additional_latency, 
+	  RtsFlags.GranFlags.Costs.fetchtime,
+	  RtsFlags.GranFlags.Costs.gunblocktime, 
+	  RtsFlags.GranFlags.Costs.lunblocktime);
+  fprintf(gr_file,
+	  "Message Creation %u (+ %u after send), Message Read %u\n",
+	  RtsFlags.GranFlags.Costs.mpacktime, 
+	  RtsFlags.GranFlags.Costs.mtidytime, 
+	  RtsFlags.GranFlags.Costs.munpacktime);
+  fputs("\n\n--------------------\n\n", gr_file);
+
+  fputs("Instruction Metrics:\n\n", gr_file);
+  fprintf(gr_file, "Arith %u, Branch %u, Load %u, Store %u, Float %u, Alloc %u\n",
+	  RtsFlags.GranFlags.Costs.arith_cost, 
+	  RtsFlags.GranFlags.Costs.branch_cost,
+	  RtsFlags.GranFlags.Costs.load_cost, 
+	  RtsFlags.GranFlags.Costs.store_cost, 
+	  RtsFlags.GranFlags.Costs.float_cost, 
+	  RtsFlags.GranFlags.Costs.heapalloc_cost);
+  fputs("\n\n++++++++++++++++++++\n\n", gr_file);
+
+# if 0
+  /* binary log files are currently not supported */
+  if (RtsFlags.GranFlags.GranSimStats.Binary)
+    grputw(sizeof(rtsTime));
+# endif
+
+  return (0);
+}
+
+#elif defined(PAR)
+
+void init_gr_stats (void);
+
+void
+init_gr_simulation(rts_argc, rts_argv, prog_argc, prog_argv)
+char *prog_argv[], *rts_argv[];
+int prog_argc, rts_argc;
+{
+  nat i;
+  char time_string[TIME_STR_LEN], node_str[NODE_STR_LEN];
+  char *extension = RtsFlags.ParFlags.ParStats.Binary ? "gb" : "gr";
+
+  sprintf(gr_filename, GR_FILENAME_FMT_GUM, prog_argv[0], thisPE, extension);
+
+  if (!RtsFlags.ParFlags.ParStats.Full) 
+    return;
+
+  if (RtsFlags.ParFlags.ParStats.Global)
+    init_gr_stats();
+
+  if ((gr_file = fopen(gr_filename, "w")) == NULL)
+    barf("Can't open activity report file %s\n", gr_filename);
+
+  setbuf(gr_file, NULL);                   /* turn  buffering off */
+
+  /* write header with program name, options and setup to gr_file */
+  for (i = 0; i < prog_argc; ++i) {
+    fputs(prog_argv[i], gr_file);
+    fputc(' ', gr_file);
+  }
+
+  if (rts_argc > 0) {
+    fputs("+RTS ", gr_file);
+    
+    for (i = 0; i < rts_argc; ++i) {
+      fputs(rts_argv[i], gr_file);
+      fputc(' ', gr_file);
+    }
+  }
+  fputc('\n', gr_file);
+
+  /* record the absolute start time to allow synchronisation of log-files */
+  fputs("Start-Time: ", gr_file);
+  fputs(time_str(), gr_file);
+  fputc('\n', gr_file);
+
+  ASSERT(startTime==0);
+  // startTime = msTime();
+  startTime = CURRENT_TIME;
+  ullong_format_string(CURRENT_TIME, time_string, rtsFalse/*no commas!*/);
+  fprintf(gr_file, "PE %2u [%s]: TIME\n", thisPE, time_string);
+
+# if 0
+    ngoq Dogh'q' vImuS
+  IF_PAR_DEBUG(verbose,
+	       belch("== Start-time: %ld (%s)",
+		     startTime, time_string));
+
+    if (startTime > LL(1000000000)) {
+      fprintf(gr_file, "PE %2u [%lu%lu]: TIME\n", thisPE, 
+	    (rtsTime) (startTime / LL(1000000000)),
+	    (rtsTime) (startTime % LL(1000000000)));
+    } else {
+      fprintf(gr_file, "PE %2u [%lu]: TIME\n", thisPE, (TIME) startTime);
+    } 
+    /* binary log files are currently not supported */
+    if (RtsFlags.GranFlags.GranSimStats.Binary)
+	grputw(sizeof(rtsTime));
+# endif
+
+    return;
+}
+
+void 
+init_gr_stats (void) {
+  // memset(&globalParStats, '\0', sizeof(GlobalParStats));
+
+  globalParStats.tot_mark_GA = globalParStats.tot_rebuild_GA = globalParStats.tot_free_GA = globalParStats.res_mark_GA = globalParStats.res_rebuild_GA = globalParStats.res_free_GA = globalParStats.tot_size_GA = globalParStats.res_size_GA = globalParStats.tot_global = globalParStats.tot_local = 0;
+  globalParStats.cnt_mark_GA = globalParStats.cnt_rebuild_GA = globalParStats.cnt_free_GA = globalParStats.res_free_GA = globalParStats.local_alloc_GA = 0;
+
+  globalParStats.time_mark_GA = 0.0;
+  globalParStats.time_rebuild_GA = 0.0;
+  globalParStats.time_sparks = 0.0;
+  globalParStats.time_pack = 0.0;
+
+  globalParStats.res_sp = globalParStats.res_tp = globalParStats.tot_sp = globalParStats.tot_tp = globalParStats.cnt_sp = globalParStats.cnt_tp = globalParStats.emp_sp = globalParStats.emp_tp = 0;
+  globalParStats.tot_packets = globalParStats.tot_packet_size = globalParStats.tot_thunks = globalParStats.res_packet_size = globalParStats.res_thunks = globalParStats.rec_res_packet_size = globalParStats.rec_res_thunks = 0;
+
+  globalParStats.tot_fish_mess = globalParStats.tot_fetch_mess = globalParStats.tot_resume_mess = globalParStats.tot_schedule_mess = 0;
+  globalParStats.rec_fish_mess = globalParStats.rec_resume_mess = globalParStats.rec_schedule_mess = 0;
+  globalParStats.rec_fetch_mess = 0;
+#if defined(DIST)
+  globalParStats.tot_reval_mess = 0;
+  globalParStats.rec_reval_mess = 0;
+#endif
+
+  globalParStats.tot_threads_created = globalParStats.tot_sparks_created = globalParStats.tot_sparks_ignored = globalParStats.tot_sparks_marked = globalParStats.res_sparks_created = globalParStats.res_sparks_ignored = globalParStats.res_sparks_marked = 0;
+   globalParStats.tot_yields = globalParStats.tot_stackover = globalParStats.tot_heapover = 0;
+
+   globalParStats.tot_arrs = globalParStats.tot_arr_size = 0; 
+}
+
+#endif /* PAR */
+
+//@cindex end_gr_simulation
+#if defined(GRAN)
+void
+end_gr_simulation(void)
+{
+   char time_string[TIME_STR_LEN];
+
+   ullong_format_string(CURRENT_TIME, time_string, rtsFalse/*no commas!*/);
+
+   if (RtsFlags.GranFlags.GranSimStats.Suppressed)
+     return;
+
+   /* Print event stats */
+   if (RtsFlags.GranFlags.GranSimStats.Global) {
+     nat i;
+   
+     fprintf(stderr,"Total yields: %d\n",
+             globalGranStats.tot_yields);
+
+     fprintf(stderr,"Total number of threads created: %d ; per PE:\n",
+             globalGranStats.tot_threads_created);
+     for (i=0; i<RtsFlags.GranFlags.proc; i++) {
+       fprintf(stderr,"  PE %d: %d\t", 
+	       i, globalGranStats.threads_created_on_PE[i]);
+       if (i+1 % 4 == 0) fprintf(stderr,"\n");
+     }
+     if (RtsFlags.GranFlags.proc+1 % 4 != 0) fprintf(stderr,"\n");
+     fprintf(stderr,"Total number of threads migrated: %d\n",
+             globalGranStats.tot_TSOs_migrated);
+
+     fprintf(stderr,"Total number of sparks created: %d ; per PE:\n",
+             globalGranStats.tot_sparks_created);
+     for (i=0; i<RtsFlags.GranFlags.proc; i++) {
+       fprintf(stderr,"  PE %d: %d\t", 
+	       i, globalGranStats.sparks_created_on_PE[i]);
+       if (i+1 % 4 == 0) fprintf(stderr,"\n");
+     }
+     if (RtsFlags.GranFlags.proc+1 % 4 != 0) fprintf(stderr,"\n");
+
+     fprintf(stderr,"Event statistics (number of events: %d):\n",
+             globalGranStats.noOfEvents);
+     for (i=0; i<=MAX_EVENT; i++) {
+       fprintf(stderr,"  %s (%d): \t%d \t%f%%\t%f%%\n",
+               event_names[i],i,globalGranStats.event_counts[i],
+               (float)(100*globalGranStats.event_counts[i])/(float)(globalGranStats.noOfEvents),
+               (i==ContinueThread ? 0.0 :
+   		   (float)(100*(globalGranStats.event_counts[i])/(float)(globalGranStats.noOfEvents-globalGranStats.event_counts[ContinueThread])) ));
+     }
+     fprintf(stderr,"Randomized steals: %ld sparks, %ld threads \n \t(Sparks: #%u (avg ntimes=%f; avg fl=%f)\n\t(Threads: %ld)", 
+   	             globalGranStats.rs_sp_count, 
+	             globalGranStats.rs_t_count, 
+	             globalGranStats.no_of_steals, 
+   	             (float)globalGranStats.ntimes_total/(float)stg_max(globalGranStats.no_of_steals,1),
+   	             (float)globalGranStats.fl_total/(float)stg_max(globalGranStats.no_of_steals,1),
+	             globalGranStats.no_of_migrates);
+     fprintf(stderr,"Moved sparks: %d  Withered sparks: %d (%.2f %%)\n",
+   	      globalGranStats.tot_sparks, globalGranStats.withered_sparks,
+             ( globalGranStats.tot_sparks == 0 ? 0 :
+                  (float)(100*globalGranStats.withered_sparks)/(float)(globalGranStats.tot_sparks)) );
+     /* Print statistics about priority sparking */
+     if (RtsFlags.GranFlags.DoPrioritySparking) {
+   	fprintf(stderr,"About Priority Sparking:\n");
+   	fprintf(stderr,"  Total no. NewThreads: %d   Avg. spark queue len: %.2f \n", globalGranStats.tot_sq_probes, (float)globalGranStats.tot_sq_len/(float)globalGranStats.tot_sq_probes);
+     }
+     /* Print statistics about priority sparking */
+     if (RtsFlags.GranFlags.DoPriorityScheduling) {
+   	fprintf(stderr,"About Priority Scheduling:\n");
+   	fprintf(stderr,"  Total no. of StartThreads: %d (non-end: %d) Avg. thread queue len: %.2f\n", 
+   		globalGranStats.tot_add_threads, globalGranStats.non_end_add_threads, 
+   		(float)globalGranStats.tot_tq_len/(float)globalGranStats.tot_add_threads);
+     }
+     /* Blocking queue statistics */
+     if (1) {
+   	fprintf(stderr,"Blocking queue statistcs:\n");
+   	fprintf(stderr,"  Total no. of FMBQs generated: %d\n",
+		globalGranStats.tot_FMBQs);
+   	fprintf(stderr,"  Total no. of bqs awakened: %d\n",
+		globalGranStats.tot_awbq);
+   	fprintf(stderr,"  Total length of all bqs: %d\tAvg length of bqs: %.2f\n",
+		globalGranStats.tot_bq_len, (float)globalGranStats.tot_bq_len/(float)globalGranStats.tot_awbq);
+   	fprintf(stderr,"  Percentage of local TSOs in BQs: %.2f\n",
+		(float)globalGranStats.tot_bq_len*100.0/(float)globalGranStats.tot_bq_len);
+   	fprintf(stderr,"  Total time spent processing BQs: %lx\n",
+		globalGranStats.tot_bq_processing_time);
+     }
+
+     /* Fetch misses and thunk stealing */
+     fprintf(stderr,"Number of fetch misses: %d\n", 
+	     globalGranStats.fetch_misses);
+
+     /* Print packet statistics if GUMM fetching is turned on */
+     if (RtsFlags.GranFlags.DoBulkFetching) {
+   	fprintf(stderr,"Packet statistcs:\n");
+   	fprintf(stderr,"  Total no. of packets: %d   Avg. packet size: %.2f \n", globalGranStats.tot_packets, (float)globalGranStats.tot_packet_size/(float)globalGranStats.tot_packets);
+   	fprintf(stderr,"  Total no. of thunks: %d   Avg. thunks/packet: %.2f \n", globalGranStats.tot_thunks, (float)globalGranStats.tot_thunks/(float)globalGranStats.tot_packets);
+   	fprintf(stderr,"  Total no. of cuts: %d   Avg. cuts/packet: %.2f\n", globalGranStats.tot_cuts, (float)globalGranStats.tot_cuts/(float)globalGranStats.tot_packets);
+        /* 
+   	if (closure_queue_overflows>0) 
+   	  fprintf(stderr,"  Number of closure queue overflows: %u\n",
+   		  closure_queue_overflows);
+	*/
+     }
+   } /* RtsFlags.GranFlags.GranSimStats.Global */
+
+#  if defined(GRAN_COUNT)
+#  error "GRAN_COUNT not supported; should be parallel ticky profiling, really"
+    fprintf(stderr,"Update count statistics:\n");
+    fprintf(stderr,"  Total number of updates: %u\n",nUPDs);
+    fprintf(stderr,"  Needed to awaken BQ: %u with avg BQ len of: %f\n",
+	    nUPDs_BQ,(float)BQ_lens/(float)nUPDs_BQ);
+    fprintf(stderr,"  Number of PAPs: %u\n",nPAPs);
+#  endif
+
+    fprintf(stderr, "Simulation finished after @ %s @ cycles. %d sparks created, %d sparks ignored. Check %s for details.\n",
+	    time_string, sparksCreated, sparksIgnored, gr_filename);
+
+    if (RtsFlags.GranFlags.GranSimStats.Full) 
+      fclose(gr_file);
+}
+
+#elif defined(PAR)
+
+/*
+  Under GUM we print only one line. 
+*/
+void
+end_gr_simulation(void)
+{
+  char time_string[TIME_STR_LEN];
+
+  ullong_format_string(CURRENT_TIME-startTime, time_string, rtsFalse/*no commas!*/);
+
+  fprintf(stderr, "Computation finished after @ %s @ ms. %d sparks created, %d sparks ignored. Check %s for details.\n",
+	    time_string, sparksCreated, sparksIgnored, gr_filename);
+
+  if (RtsFlags.ParFlags.ParStats.Full) 
+    fclose(gr_file);
+}
+#endif /* PAR */
+
+//@node Global statistics, Dumping routines, Writing to the log-file
+//@subsection Global statistics
+/* 
+   Called at the end of execution
+*/
+
+//@node Dumping routines,  , Global statistics
+//@subsection Dumping routines
+
+//@cindex DumpGranEvent
+void
+DumpGranEvent(name, tso)
+GranEventType name;
+StgTSO *tso;
+{
+    DumpRawGranEvent(CURRENT_PROC, (PEs)0, name, tso, &stg_END_TSO_QUEUE_closure, (StgInt)0, (StgInt)0);
+}
+
+//@cindex DumpRawGranEvent
+void
+DumpRawGranEvent(proc, p, name, tso, node, sparkname, len)
+PEs proc, p;         /* proc ... where it happens; p ... where node lives */
+GranEventType name;
+StgTSO *tso;
+StgClosure *node;
+StgInt sparkname, len;
+{
+# if defined(GRAN)
+  DumpVeryRawGranEvent(TIME_ON_PROC(proc), 
+		       proc, p, name, tso, node, sparkname, len);
+# elif defined(PAR)
+  DumpVeryRawGranEvent(CURRENT_TIME,
+		       proc, p, name, tso, node, sparkname, len);
+# endif
+}
+
+//@cindex DumpVeryRawGranEvent
+void
+DumpVeryRawGranEvent(time, proc, p, name, tso, node, sparkname, len)
+rtsTime time;
+PEs proc, p;         /* proc ... where it happens; p ... where node lives */
+GranEventType name;
+StgTSO *tso;
+StgClosure *node;
+StgInt sparkname, len;
+{
+  FILE *output_file; // DEBUGGING ONLY !!!!!!!!!!!!!!!!!!!!!!!!!1
+  StgWord id;
+  char time_string[TIME_STR_LEN], node_str[NODE_STR_LEN];
+# if defined(GRAN)
+  ullong_format_string(time,
+		       time_string, rtsFalse/*no commas!*/);
+# elif defined(PAR)
+  ullong_format_string(time,
+		       time_string, rtsFalse/*no commas!*/);
+# endif
+  output_file = gr_file;
+
+# if defined(GRAN)
+
+  if (RtsFlags.GranFlags.GranSimStats.Full) 
+    ASSERT(output_file!=NULL);
+
+  if (RtsFlags.GranFlags.GranSimStats.Suppressed)
+    return;
+# elif defined(PAR)
+
+  if (RtsFlags.ParFlags.ParStats.Full) 
+    ASSERT(output_file!=NULL);
+
+  if (RtsFlags.ParFlags.ParStats.Suppressed)
+    return;
+
+# endif
+
+  id = tso == NULL ? -1 : tso->id;
+  if (node==stgCast(StgClosure*,&stg_END_TSO_QUEUE_closure))
+      strcpy(node_str,"________");  /* "END_TSO_QUEUE"); */
+  else
+      sprintf(node_str,"0x%-6lx",node);
+
+  if (name > GR_EVENT_MAX)
+	name = GR_EVENT_MAX;
+
+  if (BINARY_STATS)
+    barf("binary log files not yet supported");
+#if 0
+    /* ToDo: fix code for writing binary GrAnSim statistics */
+    switch (name) { 
+      case GR_START:
+      case GR_STARTQ:
+                      grputw(name);
+		      grputw(proc);
+		      abort();        /* die please: a single word */
+				      /* doesn't represent long long times */
+		      grputw(TIME_ON_PROC(proc));
+		      grputw((StgWord)node);
+		      break;
+      case GR_FETCH:
+      case GR_REPLY:
+      case GR_BLOCK:
+		      grputw(name);
+		      grputw(proc);
+		      abort();        /* die please: a single word */
+				      /* doesn't represent long long times */
+		      grputw(TIME_ON_PROC(proc));  /* this line is bound to */
+		      grputw(id);                  /*   do the wrong thing */
+		      break;
+      default: 
+                      grputw(name);
+		      grputw(proc);
+		      abort();        /* die please: a single word */
+				      /* doesn't represent long long times */
+		      grputw(TIME_ON_PROC(proc));
+		      grputw((StgWord)node);
+    }
+#endif
+  else /* !BINARY_STATS */
+    switch (name) { 
+     case GR_START:
+     case GR_STARTQ:
+        fprintf(output_file,"PE %2u [%s]: %-9s\t%lx\t%s\t[SN %u]\t[sparks %u]\n", 
+	        proc,time_string,gran_event_names[name],
+	        id,node_str,sparkname,len);
+        break;
+     case GR_FETCH:
+     case GR_REPLY:
+     case GR_BLOCK:
+     case GR_STOLEN:
+     case GR_STOLENQ:
+     case GR_STEALING:
+	fprintf(output_file, "PE %2u [%s]: %-9s\t%lx \t%s\t(from %2u)\n",
+	        proc, time_string, gran_event_names[name], 
+		id,node_str,p);
+	break;
+     case GR_RESUME:
+     case GR_RESUMEQ:
+     case GR_SCHEDULE:
+     case GR_DESCHEDULE:
+        fprintf(output_file,"PE %2u [%s]: %-9s\t%lx \n",
+	        proc,time_string,gran_event_names[name],id);
+        break;
+     case GR_ALLOC:
+        fprintf(output_file,"PE %2u [%s]: %-9s\t%lx\t        \tallocating %u words\n",
+	        proc,time_string,gran_event_names[name],id,len);
+        break;
+     default:
+        fprintf(output_file,"PE %2u [%s]: %-9s\t%lx\t%s\t[sparks %u]\n",
+	        proc,time_string,gran_event_names[name],id,node_str,len);
+    }
+}
+
+//@cindex DumpGranInfo
+void
+DumpEndEvent(proc, tso, mandatory_thread)
+PEs proc;
+StgTSO *tso;
+rtsBool mandatory_thread;
+{
+  FILE *output_file; // DEBUGGING ONLY !!!!!!!!!!!!!!!!!!!!!!!!!1
+  char time_string[TIME_STR_LEN];
+# if defined(GRAN)
+  ullong_format_string(TIME_ON_PROC(proc), 
+		       time_string, rtsFalse/*no commas!*/);
+# elif defined(PAR)
+  ullong_format_string(CURRENT_TIME,
+		       time_string, rtsFalse/*no commas!*/);
+# endif
+
+  output_file = gr_file;
+  ASSERT(output_file!=NULL);
+#if defined(GRAN)
+    if (RtsFlags.GranFlags.GranSimStats.Suppressed)
+      return;
+#endif
+
+    if (BINARY_STATS) {
+    barf("binary log files not yet supported");
+#if 0
+	grputw(GR_END);
+	grputw(proc);
+	abort(); /* die please: a single word doesn't represent long long times */
+	grputw(CURRENT_TIME); /* this line is bound to fail */
+	grputw(tso->id);
+#ifdef PAR
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+	grputw(0);
+#else
+	grputw(tso->gran.sparkname);
+	grputw(tso->gran.startedat);
+	grputw(tso->gran.exported);
+	grputw(tso->gran.basicblocks);
+	grputw(tso->gran.allocs);
+	grputw(tso->gran.exectime);
+	grputw(tso->gran.blocktime);
+	grputw(tso->gran.blockcount);
+	grputw(tso->gran.fetchtime);
+	grputw(tso->gran.fetchcount);
+	grputw(tso->gran.localsparks);
+	grputw(tso->gran.globalsparks);
+#endif
+	grputw(mandatory_thread);
+#endif /* 0 */
+    } else {
+
+	/*
+	 * NB: DumpGranEvent cannot be used because PE may be wrong 
+	 * (as well as the extra info)
+	 */
+	fprintf(output_file, "PE %2u [%s]: END %lx, SN %u, ST %lu, EXP %s, BB %u, HA %u, RT %u, BT %u (%u), FT %u (%u), LS %u, GS %u, MY %s\n"
+	  ,proc
+	  ,time_string
+	  ,tso->id
+#if defined(GRAN)		
+	  ,tso->gran.sparkname
+	  ,tso->gran.startedat
+	  ,((tso->gran.exported) ? 'T' : 'F')
+	  ,tso->gran.basicblocks
+	  ,tso->gran.allocs
+	  ,tso->gran.exectime
+	  ,tso->gran.blocktime
+	  ,tso->gran.blockcount
+	  ,tso->gran.fetchtime
+	  ,tso->gran.fetchcount
+	  ,tso->gran.localsparks
+	  ,tso->gran.globalsparks
+#elif defined(PAR)
+	  ,tso->par.sparkname
+	  ,tso->par.startedat
+	  ,(tso->par.exported) ? "T" : "F"
+	  ,tso->par.basicblocks
+	  ,tso->par.allocs
+	  ,tso->par.exectime
+	  ,tso->par.blocktime
+	  ,tso->par.blockcount
+	  ,tso->par.fetchtime
+	  ,tso->par.fetchcount
+	  ,tso->par.localsparks
+	  ,tso->par.globalsparks
+#endif
+	  ,(mandatory_thread ? "T" : "F")
+	  );
+    }
+}
+
+//@cindex DumpTSO
+void
+DumpTSO(tso)
+StgTSO *tso;
+{
+  FILE *output_file; // DEBUGGING ONLY !!!!!!!!!!!!!!!!!!!!!!!!!1
+
+  output_file = gr_file;
+  ASSERT(output_file!=NULL);
+  fprintf(stderr,"TSO 0x%lx, NAME 0x%lx, ID %u, LINK 0x%lx, TYPE %s\n"
+          ,tso
+#if defined(GRAN)
+          ,tso->gran.sparkname
+#elif defined(PAR)
+          ,tso->par.sparkname
+#endif
+          ,tso->id
+          ,tso->link
+          ,/*tso->state==T_MAIN?"MAIN":
+           TSO_TYPE(tso)==T_FAIL?"FAIL":
+           TSO_TYPE(tso)==T_REQUIRED?"REQUIRED":
+           TSO_TYPE(tso)==T_ADVISORY?"ADVISORY":
+	   */
+           "???"
+          );
+          
+  fprintf(output_file,"TSO %lx: SN %u, ST %u, GBL %c, BB %u, HA %u, RT %u, BT %u (%u), FT %u (%u) LS %u, GS %u\n"
+	  ,tso->id
+#if defined(GRAN)
+          ,tso->gran.sparkname
+          ,tso->gran.startedat
+          ,tso->gran.exported?'T':'F'
+          ,tso->gran.basicblocks
+          ,tso->gran.allocs
+          ,tso->gran.exectime
+          ,tso->gran.blocktime
+          ,tso->gran.blockcount
+          ,tso->gran.fetchtime
+          ,tso->gran.fetchcount
+          ,tso->gran.localsparks
+          ,tso->gran.globalsparks
+#elif defined(PAR)
+          ,tso->par.sparkname
+          ,tso->par.startedat
+          ,tso->par.exported?'T':'F'
+          ,tso->par.basicblocks
+          ,tso->par.allocs
+          ,tso->par.exectime
+          ,tso->par.blocktime
+          ,tso->par.blockcount
+          ,tso->par.fetchtime
+          ,tso->par.fetchcount
+          ,tso->par.localsparks
+          ,tso->par.globalsparks
+#endif
+          );
+}
+
+#if 0
+/*
+  ToDo: fix binary output of log files, and support new log file format.
+*/
+/*
+   Output a terminate event and an 8-byte time.
+*/
+
+//@cindex grterminate
+void
+grterminate(v)
+rtsTime v;
+{
+  if (!BINARY_STATS) 
+    barf("grterminate: binary statistics not enabled\n");
+
+# if defined(GRAN)
+    if (RtsFlags.GranFlags.GranSimStats.Suppressed)
+      return;
+# endif
+
+    DumpGranEvent(GR_TERMINATE, stgCast(StgTSO*,&stg_END_TSO_QUEUE_closure));
+
+    if (sizeof(rtsTime) == 4) {
+      putc('\0', gr_file);
+      putc('\0', gr_file);
+      putc('\0', gr_file);
+      putc('\0', gr_file);
+    } else {
+      putc(v >> 56l, gr_file);
+      putc((v >> 48l) & 0xffl, gr_file);
+      putc((v >> 40l) & 0xffl, gr_file);
+      putc((v >> 32l) & 0xffl, gr_file);
+    }
+    putc((v >> 24l) & 0xffl, gr_file);
+    putc((v >> 16l) & 0xffl, gr_file);
+    putc((v >> 8l) & 0xffl, gr_file);
+    putc(v & 0xffl, gr_file);
+}
+
+/*
+   Length-coded output: first 3 bits contain length coding
+
+     00x        1 byte
+     01x        2 bytes
+     10x        4 bytes
+     110        8 bytes
+     111        5 or 9 bytes
+*/
+
+//@cindex grputw
+void
+grputw(v)
+rtsTime v;
+{
+  if (!BINARY_STATS) 
+    barf("grputw: binary statistics not enabled\n");
+
+# if defined(GRAN)
+    if (RtsFlags.GranFlags.GranSimStats.Suppressed)
+      return;
+# endif
+
+    if (v <= 0x3fl) {                           /* length v = 1 byte */ 
+	fputc(v & 0x3f, gr_file);
+    } else if (v <= 0x3fffl) {                  /* length v = 2 byte */ 
+	fputc((v >> 8l) | 0x40l, gr_file);
+	fputc(v & 0xffl, gr_file);
+    } else if (v <= 0x3fffffffl) {              /* length v = 4 byte */ 
+	fputc((v >> 24l) | 0x80l, gr_file);
+	fputc((v >> 16l) & 0xffl, gr_file);
+	fputc((v >> 8l) & 0xffl, gr_file);
+	fputc(v & 0xffl, gr_file);
+    } else if (sizeof(TIME) == 4) {
+	fputc(0x70, gr_file);
+	fputc((v >> 24l) & 0xffl, gr_file);
+	fputc((v >> 16l) & 0xffl, gr_file);
+	fputc((v >> 8l) & 0xffl, gr_file);
+	fputc(v & 0xffl, gr_file);
+    } else {
+	if (v <= 0x3fffffffffffffl)
+	    putc((v >> 56l) | 0x60l, gr_file);
+	else {
+	    putc(0x70, gr_file);
+	    putc((v >> 56l) & 0xffl, gr_file);
+	}
+
+	putc((v >> 48l) & 0xffl, gr_file);
+	putc((v >> 40l) & 0xffl, gr_file);
+	putc((v >> 32l) & 0xffl, gr_file);
+	putc((v >> 24l) & 0xffl, gr_file);
+	putc((v >> 16l) & 0xffl, gr_file);
+	putc((v >> 8l) & 0xffl, gr_file);
+	putc(v & 0xffl, gr_file);
+    }
+}
+#endif /* 0 */
+
+/* 
+   extracting specific info out of a closure; used in packing (GranSim, GUM)
+*/
+//@cindex get_closure_info
+StgInfoTable*
+get_closure_info(StgClosure* node, nat *size, nat *ptrs, nat *nonptrs, 
+		 nat *vhs, char *info_hdr_ty)
+{
+  StgInfoTable *info;
+
+  ASSERT(LOOKS_LIKE_COOL_CLOSURE(node)); 
+  info = get_itbl(node);
+  /* the switch shouldn't be necessary, really; just use default case */
+  switch (info->type) {
+  case RBH:
+    {
+      StgInfoTable *rip = REVERT_INFOPTR(info); // closure to revert to
+      *size = sizeW_fromITBL(rip);
+      *ptrs = (nat) (rip->layout.payload.ptrs);
+      *nonptrs = (nat) (rip->layout.payload.nptrs);
+      *vhs = *size - *ptrs - *nonptrs - sizeofW(StgHeader);
+#if 0 /* DEBUG */
+      info_hdr_type(node, info_hdr_ty);
+#else
+      strcpy(info_hdr_ty, "RBH");
+#endif
+      return rip;  // NB: we return the reverted info ptr for a RBH!!!!!!
+    }
+
+#if defined(PAR)
+  /* Closures specific to GUM */
+  case FETCH_ME:
+    *size = sizeofW(StgFetchMe);
+    *ptrs = (nat)0;
+    *nonptrs = (nat)0;
+    *vhs = *size - *ptrs - *nonptrs - sizeofW(StgHeader);
+#if 0 /* DEBUG */
+    info_hdr_type(node, info_hdr_ty);
+#else
+    strcpy(info_hdr_ty, "FETCH_ME");
+#endif
+    return info;
+
+#ifdef DIST    
+  case REMOTE_REF: //same as for FETCH_ME...
+    *size = sizeofW(StgFetchMe);
+    *ptrs = (nat)0;
+    *nonptrs = (nat)0;
+    *vhs = *size - *ptrs - *nonptrs - sizeofW(StgHeader);
+#if 0 /* DEBUG */
+    info_hdr_type(node, info_hdr_ty);
+#else
+    strcpy(info_hdr_ty, "REMOTE_REF");
+#endif
+    return info; 
+#endif /* DIST */
+    
+  case FETCH_ME_BQ:
+    *size = sizeofW(StgFetchMeBlockingQueue);
+    *ptrs = (nat)0;
+    *nonptrs = (nat)0;
+    *vhs = *size - *ptrs - *nonptrs - sizeofW(StgHeader);
+#if 0 /* DEBUG */
+    info_hdr_type(node, info_hdr_ty);
+#else
+    strcpy(info_hdr_ty, "FETCH_ME_BQ");
+#endif
+    return info;
+
+  case BLOCKED_FETCH:
+    *size = sizeofW(StgBlockedFetch);
+    *ptrs = (nat)0;
+    *nonptrs = (nat)0;
+    *vhs = *size - *ptrs - *nonptrs - sizeofW(StgHeader);
+#if 0 /* DEBUG */
+    info_hdr_type(node, info_hdr_ty);
+#else
+    strcpy(info_hdr_ty, "BLOCKED_FETCH");
+#endif
+    return info;
+#endif /* PAR */
+    
+  /* these magic constants are outrageous!! why does the ITBL lie about it? */
+  case THUNK_SELECTOR:
+    *size = THUNK_SELECTOR_sizeW();
+    *ptrs = 1;
+    *nonptrs = MIN_UPD_SIZE-*ptrs;   // weird
+    *vhs = *size - *ptrs - *nonptrs - sizeofW(StgHeader);
+    return info;
+
+  case ARR_WORDS:
+    /* ToDo: check whether this can be merged with the default case */
+    *size = arr_words_sizeW((StgArrWords *)node); 
+    *ptrs = 0;
+    *nonptrs = ((StgArrWords *)node)->words;
+    *vhs = *size - *ptrs - *nonptrs - sizeofW(StgHeader);
+    return info;
+
+  case PAP:
+    /* ToDo: check whether this can be merged with the default case */
+    *size = pap_sizeW((StgPAP *)node); 
+    *ptrs = 0;
+    *nonptrs = 0;
+    *vhs = *size - *ptrs - *nonptrs - sizeofW(StgHeader);
+    return info;
+
+  case AP_UPD:
+    /* ToDo: check whether this can be merged with the default case */
+    *size = AP_sizeW(((StgAP_UPD *)node)->n_args); 
+    *ptrs = 0;
+    *nonptrs = 0;
+    *vhs = *size - *ptrs - *nonptrs - sizeofW(StgHeader);
+    return info;
+
+  default:
+    *size = sizeW_fromITBL(info);
+    *ptrs = (nat) (info->layout.payload.ptrs);
+    *nonptrs = (nat) (info->layout.payload.nptrs);
+    *vhs = *size - *ptrs - *nonptrs - sizeofW(StgHeader);
+#if 0 /* DEBUG */
+      info_hdr_type(node, info_hdr_ty);
+#else
+      strcpy(info_hdr_ty, "UNKNOWN");
+#endif
+    return info;
+  }
+} 
+
+//@cindex IS_BLACK_HOLE
+rtsBool
+IS_BLACK_HOLE(StgClosure* node)          
+{ 
+  // StgInfoTable *info;
+  ASSERT(LOOKS_LIKE_COOL_CLOSURE(node));
+  switch (get_itbl(node)->type) {
+  case BLACKHOLE:
+  case BLACKHOLE_BQ:
+  case RBH:
+  case FETCH_ME:
+  case FETCH_ME_BQ:
+    return rtsTrue;
+  default:
+    return rtsFalse;
+  }
+//return ((info->type == BLACKHOLE || info->type == RBH) ? rtsTrue : rtsFalse);
+}
+
+//@cindex IS_INDIRECTION
+StgClosure *
+IS_INDIRECTION(StgClosure* node)          
+{ 
+  StgInfoTable *info;
+  ASSERT(LOOKS_LIKE_COOL_CLOSURE(node));
+  info = get_itbl(node);
+  switch (info->type) {
+    case IND:
+    case IND_OLDGEN:
+    case IND_PERM:
+    case IND_OLDGEN_PERM:
+    case IND_STATIC:
+      /* relies on indirectee being at same place for all these closure types */
+      return (((StgInd*)node) -> indirectee);
+#if 0
+    case EVACUATED:           // counting as ind to use in GC routines, too
+      // could use the same code as above (evacuee is at same pos as indirectee)
+      return (((StgEvacuated *)node) -> evacuee);
+#endif
+    default:
+      return NULL;
+  }
+}
+
+//@cindex unwindInd
+StgClosure *
+UNWIND_IND (StgClosure *closure)
+{
+  StgClosure *next;
+
+  while ((next = IS_INDIRECTION((StgClosure *)closure)) != NULL) 
+    closure = next;
+
+  ASSERT(next==(StgClosure *)NULL);
+  ASSERT(LOOKS_LIKE_COOL_CLOSURE(closure)); 
+  return closure;
+}
+
+#endif /* GRAN || PAR   whole file */
diff --git a/rts/parallel/ParallelDebug.c b/rts/parallel/ParallelDebug.c
new file mode 100644
index 0000000000..b357af6379
--- /dev/null
+++ b/rts/parallel/ParallelDebug.c
@@ -0,0 +1,1955 @@
+/*
+  Time-stamp: <Sun Mar 18 2001 19:32:56 Stardate: [-30]6349.07 hwloidl>
+
+  Various debugging routines for GranSim and GUM
+*/
+
+#if defined(DEBUG) && (defined(GRAN) || defined(PAR))        /* whole file */
+
+//@node Debugging routines for GranSim and GUM, , ,
+//@section Debugging routines for GranSim and GUM
+
+//@menu
+//* Includes::			
+//* Constants and Variables::	
+//* Closures::			
+//* Threads::			
+//* Events::			
+//* Sparks::			
+//* Processors::		
+//* Shortcuts::			
+//* Printing info type::	
+//* Printing Pack:et Contents::	
+//* End of File::		
+//@end menu
+//*/
+
+//@node Includes, Prototypes, Debugging routines for GranSim and GUM, Debugging routines for GranSim and GUM
+//@subsection Includes
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "GranSimRts.h"
+#include "ParallelRts.h"
+#include "StgMiscClosures.h"
+#include "Printer.h"
+# if defined(DEBUG)
+# include "Hash.h" 
+# include "Storage.h"
+# include "ParallelDebug.h"
+# endif
+
+//@node Prototypes, Constants and Variables, Includes, Debugging routines for GranSim and GUM
+//@subsection Prototypes
+/*
+rtsBool  isOffset(globalAddr *ga);
+rtsBool  isFixed(globalAddr *ga);
+*/
+//@node Constants and Variables, Closures, Prototypes, Debugging routines for GranSim and GUM
+//@subsection Constants and Variables
+
+static HashTable *tmpClosureTable;  // used in GraphFingerPrint and PrintGraph
+
+#if defined(PAR)
+static char finger_print_char[] = {
+ '/',  /* INVALID_OBJECT          0 */
+ 'C', /* CONSTR                  1 */
+ 'C', /*	CONSTR_1_0		2 */
+ 'C', /*	CONSTR_0_1		3 */
+ 'C', /*	CONSTR_2_0		4 */
+ 'C', /*	CONSTR_1_1		5 */
+ 'C', /*	CONSTR_0_2		6 */
+ 'I', /* CONSTR_INTLIKE	        7  */
+ 'I', /* CONSTR_CHARLIKE	        8  */
+ 'S', /* CONSTR_STATIC	        9  */
+ 'S', /* CONSTR_NOCAF_STATIC     10 */
+ 'F', /* FUN		        11 */
+ 'F', /*	FUN_1_0		  	12 */
+ 'F', /*	FUN_0_1		  	13 */
+ 'F', /*	FUN_2_0		  	14 */
+ 'F', /*	FUN_1_1		  	15 */
+ 'F', /*	FUN_0_2			16 */
+ 'S', /* FUN_STATIC	        17 */
+ 'T', /* THUNK		        18 */
+ 'T', /*	THUNK_1_0	19 */
+ 'T', /*	THUNK_0_1	20 */
+ 'T', /*	THUNK_2_0	21 */
+ 'T', /*	THUNK_1_1	22 */
+ 'T', /*	THUNK_0_2	23 */
+ 'S', /* THUNK_STATIC	        24 */
+ 'E', /* THUNK_SELECTOR	        25 */
+ 'b', /* BCO		        26 */
+ 'p', /* AP_UPD		        27 */
+ 'p', /* PAP			28 */
+ '_', /* IND		        29 */
+ '_', /* IND_OLDGEN	        30 */
+ '_', /* IND_PERM	        31 */
+ '_', /* IND_OLDGEN_PERM	32 */
+ '_', /* IND_STATIC	        33 */
+ '?', /* ***unused***	        34 */
+ '?', /* ***unused***	        35 */
+ '^', /* RET_BCO                36 */
+ '^', /* RET_SMALL	        37 */
+ '^', /* RET_VEC_SMALL	        38 */
+ '^', /* RET_BIG		39 */
+ '^', /* RET_VEC_BIG	        40 */
+ '^', /* RET_DYN		41 */
+ '~', /* UPDATE_FRAME	        42 */
+ '~', /* CATCH_FRAME	        43 */
+ '~', /* STOP_FRAME	        44 */
+ '~', /* SEQ_FRAME	        45 */
+ 'o', /* CAF_BLACKHOLE	        46 */
+ 'o', /* BLACKHOLE	        47 */
+ 'o', /* BLACKHOLE_BQ	        48 */
+ 'o', /* SE_BLACKHOLE		49 */
+ 'o', /* SE_CAF_BLACKHOLE	50 */
+ 'm', /* MVAR		        51 */
+ 'a', /* ARR_WORDS	        52 */
+ 'a', /* MUT_ARR_PTRS	        53 */
+ 'a', /* MUT_ARR_PTRS_FROZEN    54 */
+ 'q', /* MUT_VAR		55 */
+ 'w', /* WEAK		        56 */
+ 'f', /* FOREIGN		57 */
+ 's', /* STABLE_NAME	        58 */
+ '@', /* TSO		        59 */
+ '#', /* BLOCKED_FETCH	        60 */
+ '>', /* FETCH_ME               61 */
+ '>', /* FETCH_ME_BQ            62 */
+ '$', /* RBH                    63 */
+ 'v', /* EVACUATED              64 */
+ '>' /* REMOTE_REF              65 */  
+     /* ASSERT(there are N_CLOSURE_TYPES (==66) in this arrary) */
+};
+#endif /* PAR */
+
+#if defined(GRAN) && defined(GRAN_CHECK)
+//@node Closures, Threads, Constants and Variables, Debugging routines for GranSim and GUM
+//@subsection Closures
+
+void
+G_PRINT_NODE(node)
+StgClosure* node;
+{
+   StgInfoTable *info_ptr;
+   StgTSO* bqe;
+   nat size = 0, ptrs = 0, nonptrs = 0, i, vhs = 0;
+   char info_hdr_ty[80], info_ty[80];
+
+   if (node==NULL) {
+     fprintf(stderr,"NULL\n");
+     return;
+   } else if (node==END_TSO_QUEUE) {
+     fprintf(stderr,"END_TSO_QUEUE\n");
+     return;
+   }
+   /* size_and_ptrs(node,&size,&ptrs); */
+   info_ptr = get_closure_info(node, &size, &ptrs, &nonptrs, &vhs, info_hdr_ty);
+
+   /* vhs = var_hdr_size(node); */
+   display_info_type(info_ptr,info_ty);
+
+   fprintf(stderr,"Node: 0x%lx", node);
+
+#if defined(PAR)
+   fprintf(stderr," [GA: 0x%lx]",GA(node));
+#endif
+
+#if defined(USE_COST_CENTRES)
+   fprintf(stderr," [CC: 0x%lx]",CC_HDR(node));
+#endif
+
+#if defined(GRAN)
+   fprintf(stderr," [Bitmask: 0%lo]",PROCS(node));
+#endif
+
+   if (info_ptr->type==TSO) 
+     fprintf(stderr," TSO: 0x%lx (%x) IP: 0x%lx (%s), type %s \n     ",
+	     (StgTSO*)node, ((StgTSO*)node)->id, info_ptr, info_hdr_ty, info_ty);
+   else
+     fprintf(stderr," IP: 0x%lx (%s), type %s \n       VHS: %d, size: %ld, ptrs:%ld, nonptrs:  %ld\n     ",
+	     info_ptr,info_hdr_ty,info_ty,vhs,size,ptrs,nonptrs);
+
+   /* For now, we ignore the variable header */
+
+   fprintf(stderr," Ptrs: ");
+   for(i=0; i < ptrs; ++i)
+     {
+     if ( (i+1) % 6 == 0)
+       fprintf(stderr,"\n      ");
+     fprintf(stderr," 0x%lx[P]",node->payload[i]);
+     };
+
+   fprintf(stderr," Data: ");
+   for(i=0; i < nonptrs; ++i)
+     {
+       if( (i+1) % 6 == 0)
+         fprintf(stderr,"\n      ");
+       fprintf(stderr," %lu[D]",node->payload[ptrs+i]);
+     }
+   fprintf(stderr, "\n");
+
+
+   switch (info_ptr->type)
+    {
+     case TSO: 
+      fprintf(stderr,"\n TSO_LINK: %#lx", 
+	      ((StgTSO*)node)->link);
+      break;
+
+    case BLACKHOLE:
+    case RBH:
+      bqe = ((StgBlockingQueue*)node)->blocking_queue;
+      fprintf(stderr," BQ of %#lx: ", node);
+      G_PRINT_BQ(bqe);
+      break;
+    case FETCH_ME:
+    case FETCH_ME_BQ:
+      printf("Panic: found FETCH_ME or FETCH_ME_BQ Infotable in GrAnSim system.\n");
+      break;
+    default:
+      /* do nothing */
+    }
+}
+
+void
+G_PPN(node)  /* Extracted from PrintPacket in Pack.lc */
+StgClosure* node;
+{
+   StgInfoTable *info ;
+   nat size = 0, ptrs = 0, nonptrs = 0, i, vhs = 0, locn = 0;
+   char info_type[80];
+
+   /* size_and_ptrs(node,&size,&ptrs); */
+   info = get_closure_info(node, &size, &ptrs, &nonptrs, &vhs, info_type);
+
+   if (info->type == FETCH_ME || info->type == FETCH_ME_BQ || 
+       info->type == BLACKHOLE || info->type == RBH )
+     size = ptrs = nonptrs = vhs = 0;
+
+   if (closure_THUNK(node)) {
+     if (!closure_UNPOINTED(node))
+       fputs("SHARED ", stderr);
+     else
+       fputs("UNSHARED ", stderr);
+   } 
+   if (info->type==BLACKHOLE) {
+     fputs("BLACK HOLE\n", stderr);
+   } else {
+     /* Fixed header */
+     fprintf(stderr, "(%s) FH [%#lx", info_type, node[locn++]);
+     for (i = 1; i < _HS; i++)
+       fprintf(stderr, " %#lx", node[locn++]);
+     
+     /* Variable header */
+     if (vhs > 0) {
+       fprintf(stderr, "] VH [%#lx", node->payload[0]);
+       
+       for (i = 1; i < vhs; i++)
+	 fprintf(stderr, " %#lx", node->payload[i]);
+     }
+     
+     fprintf(stderr, "] PTRS %u", ptrs);
+     
+     /* Non-pointers */
+     if (nonptrs > 0) {
+       fprintf(stderr, " NPTRS [%#lx", node->payload[ptrs]);
+       
+       for (i = 1; i < nonptrs; i++)
+	 fprintf(stderr, " %#lx", node->payload[ptrs+i]);
+       
+       putc(']', stderr);
+     }
+     putc('\n', stderr);
+   }
+   
+}
+
+#if 0
+// ToDo: fix this!! -- HWL
+void
+G_INFO_TABLE(node)
+StgClosure *node;
+{
+  StgInfoTable *info_ptr;
+  nat size = 0, ptrs = 0, nonptrs = 0, vhs = 0;
+  char info_type[80], hdr_type[80];
+
+  info_hdr_type(info_ptr, hdr_type);
+
+  // get_itbl(node);
+  info_ptr = get_closure_info(node, &size, &ptrs, &nonptrs, &vhs, info_type);
+  fprintf(stderr,"%s Info Ptr @0x%lx; Entry: 0x%lx; Size: %lu; Ptrs: %lu\n\n",
+                 info_type,info_ptr,(W_) ENTRY_CODE(info_ptr),
+	         size, ptrs);
+	         // INFO_SIZE(info_ptr),INFO_NoPTRS(info_ptr));
+
+  if (closure_THUNK(node) && !closure_UNPOINTED(node) ) {
+    fprintf(stderr,"  RBH InfoPtr: %#lx\n",
+	    RBH_INFOPTR(info_ptr));
+  }
+
+#if defined(PAR)
+  fprintf(stderr,"Enter Flush Entry: 0x%lx;\tExit Flush Entry: 0x%lx\n",INFO_FLUSHENT(info_ptr),INFO_FLUSH(info_ptr));
+#endif
+
+#if defined(USE_COST_CENTRES)
+  fprintf(stderr,"Cost Centre (?):       0x%lx\n",INFO_CAT(info_ptr));
+#endif
+
+#if defined(_INFO_COPYING)
+  fprintf(stderr,"Evacuate Entry:    0x%lx;\tScavenge Entry: 0x%lx\n",
+          INFO_EVAC_2S(info_ptr),INFO_SCAV_2S(info_ptr));
+#endif
+
+#if defined(_INFO_COMPACTING)
+  fprintf(stderr,"Scan Link:         0x%lx;\tScan Move:      0x%lx\n",
+          (W_) INFO_SCAN_LINK_1S(info_ptr), (W_) INFO_SCAN_MOVE_1S(info_ptr));
+  fprintf(stderr,"Mark:              0x%lx;\tMarked:         0x%lx;\t",
+          (W_) INFO_MARK_1S(info_ptr), (W_) INFO_MARKED_1S(info_ptr));
+#if 0 /* avoid INFO_TYPE */
+  if(BASE_INFO_TYPE(info_ptr)==INFO_SPEC_TYPE)
+    fprintf(stderr,"plus specialised code\n");
+  else
+    fprintf(stderr,"Marking:           0x%lx\n",(W_) INFO_MARKING_1S(info_ptr));
+#endif /* 0 */
+#endif /* _INFO_COMPACTING */
+}
+#endif /* 0 */
+
+//@cindex G_PRINT_BQ
+void
+G_PRINT_BQ(node)
+StgClosure* node;
+{
+    StgInfoTable *info;
+    StgTSO *tso, *last;
+    char str[80], str0[80];
+
+    fprintf(stderr,"\n[PE %d] @ %lu BQ: ",
+	            CurrentProc,CurrentTime[CurrentProc]);
+    if ( node == (StgClosure*)NULL ) {
+      fprintf(stderr," NULL.\n");
+      return;
+    }
+    if ( node == END_TSO_QUEUE ) {
+      fprintf(stderr," _|_\n");
+      return;
+    }
+    tso = ((StgBlockingQueue*)node)->blocking_queue;
+    while (node != END_TSO_QUEUE) {
+      PEs proc;                     
+      
+      /* Find where the tso lives */
+      proc = where_is(node);
+      info = get_itbl(node);
+
+      switch (info->type) {
+	  case TSO:
+	    strcpy(str0,"TSO");
+	    break;
+	  case BLOCKED_FETCH:
+	    strcpy(str0,"BLOCKED_FETCH");
+	    break;
+	  default:
+	    strcpy(str0,"???");
+	    break;
+	  }
+
+      if(proc == CurrentProc)
+	fprintf(stderr," %#lx (%x) L %s,", 
+		node, ((StgBlockingQueue*)node)->blocking_queue, str0);
+      else
+	fprintf(stderr," %#lx (%x) G (PE %d) %s,", 
+		node, ((StgBlockingQueue*)node)->blocking_queue, proc, str0);
+
+      last = tso;
+      tso = last->link;
+    }
+    if ( tso == END_TSO_QUEUE ) 
+      fprintf(stderr," _|_\n");
+}
+
+//@node Threads, Events, Closures, Debugging routines for GranSim and GUM
+//@subsection Threads
+
+void
+G_CURR_THREADQ(verbose) 
+StgInt verbose;
+{ 
+  fprintf(stderr,"Thread Queue on proc %d: ", CurrentProc);
+  G_THREADQ(run_queue_hd, verbose);
+}
+
+void 
+G_THREADQ(closure, verbose) 
+StgTSO* closure;
+StgInt verbose;
+{
+ StgTSO* x;
+
+ fprintf(stderr,"Thread Queue: ");
+ for (x=closure; x!=END_TSO_QUEUE; x=x->link)
+   if (verbose) 
+     G_TSO(x,0);
+   else
+     fprintf(stderr," %#lx",x);
+
+ if (closure==END_TSO_QUEUE)
+   fprintf(stderr,"NIL\n");
+ else
+   fprintf(stderr,"\n");
+}
+
+void 
+G_TSO(closure,verbose) 
+StgTSO* closure;
+StgInt verbose;
+{
+ 
+ if (closure==END_TSO_QUEUE) {
+   fprintf(stderr,"TSO at %#lx is END_TSO_QUEUE!\n");
+   return;
+ }
+
+ if ( verbose & 0x08 ) {   /* short info */
+   fprintf(stderr,"[TSO @ %#lx, PE %d]: Id: %#lx, Link: %#lx\n",
+	   closure,where_is(closure),
+	   closure->id,closure->link);
+   return;
+ }
+   
+ fprintf(stderr,"TSO at %#lx has the following contents:\n",
+                 closure);
+
+ fprintf(stderr,"> Id:   \t%#lx",closure->id);
+ // fprintf(stderr,"\tstate: \t%#lx",closure->state);
+ fprintf(stderr,"\twhat_next: \t%#lx",closure->what_next);
+ fprintf(stderr,"\tlink: \t%#lx\n",closure->link);
+ fprintf(stderr,"\twhy_blocked: \t%d", closure->why_blocked);
+ fprintf(stderr,"\tblock_info: \t%p\n", closure->block_info);
+ // fprintf(stderr,"\tType: \t%s\n",type_name[TSO_TYPE(closure)]);
+ fprintf(stderr,">PRI: \t%#lx", closure->gran.pri);
+ fprintf(stderr,"\tMAGIC: \t%#lx %s\n", closure->gran.magic, 
+	 (closure->gran.magic==TSO_MAGIC ? "it IS a TSO" : "THIS IS NO TSO!!"));
+ if ( verbose & 0x04 ) {
+   fprintf(stderr, "Stack: stack @ %#lx (stack_size: %u; max_stack_size: %u)\n", 
+	   closure->stack, closure->stack_size, closure->max_stack_size);
+   fprintf(stderr, "  sp: %#lx, su: %#lx, splim: %#lx\n", 
+	   closure->sp, closure->su, closure->splim);
+ }
+ // fprintf(stderr,"\n");
+ if (verbose & 0x01) {
+   // fprintf(stderr,"} LOCKED: \t%#lx",closure->locked);
+   fprintf(stderr,"} SPARKNAME: \t%#lx\n", closure->gran.sparkname);
+   fprintf(stderr,"} STARTEDAT: \t%#lx", closure->gran.startedat);
+   fprintf(stderr,"\tEXPORTED: \t%#lx\n", closure->gran.exported);
+   fprintf(stderr,"} BASICBLOCKS: \t%#lx", closure->gran.basicblocks);
+   fprintf(stderr,"\tALLOCS: \t%#lx\n", closure->gran.allocs);
+   fprintf(stderr,"} EXECTIME: \t%#lx", closure->gran.exectime);
+   fprintf(stderr,"\tFETCHTIME: \t%#lx\n", closure->gran.fetchtime);
+   fprintf(stderr,"} FETCHCOUNT: \t%#lx", closure->gran.fetchcount);
+   fprintf(stderr,"\tBLOCKTIME: \t%#lx\n", closure->gran.blocktime);
+   fprintf(stderr,"} BLOCKCOUNT: \t%#lx", closure->gran.blockcount);
+   fprintf(stderr,"\tBLOCKEDAT: \t%#lx\n", closure->gran.blockedat);
+   fprintf(stderr,"} GLOBALSPARKS:\t%#lx", closure->gran.globalsparks);
+   fprintf(stderr,"\tLOCALSPARKS:\t%#lx\n", closure->gran.localsparks);
+ }
+ if ( verbose & 0x02 ) {
+   fprintf(stderr,"BQ that starts with this TSO: ");
+   G_PRINT_BQ(closure);
+ }
+}
+
+//@node Events, Sparks, Threads, Debugging routines for GranSim and GUM
+//@subsection Events
+
+void 
+G_EVENT(event, verbose) 
+rtsEventQ event;
+StgInt verbose;
+{
+  if (verbose) {
+    print_event(event);
+  }else{
+    fprintf(stderr," %#lx",event);
+  }
+}
+
+void
+G_EVENTQ(verbose)
+StgInt verbose;
+{
+ extern rtsEventQ EventHd;
+ rtsEventQ x;
+
+ fprintf(stderr,"RtsEventQ (hd @%#lx):\n",EventHd);
+ for (x=EventHd; x!=NULL; x=x->next) {
+   G_EVENT(x,verbose);
+ }
+ if (EventHd==NULL) 
+   fprintf(stderr,"NIL\n");
+ else
+   fprintf(stderr,"\n");
+}
+
+void
+G_PE_EQ(pe,verbose)
+PEs pe;
+StgInt verbose;
+{
+ extern rtsEventQ EventHd;
+ rtsEventQ x;
+
+ fprintf(stderr,"RtsEventQ (hd @%#lx):\n",EventHd);
+ for (x=EventHd; x!=NULL; x=x->next) {
+   if (x->proc==pe)
+     G_EVENT(x,verbose);
+ }
+ if (EventHd==NULL) 
+   fprintf(stderr,"NIL\n");
+ else
+   fprintf(stderr,"\n");
+}
+
+//@node Sparks, Processors, Events, Debugging routines for GranSim and GUM
+//@subsection Sparks
+
+void 
+G_SPARK(spark, verbose) 
+rtsSparkQ spark;
+StgInt verbose;
+{
+ if (spark==(rtsSpark*)NULL) {
+   belch("G_SPARK: NULL spark; aborting");
+   return;
+ }
+  if (verbose)
+    print_spark(spark);
+  else
+    fprintf(stderr," %#lx",spark);
+}
+
+void 
+G_SPARKQ(spark,verbose) 
+rtsSparkQ spark;
+StgInt verbose;
+{
+ rtsSparkQ x;
+
+ if (spark==(rtsSpark*)NULL) {
+   belch("G_SPARKQ: NULL spark; aborting");
+   return;
+ }
+   
+ fprintf(stderr,"RtsSparkQ (hd @%#lx):\n",spark);
+ for (x=spark; x!=NULL; x=x->next) {
+   G_SPARK(x,verbose);
+ }
+ if (spark==NULL) 
+   fprintf(stderr,"NIL\n");
+ else
+   fprintf(stderr,"\n");
+}
+
+void 
+G_CURR_SPARKQ(verbose) 
+StgInt verbose;
+{
+  G_SPARKQ(pending_sparks_hd,verbose);
+}
+
+//@node Processors, Shortcuts, Sparks, Debugging routines for GranSim and GUM
+//@subsection Processors
+
+void 
+G_PROC(proc,verbose)
+StgInt proc;
+StgInt verbose;
+{ 
+  extern rtsEventQ EventHd;
+  extern char *proc_status_names[];
+
+  fprintf(stderr,"Status of proc %d at time %d (%#lx): %s (%s)\n",
+          proc,CurrentTime[proc],CurrentTime[proc],
+          (CurrentProc==proc)?"ACTIVE":"INACTIVE",
+          proc_status_names[procStatus[proc]]);
+  G_THREADQ(run_queue_hds[proc],verbose & 0x2);
+  if ( (CurrentProc==proc) )
+    G_TSO(CurrentTSO,1);
+
+  if (EventHd!=NULL)
+    fprintf(stderr,"Next event (%s) is on proc %d\n",
+            event_names[EventHd->evttype],EventHd->proc);
+
+  if (verbose & 0x1) {
+    fprintf(stderr,"\nREQUIRED sparks: ");
+    G_SPARKQ(pending_sparks_hds[proc],1);
+    fprintf(stderr,"\nADVISORY_sparks: ");
+    G_SPARKQ(pending_sparks_hds[proc],1);
+  }
+}
+
+//@node Shortcuts, Printing info type, Processors, Debugging routines for GranSim and GUM
+//@subsection Shortcuts
+
+/* Debug Processor */
+void 
+GP(proc)
+StgInt proc;
+{ G_PROC(proc,1);
+}
+
+/* Debug Current Processor */
+void
+GCP(){ G_PROC(CurrentProc,2); }
+
+/* Debug TSO */
+void
+GT(StgPtr tso){ 
+  G_TSO(tso,1);
+}
+
+/* Debug CurrentTSO */
+void
+GCT(){ 
+  fprintf(stderr,"Current Proc: %d\n",CurrentProc);
+  G_TSO(CurrentTSO,1);
+}
+
+/* Shorthand for debugging event queue */
+void
+GEQ() { G_EVENTQ(1); }
+
+/* Shorthand for debugging thread queue of a processor */
+void 
+GTQ(PEs p) { G_THREADQ(run_queue_hds[p],1); } 
+
+/* Shorthand for debugging thread queue of current processor */
+void 
+GCTQ() { G_THREADQ(run_queue_hds[CurrentProc],1); } 
+
+/* Shorthand for debugging spark queue of a processor */
+void
+GSQ(PEs p) { G_SPARKQ(pending_sparks_hds[p],1); }
+
+/* Shorthand for debugging spark queue of current processor */
+void
+GCSQ() { G_CURR_SPARKQ(1); }
+
+/* Shorthand for printing a node */
+void
+GN(StgPtr node) { G_PRINT_NODE(node); }
+
+/* Shorthand for printing info table */
+#if 0
+// ToDo: fix -- HWL
+void
+GIT(StgPtr node) { G_INFO_TABLE(node); }
+#endif
+
+void 
+printThreadQPtrs(void)
+{
+  PEs p;
+  for (p=0; p<RtsFlags.GranFlags.proc; p++) {
+    fprintf(stderr,", PE %d: (hd=%p,tl=%p)", 
+	    run_queue_hds[p], run_queue_tls[p]);
+  }
+}
+
+void
+printThreadQ(StgTSO *tso) { G_THREADQ(tso, 0); };
+
+void
+printSparkQ(rtsSpark *spark) { G_SPARKQ(spark, 0); };
+
+void
+printThreadQ_verbose(StgTSO *tso) { G_THREADQ(tso, 1); };
+
+void
+printSparkQ_verbose(rtsSpark *spark) { G_SPARKQ(spark, 1); };
+
+/* Shorthand for some of ADRs debugging functions */
+
+#endif /* GRAN && GRAN_CHECK*/
+
+#if 0
+void
+DEBUG_PRINT_NODE(node)
+StgPtr node;
+{
+   W_ info_ptr = INFO_PTR(node);
+   StgInt size = 0, ptrs = 0, i, vhs = 0;
+   char info_type[80];
+
+   info_hdr_type(info_ptr, info_type);
+
+   size_and_ptrs(node,&size,&ptrs);
+   vhs = var_hdr_size(node);
+
+   fprintf(stderr,"Node: 0x%lx", (W_) node);
+
+#if defined(PAR)
+   fprintf(stderr," [GA: 0x%lx]",GA(node));
+#endif
+
+#if defined(PROFILING)
+   fprintf(stderr," [CC: 0x%lx]",CC_HDR(node));
+#endif
+
+#if defined(GRAN)
+   fprintf(stderr," [Bitmask: 0%lo]",PROCS(node));
+#endif
+
+   fprintf(stderr," IP: 0x%lx (%s), size %ld, %ld ptrs\n",
+                  info_ptr,info_type,size,ptrs);
+
+   /* For now, we ignore the variable header */
+
+   for(i=0; i < size; ++i)
+     {
+       if(i == 0)
+         fprintf(stderr,"Data: ");
+
+       else if(i % 6 == 0)
+         fprintf(stderr,"\n      ");
+
+       if(i < ptrs)
+         fprintf(stderr," 0x%lx[P]",*(node+_HS+vhs+i));
+       else
+         fprintf(stderr," %lu[D]",*(node+_HS+vhs+i));
+     }
+   fprintf(stderr, "\n");
+}
+
+
+#define INFO_MASK       0x80000000
+
+void
+DEBUG_TREE(node)
+StgPtr node;
+{
+  W_ size = 0, ptrs = 0, i, vhs = 0;
+
+  /* Don't print cycles */
+  if((INFO_PTR(node) & INFO_MASK) != 0)
+    return;
+
+  size_and_ptrs(node,&size,&ptrs);
+  vhs = var_hdr_size(node);
+
+  DEBUG_PRINT_NODE(node);
+  fprintf(stderr, "\n");
+
+  /* Mark the node -- may be dangerous */
+  INFO_PTR(node) |= INFO_MASK;
+
+  for(i = 0; i < ptrs; ++i)
+    DEBUG_TREE((StgPtr)node[i+vhs+_HS]);
+
+  /* Unmark the node */
+  INFO_PTR(node) &= ~INFO_MASK;
+}
+
+
+void
+DEBUG_INFO_TABLE(node)
+StgPtr node;
+{
+  W_ info_ptr = INFO_PTR(node);
+  char *iStgPtrtype = info_hdr_type(info_ptr);
+
+  fprintf(stderr,"%s Info Ptr @0x%lx; Entry: 0x%lx; Size: %lu; Ptrs: %lu\n\n",
+                 iStgPtrtype,info_ptr,(W_) ENTRY_CODE(info_ptr),INFO_SIZE(info_ptr),INFO_NoPTRS(info_ptr));
+#if defined(PAR)
+  fprintf(stderr,"Enter Flush Entry: 0x%lx;\tExit Flush Entry: 0x%lx\n",INFO_FLUSHENT(info_ptr),INFO_FLUSH(info_ptr));
+#endif
+
+#if defined(PROFILING)
+  fprintf(stderr,"Cost Centre (?):       0x%lx\n",INFO_CAT(info_ptr));
+#endif
+
+#if defined(_INFO_COPYING)
+  fprintf(stderr,"Evacuate Entry:    0x%lx;\tScavenge Entry: 0x%lx\n",
+          INFO_EVAC_2S(info_ptr),INFO_SCAV_2S(info_ptr));
+#endif
+
+#if defined(_INFO_COMPACTING)
+  fprintf(stderr,"Scan Link:         0x%lx;\tScan Move:      0x%lx\n",
+          (W_) INFO_SCAN_LINK_1S(info_ptr), (W_) INFO_SCAN_MOVE_1S(info_ptr));
+  fprintf(stderr,"Mark:              0x%lx;\tMarked:         0x%lx;\t",
+          (W_) INFO_MARK_1S(info_ptr), (W_) INFO_MARKED_1S(info_ptr));
+#if 0 /* avoid INFO_TYPE */
+  if(BASE_INFO_TYPE(info_ptr)==INFO_SPEC_TYPE)
+    fprintf(stderr,"plus specialised code\n");
+  else
+    fprintf(stderr,"Marking:           0x%lx\n",(W_) INFO_MARKING_1S(info_ptr));
+#endif /* 0 */
+#endif /* _INFO_COMPACTING */
+}
+#endif /* 0 */
+
+//@node Printing info type, Printing Packet Contents, Shortcuts, Debugging routines for GranSim and GUM
+//@subsection Printing info type
+
+char *
+display_info_type(closure, str)
+StgClosure *closure;
+char *str;
+{ 
+  strcpy(str,"");
+  if ( closure_HNF(closure) )
+    strcat(str,"|_HNF ");
+  else if ( closure_BITMAP(closure) )
+    strcat(str,"|_BTM");
+  else if ( !closure_SHOULD_SPARK(closure) )
+    strcat(str,"|_NS");
+  else if ( closure_STATIC(closure) )
+    strcat(str,"|_STA");
+  else if ( closure_THUNK(closure) )
+    strcat(str,"|_THU");
+  else if ( closure_MUTABLE(closure) )
+    strcat(str,"|_MUT");
+  else if ( closure_UNPOINTED(closure) )
+    strcat(str,"|_UPT");
+  else if ( closure_SRT(closure) )
+    strcat(str,"|_SRT");
+
+  return(str);
+}
+
+/*
+  PrintPacket is in Pack.c because it makes use of closure queues
+*/
+
+#if defined(GRAN) || defined(PAR)
+
+/*
+  Print graph rooted at q. The structure of this recursive printing routine
+  should be the same as in the graph traversals when packing a graph in
+  GUM. Thus, it demonstrates the structure of such a generic graph
+  traversal, and in particular, how to extract pointer and non-pointer info
+  from the multitude of different heap objects available. 
+
+  {evacuate}Daq ngoqvam nIHlu'pu'!!
+*/
+
+void
+PrintGraph(StgClosure *p, int indent_level)
+{
+  void PrintGraph_(StgClosure *p, int indent_level);
+
+  ASSERT(tmpClosureTable==NULL);
+
+  /* init hash table */
+  tmpClosureTable = allocHashTable();
+
+  /* now do the real work */
+  PrintGraph_(p, indent_level);
+
+  /* nuke hash table */
+  freeHashTable(tmpClosureTable, NULL);
+  tmpClosureTable = NULL;
+}
+
+/*
+  This is the actual worker functions. 
+  All recursive calls should be made to this function.
+*/
+void
+PrintGraph_(StgClosure *p, int indent_level)
+{
+  StgPtr x, q;
+  rtsBool printed = rtsFalse;
+  nat i, j;
+  const StgInfoTable *info;
+  
+  /* check whether we have met this node already to break cycles */
+  if (lookupHashTable(tmpClosureTable, (StgWord)p)) { // ie. already touched
+    /* indentation */
+    for (j=0; j<indent_level; j++)
+      fputs(" ", stderr);
+
+    fprintf(stderr, "#### cylce to %p", p);
+    return; 
+  }
+
+  /* record that we are processing this closure */
+  insertHashTable(tmpClosureTable, (StgWord) p, (void *)rtsTrue/*non-NULL*/);
+
+  q = p;			/* save ptr to object */
+  
+  /* indentation */
+  for (j=0; j<indent_level; j++)
+    fputs(" ", stderr);
+
+  ASSERT(p!=(StgClosure*)NULL);
+  ASSERT(LOOKS_LIKE_STATIC(p) ||
+	 LOOKS_LIKE_GHC_INFO(GET_INFO((StgClosure *)p)) ||
+         IS_HUGS_CONSTR_INFO(GET_INFO((StgClosure *)p)));
+
+  printClosure(p); // prints contents of this one closure
+
+  /* indentation */
+  for (j=0; j<indent_level; j++)
+    fputs(" ", stderr);
+
+  info = get_itbl((StgClosure *)p);
+  /* the rest of this fct recursively traverses the graph */
+  switch (info -> type) {
+  
+  case BCO:
+    {
+  	StgBCO* bco = stgCast(StgBCO*,p);
+  	nat i;
+	fprintf(stderr, "BCO (%p)\n", p);
+        /*
+  	for (i = 0; i < bco->n_ptrs; i++) {
+  	  // bcoConstCPtr(bco,i) = 
+	  PrintGraph_(bcoConstCPtr(bco,i), indent_level+1);
+  	}
+	*/
+  	// p += bco_sizeW(bco);
+  	break;
+    }
+  
+  case MVAR:
+    /* treat MVars specially, because we don't want to PrintGraph the
+     * mut_link field in the middle of the closure.
+     */
+    { 
+  	StgMVar *mvar = ((StgMVar *)p);
+  	// evac_gen = 0;
+	fprintf(stderr, "MVAR (%p) with 3 pointers (head, tail, value)\n", p);
+  	// (StgClosure *)mvar->head = 
+	PrintGraph_((StgClosure *)mvar->head, indent_level+1);
+  	// (StgClosure *)mvar->tail = 
+	PrintGraph_((StgClosure *)mvar->tail, indent_level+1);
+  	//(StgClosure *)mvar->value = 
+	PrintGraph_((StgClosure *)mvar->value, indent_level+1);
+  	// p += sizeofW(StgMVar);
+  	// evac_gen = saved_evac_gen;
+  	break;
+    }
+  
+  case THUNK_2_0:
+    if (!printed) {
+      fprintf(stderr, "THUNK_2_0 (%p) with 2 pointers\n", p);
+      printed = rtsTrue;
+    }
+  case FUN_2_0:
+    if (!printed) {
+      fprintf(stderr, "FUN_2_0 (%p) with 2 pointers\n", p);
+      printed = rtsTrue;
+    }
+    // scavenge_srt(info);
+  case CONSTR_2_0:
+    if (!printed) {
+      fprintf(stderr, "CONSTR_2_0 (%p) with 2 pointers\n", p);
+      printed = rtsTrue;
+    }
+    // ((StgClosure *)p)->payload[0] = 
+    PrintGraph_(((StgClosure *)p)->payload[0],
+	       indent_level+1);
+    // ((StgClosure *)p)->payload[1] = 
+    PrintGraph_(((StgClosure *)p)->payload[1],
+	       indent_level+1);
+    // p += sizeofW(StgHeader) + 2;
+    break;
+  
+  case THUNK_1_0:
+    // scavenge_srt(info);
+    fprintf(stderr, "THUNK_1_0 (%p) with 1 pointer\n", p);
+    // ((StgClosure *)p)->payload[0] = 
+    PrintGraph_(((StgClosure *)p)->payload[0],
+	       indent_level+1);
+    // p += sizeofW(StgHeader) + 2; /* MIN_UPD_SIZE */
+    break;
+  
+  case FUN_1_0:
+    if (!printed) {
+      fprintf(stderr, "FUN_1_0 (%p) with 1 pointer\n", p);
+      printed = rtsTrue;
+    }
+    // scavenge_srt(info);
+  case CONSTR_1_0:
+    if (!printed) {
+      fprintf(stderr, "CONSTR_2_0 (%p) with 2 pointers\n", p);
+      printed = rtsTrue;
+    }
+    // ((StgClosure *)p)->payload[0] = 
+    PrintGraph_(((StgClosure *)p)->payload[0],
+	       indent_level+1);
+    // p += sizeofW(StgHeader) + 1;
+    break;
+  
+  case THUNK_0_1:
+    fprintf(stderr, "THUNK_0_1 (%p) with 0 pointers\n", p);
+    // scavenge_srt(info);
+    // p += sizeofW(StgHeader) + 2; /* MIN_UPD_SIZE */
+    break;
+  
+  case FUN_0_1:
+    fprintf(stderr, "FUN_0_1 (%p) with 0 pointers\n", p);
+    //scavenge_srt(info);
+  case CONSTR_0_1:
+    fprintf(stderr, "CONSTR_0_1 (%p) with 0 pointers\n", p);
+    //p += sizeofW(StgHeader) + 1;
+    break;
+  
+  case THUNK_0_2:
+    if (!printed) {
+      fprintf(stderr, "THUNK_0_2 (%p) with 0 pointers\n", p);
+      printed = rtsTrue;
+    }
+  case FUN_0_2:
+    if (!printed) {
+      fprintf(stderr, "FUN_0_2 (%p) with 0 pointers\n", p);
+      printed = rtsTrue;
+    }
+    // scavenge_srt(info);
+  case CONSTR_0_2:
+    if (!printed) {
+      fprintf(stderr, "CONSTR_0_2 (%p) with 0 pointers\n", p);
+      printed = rtsTrue;
+    }
+    // p += sizeofW(StgHeader) + 2;
+    break;
+  
+  case THUNK_1_1:
+    if (!printed) {
+      fprintf(stderr, "THUNK_1_1 (%p) with 1 pointer\n", p);
+      printed = rtsTrue;
+    }
+  case FUN_1_1:
+    if (!printed) {
+      fprintf(stderr, "FUN_1_1 (%p) with 1 pointer\n", p);
+      printed = rtsTrue;
+    }
+    // scavenge_srt(info);
+  case CONSTR_1_1:
+    if (!printed) {
+      fprintf(stderr, "CONSTR_1_1 (%p) with 1 pointer\n", p);
+      printed = rtsTrue;
+    }
+    // ((StgClosure *)p)->payload[0] = 
+    PrintGraph_(((StgClosure *)p)->payload[0],
+	       indent_level+1);
+    // p += sizeofW(StgHeader) + 2;
+    break;
+  
+  case FUN:
+    if (!printed) {
+      fprintf(stderr, "FUN (%p) with %d pointers\n", p, info->layout.payload.ptrs);
+      printed = rtsTrue;
+    }
+    /* fall through */
+  
+  case THUNK:
+    if (!printed) {
+      fprintf(stderr, "THUNK (%p) with %d pointers\n", p, info->layout.payload.ptrs);
+      printed = rtsTrue;
+    }
+    // scavenge_srt(info);
+    /* fall through */
+  
+  case CONSTR:
+    if (!printed) {
+      fprintf(stderr, "CONSTR (%p) with %d pointers\n", p, info->layout.payload.ptrs);
+      printed = rtsTrue;
+    }
+    /* basically same as loop in STABLE_NAME case  */
+    for (i=0; i<info->layout.payload.ptrs; i++)
+      PrintGraph_(((StgClosure *)p)->payload[i],
+		 indent_level+1);
+    break;
+    /* NOT fall through */
+  
+  case WEAK:
+    if (!printed) {
+      fprintf(stderr, "WEAK (%p) with %d pointers\n", p, info->layout.payload.ptrs);
+      printed = rtsTrue;
+    }
+    /* fall through */
+  
+  case FOREIGN:
+    if (!printed) {
+      fprintf(stderr, "FOREIGN (%p) with %d pointers\n", p, info->layout.payload.ptrs);
+      printed = rtsTrue;
+    }
+    /* fall through */
+  
+  case STABLE_NAME:
+    {
+      StgPtr end;
+      
+      if (!printed) {
+	fprintf(stderr, "STABLE_NAME (%p) with %d pointers (not followed!)\n", 
+		p, info->layout.payload.ptrs);
+	printed = rtsTrue;
+      }
+      end = (StgPtr)((StgClosure *)p)->payload + info->layout.payload.ptrs;
+      for (p = (StgPtr)((StgClosure *)p)->payload; p < end; p++) {
+	// (StgClosure *)*p = 
+	//PrintGraph_((StgClosure *)*p, indent_level+1);
+	fprintf(stderr, ", %p", *p); 
+      }
+      //fputs("\n", stderr);
+      // p += info->layout.payload.nptrs;
+      break;
+    }
+  
+  case IND_PERM:
+    //if (step->gen->no != 0) {
+    //	SET_INFO(((StgClosure *)p), &IND_OLDGEN_PERM_info);
+    //}
+    if (!printed) {
+      fprintf(stderr, "IND_PERM (%p) with indirection to\n", 
+	      p, ((StgIndOldGen *)p)->indirectee);
+      printed = rtsTrue;
+    }
+    /* fall through */
+
+  case IND_OLDGEN_PERM:
+    if (!printed) {
+      fprintf(stderr, "IND_OLDGEN_PERM (%p) with indirection to %p\n", 
+	      p, ((StgIndOldGen *)p)->indirectee);
+      printed = rtsTrue;
+    }
+    // ((StgIndOldGen *)p)->indirectee = 
+    PrintGraph_(((StgIndOldGen *)p)->indirectee,
+	       indent_level+1);
+    //if (failed_to_evac) {
+    //	failed_to_evac = rtsFalse;
+    //	recordOldToNewPtrs((StgMutClosure *)p);
+    //}
+    // p += sizeofW(StgIndOldGen);
+    break;
+  
+  case MUT_VAR:
+    /* ignore MUT_CONSs */
+    fprintf(stderr, "MUT_VAR (%p) pointing to %p\n", p, ((StgMutVar *)p)->var);
+    if (((StgMutVar *)p)->header.info != &stg_MUT_CONS_info) {
+      //evac_gen = 0;
+      PrintGraph_(((StgMutVar *)p)->var, indent_level+1);
+  	//evac_gen = saved_evac_gen;
+    }
+    //p += sizeofW(StgMutVar);
+    break;
+  
+  case CAF_BLACKHOLE:
+    if (!printed) {
+      fprintf(stderr, "CAF_BLACKHOLE (%p) with 0 pointers\n", p);
+      printed = rtsTrue;
+    }
+  case SE_CAF_BLACKHOLE:
+    if (!printed) {
+      fprintf(stderr, "SE_CAF_BLACKHOLE (%p) with 0 pointers\n", p);
+      printed = rtsTrue;
+    }
+  case SE_BLACKHOLE:
+    if (!printed) {
+      fprintf(stderr, "SE_BLACKHOLE (%p) with 0 pointers\n", p);
+      printed = rtsTrue;
+    }
+  case BLACKHOLE:
+    if (!printed) {
+      fprintf(stderr, "BLACKHOLE (%p) with 0 pointers\n", p);
+      printed = rtsTrue;
+    }
+    //p += BLACKHOLE_sizeW();
+    break;
+  
+  case BLACKHOLE_BQ:
+    { 
+      StgBlockingQueue *bh = (StgBlockingQueue *)p;
+      // (StgClosure *)bh->blocking_queue = 
+      fprintf(stderr, "BLACKHOLE_BQ (%p) pointing to %p\n", 
+	      p, (StgClosure *)bh->blocking_queue);
+      PrintGraph_((StgClosure *)bh->blocking_queue, indent_level+1);
+      //if (failed_to_evac) {
+      //  failed_to_evac = rtsFalse;
+      //  recordMutable((StgMutClosure *)bh);
+      //}
+      // p += BLACKHOLE_sizeW();
+      break;
+    }
+  
+  case THUNK_SELECTOR:
+    { 
+      StgSelector *s = (StgSelector *)p;
+      fprintf(stderr, "THUNK_SELECTOR (%p) pointing to %p\n", 
+	      p, s->selectee);
+      PrintGraph_(s->selectee, indent_level+1);
+      // p += THUNK_SELECTOR_sizeW();
+      break;
+    }
+  
+  case IND:
+    fprintf(stderr, "IND (%p) pointing to %p\n", p, ((StgInd*)p)->indirectee);
+    PrintGraph_(((StgInd*)p)->indirectee, indent_level+1);
+    break;
+
+  case IND_OLDGEN:
+    fprintf(stderr, "IND_OLDGEN (%p) pointing to %p\n", 
+	    p, ((StgIndOldGen*)p)->indirectee);
+    PrintGraph_(((StgIndOldGen*)p)->indirectee, indent_level+1);
+    break;
+  
+  case CONSTR_INTLIKE:
+    fprintf(stderr, "CONSTR_INTLIKE (%p) with 0 pointers\n", p);
+    break;
+  case CONSTR_CHARLIKE:
+    fprintf(stderr, "CONSTR_CHARLIKE (%p) with 0 pointers\n", p);
+    break;
+  case CONSTR_STATIC:
+    fprintf(stderr, "CONSTR_STATIC (%p) with 0 pointers\n", p);
+    break;
+  case CONSTR_NOCAF_STATIC:
+    fprintf(stderr, "CONSTR_NOCAF_STATIC (%p) with 0 pointers\n", p);
+    break;
+  case THUNK_STATIC:
+    fprintf(stderr, "THUNK_STATIC (%p) with 0 pointers\n", p);
+    break;
+  case FUN_STATIC:
+    fprintf(stderr, "FUN_STATIC (%p) with 0 pointers\n", p);
+    break;
+  case IND_STATIC:
+    fprintf(stderr, "IND_STATIC (%p) with 0 pointers\n", p);
+    break;
+  
+  case RET_BCO:
+    fprintf(stderr, "RET_BCO (%p) with 0 pointers\n", p);
+    break;
+  case RET_SMALL:
+    fprintf(stderr, "RET_SMALL (%p) with 0 pointers\n", p);
+    break;
+  case RET_VEC_SMALL:
+    fprintf(stderr, "RET_VEC_SMALL (%p) with 0 pointers\n", p);
+    break;
+  case RET_BIG:
+    fprintf(stderr, "RET_BIG (%p) with 0 pointers\n", p);
+    break;
+  case RET_VEC_BIG:
+    fprintf(stderr, "RET_VEC_BIG (%p) with 0 pointers\n", p);
+    break;
+  case RET_DYN:
+    fprintf(stderr, "RET_DYN (%p) with 0 pointers\n", p);
+    break;
+  case UPDATE_FRAME:
+    fprintf(stderr, "UPDATE_FRAME (%p) with 0 pointers\n", p);
+    break;
+  case STOP_FRAME:
+    fprintf(stderr, "STOP_FRAME (%p) with 0 pointers\n", p);
+    break;
+  case CATCH_FRAME:
+    fprintf(stderr, "CATCH_FRAME (%p) with 0 pointers\n", p);
+    break;
+  case SEQ_FRAME:
+    fprintf(stderr, "SEQ_FRAME (%p) with 0 pointers\n", p);
+    break;
+  
+  case AP_UPD: /* same as PAPs */
+    fprintf(stderr, "AP_UPD (%p) with 0 pointers\n", p);
+  case PAP:
+    /* Treat a PAP just like a section of stack, not forgetting to
+     * PrintGraph_ the function pointer too...
+     */
+    { 
+  	StgPAP* pap = stgCast(StgPAP*,p);
+  
+	fprintf(stderr, "PAP (%p) pointing to %p\n", p, pap->fun);
+  	// pap->fun = 
+	//PrintGraph_(pap->fun, indent_level+1);
+  	//scavenge_stack((P_)pap->payload, (P_)pap->payload + pap->n_args);
+  	//p += pap_sizeW(pap);
+  	break;
+    }
+    
+  case ARR_WORDS:
+    /* an array of (non-mutable) words */
+    fprintf(stderr, "ARR_WORDS (%p) of %d non-ptrs (maybe a string?)\n", 
+	    p, ((StgArrWords *)q)->words);
+    break;
+
+  case MUT_ARR_PTRS:
+    /* follow everything */
+    {
+  	StgPtr next;
+  
+	fprintf(stderr, "MUT_ARR_PTRS (%p) with %d pointers (not followed)\n", 
+		p, mut_arr_ptrs_sizeW((StgMutArrPtrs*)p));
+  	// evac_gen = 0;		/* repeatedly mutable */
+  	next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+  	for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+  	  // (StgClosure *)*p = 
+	  // PrintGraph_((StgClosure *)*p, indent_level+1);
+	  fprintf(stderr, ", %p", *p); 
+  	}
+	fputs("\n", stderr);
+  	//evac_gen = saved_evac_gen;
+  	break;
+    }
+  
+  case MUT_ARR_PTRS_FROZEN:
+    /* follow everything */
+    {
+  	StgPtr start = p, next;
+  
+	fprintf(stderr, "MUT_ARR_PTRS (%p) with %d pointers (not followed)", 
+		p, mut_arr_ptrs_sizeW((StgMutArrPtrs*)p));
+  	next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+  	for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+  	  // (StgClosure *)*p = 
+	  // PrintGraph_((StgClosure *)*p, indent_level+1);
+	  fprintf(stderr, ", %p", *p); 
+  	}
+	fputs("\n", stderr);
+  	//if (failed_to_evac) {
+  	  /* we can do this easier... */
+  	//  recordMutable((StgMutClosure *)start);
+  	//  failed_to_evac = rtsFalse;
+  	//}
+  	break;
+    }
+  
+  case TSO:
+    { 
+  	StgTSO *tso;
+  	
+  	tso = (StgTSO *)p;
+	fprintf(stderr, "TSO (%p) with link field %p\n", p, (StgClosure *)tso->link);
+  	// evac_gen = 0;
+  	/* chase the link field for any TSOs on the same queue */
+  	// (StgClosure *)tso->link = 
+	PrintGraph_((StgClosure *)tso->link, indent_level+1);
+  	//if (tso->blocked_on) {
+  	//  tso->blocked_on = PrintGraph_(tso->blocked_on);
+  	//}
+  	/* scavenge this thread's stack */
+  	//scavenge_stack(tso->sp, &(tso->stack[tso->stack_size]));
+  	//evac_gen = saved_evac_gen;
+  	//p += tso_sizeW(tso);
+  	break;
+    }
+  
+#if defined(GRAN) || defined(PAR)
+  case RBH:
+    {
+    StgInfoTable *rip = REVERT_INFOPTR(get_itbl(p));
+    //if (LOOKS_LIKE_GHC_INFO(rip))
+    //  fprintf(stderr, "RBH (%p) with 0 pointers (reverted type=%s)\n", 
+	//      p, info_type_by_ip(rip)); 
+    //else
+    fprintf(stderr, "RBH (%p) with 0 pointers (reverted IP=%x)\n", 
+	    p, rip); 
+    }
+    break;
+#endif
+#if defined(PAR)
+  case BLOCKED_FETCH:
+    fprintf(stderr, "BLOCKED_FETCH (%p) with 0 pointers (link=%p)\n", 
+	    p, ((StgBlockedFetch *)p)->link);
+    break;
+  case FETCH_ME:
+    fprintf(stderr, "FETCH_ME (%p) with 0 pointers\n", p);
+    break;
+  case FETCH_ME_BQ:
+    fprintf(stderr, "FETCH_ME_BQ (%p) with 0 pointers (blocking_queue=%p)\n", 
+	    p, ((StgFetchMeBlockingQueue *)p)->blocking_queue);
+    break;
+#endif
+    
+#ifdef DIST    
+  case REMOTE_REF:
+    fprintf(stderr, "REMOTE_REF (%p) with 0 pointers\n", p);
+    break;
+#endif
+
+  case EVACUATED:
+    fprintf(stderr, "EVACUATED (%p) with 0 pointers (evacuee=%p)\n", 
+	    p, ((StgEvacuated *)p)->evacuee);
+    break;
+  
+  default:
+    barf("PrintGraph_: unknown closure %d (%s)",
+	 info -> type, info_type(info));
+  }
+  
+  /* If we didn't manage to promote all the objects pointed to by
+   * the current object, then we have to designate this object as
+   * mutable (because it contains old-to-new generation pointers).
+   */
+  //if (failed_to_evac) {
+  //  mkMutCons((StgClosure *)q, &generations[evac_gen]);
+  //  failed_to_evac = rtsFalse;
+  //}
+}    
+
+# if defined(PAR)
+/*
+  Generate a finger-print for a graph.
+  A finger-print is a string, with each char representing one node; 
+  depth-first traversal
+*/
+
+void
+GraphFingerPrint(StgClosure *p, char *finger_print)
+{
+  void GraphFingerPrint_(StgClosure *p, char *finger_print);
+
+  ASSERT(tmpClosureTable==NULL);
+  ASSERT(strlen(finger_print)==0);
+
+  /* init hash table */
+  tmpClosureTable = allocHashTable();
+
+  /* now do the real work */
+  GraphFingerPrint_(p, finger_print);
+
+  /* nuke hash table */
+  freeHashTable(tmpClosureTable, NULL);
+  tmpClosureTable = NULL;
+}
+
+/*
+  This is the actual worker functions. 
+  All recursive calls should be made to this function.
+*/
+void
+GraphFingerPrint_(StgClosure *p, char *finger_print)
+{
+  StgPtr x, q;
+  rtsBool printed = rtsFalse;
+  nat i, j, len;
+  const StgInfoTable *info;
+
+  q = p;			/* save ptr to object */
+  len = strlen(finger_print);
+  ASSERT(len<=MAX_FINGER_PRINT_LEN);
+  /* at most 7 chars for this node (I think) */
+  if (len+7>=MAX_FINGER_PRINT_LEN)
+    return;
+
+  /* check whether we have met this node already to break cycles */
+  if (lookupHashTable(tmpClosureTable, (StgWord)p)) { // ie. already touched
+    strcat(finger_print, "#");
+    return; 
+  }
+
+  /* record that we are processing this closure */
+  insertHashTable(tmpClosureTable, (StgWord) p, (void *)rtsTrue/*non-NULL*/);
+
+  ASSERT(p!=(StgClosure*)NULL);
+  ASSERT(LOOKS_LIKE_STATIC(p) ||
+	 LOOKS_LIKE_GHC_INFO(GET_INFO((StgClosure *)p)) ||
+         IS_HUGS_CONSTR_INFO(GET_INFO((StgClosure *)p)));
+
+  info = get_itbl((StgClosure *)p);
+  // append char for this node
+  finger_print[len] = finger_print_char[info->type]; finger_print[len+1] = '\0'; 
+  /* the rest of this fct recursively traverses the graph */
+  switch (info -> type) {
+  
+  case BCO:
+    {
+  	StgBCO* bco = stgCast(StgBCO*,p);
+  	nat i;
+	//%% fprintf(stderr, "BCO (%p) with %d pointers\n", p, bco->n_ptrs);
+        /*
+  	for (i = 0; i < bco->n_ptrs; i++) {
+  	  // bcoConstCPtr(bco,i) = 
+	  GraphFingerPrint_(bcoConstCPtr(bco,i), finger_print);
+  	}
+	*/
+  	// p += bco_sizeW(bco);
+  	break;
+    }
+  
+  case MVAR:
+    break;
+  
+  case THUNK_2_0:
+  case FUN_2_0:
+  case CONSTR_2_0:
+    // append char for this node
+    strcat(finger_print, "22(");
+    GraphFingerPrint_(((StgClosure *)p)->payload[0], finger_print);
+    GraphFingerPrint_(((StgClosure *)p)->payload[1], finger_print);
+    if (strlen(finger_print)+2<MAX_FINGER_PRINT_LEN)
+      strcat(finger_print, ")");
+    break;
+  
+  case THUNK_1_0:
+  case FUN_1_0:
+  case CONSTR_1_0:
+    // append char for this node
+    strcat(finger_print, "12(");
+    GraphFingerPrint_(((StgClosure *)p)->payload[0], finger_print);
+    if (strlen(finger_print)+2<MAX_FINGER_PRINT_LEN)
+      strcat(finger_print, ")");
+    break;
+  
+  case THUNK_0_1:
+  case FUN_0_1:
+  case CONSTR_0_1:
+    // append char for this node
+    strcat(finger_print, "01");
+    break;
+  
+  case THUNK_0_2:
+  case FUN_0_2:
+  case CONSTR_0_2:
+    // append char for this node
+    strcat(finger_print, "02");
+    break;
+  
+  case THUNK_1_1:
+  case FUN_1_1:
+  case CONSTR_1_1:
+    // append char for this node
+    strcat(finger_print, "11(");
+    GraphFingerPrint_(((StgClosure *)p)->payload[0], finger_print);
+    if (strlen(finger_print)+2<MAX_FINGER_PRINT_LEN)
+      strcat(finger_print, ")");
+    break;
+  
+  case FUN:
+  case THUNK:
+  case CONSTR:
+    /* basically same as loop in STABLE_NAME case  */
+    {
+	char str[6];
+	sprintf(str,"%d?(",info->layout.payload.ptrs);
+	strcat(finger_print,str); 
+	for (i=0; i<info->layout.payload.ptrs; i++)
+	  GraphFingerPrint_(((StgClosure *)p)->payload[i], finger_print);
+	if (strlen(finger_print)+2<MAX_FINGER_PRINT_LEN)
+	  strcat(finger_print, ")");
+    }
+    break;
+  
+  case WEAK:
+  case FOREIGN:
+  case STABLE_NAME:
+    {
+      StgPtr end;
+      char str[6];
+      sprintf(str,"%d?", info->layout.payload.ptrs);
+      strcat(finger_print,str); 
+
+	//end = (StgPtr)((StgClosure *)p)->payload + info->layout.payload.ptrs;
+      //for (p = (StgPtr)((StgClosure *)p)->payload; p < end; p++) {
+      // GraphFingerPrint_((StgClosure *)*p, finger_print);
+      //}
+      break;
+    }
+  
+  case IND_PERM:
+  case IND_OLDGEN_PERM:
+    GraphFingerPrint_(((StgIndOldGen *)p)->indirectee, finger_print);
+    break;
+  
+  case MUT_VAR:
+    /* ignore MUT_CONSs */
+    if (((StgMutVar *)p)->header.info != &stg_MUT_CONS_info) {
+      GraphFingerPrint_(((StgMutVar *)p)->var, finger_print);
+    }
+    break;
+  
+  case CAF_BLACKHOLE:
+  case SE_CAF_BLACKHOLE:
+  case SE_BLACKHOLE:
+  case BLACKHOLE:
+    break;
+  
+  case BLACKHOLE_BQ:
+    { 
+      StgBlockingQueue *bh = (StgBlockingQueue *)p;
+      // GraphFingerPrint_((StgClosure *)bh->blocking_queue, finger_print);
+      break;
+    }
+  
+  case THUNK_SELECTOR:
+    { 
+      StgSelector *s = (StgSelector *)p;
+      GraphFingerPrint_(s->selectee, finger_print);
+      break;
+    }
+  
+  case IND:
+    GraphFingerPrint_(((StgInd*)p)->indirectee, finger_print);
+    break;
+
+  case IND_OLDGEN:
+    GraphFingerPrint_(((StgIndOldGen*)p)->indirectee, finger_print);
+    break;
+
+  case IND_STATIC:
+    GraphFingerPrint_(((StgIndOldGen*)p)->indirectee, finger_print);
+    break;
+  
+  case CONSTR_INTLIKE:
+  case CONSTR_CHARLIKE:
+  case CONSTR_STATIC:
+  case CONSTR_NOCAF_STATIC:
+  case THUNK_STATIC:
+  case FUN_STATIC:
+    break;
+  
+  case RET_BCO:
+  case RET_SMALL:
+  case RET_VEC_SMALL:
+  case RET_BIG:
+  case RET_VEC_BIG:
+  case RET_DYN:
+  case UPDATE_FRAME:
+  case STOP_FRAME:
+  case CATCH_FRAME:
+  case SEQ_FRAME:
+    break;
+  
+  case AP_UPD: /* same as PAPs */
+  case PAP:
+    /* Treat a PAP just like a section of stack, not forgetting to
+     * GraphFingerPrint_ the function pointer too...
+     */
+    { 
+  	StgPAP* pap = stgCast(StgPAP*,p);
+	char str[6];
+	sprintf(str,"%d",pap->n_args);
+	strcat(finger_print,str); 
+	//GraphFingerPrint_(pap->fun, finger_print); // ??
+  	break;
+    }
+    
+  case ARR_WORDS:
+    {
+	char str[6];
+	sprintf(str,"%d",((StgArrWords*)p)->words);
+	strcat(finger_print,str); 
+    }
+    break;
+
+  case MUT_ARR_PTRS:
+    /* follow everything */
+    {
+	char str[6];
+	sprintf(str,"%d",((StgMutArrPtrs*)p)->ptrs);
+	strcat(finger_print,str); 
+    }
+    {
+  	StgPtr next;
+  	//next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+  	//for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+	//  GraphFingerPrint_((StgClosure *)*p, finger_print);
+  	//}
+  	break;
+    }
+  
+  case MUT_ARR_PTRS_FROZEN:
+    /* follow everything */
+    {
+	char str[6];
+	sprintf(str,"%d",((StgMutArrPtrs*)p)->ptrs);
+	strcat(finger_print,str); 
+    }
+    {
+  	StgPtr start = p, next;
+  	//next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+  	//for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+	//  GraphFingerPrint_((StgClosure *)*p, finger_print);
+  	//}
+  	break;
+    }
+  
+  case TSO:
+    { 
+      StgTSO *tso = (StgTSO *)p;
+      char str[6];
+      sprintf(str,"%d",tso->id);
+      strcat(finger_print,str); 
+    }
+    //GraphFingerPrint_((StgClosure *)tso->link, indent_level+1);
+    break;
+  
+#if defined(GRAN) || defined(PAR)
+  case RBH:
+    {
+      // use this
+      // StgInfoTable *rip = REVERT_INFOPTR(get_itbl(p));
+    }
+    break;
+#endif
+#if defined(PAR)
+  case BLOCKED_FETCH:
+    break;
+  case FETCH_ME:
+    break;
+  case FETCH_ME_BQ:
+    break;
+#endif
+#ifdef DIST    
+  case REMOTE_REF:
+    break;
+#endif
+  case EVACUATED:
+    break;
+  
+  default:
+    barf("GraphFingerPrint_: unknown closure %d (%s)",
+	 info -> type, info_type(info));
+  }
+ 
+}    
+# endif /* PAR */
+
+/*
+  Do a sanity check on the whole graph, down to a recursion level of level.
+  Same structure as PrintGraph (nona).
+*/
+void
+checkGraph(StgClosure *p, int rec_level)
+{
+  StgPtr x, q;
+  nat i, j;
+  const StgInfoTable *info;
+  
+  if (rec_level==0)
+    return;
+
+  q = p;			/* save ptr to object */
+
+  /* First, the obvious generic checks */
+  ASSERT(p!=(StgClosure*)NULL);
+  checkClosure(p);              /* see Sanity.c for what's actually checked */
+
+  info = get_itbl((StgClosure *)p);
+  /* the rest of this fct recursively traverses the graph */
+  switch (info -> type) {
+  
+  case BCO:
+    {
+  	StgBCO* bco = stgCast(StgBCO*,p);
+  	nat i;
+        /*
+  	for (i = 0; i < bco->n_ptrs; i++) {
+	  checkGraph(bcoConstCPtr(bco,i), rec_level-1);
+  	}
+	*/
+  	break;
+    }
+  
+  case MVAR:
+    /* treat MVars specially, because we don't want to PrintGraph the
+     * mut_link field in the middle of the closure.
+     */
+    { 
+  	StgMVar *mvar = ((StgMVar *)p);
+	checkGraph((StgClosure *)mvar->head, rec_level-1);
+	checkGraph((StgClosure *)mvar->tail, rec_level-1);
+	checkGraph((StgClosure *)mvar->value, rec_level-1);
+  	break;
+    }
+  
+  case THUNK_2_0:
+  case FUN_2_0:
+  case CONSTR_2_0:
+    checkGraph(((StgClosure *)p)->payload[0], rec_level-1);
+    checkGraph(((StgClosure *)p)->payload[1], rec_level-1);
+    break;
+  
+  case THUNK_1_0:
+    checkGraph(((StgClosure *)p)->payload[0], rec_level-1);
+    break;
+  
+  case FUN_1_0:
+  case CONSTR_1_0:
+    checkGraph(((StgClosure *)p)->payload[0], rec_level-1);
+    break;
+  
+  case THUNK_0_1:
+    break;
+  
+  case FUN_0_1:
+  case CONSTR_0_1:
+    break;
+  
+  case THUNK_0_2:
+  case FUN_0_2:
+  case CONSTR_0_2:
+    break;
+  
+  case THUNK_1_1:
+  case FUN_1_1:
+  case CONSTR_1_1:
+    checkGraph(((StgClosure *)p)->payload[0], rec_level-1);
+    break;
+  
+  case FUN:
+  case THUNK:
+  case CONSTR:
+    for (i=0; i<info->layout.payload.ptrs; i++)
+      checkGraph(((StgClosure *)p)->payload[i], rec_level-1);
+    break;
+  
+  case WEAK:
+  case FOREIGN:
+  case STABLE_NAME:
+    {
+      StgPtr end;
+      
+      end = (StgPtr)((StgClosure *)p)->payload + info->layout.payload.ptrs;
+      for (p = (StgPtr)((StgClosure *)p)->payload; p < end; p++) {
+	checkGraph(*(StgClosure **)p, rec_level-1);
+      }
+      break;
+    }
+  
+  case IND_PERM:
+  case IND_OLDGEN_PERM:
+    checkGraph(((StgIndOldGen *)p)->indirectee, rec_level-1);
+    break;
+  
+  case MUT_VAR:
+    /* ignore MUT_CONSs */
+    if (((StgMutVar *)p)->header.info != &stg_MUT_CONS_info) {
+      checkGraph(((StgMutVar *)p)->var, rec_level-1);
+    }
+    break;
+  
+  case CAF_BLACKHOLE:
+  case SE_CAF_BLACKHOLE:
+  case SE_BLACKHOLE:
+  case BLACKHOLE:
+    break;
+  
+  case BLACKHOLE_BQ:
+    break;
+  
+  case THUNK_SELECTOR:
+    { 
+      StgSelector *s = (StgSelector *)p;
+      checkGraph(s->selectee, rec_level-1);
+      break;
+    }
+  
+  case IND:
+    checkGraph(((StgInd*)p)->indirectee, rec_level-1);
+    break;
+
+  case IND_OLDGEN:
+    checkGraph(((StgIndOldGen*)p)->indirectee, rec_level-1);
+    break;
+  
+  case CONSTR_INTLIKE:
+    break;
+  case CONSTR_CHARLIKE:
+    break;
+  case CONSTR_STATIC:
+    break;
+  case CONSTR_NOCAF_STATIC:
+    break;
+  case THUNK_STATIC:
+    break;
+  case FUN_STATIC:
+    break;
+  case IND_STATIC:
+    break;
+  
+  case RET_BCO:
+    break;
+  case RET_SMALL:
+    break;
+  case RET_VEC_SMALL:
+    break;
+  case RET_BIG:
+    break;
+  case RET_VEC_BIG:
+    break;
+  case RET_DYN:
+    break;
+  case UPDATE_FRAME:
+    break;
+  case STOP_FRAME:
+    break;
+  case CATCH_FRAME:
+    break;
+  case SEQ_FRAME:
+    break;
+  
+  case AP_UPD: /* same as PAPs */
+  case PAP:
+    /* Treat a PAP just like a section of stack, not forgetting to
+     * checkGraph the function pointer too...
+     */
+    { 
+  	StgPAP* pap = stgCast(StgPAP*,p);
+  
+	checkGraph(pap->fun, rec_level-1);
+  	break;
+    }
+    
+  case ARR_WORDS:
+    break;
+
+  case MUT_ARR_PTRS:
+    /* follow everything */
+    {
+  	StgPtr next;
+  
+  	next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+  	for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+	  checkGraph(*(StgClosure **)p, rec_level-1);
+  	}
+  	break;
+    }
+  
+  case MUT_ARR_PTRS_FROZEN:
+    /* follow everything */
+    {
+  	StgPtr start = p, next;
+  
+  	next = p + mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
+  	for (p = (P_)((StgMutArrPtrs *)p)->payload; p < next; p++) {
+	  checkGraph(*(StgClosure **)p, rec_level-1);
+  	}
+  	break;
+    }
+  
+  case TSO:
+    { 
+  	StgTSO *tso;
+  	
+  	tso = (StgTSO *)p;
+	checkGraph((StgClosure *)tso->link, rec_level-1);
+  	break;
+    }
+  
+#if defined(GRAN) || defined(PAR)
+  case RBH:
+    break;
+#endif
+#if defined(PAR)
+  case BLOCKED_FETCH:
+    break;
+  case FETCH_ME:
+    break;
+  case FETCH_ME_BQ:
+    break;
+#endif
+  case EVACUATED:
+    barf("checkGraph: found EVACUATED closure %p (%s)",
+	 p, info_type(p));
+    break;
+  
+  default:
+  }
+}    
+
+#endif /* GRAN */
+
+#endif /* GRAN || PAR */
+
+//@node End of File,  , Printing Packet Contents, Debugging routines for GranSim and GUM
+//@subsection End of File
diff --git a/rts/parallel/ParallelDebug.h b/rts/parallel/ParallelDebug.h
new file mode 100644
index 0000000000..f8aaeb85d4
--- /dev/null
+++ b/rts/parallel/ParallelDebug.h
@@ -0,0 +1,79 @@
+/* 
+   Time-stamp: <Tue Mar 06 2001 00:25:14 Stardate: [-30]6285.08 hwloidl>
+
+   Prototypes of all parallel debugging functions.
+*/
+
+#ifndef PARALLEL_DEBUG_H
+#define PARALLEL_DEBUG_H
+
+#if defined(DEBUG) && (defined(GRAN) || defined(PAR))
+/* max length of the string holding a finger-print for a graph */
+#define MAX_FINGER_PRINT_LEN  10000
+// (10*RtsFlags.ParFlags.packBufferSize)
+#endif
+
+#if defined(DEBUG) && defined(GRAN)
+void G_PRINT_NODE(StgClosure* node);
+void G_PPN(StgClosure* node);
+void G_INFO_TABLE(StgClosure* node);
+void G_CURR_THREADQ(StgInt verbose);
+void G_THREADQ(StgTSO* closure, StgInt verbose);
+void G_TSO(StgTSO* closure, StgInt verbose);
+void G_EVENT(rtsEventQ event, StgInt verbose);
+void G_EVENTQ(StgInt verbose);
+void G_PE_EQ(PEs pe, StgInt verbose);
+void G_SPARK(rtsSparkQ spark, StgInt verbose);
+void G_SPARKQ(rtsSparkQ spark, StgInt verbose);
+void G_CURR_SPARKQ(StgInt verbose);
+void G_PROC(StgInt proc, StgInt verbose);
+void GP(StgInt proc);
+void GCP(void);
+void GT(StgPtr tso);
+void GCT(void);
+void GEQ(void);
+void GTQ(PEs p);
+void GCTQ(void);
+void GSQ(PEs p);
+void GCSQ(void);
+void GN(StgPtr node);
+void GIT(StgPtr node);
+#endif
+
+#if defined(GRAN) || defined(PAR)
+
+char  *display_info_type(StgClosure *closure, char *str);
+void   info_hdr_type(StgClosure *closure, char *res);
+char  *info_type(StgClosure *closure);
+char  *info_type_by_ip(StgInfoTable *ip);
+
+void   PrintPacket(rtsPackBuffer *buffer);
+void   PrintGraph(StgClosure *p, int indent_level);
+void   GraphFingerPrint(StgClosure *p, char *finger_print);
+void   checkGraph(StgClosure *p, int rec_level);
+
+void   checkPacket(rtsPackBuffer *packBuffer);
+
+#endif /* GRAN || PAR */
+
+#if defined(PAR)
+
+/* don't want to import Schedule.h and Sanity.h everywhere */
+extern void print_bq (StgClosure *node);
+extern void checkBQ (StgBlockingQueueElement *bqe, StgClosure *closure);
+
+void   checkGAGAMap(globalAddr *gagamap, int nGAs);
+extern rtsBool isOnLiveIndTable(globalAddr *ga);
+extern void rebuildGAtables(rtsBool full);
+extern void rebuildLAGAtable(void);
+extern void checkLAGAtable(rtsBool check_closures);
+extern void checkHeapChunk(StgPtr start, StgPtr end);
+extern void printGA (globalAddr *ga);
+extern void printGALA (GALA *gala);
+extern void printLiveIndTable(void);
+extern void printRemoteGATable(void);
+extern void printLAGAtable(void);
+
+#endif
+
+#endif /* PARALLEL_DEBUG_H */
diff --git a/rts/parallel/ParallelRts.h b/rts/parallel/ParallelRts.h
new file mode 100644
index 0000000000..d421296d19
--- /dev/null
+++ b/rts/parallel/ParallelRts.h
@@ -0,0 +1,253 @@
+/* --------------------------------------------------------------------------
+   Time-stamp: <Tue Mar 06 2001 00:25:50 Stardate: [-30]6285.08 hwloidl>
+
+   Variables and functions specific to the parallel RTS (i.e. GUM or GranSim)
+   ----------------------------------------------------------------------- */
+
+#ifndef PARALLEL_RTS_H
+#define PARALLEL_RTS_H
+
+#include "ParTicky.h"
+
+/* HWL HACK: compile time sanity checks; shouldn't be necessary at all */
+#if defined(PAR) && defined(GRAN)
+# error "Both PAR and GRAN defined"
+#endif
+
+#if defined(DEBUG)
+/* Paranoia debugging: we add an end-of-buffer marker to every pack buffer 
+                       (only when sanity checking RTS is enabled, of course) */
+#define  DEBUG_HEADROOM        1
+#define  END_OF_BUFFER_MARKER  0x1111bbbb
+#define  GARBAGE_MARKER        0x1111eeee
+#else
+#define  DEBUG_HEADROOM        0
+#endif /* DEBUG */
+
+#if defined(GRAN) || defined(PAR)
+
+#if defined(GRAN)
+
+/* Statistics info */
+extern nat tot_packets, tot_packet_size, tot_cuts, tot_thunks;
+
+/* Pack.c */
+rtsPackBuffer *PackNearbyGraph(StgClosure* closure, StgTSO* tso, 
+			       nat *packBufferSize, GlobalTaskId dest); 
+rtsPackBuffer *PackOneNode(StgClosure* closure, StgTSO* tso, 
+			   nat *packBufferSize);
+rtsPackBuffer *PackTSO(StgTSO *tso, nat *packBufferSize);
+rtsPackBuffer *PackStkO(StgPtr stko, nat *packBufferSize);
+void           PackFetchMe(StgClosure *closure);
+
+/* Unpack.c */
+StgClosure*    UnpackGraph(rtsPackBuffer* buffer);
+void           InitPendingGABuffer(nat size); 
+
+/* RBH.c */
+StgClosure    *convertToRBH(StgClosure *closure);
+void           convertFromRBH(StgClosure *closure);
+
+/* HLComms.c */
+rtsFetchReturnCode blockFetch(StgTSO* tso, PEs proc, StgClosure* bh);
+void               blockThread(StgTSO *tso);
+
+#endif
+#if defined(PAR)
+
+/* Statistics info */
+
+/* global structure for collecting statistics */
+typedef struct GlobalParStats_ {
+  /* GALA and LAGA table info */
+  nat tot_mark_GA, tot_rebuild_GA, tot_free_GA,
+      res_mark_GA, res_rebuild_GA, res_free_GA,
+      cnt_mark_GA, cnt_rebuild_GA, cnt_free_GA,
+      res_size_GA, tot_size_GA, local_alloc_GA, tot_global, tot_local;
+
+  /* time spent managing the GAs */
+  double time_mark_GA, time_rebuild_GA;
+
+  /* spark queue stats */
+  nat res_sp, tot_sp, cnt_sp, emp_sp;
+  // nat tot_sq_len, tot_sq_probes, tot_sparks;
+  /* thread queue stats */
+  nat res_tp, tot_tp, cnt_tp, emp_tp;
+  //nat tot_add_threads, tot_tq_len, non_end_add_threads;
+
+  /* packet statistics */
+  nat tot_packets, tot_packet_size, tot_thunks,
+      res_packet_size, res_thunks,
+      rec_packets, rec_packet_size, rec_thunks,
+      rec_res_packet_size, rec_res_thunks;
+  /* time spent packing stuff */
+  double time_pack, time_unpack;
+
+  /* thread stats */
+  nat tot_threads_created;
+
+  /* spark stats */
+  //nat pruned_sparks, withered_sparks;
+  nat tot_sparks_created, tot_sparks_ignored, tot_sparks_marked,
+      res_sparks_created, res_sparks_ignored, res_sparks_marked; // , sparks_created_on_PE[MAX_PROC];
+  double time_sparks;
+
+  /* scheduling stats */
+  nat tot_yields, tot_stackover, tot_heapover;
+
+  /* message statistics */
+  nat tot_fish_mess, tot_fetch_mess, tot_resume_mess, tot_schedule_mess;
+  nat rec_fish_mess, rec_fetch_mess, rec_resume_mess, rec_schedule_mess;
+#if defined(DIST)
+  nat tot_reval_mess;
+  nat rec_reval_mess;
+#endif
+
+  /* blocking queue statistics
+  rtsTime tot_bq_processing_time;
+  nat tot_bq_len, tot_bq_len_local, tot_awbq, tot_FMBQs;
+  */
+
+  /* specialised info on arrays (for GPH/Maple mainly) */
+  nat tot_arrs, tot_arr_size;
+} GlobalParStats;
+
+extern GlobalParStats globalParStats;
+
+void  globalParStat_exit(void);
+
+/* Pack.c */
+rtsBool        InitPackBuffer(void);
+rtsPackBuffer *PackNearbyGraph(StgClosure* closure, StgTSO* tso, 
+			       nat *packBufferSize, GlobalTaskId dest); 
+
+/* Unpack.c */
+void           CommonUp(StgClosure *src, StgClosure *dst);
+StgClosure    *UnpackGraph(rtsPackBuffer *buffer, globalAddr **gamap, 
+			   nat *nGAs);
+
+/* RBH.c */
+StgClosure    *convertToRBH(StgClosure *closure);
+void           convertToFetchMe(StgRBH *rbh, globalAddr *ga);
+
+/* HLComms.c */
+void           blockFetch(StgBlockedFetch *bf, StgClosure *bh);
+void           blockThread(StgTSO *tso);
+
+/* Global.c */
+void           GALAdeprecate(globalAddr *ga);
+
+/* HLComms.c */
+nat            pending_fetches_len(void);
+
+/* ParInit.c */
+void 	       initParallelSystem(void);
+void 	       shutdownParallelSystem(StgInt n);
+void 	       synchroniseSystem(void);
+void 	       par_exit(I_);
+
+#endif
+
+/* this routine should be moved to a more general module; currently in Pack.c 
+StgInfoTable* get_closure_info(StgClosure* node, 
+			       nat *size, nat *ptrs, nat *nonptrs, nat *vhs, 
+			       char *info_hdr_ty);
+*/
+void doGlobalGC(void); 
+
+//@node GC routines, Debugging routines, Spark handling routines
+//@subsection GC routines
+
+#if defined(PAR)
+/* HLComms.c */
+void      freeRemoteGA(int pe, globalAddr *ga);
+void      sendFreeMessages(void);
+void      markPendingFetches(rtsBool major_gc);
+
+/* Global.c */
+void      markLocalGAs(rtsBool full);
+void      RebuildGAtables(rtsBool full);
+void      RebuildLAGAtable(void);
+#endif
+
+//@node Debugging routines, Generating .gr profiles, GC routines
+//@subsection Debugging routines
+
+#if defined(PAR)
+void      printGA (globalAddr *ga);
+void      printGALA (GALA *gala);
+void      printLAGAtable(void);
+
+rtsBool   isOnLiveIndTable(globalAddr *ga);
+rtsBool   isOnRemoteGATable(globalAddr *ga);
+void      checkFreeGALAList(void);
+void      checkFreeIndirectionsList(void);
+#endif
+
+//@node Generating .gr profiles, Index, Debugging routines
+//@subsection Generating .gr profiles
+
+#define STATS_FILENAME_MAXLEN	128
+
+/* Where to write the log file */
+//@cindex gr_file
+//@cindex gr_filename
+extern FILE *gr_file;
+extern char gr_filename[STATS_FILENAME_MAXLEN];
+
+//@cindex init_gr_stats
+//@cindex init_gr_simulation
+//@cindex end_gr_simulation
+void 	  	init_gr_stats (void);
+void 	  	init_gr_simulation(int rts_argc, char *rts_argv[], 
+ 	 			   int prog_argc, char *prog_argv[]);
+void 	  	end_gr_simulation(void);
+
+// TODO: move fcts in here (as static inline)
+StgInfoTable*   get_closure_info(StgClosure* node, nat *size, nat *ptrs, nat *nonptrs, nat *vhs, char *info_hdr_ty);
+rtsBool         IS_BLACK_HOLE(StgClosure* node);
+StgClosure     *IS_INDIRECTION(StgClosure* node)          ;
+StgClosure     *UNWIND_IND (StgClosure *closure);
+
+
+#endif /* defined(PAR) || defined(GRAN) */
+
+//@node Common macros, Index, Generating .gr profiles
+//@subsection Common macros
+
+#define LOOKS_LIKE_PTR(r)    \
+        (LOOKS_LIKE_STATIC_CLOSURE(r) ||  \
+         ((HEAP_ALLOCED(r) && Bdescr((P_)r)->free != (void *)-1)))
+
+/* see Sanity.c for this kind of test; doing this in these basic fcts
+   is paranoid (nuke it after debugging!)
+*/
+
+/* pathetic version of the check whether p can be a closure */
+#define LOOKS_LIKE_COOL_CLOSURE(p)  1
+
+//LOOKS_LIKE_GHC_INFO(get_itbl(p))
+
+    /* Is it a static closure (i.e. in the data segment)? */ \
+    /*
+#define LOOKS_LIKE_COOL_CLOSURE(p)  \
+    ((LOOKS_LIKE_STATIC(p)) ?                                   \
+	closure_STATIC(p)                               \
+      : !closure_STATIC(p) && LOOKS_LIKE_PTR(p))
+    */
+
+#endif /* PARALLEL_RTS_H */
+
+//@node Index,  , Index
+//@subsection Index
+
+//@index
+//* IS_BLACK_HOLE::  @cindex\s-+IS_BLACK_HOLE
+//* IS_INDIRECTION::  @cindex\s-+IS_INDIRECTION
+//* end_gr_simulation::  @cindex\s-+end_gr_simulation
+//* get_closure_info::  @cindex\s-+get_closure_info
+//* gr_file::  @cindex\s-+gr_file
+//* gr_filename::  @cindex\s-+gr_filename
+//* init_gr_simulation::  @cindex\s-+init_gr_simulation
+//* unwindInd::  @cindex\s-+unwindInd
+//@end index
diff --git a/rts/parallel/RBH.c b/rts/parallel/RBH.c
new file mode 100644
index 0000000000..1612209027
--- /dev/null
+++ b/rts/parallel/RBH.c
@@ -0,0 +1,337 @@
+/*
+  Time-stamp: <Tue Mar 13 2001 19:07:13 Stardate: [-30]6323.98 hwloidl>
+
+  Revertible Black Hole Manipulation.
+  Used in GUM and GranSim during the packing of closures. These black holes
+  must be revertible because a GC might occur while the packet is being 
+  transmitted. In this case all RBHs have to be reverted.
+  */
+
+#if defined(PAR) || defined(GRAN) /* whole file */
+
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "RtsUtils.h"
+#include "GranSimRts.h"
+#include "ParallelRts.h"
+# if defined(DEBUG)
+# include "ParallelDebug.h"
+# endif
+#include "Storage.h"  // for recordMutable
+#include "StgMacros.h" // inlined IS_... fcts
+
+/*
+   Turn a closure into a revertible black hole.  After the conversion, the
+   first two words of the closure (after the fixed header, of course) will
+   be a link to the mutables list (if appropriate for the garbage
+   collector), and a pointer to the blocking queue.  The blocking queue is
+   terminated by a 2-word SPEC closure which holds the original contents of
+   the first two words of the closure.  
+*/
+
+//@menu
+//* Externs and prototypes::	
+//* Conversion Functions::	
+//* Index::			
+//@end menu
+
+//@node Externs and prototypes, Conversion Functions
+//@section Externs and prototypes
+
+EXTFUN(stg_RBH_Save_0_info);
+EXTFUN(stg_RBH_Save_1_info);
+EXTFUN(stg_RBH_Save_2_info);
+
+//@node Conversion Functions, Index, Externs and prototypes
+//@section Conversion Functions
+
+/*
+  A closure is turned into an RBH upon packing it (see PackClosure in Pack.c).
+  This is needed in case we have to do a GC before the packet is turned
+  into a graph on the PE receiving the packet. 
+*/
+//@cindex convertToRBH
+StgClosure *
+convertToRBH(closure)
+StgClosure *closure;
+{
+  StgRBHSave *rbh_save;
+  StgInfoTable *info_ptr, *rbh_info_ptr, *old_info;
+  nat size, ptrs, nonptrs, vhs;
+  char str[80];
+
+  /*
+     Closure layout before this routine runs amuck:
+       +-------------------
+       |   HEADER   | DATA ...
+       +-------------------
+       | FIXED_HS   |
+  */
+  /* 
+     Turn closure into an RBH.  This is done by modifying the info_ptr,
+     grabbing the info_ptr of the RBH for this closure out of its
+     ITBL. Additionally, we have to save the words from the closure, which
+     will hold the link to the blocking queue.  For this purpose we use the
+     RBH_Save_N closures, with N being the number of pointers for this
+     closure.  */
+  IF_GRAN_DEBUG(pack,
+		belch("*>::   %p (%s): Converting closure into an RBH",
+		      closure, info_type(closure))); 
+  IF_PAR_DEBUG(pack,
+		belch("*>::   %p (%s): Converting closure into an RBH",
+		      closure, info_type(closure))); 
+
+  ASSERT(closure_THUNK(closure));
+
+  IF_GRAN_DEBUG(pack,
+		old_info = get_itbl(closure));
+
+  /* Allocate a new closure for the holding data ripped out of closure */
+  if ((rbh_save = (StgRBHSave *)allocate(_HS + 2)) == NULL)
+    return NULL;  /* have to Garbage Collect; check that in the caller! */
+
+  info_ptr = get_closure_info(closure, &size, &ptrs, &nonptrs, &vhs, str);
+  ASSERT(size >= _HS+MIN_UPD_SIZE);
+
+  /* Fill in the RBH_Save closure with the original data from closure */
+  rbh_save->payload[0] = (StgPtr) ((StgRBH *)closure)->blocking_queue;
+  rbh_save->payload[1] = (StgPtr) ((StgRBH *)closure)->mut_link;
+
+  /* Set the info_ptr for the rbh_Save closure according to the number of
+     pointers in the original */
+
+  rbh_info_ptr = (StgInfoTable *) (ptrs == 0 ? &stg_RBH_Save_0_info :
+				   ptrs == 1 ? &stg_RBH_Save_1_info :
+				   &stg_RBH_Save_2_info);
+  SET_INFO(rbh_save, rbh_info_ptr);
+  /* same bitmask as the original closure */
+  SET_GRAN_HDR(rbh_save, PROCS(closure));
+
+  /* Init the blocking queue of the RBH and have it point to the saved data */
+  ((StgRBH *)closure)->blocking_queue = (StgBlockingQueueElement *)rbh_save;
+
+  ASSERT(LOOKS_LIKE_GHC_INFO(RBH_INFOPTR(get_itbl(closure))));
+  /* Turn the closure into a RBH;  a great system, indeed! */
+  SET_INFO(closure, RBH_INFOPTR(get_itbl(closure)));
+
+  /*
+    add closure to the mutable list!
+    do this after having turned the closure into an RBH, because an
+    RBH is mutable but the closure it was before wasn't mutable
+  */
+  recordMutable((StgMutClosure *)closure);
+
+  //IF_GRAN_DEBUG(pack,
+		/* sanity check; make sure that reverting the RBH yields the 
+		   orig closure, again */
+  //ASSERT(REVERT_INFOPTR(get_itbl(closure))==old_info));
+
+  /*
+     Closure layout after this routine has run amuck:
+       +---------------------
+       | RBH-HEADER | |   |  ...
+       +--------------|---|--
+       | FIXED_HS   | |   v
+                      |   Mutable-list ie another StgMutClosure
+		      v
+		      +---------
+		      | RBH_SAVE with 0-2 words of DATA
+		      +---------
+  */
+
+  return closure;
+}
+
+/*
+  An RBH closure is turned into a FETCH_ME when reveiving an ACK message
+  indicating that the transferred closure has been unpacked on the other PE
+  (see processAck in HLComms.c). The ACK also contains the new GA of the
+  closure to which the FETCH_ME closure has to point.
+
+  Converting a closure to a FetchMe is trivial, unless the closure has
+  acquired a blocking queue.  If that has happened, we first have to awaken
+  the blocking queue.  What a nuisance!  Fortunately, @AwakenBlockingQueue@
+  should now know what to do.
+
+  A note on GrAnSim: In GrAnSim we don't have FetchMe closures. However,
+  we have to turn a RBH back to its original form when the simulated
+  transfer of the closure has been finished. Therefore we need the
+  @convertFromRBH@ routine below. After converting the RBH back to its
+  original form and awakening all TSOs, the first TSO will reenter the
+  closure which is now local and carry on merrily reducing it (the other
+  TSO will be less merrily blocked on the now local closure; we're costing
+  the difference between local and global blocks in the BQ code).  -- HWL 
+*/
+
+# if defined(PAR)
+
+EXTFUN(stg_FETCH_ME_info);
+
+//@cindex convertToFetchMe
+void
+convertToFetchMe(rbh, ga)
+StgRBH *rbh;
+globalAddr *ga;
+{
+  // StgInfoTable *ip = get_itbl(rbh);
+  StgBlockingQueueElement *bqe = rbh->blocking_queue;
+
+  ASSERT(get_itbl(rbh)->type==RBH);
+
+  IF_PAR_DEBUG(pack,
+	       belch("**:: Converting RBH %p (%s) into a FETCH_ME for GA ((%x, %d, %x))",
+		     rbh, info_type(rbh), 
+	             ga->payload.gc.gtid, ga->payload.gc.slot, ga->weight)); 
+
+  /* put closure on mutables list, while it is still a RBH */
+  recordMutable((StgMutClosure *)rbh);
+
+  /* actually turn it into a FETCH_ME */
+  SET_INFO((StgClosure *)rbh, &stg_FETCH_ME_info);
+
+  /* set the global pointer in the FETCH_ME closure to the given value */
+  ((StgFetchMe *)rbh)->ga = ga;
+
+  IF_PAR_DEBUG(pack,
+	       if (get_itbl(bqe)->type==TSO || get_itbl(bqe)->type==BLOCKED_FETCH)
+	         belch("**:: Awakening non-empty BQ of RBH closure %p (first TSO is %d (%p)",
+		      rbh, ((StgTSO *)bqe)->id, ((StgTSO *)bqe))); 
+
+  /* awaken all TSOs and BLOCKED_FETCHES on the blocking queue */
+  if (get_itbl(bqe)->type==TSO || get_itbl(bqe)->type==BLOCKED_FETCH)
+    awakenBlockedQueue(bqe, (StgClosure *)rbh);
+}
+# else  /* GRAN */
+/* Prototype */
+// void UnlinkFromMUT(StgPtr closure); 
+
+/*
+  This routine in fact reverts the RBH into its original form; this code 
+  should be of interest for GUM, too, but is not needed in the current version.
+  convertFromRBH is called where GUM uses convertToFetchMe.
+*/
+void
+convertFromRBH(closure)
+StgClosure *closure;
+{
+  StgBlockingQueueElement *bqe = ((StgRBH*)closure)->blocking_queue;
+  char str[NODE_STR_LEN]; // debugging only
+  StgInfoTable *rip = REVERT_INFOPTR(get_itbl(closure));  // debugging only
+
+  IF_GRAN_DEBUG(pack,
+		if (get_itbl(bqe)->type==TSO)
+		  sprintf(str, "%d (%p)", 
+			  ((StgTSO *)bqe)->id, ((StgTSO *)bqe));
+		else 
+		  strcpy(str, "empty");
+		belch("*<:: Reverting RBH %p (%s) into a ??? closure again; BQ start: %s",
+		      closure, info_type(closure), str));
+
+  ASSERT(get_itbl(closure)->type==RBH);
+
+  /* awakenBlockedQueue also restores the RBH_Save closure
+     (have to call it even if there are no TSOs in the queue!) */
+  awakenBlockedQueue(bqe, closure);
+
+  /* Put back old info pointer (grabbed from the RBH's info table).
+     We do that *after* awakening the BQ to be sure node is an RBH when
+     calling awakenBlockedQueue (different in GUM!)
+  */
+  SET_INFO(closure, REVERT_INFOPTR(get_itbl(closure)));
+
+  /* put closure on mutables list */
+  recordMutable((StgMutClosure *)closure);
+
+# if 0 /* rest of this fct */
+    /* ngoq ngo' */
+    /* FETCHME_GA(closure) = ga; */
+    if (IS_MUTABLE(INFO_PTR(bqe))) {
+      PROC old_proc = CurrentProc,        /* NB: For AwakenBlockingQueue, */
+           new_proc = where_is(closure);  /*     CurentProc must be where */
+					  /*     closure lives. */
+      CurrentProc = new_proc;
+
+#  if defined(GRAN_CHECK)
+      if (RTSflags.GranFlags.debug & 0x100)
+        fprintf(stderr,"===== AwBQ of node 0x%lx (%s) [PE %2u]\n",
+	               closure, (isSpec ? "SPEC_RBH" : "GEN_RBH"), new_proc);
+#  endif
+
+      rbh_save = AwakenBlockingQueue(bqe);     /* AwakenBlockingQueue(bqe); */
+      CurrentProc = old_proc;
+    } else {
+        rbh_save = bqe;
+    }
+
+    /* Put data from special RBH save closures back into the closure */
+    if ( rbh_save == NULL ) {
+      fprintf(stderr,"convertFromRBH: No RBH_Save_? closure found at end of BQ!\n");
+      EXIT(EXIT_FAILURE);
+    } else {
+      closure[isSpec ? SPEC_HS : GEN_HS] = rbh_save[SPEC_HS];
+      closure[(isSpec ? SPEC_HS : GEN_HS) + 1] = rbh_save[SPEC_HS + 1];
+    }
+# endif /* 0 */
+
+# if 0 && (defined(GCap) || defined(GCgn))
+    /* ngoq ngo' */
+    /* If we convert from an RBH in the old generation,
+       we have to make sure it goes on the mutables list */
+
+    if(closure <= StorageMgrInfo.OldLim) {
+	if (IS_MUTABLE(INFO_PTR(closure)) && MUT_LINK(closure) == MUT_NOT_LINKED) {
+	    MUT_LINK(closure) = (StgWord) StorageMgrInfo.OldMutables;
+            StorageMgrInfo.OldMutables = closure;
+	}
+    }
+# endif /* 0 */
+}
+#endif /* PAR */
+
+/* Remove closure from the mutables list */
+#if 0
+/* ngoq ngo' */
+void
+UnlinkFromMUT(StgPtr closure) 
+{
+  StgPtr curr = StorageMgrInfo.OldMutables, prev = NULL;
+
+  while (curr != NULL && curr != closure) {
+    ASSERT(MUT_LINK(curr)!=MUT_NOT_LINKED);
+    prev=curr;
+    curr=MUT_LINK(curr); 
+  }
+  if (curr==closure) {   
+   if (prev==NULL) 
+     StorageMgrInfo.OldMutables = MUT_LINK(curr);
+   else   
+     MUT_LINK(prev) = MUT_LINK(curr);
+   MUT_LINK(curr) = MUT_NOT_LINKED;
+  }
+
+#  if 0 && (defined(GCap) || defined(GCgn))
+  {
+    closq newclos;
+    extern closq ex_RBH_q;
+
+    newclos = (closq) stgMallocBytes(sizeof(struct clos), "UnlinkFromMUT");
+    CLOS_CLOSURE(newclos) = closure;
+    CLOS_PREV(newclos) = NULL;
+    CLOS_NEXT(newclos) = ex_RBH_q;
+    if (ex_RBH_q!=NULL)
+      CLOS_PREV(ex_RBH_q) = newclos;
+    ex_RBH_q = newclos;
+  }
+#  endif
+}
+#endif /* PAR */
+
+#endif /* PAR || GRAN -- whole file */
+
+//@node Index,  , Conversion Functions
+//@section Index
+
+//@index
+//* convertToFetchMe::  @cindex\s-+convertToFetchMe
+//* convertToRBH::  @cindex\s-+convertToRBH
+//@end index
diff --git a/rts/parallel/SysMan.c b/rts/parallel/SysMan.c
new file mode 100644
index 0000000000..40bcf6a19e
--- /dev/null
+++ b/rts/parallel/SysMan.c
@@ -0,0 +1,650 @@
+/* ----------------------------------------------------------------------------
+   Time-stamp: <Wed Mar 21 2001 17:16:28 Stardate: [-30]6363.59 hwloidl>
+
+   GUM System Manager Program
+   Handles startup, shutdown and global synchronisation of the parallel system.
+
+   The Parade/AQUA Projects, Glasgow University, 1994-1995.
+   GdH/APART Projects, Heriot-Watt University, Edinburgh, 1997-2000.
+ 
+   ------------------------------------------------------------------------- */
+
+//@node GUM System Manager Program, , , 
+//@section GUM System Manager Program
+
+//@menu
+//* General docu::		
+//* Includes::			
+//* Macros etc::		
+//* Variables::			
+//* Prototypes::		
+//* Aux startup and shutdown fcts::  
+//* Main fct::			
+//* Message handlers::		
+//* Auxiliary fcts::		
+//* Index::			
+//@end menu
+
+//@node General docu, Includes, GUM System Manager Program, GUM System Manager Program
+//@subsection General docu
+
+/*
+The Sysman task currently controls initiation, termination, of a
+parallel Haskell program running under GUM. In the future it may
+control global GC synchronisation and statistics gathering. Based on
+K. Hammond's SysMan.lc in Graph for PVM. SysMan is unusual in that it
+is not part of the executable produced by ghc: it is a free-standing
+program that spawns PVM tasks (logical PEs) to evaluate the
+program. After initialisation it runs in parallel with the PE tasks,
+awaiting messages.
+
+OK children, buckle down for some serious weirdness, it works like this ...
+
+o The argument vector (argv) for SysMan has one the following 2 shapes:
+
+-------------------------------------------------------------------------------
+| SysMan path | debug flag | pvm-executable path | Num. PEs | Program Args ...|
+-------------------------------------------------------------------------------
+
+-------------------------------------------------------------------
+| SysMan path | pvm-executable path | Num. PEs | Program Args ... |
+-------------------------------------------------------------------
+
+The "pvm-executable path" is an absolute path of where PVM stashes the
+code for each PE. The arguments passed on to each PE-executable
+spawned by PVM are:
+
+-------------------------------
+| Num. PEs | Program Args ... |
+-------------------------------
+
+The arguments passed to the Main-thread PE-executable are
+
+-------------------------------------------------------------------
+| main flag | pvm-executable path | Num. PEs | Program Args ... |
+-------------------------------------------------------------------
+
+o SysMan's algorithm is as follows.
+
+o use PVM to spawn (nPE-1) PVM tasks 
+o fork SysMan to create the main-thread PE. This permits the main-thread to 
+  read and write to stdin and stdout. 
+o  Wait for all the PE-tasks to reply back saying they are ready and if they were the
+  main thread or not.
+o Broadcast an array of the PE task-ids out to all of the PE-tasks.
+o Enter a loop awaiting incoming messages, e.g. failure, Garbage-collection, 
+  termination.
+
+The forked Main-thread algorithm, in SysMan, is as follows.
+
+o disconnects from PVM.
+o sets a flag in argv to indicate that it is the main thread.
+o `exec's a copy of the pvm-executable (i.e. the program being run)
+
+
+The pvm-executable run by each PE-task, is initialised as follows.
+
+o Registers with PVM, obtaining a task-id.
+o If it was main it gets SysMan's task-id from argv otherwise it can use pvm_parent.
+oSends a ready message to SysMan together with a flag indicating if it was main or not.
+o Receives from SysMan the array of task-ids of the other PEs.
+o If the number of task-ids sent was larger than expected then it must have been a task
+  generated after the rest of the program had started, so it sends its own task-id message
+  to all the tasks it was told about.
+o Begins execution.
+
+*/
+
+//@node Includes, Macros etc, General docu, GUM System Manager Program
+//@subsection Includes
+
+/* Evidently not Posix */
+/* #include "PosixSource.h" */
+
+#include "Rts.h"
+#include "ParTypes.h"
+#include "LLC.h"
+#include "Parallel.h"
+#include "ParallelRts.h" // stats only
+
+//@node Macros etc, Variables, Includes, GUM System Manager Program
+//@subsection Macros etc
+
+/* SysMan is put on top of the GHC routine that does the RtsFlags handling.
+   So, we cannot use the standard macros. For the time being we use a macro
+   that is fixed at compile time.
+*/
+
+#ifdef IF_PAR_DEBUG
+#undef IF_PAR_DEBUG
+#endif
+    
+/* debugging enabled */
+//#define IF_PAR_DEBUG(c,s)  { s; } 
+/* debugging disabled */
+#define IF_PAR_DEBUG(c,s)  /* nothing */
+
+void *stgMallocBytes (int n, char *msg);
+
+//@node Variables, Prototypes, Macros etc, GUM System Manager Program
+//@subsection Variables
+
+/*
+   The following definitions included so that SysMan can be linked with Low
+   Level Communications module (LLComms). They are not used in SysMan.  
+*/
+GlobalTaskId         mytid; 
+
+static unsigned      PEsArrived = 0;
+static GlobalTaskId  gtids[MAX_PES];
+static GlobalTaskId  sysman_id, sender_id;
+static unsigned      PEsTerminated = 0;
+static rtsBool       Finishing = rtsFalse;
+static long          PEbuffer[MAX_PES];
+nat                  nSpawn = 0;    // current no. of spawned tasks (see gtids)
+nat                  nPEs = 0;      // number of PEs specified on startup
+nat                  nextPE;
+/* PVM-ish variables */
+char                 *petask, *pvmExecutable;
+char                 **pargv;
+int                  cc, spawn_flag = PvmTaskDefault;
+
+#if 0 && defined(PAR_TICKY)
+/* ToDo: use allGlobalParStats to collect stats of all PEs */
+GlobalParStats *allGlobalParStats[MAX_PES];
+#endif
+
+//@node Prototypes, Aux startup and shutdown fcts, Variables, GUM System Manager Program
+//@subsection Prototypes
+
+/* prototypes for message handlers called from the main loop of SysMan */
+void newPE(int nbytes, int opcode, int sender_id);
+void readyPE(int nbytes, int opcode, int sender_id);
+void finishPE(int nbytes, int opcode, int sender_id, int exit_code);
+
+//@node Aux startup and shutdown fcts, Main fct, Prototypes, GUM System Manager Program
+//@subsection Aux startup and shutdown fcts
+
+/* 
+   Create the PE Tasks. We spawn (nPEs-1) pvm threads: the Main Thread 
+   (which starts execution and performs IO) is created by forking SysMan 
+*/
+static int
+createPEs(int total_nPEs) {
+  int i, spawn_nPEs, iSpawn = 0, nArch, nHost;
+  struct pvmhostinfo *hostp; 
+  int sysman_host;
+
+  spawn_nPEs = total_nPEs-1;
+  if (spawn_nPEs > 0) {
+    IF_PAR_DEBUG(verbose,
+		 fprintf(stderr, "==== [%x] Spawning %d PEs(%s) ...\n", 
+			 sysman_id, spawn_nPEs, petask);
+		 fprintf(stderr, "  args: ");
+		 for (i = 0; pargv[i]; ++i)
+		   fprintf(stderr, "%s, ", pargv[i]);
+		 fprintf(stderr, "\n"));
+
+    pvm_config(&nHost,&nArch,&hostp);
+    sysman_host=pvm_tidtohost(sysman_id);
+	
+    /* create PEs on the specific machines in the specified order! */
+    for (i=0; (iSpawn<spawn_nPEs) && (i<nHost); i++)
+      if (hostp[i].hi_tid != sysman_host) { 
+	checkComms(pvm_spawn(petask, pargv, spawn_flag+PvmTaskHost, 
+			     hostp[i].hi_name, 1, gtids+iSpawn),
+		   "SysMan startup");
+	IF_PAR_DEBUG(verbose,
+		     fprintf(stderr, "==== [%x] Spawned PE %d onto %s\n",
+			     sysman_id, i, hostp[i].hi_name));
+	iSpawn++;
+      }
+      
+    /* create additional PEs anywhere you like */
+    if (iSpawn<spawn_nPEs) { 
+      checkComms(pvm_spawn(petask, pargv, spawn_flag, "", 
+			   spawn_nPEs-iSpawn, gtids+iSpawn),
+		 "SysMan startup");
+	IF_PAR_DEBUG(verbose,
+		     fprintf(stderr,"==== [%x] Spawned %d additional PEs anywhere\n",
+			     sysman_id, spawn_nPEs-iSpawn));
+      }   
+    }
+
+#if 0
+  /* old code with random placement of PEs; make that a variant? */
+# error "Broken startup in SysMan"
+  { /* let pvm place the PEs anywhere; not used anymore */
+    checkComms(pvm_spawn(petask, pargv, spawn_flag, "", spawn_nPEs, gtids),"SysMan startup");
+    IF_PAR_DEBUG(verbose,
+		 fprintf(stderr,"==== [%x] Spawned\n", sysman_id));
+    
+  }
+#endif    
+
+  // iSpawn=spawn_nPEs; 
+
+  return iSpawn;
+}
+
+/* 
+   Check if this pvm task is in the list of tasks we spawned and are waiting 
+   on, if so then remove it.
+*/
+
+static rtsBool 
+alreadySpawned (GlobalTaskId g) { 
+  unsigned int i;
+
+  for (i=0; i<nSpawn; i++)
+    if (g==gtids[i]) { 
+      nSpawn--;
+      gtids[i] = gtids[nSpawn];  //the last takes its place
+      return rtsTrue;
+    }
+  return rtsFalse;
+}
+
+static void 
+broadcastFinish(void) { 
+  int i,j;
+  int tids[MAX_PES];  /* local buffer of all surviving PEs */
+
+  for (i=0, j=0; i<nPEs; i++) 
+    if (PEbuffer[i]) 
+      tids[j++]=PEbuffer[i]; //extract valid tids
+
+  IF_PAR_DEBUG(verbose,
+    fprintf(stderr,"==== [%x] Broadcasting Finish to %d PEs; initiating shutdown\n", 
+	sysman_id, j));
+
+  /* ToDo: move into LLComms.c */	    	    	    
+  pvm_initsend(PvmDataDefault);
+  pvm_mcast(tids,j,PP_FINISH);
+}
+
+static void 
+broadcastPEtids (void) { 
+  nat i; 
+
+  IF_PAR_DEBUG(verbose,
+    fprintf(stderr,"==== [%x] SysMan sending PE table to all PEs\n", sysman_id);
+    /* debugging */
+    fprintf(stderr,"++++ [%x] PE table as seen by SysMan:\n", mytid);
+    for (i = 0; i < nPEs; i++) { 
+      fprintf(stderr,"++++ PEbuffer[%d] = %x\n", i, PEbuffer[i]);
+    }      	
+  )
+
+  broadcastOpN(PP_PETIDS, PEGROUP, nPEs, &PEbuffer);
+}
+
+//@node Main fct, Message handlers, Aux startup and shutdown fcts, GUM System Manager Program
+//@subsection Main fct
+
+//@cindex main
+int 
+main (int argc, char **argv) {
+  int rbufid;
+  int opcode, nbytes, nSpawn;
+  unsigned int i;
+  
+  setbuf(stdout, NULL);  // disable buffering of stdout
+  setbuf(stderr, NULL);  // disable buffering of stderr
+
+  IF_PAR_DEBUG(verbose,
+	       fprintf(stderr,
+	               "==== RFP: GdH enabled SysMan reporting for duty\n"));
+  
+  if (argc > 1) {
+    if (*argv[1] == '-') {
+      spawn_flag = PvmTaskDebug;
+      argv[1] = argv[0];
+      argv++; argc--;
+    }
+    sysman_id = pvm_mytid();  /* This must be the first PVM call */
+    
+    if (sysman_id<0) { 
+	fprintf(stderr, "==== PVM initialisation failure\n");  
+      	exit(EXIT_FAILURE);  
+    }
+    
+    /* 
+       Get the full path and filename of the pvm executable (stashed in some
+       PVM directory), and the number of PEs from the command line.
+    */
+    pvmExecutable = argv[1];
+    nPEs = atoi(argv[2]);
+    
+    if (nPEs==0) { 
+      /* as usual 0 means infinity: use all PEs specified in PVM config */
+      int nArch, nHost;
+      struct pvmhostinfo *hostp; 
+
+      /* get info on PVM config */
+      pvm_config(&nHost,&nArch,&hostp);
+      nPEs=nHost;
+      sprintf(argv[2],"%d",nPEs); /* ToCheck: does this work on all archs */
+    }	
+
+    /* get the name of the binary to execute */
+    if ((petask = getenv(PETASK)) == NULL)  // PETASK set by driver
+      petask = PETASK;
+
+    IF_PAR_DEBUG(verbose,
+		 fprintf(stderr,"==== [%x] nPEs: %d; executable: |%s|\n", 
+			sysman_id, nPEs, petask));
+    
+    /* Check that we can create the number of PE and IMU tasks requested.
+                                                     ^^^
+       This comment is most entertaining since we haven't been using IMUs 
+       for the last 10 years or so -- HWL */
+    if ((nPEs > MAX_PES) || (nPEs<1)) {
+      fprintf(stderr,"==** SysMan: No more than %d PEs allowed (%d requested)\n     Reconfigure GUM setting MAX_PE in ghc/includes/Parallel.h to a higher value\n", 
+	   MAX_PES, nPEs);
+      exit(EXIT_FAILURE);
+    }
+
+    IF_PAR_DEBUG(verbose,
+		   fprintf(stderr,"==== [%x] is SysMan Task\n", sysman_id));
+
+    /* Initialise the PE task arguments from Sysman's arguments */
+    pargv = argv + 2;
+
+    /* Initialise list of all PE identifiers */
+    PEsArrived=0;  
+    nextPE=1;
+    for (i=0; i<nPEs; i++)
+      PEbuffer[i]=0;
+    
+    /* start up the required number of PEs */
+    nSpawn = createPEs(nPEs);
+    
+    /* 
+       Create the MainThread PE by forking SysMan. This arcane coding 
+       is required to allow MainThread to read stdin and write to stdout.
+       PWT 18/1/96 
+    */
+    //nPEs++;                /* Record that the number of PEs is increasing */
+    if ((cc = fork())) {
+      checkComms(cc,"SysMan fork");         /* Parent continues as SysMan */
+                  
+      PEbuffer[0]=0;    /* we accept the first main and assume its valid. */
+      PEsArrived=1;     /* assume you've got main                         */
+
+      IF_PAR_DEBUG(verbose,
+		   fprintf(stderr,"==== [%x] Sysman successfully initialized!\n",
+			   sysman_id));
+
+//@cindex message handling loop
+      /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
+      /* Main message handling loop                                         */
+      /* ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
+      /* Process incoming messages */
+      while (1) {
+	if ((rbufid = pvm_recv(ANY_TASK, ANY_OPCODE)) < 0) {
+	  pvm_perror("==** Sysman: Receiving Message (pvm_recv)");
+          /* never reached */
+        }
+
+	pvm_bufinfo(rbufid, &nbytes, &opcode, &sender_id);
+
+        /* very low level debugging
+	IF_PAR_DEBUG(verbose,
+	             fprintf(stderr,"== [%x] SysMan: Message received by SysMan: rbufid=%x, nbytes = %d, opcode = %x, sender_id = %x\n",
+                     sysman_id, rbufid, nbytes, opcode, sender_id));
+	*/
+
+	switch (opcode) {
+	    
+	  case PP_NEWPE: /* a new PE is registering for work */
+	    newPE(nbytes, opcode, sender_id);
+	    break;
+
+          case PP_READY: /* startup complete; let PEs start working */
+	    readyPE(nbytes, opcode, sender_id);
+	    break;
+
+	      
+	  case PP_GC_INIT: /* start global GC */
+	    /* This Function not yet implemented for GUM */
+	    fprintf(stderr,"==** Global GC requested by PE %x. Not yet implemented for GUM!\n", 
+		    sender_id);
+	    break;
+	    
+	  case PP_STATS_ON: /* enable statistics gathering */
+	    fprintf(stderr,"==** PP_STATS_ON requested by %x. Not yet implemented for GUM!\n", 
+		  sender_id);
+	    break;
+
+	  case PP_STATS_OFF: /* disable statistics gathering */
+	    fprintf(stderr,"==** PP_STATS_OFF requested by %x. Not yet implemented for GUM!\n", 
+		  sender_id);
+	    break;
+	    
+	  case PP_FINISH:
+	    { 
+              int exit_code = getExitCode(nbytes, &sender_id);
+	      finishPE(nbytes, opcode, sender_id, exit_code);
+	      break;
+
+	  default:
+	    {
+	     /*		          
+	      char *opname = GetOpName(opcode);
+	      fprintf(stderr,"Sysman: Unrecognised opcode %s (%x)\n",
+	                      opname,opcode);	*/
+	      fprintf(stderr,"==** Qagh: Sysman: Unrecognised opcode (%x)\n",
+		      opcode);
+	    }
+	    break;
+	  } 	/* switch */
+	}	/* else */
+      }		/* while 1 */
+      /* end of SysMan!! */
+    } else {	
+      /* forked main thread begins here */
+      IF_PAR_DEBUG(verbose,
+	           fprintf(stderr, "==== Main Thread PE has been forked; doing an execv(%s,...)\n", 
+	           pvmExecutable));
+      pvmendtask();		 // Disconnect from PVM to avoid confusion:
+                                 // executable reconnects 
+      
+      // RFP: assumes that length(arvv[0])>=9 !!!
+      sprintf(argv[0],"-%08X",sysman_id);  /*flag that its the Main Thread PE and include sysman's id*/
+      execv(pvmExecutable,argv); /* Parent task becomes Main Thread PE */
+    }           /* else */
+  }		/* argc > 1 */  
+}		/* main */
+
+//@node Message handlers, Auxiliary fcts, Main fct, GUM System Manager Program
+//@subsection Message handlers
+
+/*
+   Received PP_NEWPE:
+   A new PE has been added to the configuration.
+*/
+void
+newPE(int nbytes, int opcode, int sender_id) { 
+  IF_PAR_DEBUG(verbose,
+	       fprintf(stderr,"==== [%x] SysMan detected a new host\n",
+		       sysman_id));
+
+  /* Determine the new machine... assume its the last on the config list? */
+  if (nSpawn < MAX_PES) { 
+    int nArch,nHost;
+    struct pvmhostinfo *hostp; 
+
+    /* get conmfiguration of PVM machine */
+    pvm_config(&nHost,&nArch,&hostp);	      
+    nHost--;
+    checkComms(pvm_spawn(petask, pargv, spawn_flag+PvmTaskHost, 
+			 hostp[nHost].hi_name, 1, gtids+nSpawn),
+	       "SysMan loop");
+    nSpawn++;
+    IF_PAR_DEBUG(verbose,
+		 fprintf(stderr, "==== [%x] Spawned onto %s\n",
+			 sysman_id, hostp[nHost].hi_name));
+  }
+}
+	  
+/* 
+   Received PP_READY:
+   Let it be known that PE @sender_id@ participates in the computation.
+*/
+void
+readyPE(int nbytes, int opcode, int sender_id) { 
+  int i = 0, flag = 1;
+  long isMain;
+  int nArch, nHost;
+  struct pvmhostinfo *hostp; 
+
+  //ASSERT(opcode==PP_READY);
+
+  IF_PAR_DEBUG(verbose,
+	       fprintf(stderr,"==== [%x] SysMan received PP_READY message from %x\n",
+		       sysman_id, sender_id));
+
+    pvm_config(&nHost,&nArch,&hostp);
+
+  GetArg1(isMain);
+	      
+  //if ((isMain && (PEbuffer[0]==0)) || alreadySpawned(sender_id)) { 
+    if (nPEs >= MAX_PES) { 
+      fprintf(stderr,"==== [%x] SysMan doesn't need PE %d (max %d PEs allowed)\n",
+	      sysman_id, sender_id, MAX_PES);
+      pvm_kill(sender_id); 
+    } else { 
+      if (isMain) { 
+	IF_PAR_DEBUG(verbose,
+		     fprintf(stderr,"==== [%x] SysMan found Main PE %x\n", 
+			     sysman_id, sender_id));
+	PEbuffer[0]=sender_id;
+      } else { 
+	/* search for PE in list of PEs */
+	for(i=1; i<nPEs; i++)
+	  if (PEbuffer[i]==sender_id) { 
+	    flag=0;
+	    break;
+	  }
+	/* it's a new PE: add it to the list of PEs */
+	if (flag)  
+	  PEbuffer[nextPE++] = sender_id; 
+	
+	IF_PAR_DEBUG(verbose,
+		     fprintf(stderr,"==== [%x] SysMan: found PE %d as [%x] on host %s\n", 
+			     sysman_id, PEsArrived, sender_id, hostp[PEsArrived].hi_name));
+
+	PEbuffer[PEsArrived++] = sender_id;
+      }
+
+		
+      /* enable better handling of unexpected terminations */
+      checkComms( pvm_notify(PvmTaskExit, PP_FINISH, 1, &sender_id),
+	          "SysMan loop");
+
+      /* finished registration of all PEs => enable notification */
+      if ((PEsArrived==nPEs) && PEbuffer[0]) { 
+	checkComms( pvm_notify(PvmHostAdd, PP_NEWPE, -1, 0),
+                    "SysMan startup");
+	IF_PAR_DEBUG(verbose,
+		     fprintf(stderr,"==== [%x] SysMan initialising notificaton for new hosts\n", sysman_id));
+      }
+		
+      /* finished notification => send off the PE ids */
+      if ((PEsArrived>=nPEs) && PEbuffer[0]) { 
+        if (PEsArrived>nPEs) {
+	IF_PAR_DEBUG(verbose,	
+		     fprintf(stderr,"==== [%x] Weird: %d PEs registered, but we only asked for %d\n", sysman_id, PEsArrived, nPEs));
+	// nPEs=PEsArrived;
+        }
+	broadcastPEtids();
+      }
+    }
+}
+
+/* 
+   Received PP_FINISH:
+   Shut down the corresponding PE. Check whether it is a regular shutdown
+   or an uncontrolled termination.
+*/
+void
+finishPE(int nbytes, int opcode, int sender_id, int exitCode) { 
+  int i;
+
+  IF_PAR_DEBUG(verbose,
+	       fprintf(stderr,"==== [%x] SysMan received PP_FINISH message from %x (exit code: %d)\n",
+		       sysman_id, sender_id, exitCode));
+
+  /* Is it relevant to us? Count the first message */
+  for (i=0; i<nPEs; i++) 
+    if (PEbuffer[i] == sender_id) { 
+      PEsTerminated++;
+      PEbuffer[i]=0; 
+	
+      /* handle exit code */
+      if (exitCode<0) {           /* a task exit before a controlled finish? */
+	fprintf(stderr,"==== [%x] Termination at %x with exit(%d)\n", 
+		sysman_id, sender_id, exitCode);
+      } else if (exitCode>0) {		           /* an abnormal exit code? */
+	fprintf(stderr,"==== [%x] Uncontrolled termination at %x with exit(%d)\n", 
+		sysman_id, sender_id, exitCode);	
+      } else if (!Finishing) {             /* exitCode==0 which is good news */
+        if (i!=0) {          /* someone other than main PE terminated first? */
+	 fprintf(stderr,"==== [%x] Unexpected early termination at %x\n", 
+		 sysman_id, sender_id);	
+	} else {
+         /* start shutdown by broadcasting FINISH to other PEs */
+	 IF_PAR_DEBUG(verbose,
+	              fprintf(stderr,"==== [%x] Initiating shutdown (requested by [%x] RIP) (exit code: %d)\n", sysman_id, sender_id, exitCode));
+         Finishing = rtsTrue;
+         broadcastFinish();
+        }
+      }	else {
+         /* we are in a shutdown already */
+	IF_PAR_DEBUG(verbose,
+		     fprintf(stderr,"==== [%x] Finish from %x during shutdown (%d PEs terminated so far; %d total)\n", 
+			     sysman_id, sender_id, PEsTerminated, nPEs));
+      }
+
+      if (PEsTerminated >= nPEs) { 
+        IF_PAR_DEBUG(verbose,
+          fprintf(stderr,"==== [%x] Global Shutdown, Goodbye!! (SysMan has received FINISHes from all PEs)\n", sysman_id));
+        //broadcastFinish();
+	/* received finish from everybody; now, we can exit, too */
+        exit(EXIT_SUCCESS); /* Qapla'! */
+      }
+    }
+}	
+	    
+//@node Auxiliary fcts, Index, Message handlers, GUM System Manager Program
+//@subsection Auxiliary fcts
+
+/* Needed here because its used in loads of places like LLComms etc */
+
+//@cindex stg_exit
+
+/* 
+ * called from STG-land to exit the program
+ */
+
+void  
+stg_exit(I_ n)
+{
+  fprintf(stderr, "==// [%x] %s in SysMan code; sending PP_FINISH to all PEs ...\n", 
+	    mytid,(n!=0)?"FAILURE":"FINISH");
+  broadcastFinish();
+  //broadcastFinish();
+  pvm_exit();
+  exit(n);
+}
+
+//@node Index,  , Auxiliary fcts, GUM System Manager Program
+//@subsection Index
+
+//@index
+//* main::  @cindex\s-+main
+//* message handling loop::  @cindex\s-+message handling loop
+//* stgMallocBytes::  @cindex\s-+stgMallocBytes
+//* stg_exit::  @cindex\s-+stg_exit
+//@end index
diff --git a/rts/posix/GetTime.c b/rts/posix/GetTime.c
new file mode 100644
index 0000000000..3a0764cb91
--- /dev/null
+++ b/rts/posix/GetTime.c
@@ -0,0 +1,141 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 2005
+ *
+ * Machine-dependent time measurement functions
+ *
+ * ---------------------------------------------------------------------------*/
+
+// Not POSIX, due to use of ru_majflt in getPageFaults()
+// #include "PosixSource.h"
+
+#include "Rts.h"
+#include "GetTime.h"
+
+#ifdef HAVE_TIME_H
+# include <time.h>
+#endif
+
+#ifdef HAVE_SYS_TIME_H
+# include <sys/time.h>
+#endif
+
+#if HAVE_SYS_RESOURCE_H
+# include <sys/resource.h>
+#endif
+
+#ifdef HAVE_UNISTD_H
+# include <unistd.h>
+#endif
+
+#ifdef HAVE_SYS_TIMES_H
+# include <sys/times.h>
+#endif
+
+#if ! ((defined(HAVE_GETRUSAGE) && !irix_HOST_OS) || defined(HAVE_TIMES))
+#error No implementation for getProcessCPUTime() available.
+#endif
+
+#if defined(HAVE_GETTIMEOFDAY) && defined(HAVE_GETRUSAGE) && !irix_HOST_OS
+// we'll implement getProcessCPUTime() and getProcessElapsedTime()
+// separately, using getrusage() and gettimeofday() respectively
+
+Ticks getProcessCPUTime(void)
+{
+    struct rusage t;
+    getrusage(RUSAGE_SELF, &t);
+    return (t.ru_utime.tv_sec * TICKS_PER_SECOND + 
+	    ((Ticks)t.ru_utime.tv_usec * TICKS_PER_SECOND)/1000000);
+}
+
+Ticks getProcessElapsedTime(void)
+{
+    struct timeval tv;
+    gettimeofday(&tv, (struct timezone *) NULL);
+    return (tv.tv_sec * TICKS_PER_SECOND +
+	    ((Ticks)tv.tv_usec * TICKS_PER_SECOND)/1000000);
+}
+
+void getProcessTimes(Ticks *user, Ticks *elapsed)
+{
+    *user    = getProcessCPUTime();
+    *elapsed = getProcessElapsedTime();
+}
+
+#elif defined(HAVE_TIMES)
+
+// we'll use the old times() API.
+
+Ticks getProcessCPUTime(void)
+{
+    Ticks user, elapsed;
+    getProcessTimes(&user,&elapsed);
+    return user;
+}
+
+Ticks getProcessElapsedTime(void)
+{
+    Ticks user, elapsed;
+    getProcessTimes(&user,&elapsed);
+    return elapsed;
+}
+
+void getProcessTimes(Ticks *user, Ticks *elapsed)
+{
+    static nat ClockFreq = 0;
+
+    if (ClockFreq == 0) {
+#if defined(HAVE_SYSCONF)
+	long ticks;
+	ticks = sysconf(_SC_CLK_TCK);
+	if ( ticks == -1 ) {
+	    errorBelch("sysconf\n");
+	    stg_exit(EXIT_FAILURE);
+	}
+	ClockFreq = ticks;
+#elif defined(CLK_TCK)		/* defined by POSIX */
+	ClockFreq = CLK_TCK;
+#elif defined(HZ)
+	ClockFreq = HZ;
+#elif defined(CLOCKS_PER_SEC)
+	ClockFreq = CLOCKS_PER_SEC;
+#else
+	errorBelch("can't get clock resolution");
+	stg_exit(EXIT_FAILURE);
+#endif
+    }
+
+    struct tms t;
+    clock_t r = times(&t);
+    *user = (((Ticks)t.tms_utime * TICKS_PER_SECOND) / ClockFreq);
+    *elapsed = (((Ticks)r * TICKS_PER_SECOND) / ClockFreq);
+}
+
+#endif // HAVE_TIMES
+
+Ticks getThreadCPUTime(void)
+{
+#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_THREAD_CPUTIME_ID)
+    // clock_gettime() gives us per-thread CPU time.  It isn't
+    // reliable on Linux, but it's the best we have.
+    struct timespec ts;
+    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
+    return (ts.tv_sec * TICKS_PER_SECOND + 
+	    ((Ticks)ts.tv_nsec * TICKS_PER_SECOND) / 1000000000);
+#else
+    return getProcessCPUTime();
+#endif
+}
+
+nat
+getPageFaults(void)
+{
+#if !defined(HAVE_GETRUSAGE) || irix_HOST_OS
+    return 0;
+#else
+    struct rusage t;
+    getrusage(RUSAGE_SELF, &t);
+    return(t.ru_majflt);
+#endif
+}
+
diff --git a/rts/posix/Itimer.c b/rts/posix/Itimer.c
new file mode 100644
index 0000000000..83ed84d6ef
--- /dev/null
+++ b/rts/posix/Itimer.c
@@ -0,0 +1,226 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1995-1999
+ *
+ * Interval timer for profiling and pre-emptive scheduling.
+ *
+ * ---------------------------------------------------------------------------*/
+
+/*
+ * The interval timer is used for profiling and for context switching in the
+ * threaded build.  Though POSIX 1003.1b includes a standard interface for
+ * such things, no one really seems to be implementing them yet.  Even 
+ * Solaris 2.3 only seems to provide support for @CLOCK_REAL@, whereas we're
+ * keen on getting access to @CLOCK_VIRTUAL@.
+ * 
+ * Hence, we use the old-fashioned @setitimer@ that just about everyone seems
+ * to support.  So much for standards.
+ */
+#include "Rts.h"
+#include "RtsFlags.h"
+#include "Timer.h"
+#include "Ticker.h"
+#include "posix/Itimer.h"
+#include "Proftimer.h"
+#include "Schedule.h"
+#include "posix/Select.h"
+
+/* As recommended in the autoconf manual */
+# ifdef TIME_WITH_SYS_TIME
+#  include <sys/time.h>
+#  include <time.h>
+# else
+#  ifdef HAVE_SYS_TIME_H
+#   include <sys/time.h>
+#  else
+#   include <time.h>
+#  endif
+# endif
+
+#ifdef HAVE_SIGNAL_H
+# include <signal.h>
+#endif
+
+/* Major bogosity:
+ * 
+ * In the threaded RTS, we can't set the virtual timer because the
+ * thread which has the virtual timer might be sitting waiting for a
+ * capability, and the virtual timer only ticks in CPU time.
+ *
+ * So, possible solutions:
+ *
+ * (1) tick in realtime.  Not very good, because this ticker is used for
+ *     profiling, and this will give us unreliable time profiling
+ *     results.  Furthermore, this requires picking a single OS thread
+ *     to be the timekeeper, which is a bad idea because the thread in
+ *     question might just be making a temporary call into Haskell land.
+ *
+ * (2) save/restore the virtual timer around excursions into STG land.
+ *     Sounds great, but I tried it and the resolution of the virtual timer
+ *     isn't good enough (on Linux) - most of our excursions fall
+ *     within the timer's resolution and we never make any progress.
+ *   
+ * (3) have a virtual timer in every OS thread.  Might be reasonable,
+ *     because most of the time there is only ever one of these
+ *     threads running, so it approximates a single virtual timer.
+ *     But still quite bogus (and I got crashes when I tried this).
+ *
+ * For now, we're using (1), but this needs a better solution. --SDM
+ */
+#ifdef THREADED_RTS
+#define ITIMER_FLAVOUR  ITIMER_REAL
+#define ITIMER_SIGNAL   SIGALRM
+#else
+#define ITIMER_FLAVOUR  ITIMER_VIRTUAL
+#define ITIMER_SIGNAL   SIGVTALRM
+#endif
+
+static
+int
+install_vtalrm_handler(TickProc handle_tick)
+{
+    struct sigaction action;
+
+    action.sa_handler = handle_tick;
+
+    sigemptyset(&action.sa_mask);
+
+#ifdef SA_RESTART
+    // specify SA_RESTART.  One consequence if we don't do this is
+    // that readline gets confused by the -threaded RTS.  It seems
+    // that if a SIGALRM handler is installed without SA_RESTART,
+    // readline installs its own SIGALRM signal handler (see
+    // readline's signals.c), and this somehow causes readline to go
+    // wrong when the input exceeds a single line (try it).
+    action.sa_flags = SA_RESTART;
+#else
+    action.sa_flags = 0;
+#endif
+
+    return sigaction(ITIMER_SIGNAL, &action, NULL);
+}
+
+int
+startTicker(nat ms, TickProc handle_tick)
+{
+# ifndef HAVE_SETITIMER
+  /*    debugBelch("No virtual timer on this system\n"); */
+    return -1;
+# else
+    struct itimerval it;
+
+    install_vtalrm_handler(handle_tick);
+
+#if !defined(THREADED_RTS)
+    timestamp = getourtimeofday();
+#endif
+
+    it.it_value.tv_sec = ms / 1000;
+    it.it_value.tv_usec = 1000 * (ms - (1000 * it.it_value.tv_sec));
+    it.it_interval = it.it_value;
+    return (setitimer(ITIMER_FLAVOUR, &it, NULL));
+# endif
+}
+
+int
+stopTicker()
+{
+# ifndef HAVE_SETITIMER
+  /*    debugBelch("No virtual timer on this system\n"); */
+    return -1;
+# else
+    struct itimerval it;
+  
+    it.it_value.tv_sec = 0;
+    it.it_value.tv_usec = 0;
+    it.it_interval = it.it_value;
+    return (setitimer(ITIMER_FLAVOUR, &it, NULL));
+# endif
+}
+
+# if 0
+/* This is a potential POSIX version */
+int
+startTicker(nat ms)
+{
+    struct sigevent se;
+    struct itimerspec it;
+    timer_t tid;
+
+#if !defined(THREADED_RTS)
+    timestamp = getourtimeofday();
+#endif
+
+    se.sigev_notify = SIGEV_SIGNAL;
+    se.sigev_signo = ITIMER_SIGNAL;
+    se.sigev_value.sival_int = ITIMER_SIGNAL;
+    if (timer_create(CLOCK_VIRTUAL, &se, &tid)) {
+	barf("can't create virtual timer");
+    }
+    it.it_value.tv_sec = ms / 1000;
+    it.it_value.tv_nsec = 1000000 * (ms - 1000 * it.it_value.tv_sec);
+    it.it_interval = it.it_value;
+    return timer_settime(tid, TIMER_RELTIME, &it, NULL);
+}
+
+int
+stopTicker()
+{
+    struct sigevent se;
+    struct itimerspec it;
+    timer_t tid;
+
+#if !defined(THREADED_RTS)
+    timestamp = getourtimeofday();
+#endif
+
+    se.sigev_notify = SIGEV_SIGNAL;
+    se.sigev_signo = ITIMER_SIGNAL;
+    se.sigev_value.sival_int = ITIMER_SIGNAL;
+    if (timer_create(CLOCK_VIRTUAL, &se, &tid)) {
+	barf("can't create virtual timer");
+    }
+    it.it_value.tv_sec = 0;
+    it.it_value.tv_nsec = 0;
+    it.it_interval = it.it_value;
+    return timer_settime(tid, TIMER_RELTIME, &it, NULL);
+}
+# endif
+
+#if 0
+/* Currently unused */
+void
+block_vtalrm_signal(void)
+{
+    sigset_t signals;
+    
+    sigemptyset(&signals);
+    sigaddset(&signals, ITIMER_SIGNAL);
+
+    (void) sigprocmask(SIG_BLOCK, &signals, NULL);
+}
+
+void
+unblock_vtalrm_signal(void)
+{
+    sigset_t signals;
+    
+    sigemptyset(&signals);
+    sigaddset(&signals, ITIMER_SIGNAL);
+
+    (void) sigprocmask(SIG_UNBLOCK, &signals, NULL);
+}
+#endif
+
+/* gettimeofday() takes around 1us on our 500MHz PIII.  Since we're
+ * only calling it 50 times/s, it shouldn't have any great impact.
+ */
+lnat
+getourtimeofday(void)
+{
+  struct timeval tv;
+  gettimeofday(&tv, (struct timezone *) NULL);
+  	// cast to lnat because nat may be 64 bit when int is only 32 bit
+  return ((lnat)tv.tv_sec * TICK_FREQUENCY +
+	  (lnat)tv.tv_usec * TICK_FREQUENCY / 1000000);
+}
diff --git a/rts/posix/Itimer.h b/rts/posix/Itimer.h
new file mode 100644
index 0000000000..09d01bde54
--- /dev/null
+++ b/rts/posix/Itimer.h
@@ -0,0 +1,19 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1998-2005
+ *
+ * Interval timer for profiling and pre-emptive scheduling.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef ITIMER_H
+#define ITIMER_H
+
+extern lnat getourtimeofday   ( void );
+#if 0
+/* unused */
+extern void block_vtalrm_signal       ( void );
+extern void unblock_vtalrm_signal     ( void );
+#endif
+
+#endif /* ITIMER_H */
diff --git a/rts/posix/OSThreads.c b/rts/posix/OSThreads.c
new file mode 100644
index 0000000000..07bd762130
--- /dev/null
+++ b/rts/posix/OSThreads.c
@@ -0,0 +1,166 @@
+/* ---------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2001-2005
+ *
+ * Accessing OS threads functionality in a (mostly) OS-independent
+ * manner. 
+ *
+ * --------------------------------------------------------------------------*/
+
+#if defined(DEBUG) && defined(__linux__)
+/* We want GNU extensions in DEBUG mode for mutex error checking */
+#define _GNU_SOURCE
+#endif
+
+#include "Rts.h"
+#if defined(THREADED_RTS)
+#include "OSThreads.h"
+#include "RtsUtils.h"
+
+#if HAVE_STRING_H
+#include <string.h>
+#endif
+
+#if !defined(HAVE_PTHREAD_H)
+#error pthreads.h is required for the threaded RTS on Posix platforms
+#endif
+
+/*
+ * This (allegedly) OS threads independent layer was initially
+ * abstracted away from code that used Pthreads, so the functions
+ * provided here are mostly just wrappers to the Pthreads API.
+ *
+ */
+
+void
+initCondition( Condition* pCond )
+{
+  pthread_cond_init(pCond, NULL);
+  return;
+}
+
+void
+closeCondition( Condition* pCond )
+{
+  pthread_cond_destroy(pCond);
+  return;
+}
+
+rtsBool
+broadcastCondition ( Condition* pCond )
+{
+  return (pthread_cond_broadcast(pCond) == 0);
+}
+
+rtsBool
+signalCondition ( Condition* pCond )
+{
+  return (pthread_cond_signal(pCond) == 0);
+}
+
+rtsBool
+waitCondition ( Condition* pCond, Mutex* pMut )
+{
+  return (pthread_cond_wait(pCond,pMut) == 0);
+}
+
+void
+yieldThread()
+{
+  sched_yield();
+  return;
+}
+
+void
+shutdownThread()
+{
+  pthread_exit(NULL);
+}
+
+int
+createOSThread (OSThreadId* pId, OSThreadProc *startProc, void *param)
+{
+  int result = pthread_create(pId, NULL, (void *(*)(void *))startProc, param);
+  if(!result)
+    pthread_detach(*pId);
+  return result;
+}
+
+OSThreadId
+osThreadId()
+{
+  return pthread_self();
+}
+
+void
+initMutex(Mutex* pMut)
+{
+#if defined(DEBUG) && defined(linux_HOST_OS)
+    pthread_mutexattr_t attr;
+    pthread_mutexattr_init(&attr);
+    pthread_mutexattr_settype(&attr,PTHREAD_MUTEX_ERRORCHECK_NP);
+    pthread_mutex_init(pMut,&attr);
+#else
+    pthread_mutex_init(pMut,NULL);
+#endif
+    return;
+}
+
+void
+newThreadLocalKey (ThreadLocalKey *key)
+{
+    int r;
+    if ((r = pthread_key_create(key, NULL)) != 0) {
+	barf("newThreadLocalKey: %s", strerror(r));
+    }
+}
+
+void *
+getThreadLocalVar (ThreadLocalKey *key)
+{
+    return pthread_getspecific(*key);
+    // Note: a return value of NULL can indicate that either the key
+    // is not valid, or the key is valid and the data value has not
+    // yet been set.  We need to use the latter case, so we cannot
+    // detect errors here.
+}
+
+void
+setThreadLocalVar (ThreadLocalKey *key, void *value)
+{
+    int r;
+    if ((r = pthread_setspecific(*key,value)) != 0) {
+	barf("setThreadLocalVar: %s", strerror(r));
+    }
+}
+
+static void *
+forkOS_createThreadWrapper ( void * entry )
+{
+    Capability *cap;
+    cap = rts_lock();
+    cap = rts_evalStableIO(cap, (HsStablePtr) entry, NULL);
+    rts_unlock(cap);
+    return NULL;
+}
+
+int
+forkOS_createThread ( HsStablePtr entry )
+{
+    pthread_t tid;
+    int result = pthread_create(&tid, NULL,
+				forkOS_createThreadWrapper, (void*)entry);
+    if(!result)
+        pthread_detach(tid);
+    return result;
+}
+
+#else /* !defined(THREADED_RTS) */
+
+int
+forkOS_createThread ( HsStablePtr entry STG_UNUSED )
+{
+    return -1;
+}
+
+#endif /* !defined(THREADED_RTS) */
diff --git a/rts/posix/Select.c b/rts/posix/Select.c
new file mode 100644
index 0000000000..e21ced03ab
--- /dev/null
+++ b/rts/posix/Select.c
@@ -0,0 +1,279 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1995-2002
+ *
+ * Support for concurrent non-blocking I/O and thread waiting.
+ *
+ * ---------------------------------------------------------------------------*/
+
+/* we're outside the realms of POSIX here... */
+/* #include "PosixSource.h" */
+
+#include "Rts.h"
+#include "Schedule.h"
+#include "RtsUtils.h"
+#include "RtsFlags.h"
+#include "Timer.h"
+#include "Itimer.h"
+#include "Signals.h"
+#include "Capability.h"
+#include "posix/Select.h"
+
+# ifdef HAVE_SYS_TYPES_H
+#  include <sys/types.h>
+# endif
+
+# ifdef HAVE_SYS_TIME_H
+#  include <sys/time.h>
+# endif
+
+#include <errno.h>
+#include <string.h>
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#if !defined(THREADED_RTS)
+/* last timestamp */
+lnat timestamp = 0;
+
+/* 
+ * The threaded RTS uses an IO-manager thread in Haskell instead (see GHC.Conc) 
+ */
+
+/* There's a clever trick here to avoid problems when the time wraps
+ * around.  Since our maximum delay is smaller than 31 bits of ticks
+ * (it's actually 31 bits of microseconds), we can safely check
+ * whether a timer has expired even if our timer will wrap around
+ * before the target is reached, using the following formula:
+ *
+ *        (int)((uint)current_time - (uint)target_time) < 0
+ *
+ * if this is true, then our time has expired.
+ * (idea due to Andy Gill).
+ */
+static rtsBool
+wakeUpSleepingThreads(lnat ticks)
+{
+    StgTSO *tso;
+    rtsBool flag = rtsFalse;
+
+    while (sleeping_queue != END_TSO_QUEUE &&
+	   (int)(ticks - sleeping_queue->block_info.target) > 0) {
+	tso = sleeping_queue;
+	sleeping_queue = tso->link;
+	tso->why_blocked = NotBlocked;
+	tso->link = END_TSO_QUEUE;
+	IF_DEBUG(scheduler,debugBelch("Waking up sleeping thread %d\n", tso->id));
+	// MainCapability: this code is !THREADED_RTS
+	pushOnRunQueue(&MainCapability,tso);
+	flag = rtsTrue;
+    }
+    return flag;
+}
+
+/* Argument 'wait' says whether to wait for I/O to become available,
+ * or whether to just check and return immediately.  If there are
+ * other threads ready to run, we normally do the non-waiting variety,
+ * otherwise we wait (see Schedule.c).
+ *
+ * SMP note: must be called with sched_mutex locked.
+ *
+ * Windows: select only works on sockets, so this doesn't really work,
+ * though it makes things better than before. MsgWaitForMultipleObjects
+ * should really be used, though it only seems to work for read handles,
+ * not write handles.
+ *
+ */
+void
+awaitEvent(rtsBool wait)
+{
+    StgTSO *tso, *prev, *next;
+    rtsBool ready;
+    fd_set rfd,wfd;
+    int numFound;
+    int maxfd = -1;
+    rtsBool select_succeeded = rtsTrue;
+    rtsBool unblock_all = rtsFalse;
+    struct timeval tv;
+    lnat min, ticks;
+
+    tv.tv_sec  = 0;
+    tv.tv_usec = 0;
+    
+    IF_DEBUG(scheduler,
+	     debugBelch("scheduler: checking for threads blocked on I/O");
+	     if (wait) {
+		 debugBelch(" (waiting)");
+	     }
+	     debugBelch("\n");
+	     );
+
+    /* loop until we've woken up some threads.  This loop is needed
+     * because the select timing isn't accurate, we sometimes sleep
+     * for a while but not long enough to wake up a thread in
+     * a threadDelay.
+     */
+    do {
+
+      ticks = timestamp = getourtimeofday();
+      if (wakeUpSleepingThreads(ticks)) { 
+	  return;
+      }
+
+      if (!wait) {
+	  min = 0;
+      } else if (sleeping_queue != END_TSO_QUEUE) {
+	  min = (sleeping_queue->block_info.target - ticks) 
+	      * TICK_MILLISECS * 1000;
+      } else {
+	  min = 0x7ffffff;
+      }
+
+      /* 
+       * Collect all of the fd's that we're interested in
+       */
+      FD_ZERO(&rfd);
+      FD_ZERO(&wfd);
+
+      for(tso = blocked_queue_hd; tso != END_TSO_QUEUE; tso = next) {
+	next = tso->link;
+
+	switch (tso->why_blocked) {
+	case BlockedOnRead:
+	  { 
+	    int fd = tso->block_info.fd;
+	    if (fd >= FD_SETSIZE) {
+		barf("awaitEvent: descriptor out of range");
+	    }
+	    maxfd = (fd > maxfd) ? fd : maxfd;
+	    FD_SET(fd, &rfd);
+	    continue;
+	  }
+
+	case BlockedOnWrite:
+	  { 
+	    int fd = tso->block_info.fd;
+	    if (fd >= FD_SETSIZE) {
+		barf("awaitEvent: descriptor out of range");
+	    }
+	    maxfd = (fd > maxfd) ? fd : maxfd;
+	    FD_SET(fd, &wfd);
+	    continue;
+	  }
+
+	default:
+	  barf("AwaitEvent");
+	}
+      }
+
+      /* Check for any interesting events */
+      
+      tv.tv_sec  = min / 1000000;
+      tv.tv_usec = min % 1000000;
+
+      while ((numFound = select(maxfd+1, &rfd, &wfd, NULL, &tv)) < 0) {
+	  if (errno != EINTR) {
+	    /* Handle bad file descriptors by unblocking all the
+	       waiting threads. Why? Because a thread might have been
+	       a bit naughty and closed a file descriptor while another
+	       was blocked waiting. This is less-than-good programming
+	       practice, but having the RTS as a result fall over isn't
+	       acceptable, so we simply unblock all the waiting threads
+	       should we see a bad file descriptor & give the threads
+	       a chance to clean up their act. 
+	       
+	       Note: assume here that threads becoming unblocked
+	       will try to read/write the file descriptor before trying
+	       to issue a threadWaitRead/threadWaitWrite again (==> an
+	       IOError will result for the thread that's got the bad
+	       file descriptor.) Hence, there's no danger of a bad
+	       file descriptor being repeatedly select()'ed on, so
+	       the RTS won't loop.
+	    */
+	    if ( errno == EBADF ) {
+	      unblock_all = rtsTrue;
+	      break;
+	    } else {
+ 	      perror("select");
+	      barf("select failed");
+	    }
+	  }
+
+	  /* We got a signal; could be one of ours.  If so, we need
+	   * to start up the signal handler straight away, otherwise
+	   * we could block for a long time before the signal is
+	   * serviced.
+	   */
+#if defined(RTS_USER_SIGNALS)
+	  if (signals_pending()) {
+	      startSignalHandlers(&MainCapability);
+	      return; /* still hold the lock */
+	  }
+#endif
+
+	  /* we were interrupted, return to the scheduler immediately.
+	   */
+	  if (sched_state >= SCHED_INTERRUPTING) {
+	      return; /* still hold the lock */
+	  }
+	  
+	  /* check for threads that need waking up 
+	   */
+	  wakeUpSleepingThreads(getourtimeofday());
+	  
+	  /* If new runnable threads have arrived, stop waiting for
+	   * I/O and run them.
+	   */
+	  if (!emptyRunQueue(&MainCapability)) {
+	      return; /* still hold the lock */
+	  }
+      }
+
+      /* Step through the waiting queue, unblocking every thread that now has
+       * a file descriptor in a ready state.
+       */
+
+      prev = NULL;
+      if (select_succeeded || unblock_all) {
+	  for(tso = blocked_queue_hd; tso != END_TSO_QUEUE; tso = next) {
+	      next = tso->link;
+	      switch (tso->why_blocked) {
+	      case BlockedOnRead:
+		  ready = unblock_all || FD_ISSET(tso->block_info.fd, &rfd);
+		  break;
+	      case BlockedOnWrite:
+		  ready = unblock_all || FD_ISSET(tso->block_info.fd, &wfd);
+		  break;
+	      default:
+		  barf("awaitEvent");
+	      }
+      
+	      if (ready) {
+		  IF_DEBUG(scheduler,debugBelch("Waking up blocked thread %d\n", tso->id));
+		  tso->why_blocked = NotBlocked;
+		  tso->link = END_TSO_QUEUE;
+		  pushOnRunQueue(&MainCapability,tso);
+	      } else {
+		  if (prev == NULL)
+		      blocked_queue_hd = tso;
+		  else
+		      prev->link = tso;
+		  prev = tso;
+	      }
+	  }
+
+	  if (prev == NULL)
+	      blocked_queue_hd = blocked_queue_tl = END_TSO_QUEUE;
+	  else {
+	      prev->link = END_TSO_QUEUE;
+	      blocked_queue_tl = prev;
+	  }
+      }
+      
+    } while (wait && sched_state == SCHED_RUNNING
+	     && emptyRunQueue(&MainCapability));
+}
+
+#endif /* THREADED_RTS */
diff --git a/rts/posix/Select.h b/rts/posix/Select.h
new file mode 100644
index 0000000000..8825562974
--- /dev/null
+++ b/rts/posix/Select.h
@@ -0,0 +1,26 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 1998-2005
+ *
+ * Prototypes for functions in Select.c
+ *
+ * -------------------------------------------------------------------------*/
+
+#ifndef SELECT_H
+#define SELECT_H
+
+#if !defined(THREADED_RTS)
+/* In Select.c */
+extern lnat RTS_VAR(timestamp);
+
+/* awaitEvent(rtsBool wait)
+ *
+ * Checks for blocked threads that need to be woken.
+ *
+ * Called from STG :  NO
+ * Locks assumed   :  sched_mutex
+ */
+void awaitEvent(rtsBool wait);  /* In Select.c */
+#endif
+
+#endif /* SELECT_H */
diff --git a/rts/posix/Signals.c b/rts/posix/Signals.c
new file mode 100644
index 0000000000..5f5f77fd39
--- /dev/null
+++ b/rts/posix/Signals.c
@@ -0,0 +1,510 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Signal processing / handling.
+ *
+ * ---------------------------------------------------------------------------*/
+
+/* This is non-Posix-compliant.
+   #include "PosixSource.h" 
+*/
+#include "Rts.h"
+#include "SchedAPI.h"
+#include "Schedule.h"
+#include "RtsSignals.h"
+#include "posix/Signals.h"
+#include "RtsUtils.h"
+#include "RtsFlags.h"
+
+#ifdef alpha_HOST_ARCH
+# if defined(linux_HOST_OS)
+#  include <asm/fpu.h>
+# else
+#  include <machine/fpu.h>
+# endif
+#endif
+
+#ifdef HAVE_UNISTD_H
+# include <unistd.h>
+#endif
+
+#ifdef HAVE_SIGNAL_H
+# include <signal.h>
+#endif
+
+#include <stdlib.h>
+
+/* This curious flag is provided for the benefit of the Haskell binding
+ * to POSIX.1 to control whether or not to include SA_NOCLDSTOP when
+ * installing a SIGCHLD handler. 
+ */
+StgInt nocldstop = 0;
+
+/* -----------------------------------------------------------------------------
+ * The table of signal handlers
+ * -------------------------------------------------------------------------- */
+
+#if defined(RTS_USER_SIGNALS)
+
+/* SUP: The type of handlers is a little bit, well, doubtful... */
+StgInt *signal_handlers = NULL; /* Dynamically grown array of signal handlers */
+static StgInt nHandlers = 0;    /* Size of handlers array */
+
+static nat n_haskell_handlers = 0;
+
+/* -----------------------------------------------------------------------------
+ * Allocate/resize the table of signal handlers.
+ * -------------------------------------------------------------------------- */
+
+static void
+more_handlers(I_ sig)
+{
+    StgInt i;
+
+    if (sig < nHandlers)
+	return;
+
+    if (signal_handlers == NULL)
+	signal_handlers = (StgInt *)stgMallocBytes((sig + 1) * sizeof(StgInt), "more_handlers");
+    else
+	signal_handlers = (StgInt *)stgReallocBytes(signal_handlers, (sig + 1) * sizeof(StgInt), "more_handlers");
+
+    for(i = nHandlers; i <= sig; i++)
+	// Fill in the new slots with default actions
+	signal_handlers[i] = STG_SIG_DFL;
+
+    nHandlers = sig + 1;
+}
+
+/* -----------------------------------------------------------------------------
+ * Pending Handlers
+ *
+ * The mechanism for starting handlers differs between the threaded
+ * (THREADED_RTS) and non-threaded versions of the RTS.
+ *
+ * When the RTS is single-threaded, we just write the pending signal
+ * handlers into a buffer, and start a thread for each one in the
+ * scheduler loop.
+ *
+ * When THREADED_RTS, the problem is that signals might be
+ * delivered to multiple threads, so we would need to synchronise
+ * access to pending_handler_buf somehow.  Using thread
+ * synchronisation from a signal handler isn't possible in general
+ * (some OSs support it, eg. MacOS X, but not all).  So instead:
+ *
+ *   - the signal handler writes the signal number into the pipe
+ *     managed by the IO manager thread (see GHC.Conc).
+ *   - the IO manager picks up the signal number and calls
+ *     startSignalHandler() to start the thread.
+ *
+ * This also has the nice property that we don't need to arrange to
+ * wake up a worker task to start the signal handler: the IO manager
+ * wakes up when we write into the pipe.
+ *
+ * -------------------------------------------------------------------------- */
+
+// Here's the pipe into which we will send our signals
+static int io_manager_pipe = -1;
+
+void
+setIOManagerPipe (int fd)
+{
+    // only called when THREADED_RTS, but unconditionally
+    // compiled here because GHC.Conc depends on it.
+    io_manager_pipe = fd;
+}
+
+#if !defined(THREADED_RTS)
+
+#define N_PENDING_HANDLERS 16
+
+StgPtr pending_handler_buf[N_PENDING_HANDLERS];
+StgPtr *next_pending_handler = pending_handler_buf;
+
+#endif /* THREADED_RTS */
+
+/* -----------------------------------------------------------------------------
+ * SIGCONT handler
+ *
+ * It seems that shells tend to put stdin back into blocking mode
+ * following a suspend/resume of the process.  Here we arrange to put
+ * it back into non-blocking mode.  We don't do anything to
+ * stdout/stderr because these handles don't get put into non-blocking
+ * mode at all - see the comments on stdout/stderr in PrelHandle.hsc.
+ * -------------------------------------------------------------------------- */
+
+static void
+cont_handler(int sig STG_UNUSED)
+{
+    setNonBlockingFd(0);
+}
+
+/* -----------------------------------------------------------------------------
+ * Low-level signal handler
+ *
+ * Places the requested handler on a stack of pending handlers to be
+ * started up at the next context switch.
+ * -------------------------------------------------------------------------- */
+
+static void
+generic_handler(int sig)
+{
+    sigset_t signals;
+
+#if defined(THREADED_RTS)
+
+    if (io_manager_pipe != -1)
+    {
+	// Write the signal number into the pipe as a single byte.  We
+	// hope that signals fit into a byte...
+	StgWord8 csig = (StgWord8)sig;
+	write(io_manager_pipe, &csig, 1);
+    }
+    // If the IO manager hasn't told us what the FD of the write end
+    // of its pipe is, there's not much we can do here, so just ignore
+    // the signal..
+
+#else /* not THREADED_RTS */
+
+    /* Can't call allocate from here.  Probably can't call malloc
+       either.  However, we have to schedule a new thread somehow.
+
+       It's probably ok to request a context switch and allow the
+       scheduler to  start the handler thread, but how do we
+       communicate this to the scheduler?
+
+       We need some kind of locking, but with low overhead (i.e. no
+       blocking signals every time around the scheduler).
+       
+       Signal Handlers are atomic (i.e. they can't be interrupted), and
+       we can make use of this.  We just need to make sure the
+       critical section of the scheduler can't be interrupted - the
+       only way to do this is to block signals.  However, we can lower
+       the overhead by only blocking signals when there are any
+       handlers to run, i.e. the set of pending handlers is
+       non-empty.
+    */
+       
+    /* We use a stack to store the pending signals.  We can't
+       dynamically grow this since we can't allocate any memory from
+       within a signal handler.
+
+       Hence unfortunately we have to bomb out if the buffer
+       overflows.  It might be acceptable to carry on in certain
+       circumstances, depending on the signal.  
+    */
+
+    *next_pending_handler++ = deRefStablePtr((StgStablePtr)signal_handlers[sig]);
+
+    // stack full?
+    if (next_pending_handler == &pending_handler_buf[N_PENDING_HANDLERS]) {
+	errorBelch("too many pending signals");
+	stg_exit(EXIT_FAILURE);
+    }
+    
+#endif /* THREADED_RTS */
+
+    // re-establish the signal handler, and carry on
+    sigemptyset(&signals);
+    sigaddset(&signals, sig);
+    sigprocmask(SIG_UNBLOCK, &signals, NULL);
+
+    // *always* do the SIGCONT handler, even if the user overrides it.
+    if (sig == SIGCONT) {
+	cont_handler(sig);
+    }
+
+    context_switch = 1;
+}
+
+/* -----------------------------------------------------------------------------
+ * Blocking/Unblocking of the user signals
+ * -------------------------------------------------------------------------- */
+
+static sigset_t userSignals;
+static sigset_t savedSignals;
+
+void
+initUserSignals(void)
+{
+    sigemptyset(&userSignals);
+}
+
+void
+blockUserSignals(void)
+{
+    sigprocmask(SIG_BLOCK, &userSignals, &savedSignals);
+}
+
+void
+unblockUserSignals(void)
+{
+    sigprocmask(SIG_SETMASK, &savedSignals, NULL);
+}
+
+rtsBool
+anyUserHandlers(void)
+{
+    return n_haskell_handlers != 0;
+}
+
+#if !defined(THREADED_RTS)
+void
+awaitUserSignals(void)
+{
+    while (!signals_pending() && sched_state == SCHED_RUNNING) {
+	pause();
+    }
+}
+#endif
+
+/* -----------------------------------------------------------------------------
+ * Install a Haskell signal handler.
+ * -------------------------------------------------------------------------- */
+
+int
+stg_sig_install(int sig, int spi, StgStablePtr *handler, void *mask)
+{
+    sigset_t signals, osignals;
+    struct sigaction action;
+    StgInt previous_spi;
+
+    // Block the signal until we figure out what to do
+    // Count on this to fail if the signal number is invalid
+    if (sig < 0 || sigemptyset(&signals) ||
+	sigaddset(&signals, sig) || sigprocmask(SIG_BLOCK, &signals, &osignals)) {
+	return STG_SIG_ERR;
+    }
+    
+    more_handlers(sig);
+
+    previous_spi = signal_handlers[sig];
+
+    action.sa_flags = 0;
+    
+    switch(spi) {
+    case STG_SIG_IGN:
+    	signal_handlers[sig] = STG_SIG_IGN;
+	sigdelset(&userSignals, sig);
+        action.sa_handler = SIG_IGN;
+    	break;
+    	
+    case STG_SIG_DFL:
+    	signal_handlers[sig] = STG_SIG_DFL;
+	sigdelset(&userSignals, sig);
+        action.sa_handler = SIG_DFL;
+    	break;
+
+    case STG_SIG_HAN:
+    case STG_SIG_RST:
+    	signal_handlers[sig] = (StgInt)*handler;
+	sigaddset(&userSignals, sig);
+    	action.sa_handler = generic_handler;
+	if (spi == STG_SIG_RST) {
+	    action.sa_flags = SA_RESETHAND;
+	}
+	n_haskell_handlers++;
+    	break;
+
+    default:
+        barf("stg_sig_install: bad spi");
+    }
+
+    if (mask != NULL)
+        action.sa_mask = *(sigset_t *)mask;
+    else
+	sigemptyset(&action.sa_mask);
+
+    action.sa_flags |= sig == SIGCHLD && nocldstop ? SA_NOCLDSTOP : 0;
+
+    if (sigaction(sig, &action, NULL) || 
+	sigprocmask(SIG_SETMASK, &osignals, NULL)) 
+    {
+	// need to return an error code, so avoid a stable pointer leak
+	// by freeing the previous handler if there was one.
+	if (previous_spi >= 0) {
+	    freeStablePtr(stgCast(StgStablePtr,signal_handlers[sig]));
+	    n_haskell_handlers--;
+	}
+	return STG_SIG_ERR;
+    }
+
+    if (previous_spi == STG_SIG_DFL || previous_spi == STG_SIG_IGN
+	|| previous_spi == STG_SIG_ERR) {
+	return previous_spi;
+    } else {
+	*handler = (StgStablePtr)previous_spi;
+	return STG_SIG_HAN;
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ * Creating new threads for signal handlers.
+ * -------------------------------------------------------------------------- */
+
+#if !defined(THREADED_RTS)
+void
+startSignalHandlers(Capability *cap)
+{
+  blockUserSignals();
+  
+  while (next_pending_handler != pending_handler_buf) {
+
+    next_pending_handler--;
+
+    scheduleThread (cap,
+	createIOThread(cap,
+		       RtsFlags.GcFlags.initialStkSize, 
+		       (StgClosure *) *next_pending_handler));
+  }
+
+  unblockUserSignals();
+}
+#endif
+
+/* ----------------------------------------------------------------------------
+ * Mark signal handlers during GC.
+ *
+ * We do this rather than trying to start all the signal handlers
+ * prior to GC, because that requires extra heap for the new threads.
+ * Signals must be blocked (see blockUserSignals() above) during GC to
+ * avoid race conditions.
+ * -------------------------------------------------------------------------- */
+
+#if !defined(THREADED_RTS)
+void
+markSignalHandlers (evac_fn evac)
+{
+    StgPtr *p;
+
+    p = next_pending_handler;
+    while (p != pending_handler_buf) {
+	p--;
+	evac((StgClosure **)p);
+    }
+}
+#else
+void
+markSignalHandlers (evac_fn evac STG_UNUSED)
+{
+}
+#endif
+
+#else /* !RTS_USER_SIGNALS */
+StgInt 
+stg_sig_install(StgInt sig STG_UNUSED,
+		StgInt spi STG_UNUSED,
+		StgStablePtr* handler STG_UNUSED,
+		void* mask STG_UNUSED)
+{
+  //barf("User signals not supported");
+  return STG_SIG_DFL;
+}
+
+#endif
+
+#if defined(RTS_USER_SIGNALS)
+/* -----------------------------------------------------------------------------
+ * SIGINT handler.
+ *
+ * We like to shutdown nicely after receiving a SIGINT, write out the
+ * stats, write profiling info, close open files and flush buffers etc.
+ * -------------------------------------------------------------------------- */
+#ifdef SMP
+pthread_t startup_guy;
+#endif
+
+static void
+shutdown_handler(int sig STG_UNUSED)
+{
+#ifdef SMP
+    // if I'm a worker thread, send this signal to the guy who
+    // originally called startupHaskell().  Since we're handling
+    // the signal, it won't be a "send to all threads" type of signal
+    // (according to the POSIX threads spec).
+    if (pthread_self() != startup_guy) {
+	pthread_kill(startup_guy, sig);
+	return;
+    }
+#endif
+
+    // If we're already trying to interrupt the RTS, terminate with
+    // extreme prejudice.  So the first ^C tries to exit the program
+    // cleanly, and the second one just kills it.
+    if (sched_state >= SCHED_INTERRUPTING) {
+	stg_exit(EXIT_INTERRUPTED);
+    } else {
+	interruptStgRts();
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ * Install default signal handlers.
+ *
+ * The RTS installs a default signal handler for catching
+ * SIGINT, so that we can perform an orderly shutdown.
+ *
+ * Haskell code may install their own SIGINT handler, which is
+ * fine, provided they're so kind as to put back the old one
+ * when they de-install.
+ *
+ * In addition to handling SIGINT, the RTS also handles SIGFPE
+ * by ignoring it.  Apparently IEEE requires floating-point
+ * exceptions to be ignored by default, but alpha-dec-osf3
+ * doesn't seem to do so.
+ * -------------------------------------------------------------------------- */
+void
+initDefaultHandlers()
+{
+    struct sigaction action,oact;
+
+#ifdef SMP
+    startup_guy = pthread_self();
+#endif
+
+    // install the SIGINT handler
+    action.sa_handler = shutdown_handler;
+    sigemptyset(&action.sa_mask);
+    action.sa_flags = 0;
+    if (sigaction(SIGINT, &action, &oact) != 0) {
+	errorBelch("warning: failed to install SIGINT handler");
+    }
+
+#if defined(HAVE_SIGINTERRUPT)
+    siginterrupt(SIGINT, 1);	// isn't this the default? --SDM
+#endif
+
+    // install the SIGCONT handler
+    action.sa_handler = cont_handler;
+    sigemptyset(&action.sa_mask);
+    action.sa_flags = 0;
+    if (sigaction(SIGCONT, &action, &oact) != 0) {
+	errorBelch("warning: failed to install SIGCONT handler");
+    }
+
+    // install the SIGFPE handler
+
+    // In addition to handling SIGINT, also handle SIGFPE by ignoring it.
+    // Apparently IEEE requires floating-point exceptions to be ignored by
+    // default, but alpha-dec-osf3 doesn't seem to do so.
+
+    // Commented out by SDM 2/7/2002: this causes an infinite loop on
+    // some architectures when an integer division by zero occurs: we
+    // don't recover from the floating point exception, and the
+    // program just generates another one immediately.
+#if 0
+    action.sa_handler = SIG_IGN;
+    sigemptyset(&action.sa_mask);
+    action.sa_flags = 0;
+    if (sigaction(SIGFPE, &action, &oact) != 0) {
+	errorBelch("warning: failed to install SIGFPE handler");
+    }
+#endif
+
+#ifdef alpha_HOST_ARCH
+    ieee_set_fp_control(0);
+#endif
+}
+
+#endif /* RTS_USER_SIGNALS */
diff --git a/rts/posix/Signals.h b/rts/posix/Signals.h
new file mode 100644
index 0000000000..39477f8c6a
--- /dev/null
+++ b/rts/posix/Signals.h
@@ -0,0 +1,26 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 1998-2005
+ *
+ * Signal processing / handling.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef POSIX_SIGNALS_H
+#define POSIX_SIGNALS_H
+
+extern rtsBool anyUserHandlers(void);
+
+#if !defined(THREADED_RTS)
+
+extern StgPtr pending_handler_buf[];
+extern StgPtr *next_pending_handler;
+#define signals_pending() (next_pending_handler != pending_handler_buf)
+void startSignalHandlers(Capability *cap);
+
+#endif
+
+extern StgInt *signal_handlers;
+
+#endif /* POSIX_SIGNALS_H */
+
diff --git a/rts/win32/AsyncIO.c b/rts/win32/AsyncIO.c
new file mode 100644
index 0000000000..7bcf571cf8
--- /dev/null
+++ b/rts/win32/AsyncIO.c
@@ -0,0 +1,345 @@
+/* AsyncIO.c
+ *
+ * Integrating Win32 asynchronous I/O with the GHC RTS.
+ *
+ * (c) sof, 2002-2003.
+ */
+#include "Rts.h"
+#include "RtsUtils.h"
+#include <windows.h>
+#include <stdio.h>
+#include "Schedule.h"
+#include "RtsFlags.h"
+#include "Capability.h"
+#include "win32/AsyncIO.h"
+#include "win32/IOManager.h"
+
+/*
+ * Overview:
+ *
+ * Haskell code issue asynchronous I/O requests via the 
+ * async{Read,Write,DoOp}# primops. These cause addIORequest()
+ * to be invoked, which forwards the request to the underlying
+ * asynchronous I/O subsystem. Each request is tagged with a unique
+ * ID.
+ *
+ * addIORequest() returns this ID, so that when the blocked CH
+ * thread is added onto blocked_queue, its TSO is annotated with
+ * it. Upon completion of an I/O request, the async I/O handling
+ * code makes a back-call to signal its completion; the local
+ * onIOComplete() routine. It adds the IO request ID (along with
+ * its result data) to a queue of completed requests before returning. 
+ *
+ * The queue of completed IO request is read by the thread operating
+ * the RTS scheduler. It de-queues the CH threads corresponding
+ * to the request IDs, making them runnable again.
+ *
+ */
+
+typedef struct CompletedReq {
+    unsigned int   reqID;
+    int            len;
+    int            errCode;
+} CompletedReq;
+
+#define MAX_REQUESTS 200
+
+static CRITICAL_SECTION queue_lock;
+static HANDLE           completed_req_event;
+static HANDLE           abandon_req_wait;
+static HANDLE           wait_handles[2];
+static CompletedReq     completedTable[MAX_REQUESTS];
+static int              completed_hw;
+static HANDLE           completed_table_sema;
+static int              issued_reqs;
+
+static void
+onIOComplete(unsigned int reqID,
+	     int   fd STG_UNUSED,
+	     int   len,
+	     void* buf STG_UNUSED,
+	     int   errCode)
+{
+    DWORD dwRes;
+    /* Deposit result of request in queue/table..when there's room. */
+    dwRes = WaitForSingleObject(completed_table_sema, INFINITE);
+    switch (dwRes) {
+    case WAIT_OBJECT_0:
+	break;
+    default:
+	/* Not likely */
+	fprintf(stderr, "onIOComplete: failed to grab table semaphore, dropping request 0x%x\n", reqID);
+	fflush(stderr);
+	return;
+    }
+    EnterCriticalSection(&queue_lock);
+    if (completed_hw == MAX_REQUESTS) {
+	/* Shouldn't happen */
+	fprintf(stderr, "onIOComplete: ERROR -- Request table overflow (%d); dropping.\n", reqID);
+	fflush(stderr);
+    } else {
+#if 0
+	fprintf(stderr, "onCompl: %d %d %d %d %d\n", 
+		reqID, len, errCode, issued_reqs, completed_hw); 
+	fflush(stderr);
+#endif
+	completedTable[completed_hw].reqID   = reqID;
+	completedTable[completed_hw].len     = len;
+	completedTable[completed_hw].errCode = errCode;
+	completed_hw++;
+	issued_reqs--;
+	if (completed_hw == 1) {
+	    /* The event is used to wake up the scheduler thread should it
+	     * be blocked waiting for requests to complete. The event resets once
+	     * that thread has cleared out the request queue/table.
+	     */
+	    SetEvent(completed_req_event);
+	}
+    }
+    LeaveCriticalSection(&queue_lock);
+}
+
+unsigned int
+addIORequest(int   fd,
+	     int   forWriting,
+	     int   isSock,
+	     int   len,
+	     char* buf)
+{
+    EnterCriticalSection(&queue_lock);
+    issued_reqs++;
+    LeaveCriticalSection(&queue_lock);
+#if 0
+    fprintf(stderr, "addIOReq: %d %d %d\n", fd, forWriting, len); fflush(stderr);
+#endif
+    return AddIORequest(fd,forWriting,isSock,len,buf,onIOComplete);
+}
+
+unsigned int
+addDelayRequest(int msecs)
+{
+    EnterCriticalSection(&queue_lock);
+    issued_reqs++;
+    LeaveCriticalSection(&queue_lock);
+#if 0
+    fprintf(stderr, "addDelayReq: %d\n", msecs); fflush(stderr);
+#endif
+    return AddDelayRequest(msecs,onIOComplete);
+}
+
+unsigned int
+addDoProcRequest(void* proc, void* param)
+{
+    EnterCriticalSection(&queue_lock);
+    issued_reqs++;
+    LeaveCriticalSection(&queue_lock);
+#if 0
+    fprintf(stderr, "addProcReq: %p %p\n", proc, param); fflush(stderr);
+#endif
+    return AddProcRequest(proc,param,onIOComplete);
+}
+
+
+int
+startupAsyncIO()
+{
+    if (!StartIOManager()) {
+	return 0;
+    }
+    InitializeCriticalSection(&queue_lock);
+    /* Create a pair of events:
+     *
+     *    - completed_req_event  -- signals the deposit of request result; manual reset.
+     *    - abandon_req_wait     -- external OS thread tells current RTS/Scheduler
+     *                              thread to abandon wait for IO request completion.
+     *                              Auto reset.
+     */
+    completed_req_event = CreateEvent (NULL, TRUE,  FALSE, NULL);
+    abandon_req_wait    = CreateEvent (NULL, FALSE, FALSE, NULL);
+    wait_handles[0] = completed_req_event;
+    wait_handles[1] = abandon_req_wait;
+    completed_hw = 0;
+    if ( !(completed_table_sema = CreateSemaphore (NULL, MAX_REQUESTS, MAX_REQUESTS, NULL)) ) {
+	DWORD rc = GetLastError();
+	fprintf(stderr, "startupAsyncIO: CreateSemaphore failed 0x%x\n", rc);
+	fflush(stderr);
+    }
+
+    return ( completed_req_event  != INVALID_HANDLE_VALUE &&
+	     abandon_req_wait     != INVALID_HANDLE_VALUE &&
+	     completed_table_sema != NULL );
+}
+
+void
+shutdownAsyncIO()
+{
+    CloseHandle(completed_req_event);
+    ShutdownIOManager();
+}
+
+/*
+ * Function: awaitRequests(wait)
+ *
+ * Check for the completion of external IO work requests. Worker
+ * threads signal completion of IO requests by depositing them
+ * in a table (completedTable). awaitRequests() matches up 
+ * requests in that table with threads on the blocked_queue, 
+ * making the threads whose IO requests have completed runnable
+ * again.
+ * 
+ * awaitRequests() is called by the scheduler periodically _or_ if
+ * it is out of work, and need to wait for the completion of IO
+ * requests to make further progress. In the latter scenario, 
+ * awaitRequests() will simply block waiting for worker threads 
+ * to complete if the 'completedTable' is empty.
+ */
+int
+awaitRequests(rtsBool wait)
+{
+#ifndef THREADED_RTS
+  // none of this is actually used in the threaded RTS
+
+start:
+#if 0
+    fprintf(stderr, "awaitRequests(): %d %d %d\n", issued_reqs, completed_hw, wait);
+    fflush(stderr);
+#endif
+    EnterCriticalSection(&queue_lock);
+    /* Nothing immediately available & we won't wait */
+    if ((!wait && completed_hw == 0)
+#if 0
+	// If we just return when wait==rtsFalse, we'll go into a busy
+	// wait loop, so I disabled this condition --SDM 18/12/2003
+	(issued_reqs == 0 && completed_hw == 0)
+#endif
+	) {
+	LeaveCriticalSection(&queue_lock);
+	return 0;
+    }
+    if (completed_hw == 0) {
+	/* empty table, drop lock and wait */
+	LeaveCriticalSection(&queue_lock);
+	if ( wait && sched_state == SCHED_RUNNING ) {
+	    DWORD dwRes = WaitForMultipleObjects(2, wait_handles, FALSE, INFINITE);
+	    switch (dwRes) {
+	    case WAIT_OBJECT_0:
+		/* a request was completed */
+		break;
+	    case WAIT_OBJECT_0 + 1:
+	    case WAIT_TIMEOUT:
+		/* timeout (unlikely) or told to abandon waiting */
+		return 0;
+	    case WAIT_FAILED: {
+		DWORD dw = GetLastError();
+		fprintf(stderr, "awaitRequests: wait failed -- error code: %lu\n", dw); fflush(stderr);
+		return 0;
+	    }
+	    default:
+		fprintf(stderr, "awaitRequests: unexpected wait return code %lu\n", dwRes); fflush(stderr);
+		return 0;
+	    }
+	} else {
+	    return 0;
+	}
+	goto start;
+    } else {
+	int i;
+	StgTSO *tso, *prev;
+	
+	for (i=0; i < completed_hw; i++) {
+	    /* For each of the completed requests, match up their Ids
+	     * with those of the threads on the blocked_queue. If the
+	     * thread that made the IO request has been subsequently
+	     * killed (and removed from blocked_queue), no match will
+	     * be found for that request Id. 
+	     *
+	     * i.e., killing a Haskell thread doesn't attempt to cancel
+	     * the IO request it is blocked on.
+	     *
+	     */
+	    unsigned int rID = completedTable[i].reqID;
+	    
+	    prev = NULL;
+	    for(tso = blocked_queue_hd ; tso != END_TSO_QUEUE; prev = tso, tso = tso->link) {
+	
+		switch(tso->why_blocked) {
+		case BlockedOnRead:
+		case BlockedOnWrite:
+		case BlockedOnDoProc:
+		    if (tso->block_info.async_result->reqID == rID) {
+			/* Found the thread blocked waiting on request; stodgily fill 
+			 * in its result block. 
+			 */
+			tso->block_info.async_result->len = completedTable[i].len;
+			tso->block_info.async_result->errCode = completedTable[i].errCode;
+			
+			/* Drop the matched TSO from blocked_queue */
+			if (prev) {
+			    prev->link = tso->link;
+			} else {
+			    blocked_queue_hd = tso->link;
+			}
+			if (blocked_queue_tl == tso) {
+			    blocked_queue_tl = prev ? prev : END_TSO_QUEUE;
+			}
+		    
+			/* Terminates the run queue + this inner for-loop. */
+			tso->link = END_TSO_QUEUE;
+			tso->why_blocked = NotBlocked;
+			pushOnRunQueue(&MainCapability, tso);
+			break;
+		    }
+		    break;
+		default:
+		    if (tso->why_blocked != NotBlocked) {
+			barf("awaitRequests: odd thread state");
+		    }
+		    break;
+		}
+	    }
+	    /* Signal that there's completed table slots available */
+	    if ( !ReleaseSemaphore(completed_table_sema, 1, NULL) ) {
+		DWORD dw = GetLastError();
+		fprintf(stderr, "awaitRequests: failed to signal semaphore (error code=0x%x)\n", dw);
+		fflush(stderr);
+	    }
+	}
+	completed_hw = 0;
+	ResetEvent(completed_req_event);
+	LeaveCriticalSection(&queue_lock);
+	return 1;
+    }
+#endif /* !THREADED_RTS */
+}
+
+/*
+ * Function: abandonRequestWait()
+ *
+ * Wake up a thread that's blocked waiting for new IO requests
+ * to complete (via awaitRequests().)
+ */
+void
+abandonRequestWait( void )
+{
+    /* the event is auto-reset, but in case there's no thread
+     * already waiting on the event, we want to return it to
+     * a non-signalled state.
+     *
+     * Careful!  There is no synchronisation between
+     * abandonRequestWait and awaitRequest, which means that
+     * abandonRequestWait might be called just before a thread
+     * goes into a wait, and we miss the abandon signal.  So we
+     * must SetEvent() here rather than PulseEvent() to ensure
+     * that the event isn't lost.  We can re-optimise by resetting
+     * the event somewhere safe if we know the event has been
+     * properly serviced (see resetAbandon() below).  --SDM 18/12/2003
+     */
+    SetEvent(abandon_req_wait);
+}
+
+void
+resetAbandonRequestWait( void )
+{
+    ResetEvent(abandon_req_wait);
+}
+
diff --git a/rts/win32/AsyncIO.h b/rts/win32/AsyncIO.h
new file mode 100644
index 0000000000..2077ea0cf7
--- /dev/null
+++ b/rts/win32/AsyncIO.h
@@ -0,0 +1,25 @@
+/* AsyncIO.h
+ *
+ * Integrating Win32 asynchronous I/O with the GHC RTS.
+ *
+ * (c) sof, 2002-2003.
+ */
+#ifndef __ASYNCHIO_H__
+#define __ASYNCHIO_H__
+extern unsigned int
+addIORequest(int   fd,
+	     int   forWriting,
+	     int   isSock,
+	     int   len,
+	     char* buf);
+extern unsigned int addDelayRequest(int   msecs);
+extern unsigned int addDoProcRequest(void* proc, void* param);
+extern int  startupAsyncIO(void);
+extern void shutdownAsyncIO(void);
+
+extern int awaitRequests(rtsBool wait);
+
+extern void abandonRequestWait(void);
+extern void resetAbandonRequestWait(void);
+
+#endif /* __ASYNCHIO_H__ */
diff --git a/rts/win32/AwaitEvent.c b/rts/win32/AwaitEvent.c
new file mode 100644
index 0000000000..43e188fb34
--- /dev/null
+++ b/rts/win32/AwaitEvent.c
@@ -0,0 +1,51 @@
+#if !defined(THREADED_RTS) /* to the end */
+/*
+ * Wait/check for external events. Periodically, the
+ * Scheduler checks for the completion of external operations,
+ * like the expiration of timers, completion of I/O requests
+ * issued by Haskell threads.
+ *
+ * If the Scheduler is otherwise out of work, it'll block
+ * herein waiting for external events to occur.
+ *
+ * This file mirrors the select()-based functionality 
+ * for POSIX / Unix platforms in rts/Select.c, but for
+ * Win32.
+ *
+ */
+#include "Rts.h"
+#include "Schedule.h"
+#include "AwaitEvent.h"
+#include <windows.h>
+#include "win32/AsyncIO.h"
+
+// Used to avoid calling abandonRequestWait() if we don't need to.
+// Protected by sched_mutex.
+static nat workerWaitingForRequests = 0;
+
+void
+awaitEvent(rtsBool wait)
+{
+  int ret;
+
+  do {
+    /* Try to de-queue completed IO requests
+     */
+    workerWaitingForRequests = 1;
+    ret = awaitRequests(wait);
+    workerWaitingForRequests = 0;
+    if (!ret) { 
+      return; /* still hold the lock */
+    }
+
+    // Return to the scheduler if:
+    //
+    //  - we were interrupted
+    //  - new threads have arrived
+
+  } while (wait
+	   && sched_state == SCHED_RUNNING
+	   && emptyRunQueue(&MainCapability)
+      );
+}
+#endif
diff --git a/rts/win32/ConsoleHandler.c b/rts/win32/ConsoleHandler.c
new file mode 100644
index 0000000000..d7096db632
--- /dev/null
+++ b/rts/win32/ConsoleHandler.c
@@ -0,0 +1,313 @@
+/*
+ * Console control handler support.
+ *
+ */
+#include "Rts.h"
+#include <windows.h>
+#include "ConsoleHandler.h"
+#include "SchedAPI.h"
+#include "Schedule.h"
+#include "RtsUtils.h"
+#include "RtsFlags.h"
+#include "AsyncIO.h"
+#include "RtsSignals.h"
+
+extern int stg_InstallConsoleEvent(int action, StgStablePtr *handler);
+
+static BOOL WINAPI shutdown_handler(DWORD dwCtrlType);
+static BOOL WINAPI generic_handler(DWORD dwCtrlType);
+
+static rtsBool deliver_event = rtsTrue;
+static StgInt console_handler = STG_SIG_DFL;
+
+static HANDLE hConsoleEvent = INVALID_HANDLE_VALUE;
+
+#define N_PENDING_EVENTS 16
+StgInt stg_pending_events = 0;           /* number of undelivered events */
+DWORD stg_pending_buf[N_PENDING_EVENTS]; /* their associated event numbers. */
+
+/*
+ * Function: initUserSignals()
+ *
+ * Initialize the console handling substrate.
+ */
+void
+initUserSignals(void)
+{
+    stg_pending_events = 0;
+    console_handler = STG_SIG_DFL;
+    if (hConsoleEvent == INVALID_HANDLE_VALUE) {
+	hConsoleEvent = 
+	    CreateEvent ( NULL,  /* default security attributes */
+			  TRUE,  /* manual-reset event */
+			  FALSE, /* initially non-signalled */
+			  NULL); /* no name */
+    }
+    return;
+}
+
+/*
+ * Function: shutdown_handler()
+ *
+ * Local function that performs the default handling of Ctrl+C kind
+ * events; gently shutting down the RTS
+ *
+ * To repeat Signals.c remark -- user code may choose to override the
+ * default handler. Which is fine, assuming they put back the default
+ * handler when/if they de-install the custom handler.
+ * 
+ */
+static BOOL WINAPI shutdown_handler(DWORD dwCtrlType)
+{
+    switch (dwCtrlType) {
+    
+    case CTRL_CLOSE_EVENT:
+	/* see generic_handler() comment re: this event */
+	return FALSE;
+    case CTRL_C_EVENT:
+    case CTRL_BREAK_EVENT:
+
+	// If we're already trying to interrupt the RTS, terminate with
+	// extreme prejudice.  So the first ^C tries to exit the program
+	// cleanly, and the second one just kills it.
+	if (sched_state >= SCHED_INTERRUPTING) {
+	    stg_exit(EXIT_INTERRUPTED);
+	} else {
+	    interruptStgRts();
+	    /* Cheesy pulsing of an event to wake up a waiting RTS thread, if any */
+	    abandonRequestWait();
+	    resetAbandonRequestWait();
+	}
+	return TRUE;
+
+	/* shutdown + logoff events are not handled here. */
+    default:
+	return FALSE;
+    }
+}
+
+
+/*
+ * Function: initDefaultHandlers()
+ *
+ * Install any default signal/console handlers. Currently we install a
+ * Ctrl+C handler that shuts down the RTS in an orderly manner.
+ */
+void initDefaultHandlers(void)
+{
+    if ( !SetConsoleCtrlHandler(shutdown_handler, TRUE) ) {
+	errorBelch("warning: failed to install default console handler");
+    }
+}
+
+
+/*
+ * Function: blockUserSignals()
+ *
+ * Temporarily block the delivery of further console events. Needed to
+ * avoid race conditions when GCing the stack of outstanding handlers or
+ * when emptying the stack by running the handlers.
+ * 
+ */
+void
+blockUserSignals(void)
+{
+    deliver_event = rtsFalse;
+}
+
+
+/*
+ * Function: unblockUserSignals()
+ *
+ * The inverse of blockUserSignals(); re-enable the deliver of console events.
+ */
+void
+unblockUserSignals(void)
+{
+    deliver_event = rtsTrue;
+}
+
+
+/*
+ * Function: awaitUserSignals()
+ *
+ * Wait for the next console event. Currently a NOP (returns immediately.)
+ */
+void awaitUserSignals(void)
+{
+    return;
+}
+
+
+/*
+ * Function: startSignalHandlers()
+ *
+ * Run the handlers associated with the stacked up console events. Console
+ * event delivery is blocked for the duration of this call.
+ */
+void startSignalHandlers(Capability *cap)
+{
+    StgStablePtr handler;
+
+    if (console_handler < 0) {
+	return;
+    }
+
+    blockUserSignals();
+    ACQUIRE_LOCK(&sched_mutex);
+    
+    handler = deRefStablePtr((StgStablePtr)console_handler);
+    while (stg_pending_events > 0) {
+	stg_pending_events--;
+	scheduleThread(cap,
+	    createIOThread(cap,
+			   RtsFlags.GcFlags.initialStkSize, 
+			   rts_apply(cap,
+				     (StgClosure *)handler,
+				     rts_mkInt(cap,
+					       stg_pending_buf[stg_pending_events]))));
+    }
+    
+    RELEASE_LOCK(&sched_mutex);
+    unblockUserSignals();
+}
+
+/*
+ * Function: markSignalHandlers()
+ *
+ * Evacuate the handler stack. _Assumes_ that console event delivery
+ * has already been blocked.
+ */
+void markSignalHandlers (evac_fn evac)
+{
+    if (console_handler >= 0) {
+	StgPtr p = deRefStablePtr((StgStablePtr)console_handler);
+	evac((StgClosure**)(void *)&p);
+    }
+}
+
+
+/* 
+ * Function: generic_handler()
+ *
+ * Local function which handles incoming console event (done in a sep OS thread),
+ * recording the event in stg_pending_events. 
+ */
+static BOOL WINAPI generic_handler(DWORD dwCtrlType)
+{
+    ACQUIRE_LOCK(&sched_mutex);
+
+    /* Ultra-simple -- up the counter + signal a switch. */
+    switch(dwCtrlType) {
+    case CTRL_CLOSE_EVENT:
+	/* Don't support the delivery of this event; if we
+	 * indicate that we've handled it here and the Haskell handler
+	 * doesn't take proper action (e.g., terminate the OS process),
+	 * the user of the app will be unable to kill/close it. Not
+	 * good, so disable the delivery for now.
+	 */
+	return FALSE;
+    default:
+	if (!deliver_event) return TRUE;
+
+	if ( stg_pending_events < N_PENDING_EVENTS ) {
+	    stg_pending_buf[stg_pending_events] = dwCtrlType;
+	    stg_pending_events++;
+	}
+	/* Cheesy pulsing of an event to wake up a waiting RTS thread, if any */
+	abandonRequestWait();
+	resetAbandonRequestWait();
+	return TRUE;
+    }
+
+    RELEASE_LOCK(&sched_mutex);
+}
+
+
+/*
+ * Function: rts_InstallConsoleEvent()
+ *
+ * Install/remove a console event handler.
+ */
+int
+rts_InstallConsoleEvent(int action, StgStablePtr *handler)
+{
+    StgInt previous_hdlr = console_handler;
+
+    switch (action) {
+    case STG_SIG_IGN:
+	console_handler = STG_SIG_IGN;
+	if ( !SetConsoleCtrlHandler(NULL, TRUE) ) {
+	    errorBelch("warning: unable to ignore console events");
+	}
+	break;
+    case STG_SIG_DFL:
+	console_handler = STG_SIG_IGN;
+	if ( !SetConsoleCtrlHandler(NULL, FALSE) ) {
+	    errorBelch("warning: unable to restore default console event handling");
+	}
+	break;
+    case STG_SIG_HAN:
+	console_handler = (StgInt)*handler;
+	if ( previous_hdlr < 0 ) {
+	  /* Only install generic_handler() once */
+	  if ( !SetConsoleCtrlHandler(generic_handler, TRUE) ) {
+	    errorBelch("warning: unable to install console event handler");
+	  }
+	}
+	break;
+    }
+    
+    if (previous_hdlr == STG_SIG_DFL || 
+	previous_hdlr == STG_SIG_IGN) {
+	return previous_hdlr;
+    } else {
+	*handler = (StgStablePtr)previous_hdlr;
+	return STG_SIG_HAN;
+    }
+}
+
+/*
+ * Function: rts_HandledConsoleEvent()
+ *
+ * Signal that a Haskell console event handler has completed its run.
+ * The explicit notification that a Haskell handler has completed is 
+ * required to better handle the delivery of Ctrl-C/Break events whilst
+ * an async worker thread is handling a read request on stdin. The 
+ * Win32 console implementation will abort such a read request when Ctrl-C
+ * is delivered. That leaves the worker thread in a bind: should it 
+ * abandon the request (the Haskell thread reading from stdin has been 
+ * thrown an exception to signal the delivery of Ctrl-C & hence have 
+ * aborted the I/O request) or simply ignore the aborted read and retry?
+ * (the Haskell thread reading from stdin isn't concerned with the
+ * delivery and handling of Ctrl-C.) With both scenarios being
+ * possible, the worker thread needs to be told -- that is, did the
+ * console event handler cause the IO request to be abandoned? 
+ *
+ */
+void
+rts_ConsoleHandlerDone(int ev)
+{
+    if ( (DWORD)ev == CTRL_BREAK_EVENT ||
+	 (DWORD)ev == CTRL_C_EVENT ) {
+	/* only these two cause stdin system calls to abort.. */
+	SetEvent(hConsoleEvent); /* event is manual-reset */
+	Sleep(0); /* yield */
+	ResetEvent(hConsoleEvent); /* turn it back off again */
+    }
+}
+
+/*
+ * Function: rts_waitConsoleHandlerCompletion()
+ *
+ * Esoteric entry point used by worker thread that got woken
+ * up as part Ctrl-C delivery.
+ */
+int
+rts_waitConsoleHandlerCompletion()
+{
+    /* As long as the worker doesn't need to do a multiple wait,
+     * let's keep this HANDLE private to this 'module'.
+     */
+    return (WaitForSingleObject(hConsoleEvent, INFINITE) == WAIT_OBJECT_0);
+}
diff --git a/rts/win32/ConsoleHandler.h b/rts/win32/ConsoleHandler.h
new file mode 100644
index 0000000000..b09adf71cb
--- /dev/null
+++ b/rts/win32/ConsoleHandler.h
@@ -0,0 +1,63 @@
+/*
+ * Console control handler support.
+ *
+ */
+#ifndef __CONSOLEHANDLER_H__
+#define __CONSOLEHANDLER_H__
+
+/*
+ * Console control handlers lets an application handle Ctrl+C, Ctrl+Break etc.
+ * in Haskell under Win32. Akin to the Unix signal SIGINT.
+ *
+ * The API offered by ConsoleHandler.h is identical to that of the signal handling
+ * code (which isn't supported under win32.) Unsurprisingly, the underlying impl 
+ * is derived from the signal handling code also.
+ */
+
+/*
+ * Function: signals_pending() 
+ * 
+ * Used by the RTS to check whether new signals have been 'recently' reported.
+ * If so, the RTS arranges for the delivered signals to be handled by 
+ * de-queueing them from their table, running the associated Haskell 
+ * signal handler.
+ */
+extern StgInt stg_pending_events;
+
+#define signals_pending() ( stg_pending_events > 0)
+
+/* 
+ * Function: anyUserHandlers()
+ *
+ * Used by the Scheduler to decide whether its worth its while to stick
+ * around waiting for an external signal when there are no threads
+ * runnable. A console handler is used to handle termination events (Ctrl+C)
+ * and isn't considered a 'user handler'.
+ */
+#define anyUserHandlers() (rtsFalse)
+
+/*
+ * Function: startSignalHandlers()
+ *
+ * Run the handlers associated with the queued up console events. Console
+ * event delivery is blocked for the duration of this call.
+ */
+extern void startSignalHandlers(Capability *cap);
+
+/*
+ * Function: handleSignalsInThisThread()
+ * 
+ * Have current (OS) thread assume responsibility of handling console events/signals.
+ * Currently not used (by the console event handling code.)
+ */
+extern void handleSignalsInThisThread(void);
+
+/*
+ * Function: rts_waitConsoleHandlerCompletion()
+ *
+ * Esoteric entry point used by worker thread that got woken
+ * up as part Ctrl-C delivery.
+ */
+extern int rts_waitConsoleHandlerCompletion(void);
+
+#endif /* __CONSOLEHANDLER_H__ */
diff --git a/rts/win32/GetTime.c b/rts/win32/GetTime.c
new file mode 100644
index 0000000000..584b994d53
--- /dev/null
+++ b/rts/win32/GetTime.c
@@ -0,0 +1,101 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team 2005
+ *
+ * Machine-dependent time measurement functions
+ *
+ * ---------------------------------------------------------------------------*/
+
+#include "Rts.h"
+#include "GetTime.h"
+
+#include <windows.h>
+
+#ifdef HAVE_TIME_H
+# include <time.h>
+#endif
+
+#define HNS_PER_SEC 10000000LL /* FILETIMES are in units of 100ns */
+/* Convert FILETIMEs into secs */
+
+static INLINE_ME Ticks
+fileTimeToTicks(FILETIME ft)
+{
+    Ticks t;
+    t = ((Ticks)ft.dwHighDateTime << 32) | ft.dwLowDateTime;
+    t = (t * TICKS_PER_SECOND) / HNS_PER_SEC;
+    return t;
+}    
+
+static int is_win9x = -1;
+
+static INLINE_ME rtsBool
+isWin9x(void)
+{
+    if (is_win9x < 0) {
+	/* figure out whether we're on a Win9x box or not. */
+	OSVERSIONINFO oi;
+	BOOL b;
+	
+	/* Need to init the size field first.*/
+	oi.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
+	b = GetVersionEx(&oi);
+      
+	is_win9x = ( (b && (oi.dwPlatformId & VER_PLATFORM_WIN32_WINDOWS)) ? 1 : 0);
+    }
+    return is_win9x;
+}
+
+
+void
+getProcessTimes(Ticks *user, Ticks *elapsed)
+{
+    *user    = getProcessCPUTime();
+    *elapsed = getProcessElapsedTime();
+}
+
+Ticks
+getProcessCPUTime(void)
+{
+    FILETIME creationTime, exitTime, userTime, kernelTime = {0,0};
+
+    if (isWin9x()) return getProcessElapsedTime();
+
+    if (!GetProcessTimes(GetCurrentProcess(), &creationTime,
+			 &exitTime, &kernelTime, &userTime)) {
+	return 0;
+    }
+
+    return fileTimeToTicks(userTime);
+}
+
+Ticks
+getProcessElapsedTime(void)
+{
+    FILETIME system_time;
+    GetSystemTimeAsFileTime(&system_time);
+    return fileTimeToTicks(system_time);
+}
+
+Ticks
+getThreadCPUTime(void)
+{
+    FILETIME creationTime, exitTime, userTime, kernelTime = {0,0};
+
+    if (isWin9x()) return getProcessCPUTime();
+
+    if (!GetThreadTimes(GetCurrentThread(), &creationTime,
+			&exitTime, &kernelTime, &userTime)) {
+	return 0;
+    }
+
+    return fileTimeToTicks(userTime);
+}
+
+nat
+getPageFaults(void)
+{
+  /* ToDo (on NT): better, get this via the performance data
+     that's stored in the registry. */
+    return 0;
+}
diff --git a/rts/win32/IOManager.c b/rts/win32/IOManager.c
new file mode 100644
index 0000000000..a67c3504c1
--- /dev/null
+++ b/rts/win32/IOManager.c
@@ -0,0 +1,510 @@
+/* IOManager.c
+ *
+ * Non-blocking / asynchronous I/O for Win32.
+ *
+ * (c) sof, 2002-2003.
+ */
+#include "Rts.h"
+#include "IOManager.h"
+#include "WorkQueue.h"
+#include "ConsoleHandler.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <io.h>
+#include <winsock.h>
+#include <process.h>
+
+/*
+ * Internal state maintained by the IO manager.
+ */
+typedef struct IOManagerState {
+    CritSection      manLock;
+    WorkQueue*       workQueue;
+    int              queueSize;
+    int              numWorkers;
+    int              workersIdle;
+    HANDLE           hExitEvent;
+    unsigned int     requestID;
+    /* fields for keeping track of active WorkItems */
+    CritSection      active_work_lock;
+    WorkItem*        active_work_items;
+} IOManagerState;
+
+/* ToDo: wrap up this state via a IOManager handle instead? */
+static IOManagerState* ioMan;
+
+static void RegisterWorkItem  ( IOManagerState* iom, WorkItem* wi);
+static void DeregisterWorkItem( IOManagerState* iom, WorkItem* wi);
+
+/*
+ * The routine executed by each worker thread.
+ */
+static
+unsigned
+WINAPI
+IOWorkerProc(PVOID param)
+{
+    HANDLE  hWaits[2];
+    DWORD   rc;
+    IOManagerState* iom = (IOManagerState*)param;
+    WorkQueue* pq = iom->workQueue;
+    WorkItem*  work;
+    int        len = 0, fd = 0;
+    DWORD      errCode = 0;
+    void*      complData;
+
+    hWaits[0] = (HANDLE)iom->hExitEvent;
+    hWaits[1] = GetWorkQueueHandle(pq);
+  
+    while (1) {
+	/* The error code is communicated back on completion of request; reset. */
+	errCode = 0;
+	
+	EnterCriticalSection(&iom->manLock);
+	/* Signal that the worker is idle.
+	 *
+	 * 'workersIdle' is used when determining whether or not to
+	 * increase the worker thread pool when adding a new request.
+	 * (see addIORequest().)
+	 */
+	iom->workersIdle++;
+	LeaveCriticalSection(&iom->manLock);
+	
+	/*
+	 * A possible future refinement is to make long-term idle threads
+	 * wake up and decide to shut down should the number of idle threads
+	 * be above some threshold.
+	 *
+	 */
+	rc = WaitForMultipleObjects( 2, hWaits, FALSE, INFINITE );
+
+	if (rc == WAIT_OBJECT_0) {
+	    // we received the exit event
+	    return 0;
+	}
+
+	EnterCriticalSection(&iom->manLock);
+	/* Signal that the thread is 'non-idle' and about to consume 
+	 * a work item.
+	 */
+	iom->workersIdle--;
+	iom->queueSize--;
+	LeaveCriticalSection(&iom->manLock);
+    
+	if ( rc == (WAIT_OBJECT_0 + 1) ) {
+	    /* work item available, fetch it. */
+	    if (FetchWork(pq,(void**)&work)) {
+		work->abandonOp = 0;
+		RegisterWorkItem(iom,work);
+		if ( work->workKind & WORKER_READ ) {
+		    if ( work->workKind & WORKER_FOR_SOCKET ) {
+			len = recv(work->workData.ioData.fd, 
+				   work->workData.ioData.buf,
+				   work->workData.ioData.len,
+				   0);
+			if (len == SOCKET_ERROR) {
+			    errCode = WSAGetLastError();
+			}
+		    } else {
+			while (1) {
+			/* Do the read(), with extra-special handling for Ctrl+C */
+			len = read(work->workData.ioData.fd,
+				   work->workData.ioData.buf,
+				   work->workData.ioData.len);
+			if ( len == 0 && work->workData.ioData.len != 0 ) {
+			    /* Given the following scenario:
+			     *     - a console handler has been registered that handles Ctrl+C
+			     *       events.
+			     *     - we've not tweaked the 'console mode' settings to turn on
+			     *       ENABLE_PROCESSED_INPUT.
+			     *     - we're blocked waiting on input from standard input.
+			     *     - the user hits Ctrl+C.
+			     *
+			     * The OS will invoke the console handler (in a separate OS thread),
+			     * and the above read() (i.e., under the hood, a ReadFile() op) returns
+			     * 0, with the error set to ERROR_OPERATION_ABORTED. We don't
+			     * want to percolate this error condition back to the Haskell user.
+			     * Do this by waiting for the completion of the Haskell console handler.
+			     * If upon completion of the console handler routine, the Haskell thread 
+			     * that issued the request is found to have been thrown an exception, 
+			     * the worker abandons the request (since that's what the Haskell thread 
+			     * has done.) If the Haskell thread hasn't been interrupted, the worker 
+			     * retries the read request as if nothing happened.
+			     */
+			    if ( (GetLastError()) == ERROR_OPERATION_ABORTED ) {
+				/* For now, only abort when dealing with the standard input handle.
+				 * i.e., for all others, an error is raised.
+				 */
+				HANDLE h  = (HANDLE)GetStdHandle(STD_INPUT_HANDLE);
+				if ( _get_osfhandle(work->workData.ioData.fd) == (long)h ) {
+				    if (rts_waitConsoleHandlerCompletion()) {
+					/* If the Scheduler has set work->abandonOp, the Haskell thread has 
+					 * been thrown an exception (=> the worker must abandon this request.)
+					 * We test for this below before invoking the on-completion routine.
+					 */
+					if (work->abandonOp) {
+					    break;
+					} else {
+					    continue;
+					}
+				    } 
+				} else { 
+				    break; /* Treat it like an error */
+				}
+			    } else {
+				break;
+			    }
+			} else {
+			    break;
+			}
+			}
+			if (len == -1) { errCode = errno; }
+		    }
+		    complData = work->workData.ioData.buf;
+		    fd = work->workData.ioData.fd;
+		} else if ( work->workKind & WORKER_WRITE ) {
+		    if ( work->workKind & WORKER_FOR_SOCKET ) {
+			len = send(work->workData.ioData.fd,
+				   work->workData.ioData.buf,
+				   work->workData.ioData.len,
+				   0);
+			if (len == SOCKET_ERROR) {
+			    errCode = WSAGetLastError();
+			}
+		    } else {
+			len = write(work->workData.ioData.fd,
+				    work->workData.ioData.buf,
+				    work->workData.ioData.len);
+			if (len == -1) { errCode = errno; }
+		    }
+		    complData = work->workData.ioData.buf;
+		    fd = work->workData.ioData.fd;
+		} else if ( work->workKind & WORKER_DELAY ) {
+		    /* Approximate implementation of threadDelay;
+		     * 
+		     * Note: Sleep() is in milliseconds, not micros.
+		     */
+		    Sleep(work->workData.delayData.msecs / 1000);
+		    len = work->workData.delayData.msecs;
+		    complData = NULL;
+		    fd = 0;
+		    errCode = 0;
+		} else if ( work->workKind & WORKER_DO_PROC ) {
+		    /* perform operation/proc on behalf of Haskell thread. */
+		    if (work->workData.procData.proc) {
+			/* The procedure is assumed to encode result + success/failure
+			 * via its param.
+			 */
+			errCode=work->workData.procData.proc(work->workData.procData.param);
+		    } else {
+			errCode=1;
+		    }
+		    complData = work->workData.procData.param;
+		} else {
+		    fprintf(stderr, "unknown work request type (%d) , ignoring.\n", work->workKind);
+		    fflush(stderr);
+		    continue;
+		}
+		if (!work->abandonOp) {
+		    work->onCompletion(work->requestID,
+				       fd,
+				       len,
+				       complData,
+				       errCode);
+		}
+		/* Free the WorkItem */
+		DeregisterWorkItem(iom,work);
+		free(work);
+	    } else {
+		fprintf(stderr, "unable to fetch work; fatal.\n"); fflush(stderr);
+		return 1;
+	    }
+	} else {
+	    fprintf(stderr, "waiting failed (%lu); fatal.\n", rc); fflush(stderr);
+	    return 1;
+	}
+    }
+    return 0;
+}
+
+static 
+BOOL
+NewIOWorkerThread(IOManagerState* iom)
+{
+    unsigned threadId;
+    return ( 0 != _beginthreadex(NULL,
+				 0,
+				 IOWorkerProc,
+				 (LPVOID)iom,
+				 0,
+				 &threadId) );
+}
+
+BOOL
+StartIOManager(void)
+{
+    HANDLE hExit;
+    WorkQueue* wq;
+
+    wq = NewWorkQueue();
+    if ( !wq ) return FALSE;  
+  
+    ioMan = (IOManagerState*)malloc(sizeof(IOManagerState));
+  
+    if (!ioMan) {
+	FreeWorkQueue(wq);
+	return FALSE;
+    }
+
+    /* A manual-reset event */
+    hExit = CreateEvent ( NULL, TRUE, FALSE, NULL );
+    if ( !hExit ) {
+	FreeWorkQueue(wq);
+	free(ioMan);
+	return FALSE;
+    }
+  
+    ioMan->hExitEvent = hExit;
+    InitializeCriticalSection(&ioMan->manLock);
+    ioMan->workQueue   = wq;
+    ioMan->numWorkers  = 0;
+    ioMan->workersIdle = 0;
+    ioMan->queueSize   = 0;
+    ioMan->requestID   = 1;
+    InitializeCriticalSection(&ioMan->active_work_lock);
+    ioMan->active_work_items = NULL;
+ 
+    return TRUE;
+}
+
+/*
+ * Function: depositWorkItem()
+ *
+ * Local function which deposits a WorkItem onto a work queue,
+ * deciding in the process whether or not the thread pool needs
+ * to be augmented with another thread to handle the new request.
+ *
+ */
+static
+int
+depositWorkItem( unsigned int reqID,
+		 WorkItem* wItem )
+{
+    EnterCriticalSection(&ioMan->manLock);
+
+#if 0
+    fprintf(stderr, "depositWorkItem: %d/%d\n", ioMan->workersIdle, ioMan->numWorkers); 
+    fflush(stderr);
+#endif
+    /* A new worker thread is created when there are fewer idle threads
+     * than non-consumed queue requests. This ensures that requests will
+     * be dealt with in a timely manner.
+     *
+     * [Long explanation of why the previous thread pool policy lead to 
+     * trouble]
+     *
+     * Previously, the thread pool was augmented iff no idle worker threads
+     * were available. That strategy runs the risk of repeatedly adding to
+     * the request queue without expanding the thread pool to handle this
+     * sudden spike in queued requests. 
+     * [How? Assume workersIdle is 1, and addIORequest() is called. No new 
+     * thread is created and the request is simply queued. If addIORequest()
+     * is called again _before the OS schedules a worker thread to pull the
+     * request off the queue_, workersIdle is still 1 and another request is 
+     * simply added to the queue. Once the worker thread is run, only one
+     * request is de-queued, leaving the 2nd request in the queue]
+     * 
+     * Assuming none of the queued requests take an inordinate amount of to 
+     * complete, the request queue would eventually be drained. But if that's 
+     * not the case, the later requests will end up languishing in the queue 
+     * indefinitely. The non-timely handling of requests may cause CH applications
+     * to misbehave / hang; bad.
+     *
+     */
+    ioMan->queueSize++;
+    if ( (ioMan->workersIdle < ioMan->queueSize) ) {
+	/* see if giving up our quantum ferrets out some idle threads.
+	 */
+	LeaveCriticalSection(&ioMan->manLock);
+	Sleep(0);
+	EnterCriticalSection(&ioMan->manLock);
+	if ( (ioMan->workersIdle < ioMan->queueSize) ) {
+	    /* No, go ahead and create another. */
+	    ioMan->numWorkers++;
+	    LeaveCriticalSection(&ioMan->manLock);
+	    NewIOWorkerThread(ioMan);
+	} else {
+	    LeaveCriticalSection(&ioMan->manLock);
+	}
+    } else {
+	LeaveCriticalSection(&ioMan->manLock);
+    }
+  
+    if (SubmitWork(ioMan->workQueue,wItem)) {
+	/* Note: the work item has potentially been consumed by a worker thread
+	 *       (and freed) at this point, so we cannot use wItem's requestID.
+	 */
+	return reqID;
+    } else {
+	return 0;
+    }
+}
+
+/*
+ * Function: AddIORequest()
+ *
+ * Conduit to underlying WorkQueue's SubmitWork(); adds IO
+ * request to work queue, deciding whether or not to augment
+ * the thread pool in the process. 
+ */
+int
+AddIORequest ( int   fd,
+	       BOOL  forWriting,
+	       BOOL  isSocket,
+	       int   len,
+	       char* buffer,
+	       CompletionProc onCompletion)
+{
+    WorkItem* wItem    = (WorkItem*)malloc(sizeof(WorkItem));
+    unsigned int reqID = ioMan->requestID++;
+    if (!ioMan || !wItem) return 0;
+  
+    /* Fill in the blanks */
+    wItem->workKind     = ( isSocket   ? WORKER_FOR_SOCKET : 0 ) | 
+	                  ( forWriting ? WORKER_WRITE : WORKER_READ );
+    wItem->workData.ioData.fd  = fd;
+    wItem->workData.ioData.len = len;
+    wItem->workData.ioData.buf = buffer;
+    wItem->link = NULL;
+
+    wItem->onCompletion        = onCompletion;
+    wItem->requestID           = reqID;
+  
+    return depositWorkItem(reqID, wItem);
+}       
+
+/*
+ * Function: AddDelayRequest()
+ *
+ * Like AddIORequest(), but this time adding a delay request to
+ * the request queue.
+ */
+BOOL
+AddDelayRequest ( unsigned int   msecs,
+		  CompletionProc onCompletion)
+{
+    WorkItem* wItem = (WorkItem*)malloc(sizeof(WorkItem));
+    unsigned int reqID = ioMan->requestID++;
+    if (!ioMan || !wItem) return FALSE;
+  
+    /* Fill in the blanks */
+    wItem->workKind     = WORKER_DELAY;
+    wItem->workData.delayData.msecs = msecs;
+    wItem->onCompletion = onCompletion;
+    wItem->requestID    = reqID;
+    wItem->link         = NULL;
+
+    return depositWorkItem(reqID, wItem);
+}
+
+/*
+ * Function: AddProcRequest()
+ *
+ * Add an asynchronous procedure request.
+ */
+BOOL
+AddProcRequest ( void* proc,
+		 void* param,
+		 CompletionProc onCompletion)
+{
+    WorkItem* wItem = (WorkItem*)malloc(sizeof(WorkItem));
+    unsigned int reqID = ioMan->requestID++;
+    if (!ioMan || !wItem) return FALSE;
+  
+    /* Fill in the blanks */
+    wItem->workKind     = WORKER_DO_PROC;
+    wItem->workData.procData.proc  = proc;
+    wItem->workData.procData.param = param;
+    wItem->onCompletion = onCompletion;
+    wItem->requestID    = reqID;
+    wItem->abandonOp    = 0;
+    wItem->link         = NULL;
+
+    return depositWorkItem(reqID, wItem);
+}
+
+void ShutdownIOManager ( void )
+{
+  SetEvent(ioMan->hExitEvent);
+  // ToDo: we can't free this now, because the worker thread(s)
+  // haven't necessarily finished with it yet.  Perhaps it should
+  // have a reference count or something.
+  // free(ioMan);
+  // ioMan = NULL;
+}
+
+/* Keep track of WorkItems currently being serviced. */
+static 
+void
+RegisterWorkItem(IOManagerState* ioMan, 
+		 WorkItem* wi)
+{
+    EnterCriticalSection(&ioMan->active_work_lock);
+    wi->link = ioMan->active_work_items;
+    ioMan->active_work_items = wi;
+    LeaveCriticalSection(&ioMan->active_work_lock);
+}
+
+static 
+void
+DeregisterWorkItem(IOManagerState* ioMan, 
+		   WorkItem* wi)
+{
+    WorkItem *ptr, *prev;
+    
+    EnterCriticalSection(&ioMan->active_work_lock);
+    for(prev=NULL,ptr=ioMan->active_work_items;ptr;prev=ptr,ptr=ptr->link) {
+	if (wi->requestID == ptr->requestID) {
+	    if (prev==NULL) {
+		ioMan->active_work_items = ptr->link;
+	    } else {
+		prev->link = ptr->link;
+	    }
+	    LeaveCriticalSection(&ioMan->active_work_lock);
+	    return;
+	}
+    }
+    fprintf(stderr, "DeregisterWorkItem: unable to locate work item %d\n", wi->requestID);
+    LeaveCriticalSection(&ioMan->active_work_lock);
+}
+
+
+/*
+ * Function: abandonWorkRequest()
+ *
+ * Signal that a work request isn't of interest. Called by the Scheduler
+ * if a blocked Haskell thread has an exception thrown to it.
+ *
+ * Note: we're not aborting the system call that a worker might be blocked on
+ * here, just disabling the propagation of its result once its finished. We
+ * may have to go the whole hog here and switch to overlapped I/O so that we
+ * can abort blocked system calls.
+ */
+void
+abandonWorkRequest ( int reqID )
+{
+    WorkItem *ptr;
+    EnterCriticalSection(&ioMan->active_work_lock);
+    for(ptr=ioMan->active_work_items;ptr;ptr=ptr->link) {
+	if (ptr->requestID == (unsigned int)reqID ) {
+	    ptr->abandonOp = 1;
+	    LeaveCriticalSection(&ioMan->active_work_lock);
+	    return;
+	}
+    }
+    /* Note: if the request ID isn't present, the worker will have
+     * finished sometime since awaitRequests() last drained the completed
+     * request table; i.e., not an error.
+     */
+    LeaveCriticalSection(&ioMan->active_work_lock);
+}
diff --git a/rts/win32/IOManager.h b/rts/win32/IOManager.h
new file mode 100644
index 0000000000..4893e2387c
--- /dev/null
+++ b/rts/win32/IOManager.h
@@ -0,0 +1,110 @@
+/* IOManager.h
+ *
+ * Non-blocking / asynchronous I/O for Win32.
+ *
+ * (c) sof, 2002-2003
+ */
+#ifndef __IOMANAGER_H__
+#define __IOMANAGER_H__
+/* On the yucky side..suppress -Wmissing-declarations warnings when
+ * including <windows.h>
+ */
+extern void* GetCurrentFiber ( void );
+extern void* GetFiberData ( void );
+#include <windows.h>
+
+/*
+ The IOManager subsystem provides a non-blocking view
+ of I/O operations. It lets one (or more) OS thread(s)
+ issue multiple I/O requests, which the IOManager then 
+ handles independently of/concurrent to the thread(s)
+ that issued the request. Upon completion, the issuing
+ thread can inspect the result of the I/O operation &
+ take appropriate action.
+
+ The IOManager is intended used with the GHC RTS to
+ implement non-blocking I/O in Concurrent Haskell.
+ */
+
+/*
+ * Our WorkQueue holds WorkItems, encoding IO and
+ * delay requests.
+ *
+ */
+typedef void (*CompletionProc)(unsigned int requestID,
+			       int   fd,
+			       int   len,
+			       void* buf,
+			       int   errCode);
+
+/* 
+ * Asynchronous procedure calls executed by a worker thread
+ * take a generic state argument pointer and return an int by 
+ * default. 
+ */
+typedef int (*DoProcProc)(void *param);
+
+typedef union workData {
+    struct {
+	int   fd;
+	int   len;
+	char *buf; 
+    } ioData;
+    struct { 
+	int   msecs;
+    } delayData;
+    struct { 
+	DoProcProc proc;
+	void* param;
+    } procData;
+} WorkData;
+
+typedef struct WorkItem {
+  unsigned int     workKind;
+  WorkData         workData;
+  unsigned int     requestID;
+  CompletionProc   onCompletion;
+  unsigned int     abandonOp;
+  struct WorkItem  *link;
+} WorkItem;
+
+extern CompletionProc onComplete;
+
+/* the kind of operations supported; you could easily imagine
+ * that instead of passing a tag describing the work to be performed,
+ * a function pointer is passed instead. Maybe later.
+ */
+#define WORKER_READ        1
+#define WORKER_WRITE       2
+#define WORKER_DELAY       4
+#define WORKER_FOR_SOCKET  8
+#define WORKER_DO_PROC    16
+
+/*
+ * Starting up and shutting down. 
+ */ 
+extern BOOL StartIOManager     ( void );
+extern void ShutdownIOManager  ( void );
+
+/*
+ * Adding I/O and delay requests. With each request a
+ * completion routine is supplied, which the worker thread
+ * will invoke upon completion.
+ */
+extern int AddDelayRequest ( unsigned int   msecs,
+			     CompletionProc onCompletion);
+
+extern int AddIORequest ( int            fd,
+			  BOOL           forWriting,
+			  BOOL           isSocket,
+			  int            len,
+			  char*          buffer,
+			  CompletionProc onCompletion);
+
+extern int AddProcRequest ( void*          proc,
+			    void*          data,
+			    CompletionProc onCompletion);
+
+extern void abandonWorkRequest ( int reqID );
+
+#endif /* __IOMANAGER_H__ */
diff --git a/rts/win32/OSThreads.c b/rts/win32/OSThreads.c
new file mode 100644
index 0000000000..c772be38f4
--- /dev/null
+++ b/rts/win32/OSThreads.c
@@ -0,0 +1,199 @@
+/* ---------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2001-2005
+ *
+ * Accessing OS threads functionality in a (mostly) OS-independent
+ * manner. 
+ *
+ * --------------------------------------------------------------------------*/
+
+#include "Rts.h"
+#if defined(THREADED_RTS)
+#include "OSThreads.h"
+#include "RtsUtils.h"
+
+/* For reasons not yet clear, the entire contents of process.h is protected 
+ * by __STRICT_ANSI__ not being defined.
+ */
+#undef __STRICT_ANSI__
+#include <process.h>
+
+/* Win32 threads and synchronisation objects */
+
+/* A Condition is represented by a Win32 Event object;
+ * a Mutex by a Mutex kernel object.
+ *
+ * ToDo: go through the defn and usage of these to
+ * make sure the semantics match up with that of 
+ * the (assumed) pthreads behaviour. This is really
+ * just a first pass at getting something compilable.
+ */
+
+void
+initCondition( Condition* pCond )
+{
+  HANDLE h =  CreateEvent(NULL, 
+			  FALSE,  /* auto reset */
+			  FALSE,  /* initially not signalled */
+			  NULL); /* unnamed => process-local. */
+  
+  if ( h == NULL ) {
+    errorBelch("initCondition: unable to create");
+  }
+  *pCond = h;
+  return;
+}
+
+void
+closeCondition( Condition* pCond )
+{
+  if ( CloseHandle(*pCond) == 0 ) {
+    errorBelch("closeCondition: failed to close");
+  }
+  return;
+}
+
+rtsBool
+broadcastCondition ( Condition* pCond )
+{
+  PulseEvent(*pCond);
+  return rtsTrue;
+}
+
+rtsBool
+signalCondition ( Condition* pCond )
+{
+    if (SetEvent(*pCond) == 0) {
+	barf("SetEvent: %d", GetLastError());
+    }
+    return rtsTrue;
+}
+
+rtsBool
+waitCondition ( Condition* pCond, Mutex* pMut )
+{
+  RELEASE_LOCK(pMut);
+  WaitForSingleObject(*pCond, INFINITE);
+  /* Hmm..use WaitForMultipleObjects() ? */
+  ACQUIRE_LOCK(pMut);
+  return rtsTrue;
+}
+
+void
+yieldThread()
+{
+  Sleep(0);
+  return;
+}
+
+void
+shutdownThread()
+{
+  _endthreadex(0);
+}
+
+int
+createOSThread (OSThreadId* pId, OSThreadProc *startProc, void *param)
+{
+  
+  return (_beginthreadex ( NULL,  /* default security attributes */
+			   0,
+			   (unsigned (__stdcall *)(void *)) startProc,
+			   param,
+			   0,
+			   (unsigned*)pId) == 0);
+}
+
+OSThreadId
+osThreadId()
+{
+  return GetCurrentThreadId();
+}
+
+#ifdef USE_CRITICAL_SECTIONS
+void
+initMutex (Mutex* pMut)
+{
+    InitializeCriticalSectionAndSpinCount(pMut,4000);
+}
+#else
+void
+initMutex (Mutex* pMut)
+{
+  HANDLE h = CreateMutex ( NULL,  /* default sec. attributes */
+			   FALSE, /* not owned => initially signalled */
+			   NULL
+			   );
+  *pMut = h;
+  return;
+}
+#endif
+
+void
+newThreadLocalKey (ThreadLocalKey *key)
+{
+    DWORD r;
+    r = TlsAlloc();
+    if (r == TLS_OUT_OF_INDEXES) {
+	barf("newThreadLocalKey: out of keys");
+    }
+    *key = r;
+}
+
+void *
+getThreadLocalVar (ThreadLocalKey *key)
+{
+    void *r;
+    r = TlsGetValue(*key);
+#ifdef DEBUG
+    // r is allowed to be NULL - it can mean that either there was an
+    // error or the stored value is in fact NULL.
+    if (GetLastError() != NO_ERROR) {
+	barf("getThreadLocalVar: key not found");
+    }
+#endif
+    return r;
+}
+
+void
+setThreadLocalVar (ThreadLocalKey *key, void *value)
+{
+    BOOL b;
+    b = TlsSetValue(*key, value);
+    if (!b) {
+	barf("setThreadLocalVar: %d", GetLastError());
+    }
+}
+
+
+static unsigned __stdcall
+forkOS_createThreadWrapper ( void * entry )
+{
+    Capability *cap;
+    cap = rts_lock();
+    cap = rts_evalStableIO(cap, (HsStablePtr) entry, NULL);
+    rts_unlock(cap);
+    return 0;
+}
+
+int
+forkOS_createThread ( HsStablePtr entry )
+{
+    unsigned long pId;
+    return (_beginthreadex ( NULL,  /* default security attributes */
+			   0,
+			   forkOS_createThreadWrapper,
+			   (void*)entry,
+			   0,
+			   (unsigned*)&pId) == 0);
+}
+
+#else /* !defined(THREADED_RTS) */
+
+int
+forkOS_createThread ( HsStablePtr entry STG_UNUSED )
+{
+    return -1;
+}
+
+#endif /* !defined(THREADED_RTS) */
diff --git a/rts/win32/Ticker.c b/rts/win32/Ticker.c
new file mode 100644
index 0000000000..ab791d8dc7
--- /dev/null
+++ b/rts/win32/Ticker.c
@@ -0,0 +1,124 @@
+/*
+ * RTS periodic timers.
+ * 
+ */
+#include "Rts.h"
+#include "Timer.h"
+#include "Ticker.h"
+#include <windows.h>
+#include <stdio.h>
+#include <process.h>
+#include "OSThreads.h"
+
+/*
+ * Provide a timer service for the RTS, periodically
+ * notifying it that a number of 'ticks' has passed.
+ *
+ */
+
+/* To signal shutdown of the timer service, we use a local
+ * event which the timer thread listens to (and stopVirtTimer()
+ * signals.)
+ */
+static HANDLE hStopEvent = INVALID_HANDLE_VALUE;
+static HANDLE tickThread = INVALID_HANDLE_VALUE;
+
+static TickProc tickProc = NULL;
+
+/*
+ * Ticking is done by a separate thread which periodically
+ * wakes up to handle a tick.
+ *
+ * This is the portable way of providing a timer service under
+ * Win32; features like waitable timers or timer queues are only
+ * supported by a subset of the Win32 platforms (notably not
+ * under Win9x.)
+ *
+ */
+static
+unsigned
+WINAPI
+TimerProc(PVOID param)
+{
+  int ms = (int)param;
+  DWORD waitRes;
+  
+  /* interpret a < 0 timeout period as 'instantaneous' */ 
+ if (ms < 0) ms = 0;
+
+  while (1) {
+    waitRes = WaitForSingleObject(hStopEvent, ms);
+    
+    switch (waitRes) {
+    case WAIT_OBJECT_0:
+      /* event has become signalled */
+      tickProc = NULL;
+      CloseHandle(hStopEvent);
+      return 0;
+    case WAIT_TIMEOUT:
+      /* tick */
+      tickProc(0);
+      break;
+    case WAIT_FAILED: {
+	DWORD dw = GetLastError();
+	fprintf(stderr, "TimerProc: wait failed -- error code: %lu\n", dw); fflush(stderr);
+	break; 
+    }
+    default:
+      fprintf(stderr, "TimerProc: unexpected result %lu\n", waitRes); fflush(stderr);
+      break;
+    }
+  }
+  return 0;
+}
+
+
+int
+startTicker(nat ms, TickProc handle_tick)
+{
+  unsigned threadId;
+  /* 'hStopEvent' is a manual-reset event that's signalled upon
+   * shutdown of timer service (=> timer thread.)
+   */
+  hStopEvent = CreateEvent ( NULL,
+			     TRUE,
+			     FALSE,
+			     NULL);
+  if (hStopEvent == INVALID_HANDLE_VALUE) {
+    return 0;
+  }
+  tickProc = handle_tick;
+  tickThread = (HANDLE)(long)_beginthreadex( NULL,
+			       0,
+			       TimerProc,
+			       (LPVOID)ms,
+			       0,
+			       &threadId);
+  return (tickThread != 0);
+}
+
+int
+stopTicker(void)
+{
+    // We must wait for the ticker thread to terminate, since if we
+    // are in a DLL that is about to be unloaded, the ticker thread
+    // cannot be allowed to return to a missing DLL.
+
+    if (hStopEvent != INVALID_HANDLE_VALUE && 
+	tickThread != INVALID_HANDLE_VALUE) {
+	DWORD exitCode;
+	SetEvent(hStopEvent);
+	while (1) {
+	    WaitForSingleObject(tickThread, 20);
+	    if (!GetExitCodeThread(tickThread, &exitCode)) {
+		return 1;
+	    }
+	    if (exitCode != STILL_ACTIVE) {
+		tickThread = INVALID_HANDLE_VALUE;
+		return 0;
+	    }
+	    TerminateThread(tickThread, 0);
+	}
+    }
+    return 0;
+}
diff --git a/rts/win32/WorkQueue.c b/rts/win32/WorkQueue.c
new file mode 100644
index 0000000000..85a23608be
--- /dev/null
+++ b/rts/win32/WorkQueue.c
@@ -0,0 +1,215 @@
+/*
+ * A fixed-size queue; MT-friendly.
+ * 
+ * (c) sof, 2002-2003.
+ */
+#include "WorkQueue.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static void queue_error_rc( char* loc, DWORD err);
+static void queue_error( char* loc, char* reason);
+
+
+/* Wrapper around OS call to create semaphore */
+static Semaphore
+newSemaphore(int initCount, int max)
+{
+  Semaphore s;
+  s = CreateSemaphore ( NULL,       /* LPSECURITY_ATTRIBUTES (default) */
+			initCount,  /* LONG lInitialCount */
+			max,        /* LONG lMaxCount */
+			NULL);      /* LPCTSTR (anonymous / no object name) */
+  if ( NULL == s) {
+    queue_error_rc("newSemaphore", GetLastError());
+    return NULL;
+  }
+  return s;
+}
+
+/*
+ * Function: NewWorkQueue
+ *
+ * The queue constructor - semaphores are initialised to match
+ * max number of queue entries.
+ * 
+ */
+WorkQueue*
+NewWorkQueue()
+{
+  WorkQueue* wq = (WorkQueue*)malloc(sizeof(WorkQueue));
+  
+  if (!wq) {
+    queue_error("NewWorkQueue", "malloc() failed");
+    return wq;
+  }
+    
+  wq->head   = 0;
+  wq->tail   = 0;
+  
+  InitializeCriticalSection(&wq->queueLock);
+  wq->workAvailable = newSemaphore(0, WORKQUEUE_SIZE);
+  wq->roomAvailable = newSemaphore(WORKQUEUE_SIZE, WORKQUEUE_SIZE);
+  
+  /* Fail if we were unable to create any of the sync objects. */
+  if ( NULL == wq->workAvailable ||
+       NULL == wq->roomAvailable ) {
+    FreeWorkQueue(wq);
+    return NULL;
+  }
+
+  return wq;
+}
+
+void
+FreeWorkQueue ( WorkQueue* pq )
+{
+  /* Close the semaphores; any threads blocked waiting
+   * on either will as a result be woken up.
+   */ 
+  if ( pq->workAvailable ) {
+    CloseHandle(pq->workAvailable);
+  }
+  if ( pq->roomAvailable ) {
+    CloseHandle(pq->workAvailable);
+  }
+  free(pq);
+  return;
+}
+
+HANDLE
+GetWorkQueueHandle ( WorkQueue* pq )
+{
+  if (!pq) return NULL;
+  
+  return pq->workAvailable;
+}
+
+/*
+ * Function: GetWork
+ *
+ * Fetch a work item from the queue, blocking if none available.
+ * Return value indicates of FALSE indicates error/fatal condition.
+ */
+BOOL
+GetWork ( WorkQueue* pq, void** ppw )
+{
+  DWORD rc;
+
+  if (!pq) {
+    queue_error("GetWork", "NULL WorkQueue object");
+    return FALSE;
+  }
+  if (!ppw) {
+    queue_error("GetWork", "NULL WorkItem object");
+    return FALSE;
+  }
+  
+  /* Block waiting for work item to become available */
+  if ( (rc = WaitForSingleObject( pq->workAvailable, INFINITE)) != WAIT_OBJECT_0 ) {
+    queue_error_rc("GetWork.WaitForSingleObject(workAvailable)", 
+		   ( (WAIT_FAILED == rc) ? GetLastError() : rc));
+    return FALSE;
+  }
+  
+  return FetchWork(pq,ppw);
+}
+
+/*
+ * Function: FetchWork
+ *
+ * Fetch a work item from the queue, blocking if none available.
+ * Return value indicates of FALSE indicates error/fatal condition.
+ */
+BOOL
+FetchWork ( WorkQueue* pq, void** ppw )
+{
+  DWORD rc;
+
+  if (!pq) {
+    queue_error("FetchWork", "NULL WorkQueue object");
+    return FALSE;
+  }
+  if (!ppw) {
+    queue_error("FetchWork", "NULL WorkItem object");
+    return FALSE;
+  }
+  
+  EnterCriticalSection(&pq->queueLock);
+  *ppw = pq->items[pq->head];
+  /* For sanity's sake, zero out the pointer. */
+  pq->items[pq->head] = NULL;
+  pq->head = (pq->head + 1) % WORKQUEUE_SIZE;
+  rc = ReleaseSemaphore(pq->roomAvailable,1, NULL);
+  LeaveCriticalSection(&pq->queueLock);
+  if ( 0 == rc ) {
+    queue_error_rc("FetchWork.ReleaseSemaphore()", GetLastError());
+    return FALSE;
+  }
+
+  return TRUE;
+}
+
+/*
+ * Function: SubmitWork
+ *
+ * Add work item to the queue, blocking if no room available.
+ * Return value indicates of FALSE indicates error/fatal condition.
+ */
+BOOL
+SubmitWork ( WorkQueue* pq, void* pw )
+{
+  DWORD rc;
+
+  if (!pq) {
+    queue_error("SubmitWork", "NULL WorkQueue object");
+    return FALSE;
+  }
+  if (!pw) {
+    queue_error("SubmitWork", "NULL WorkItem object");
+    return FALSE;
+  }
+  
+  /* Block waiting for work item to become available */
+  if ( (rc = WaitForSingleObject( pq->roomAvailable, INFINITE)) != WAIT_OBJECT_0 ) {
+    queue_error_rc("SubmitWork.WaitForSingleObject(workAvailable)", 
+		   ( (WAIT_FAILED == rc) ? GetLastError() : rc));
+
+    return FALSE;
+  }
+  
+  EnterCriticalSection(&pq->queueLock);
+  pq->items[pq->tail] = pw;
+  pq->tail = (pq->tail + 1) % WORKQUEUE_SIZE;
+  rc = ReleaseSemaphore(pq->workAvailable,1, NULL);
+  LeaveCriticalSection(&pq->queueLock);
+  if ( 0 == rc ) {
+    queue_error_rc("SubmitWork.ReleaseSemaphore()", GetLastError());
+    return FALSE;
+  }
+
+  return TRUE;
+}
+
+/* Error handling */
+
+static void
+queue_error_rc( char* loc,
+		DWORD err)
+{
+  fprintf(stderr, "%s failed: return code = 0x%lx\n", loc, err);
+  fflush(stderr);
+  return;
+}
+			    
+
+static void
+queue_error( char* loc,
+	     char* reason)
+{
+  fprintf(stderr, "%s failed: %s\n", loc, reason);
+  fflush(stderr);
+  return;
+}
+
diff --git a/rts/win32/WorkQueue.h b/rts/win32/WorkQueue.h
new file mode 100644
index 0000000000..bde82a3a77
--- /dev/null
+++ b/rts/win32/WorkQueue.h
@@ -0,0 +1,37 @@
+/* WorkQueue.h
+ *
+ * A fixed-size queue; MT-friendly.
+ * 
+ * (c) sof, 2002-2003
+ *
+ */
+#ifndef __WORKQUEUE_H__
+#define __WORKQUEUE_H__
+#include <windows.h>
+
+/* This is a fixed-size queue. */
+#define WORKQUEUE_SIZE 16
+
+typedef HANDLE           Semaphore;
+typedef CRITICAL_SECTION CritSection;
+
+typedef struct WorkQueue {
+    /* the master lock, need to be grabbed prior to
+       using any of the other elements of the struct. */
+  CritSection   queueLock;
+  /* consumers/workers block waiting for 'workAvailable' */
+  Semaphore     workAvailable;
+  Semaphore     roomAvailable;
+  int           head;
+  int           tail;
+  void**        items[WORKQUEUE_SIZE];
+} WorkQueue;
+
+extern WorkQueue* NewWorkQueue       ( void );
+extern void       FreeWorkQueue      ( WorkQueue* pq );
+extern HANDLE     GetWorkQueueHandle ( WorkQueue* pq );
+extern BOOL       GetWork            ( WorkQueue* pq, void** ppw );
+extern BOOL       FetchWork          ( WorkQueue* pq, void** ppw );
+extern int        SubmitWork         ( WorkQueue* pq, void*   pw );
+
+#endif /* __WORKQUEUE_H__ */