gc7.0alpha2 tarball importgc7_0alpha2

author: Ivan Maidanski <ivmai@mail.ru> 2011-07-26 20:18:03 +0400
committer: Ivan Maidanski <ivmai@mail.ru> 2011-07-26 20:18:03 +0400
commit: 64040040407b11d8740516fad2438109e7f22c02 (patch)
tree: 6d18a075ceb6f63855730afe630e50b0afa7ed32
parent: e35a4171fe47dfbf847e08988ea6cec4dfc8d124 (diff)
download: bdwgc-64040040407b11d8740516fad2438109e7f22c02.tar.gz
46 files changed, 1463 insertions, 1563 deletions
diff --git a/AmigaOS.c b/AmigaOS.c
index f4024a79..d1715039 100644
--- a/AmigaOS.c
+++ b/AmigaOS.c
@@ -40,7 +40,7 @@
    Find the base of the stack.
 ******************************************************************/
 
-ptr_t GC_get_stack_base()
+ptr_t GC_get_main_stack_base()
 {
     struct Process *proc = (struct Process*)SysBase->ThisTask;
  
diff --git a/Makefile b/Makefile
index 83d4d60e..faa9008f 100644
--- a/Makefile
+++ b/Makefile
@@ -31,10 +31,10 @@ srcdir= .
 VPATH= $(srcdir)
 
 # Atomic_ops installation directory.  If this doesn't exist, we create
-# it from the included atomic_ops distribution.
-AO_VERSION=0.6
-AO_SRC_DIR=$(srcdir)/atomic_ops-$(AO_VERSION)
-AO_INSTALL_DIR=$(AO_SRC_DIR)/installed
+# it from the included libatomic_ops distribution.
+AO_VERSION=1.0
+AO_SRC_DIR=$(srcdir)/libatomic_ops-$(AO_VERSION)
+AO_INSTALL_DIR=$(srcdir)/libatomic_ops-install
 
 CFLAGS= -O -I$(srcdir)/include -I$(AO_INSTALL_DIR)/include -DATOMIC_UNCOLLECTABLE -DNO_EXECUTE_PERMISSION -DALL_INTERIOR_POINTERS
 
@@ -42,7 +42,8 @@ CFLAGS= -O -I$(srcdir)/include -I$(AO_INSTALL_DIR)/include -DATOMIC_UNCOLLECTABL
 # -DGC_LINUX_THREADS -DPARALLEL_MARK -DTHREAD_LOCAL_ALLOC
 # To build the parallel collector in a static library on HP/UX,
 # add to the above:
-# -DGC_HPUX_THREADS -DPARALLEL_MARK -DTHREAD_LOCAL_ALLOC -D_POSIX_C_SOURCE=199506L
+# -DGC_HPUX_THREADS -DTHREAD_LOCAL_ALLOC -D_POSIX_C_SOURCE=199506L -mt
+# FIXME: PARALLEL_MARK currently broken on HP/UX.
 # To build the thread-safe collector on Tru64, add to the above:
 # -pthread -DGC_OSF1_THREADS
 
@@ -236,11 +237,12 @@ HOSTCFLAGS=$(CFLAGS)
 #   causes the collector some system and pthread calls in a more transparent
 #   fashion than the usual macro-based approach.  Requires GNU ld, and
 #   currently probably works only with Linux.
-# -DTHREAD_LOCAL_ALLOC defines GC_local_malloc(), GC_local_malloc_atomic()
-#   and GC_local_gcj_malloc().  Needed for gc_gcj.h interface.  These allocate
-#   in a way that usually does not involve acquisition of a global lock.
-#   Currently requires -DGC_LINUX_THREADS, but should be easy to port to
-#   other pthreads environments.  Recommended for multiprocessors.
+# -DTHREAD_LOCAL_ALLOC defines GC_malloc(), GC_malloc_atomic()
+#   and GC_gcj_malloc() to use a per-thread set of free-lists.
+#   These then allocate  in a way that usually does not involve
+#   acquisition of a global lock.  Currently supported only on platforms
+#   such as Linux that use pthread_support.c.  Recommended for multiprocessors.
+#   Requires explicit GC_INIT() call.
 # -DUSE_COMPILER_TLS causes thread local allocation to use compiler-supported
 #   "__thread" thread-local variables.  This is the default in HP/UX.  It
 #   may help performance on recent Linux installations.  (It failed for
@@ -304,7 +306,7 @@ SRCS= $(CSRCS) mips_sgi_mach_dep.s rs6000_mach_dep.s alpha_mach_dep.S \
     sparc_mach_dep.S include/gc.h include/gc_typed.h include/gc_tiny_fl.h \
     include/private/gc_hdrs.h include/private/gc_priv.h \
     include/private/gcconfig.h include/private/gc_pmark.h \
-    include/gc_inl.h include/gc_inline.h include/gc_mark.h \
+    include/gc_inline.h include/gc_mark.h \
     threadlibs.c if_mach.c if_not_there.c gc_cpp.cc include/gc_cpp.h \
     gcname.c include/weakpointer.h include/private/gc_locks.h \
     mips_ultrix_mach_dep.s \
@@ -312,7 +314,7 @@ SRCS= $(CSRCS) mips_sgi_mach_dep.s rs6000_mach_dep.s alpha_mach_dep.S \
     include/javaxfc.h sparc_sunos4_mach_dep.s sparc_netbsd_mach_dep.s \
     include/gc_backptr.h \
     hpux_test_and_clear.s include/gc_gcj.h \
-    include/gc_local_alloc.h include/private/dbg_mlc.h \
+    include/private/dbg_mlc.h \
     include/private/specific.h powerpc_darwin_mach_dep.s \
     include/leak_detector.h include/gc_amiga_redirects.h \
     include/gc_pthread_redirects.h ia64_save_regs_in_stack.s \
@@ -335,25 +337,27 @@ DOC_FILES= README.QUICK doc/README.Mac doc/README.MacOSX doc/README.OS2 \
 TESTS= tests/test.c tests/test_cpp.cc tests/trace_test.c \
 	tests/leak_test.c tests/thread_leak_test.c tests/middle.c
 
-GNU_BUILD_FILES= configure.in Makefile.am configure acinclude.m4 \
+GNU_BUILD_FILES= configure.ac Makefile.am configure acinclude.m4 \
 		 libtool.m4 install-sh configure.host Makefile.in \
 		 aclocal.m4 config.sub config.guess \
 		 include/Makefile.am include/Makefile.in \
 		 doc/Makefile.am doc/Makefile.in \
-		 ltmain.sh mkinstalldirs depcomp missing
+		 ltmain.sh mkinstalldirs depcomp missing \
+		 cord/Makefile.am tests/Makefile.am
 
 OTHER_MAKEFILES= OS2_MAKEFILE NT_MAKEFILE NT_THREADS_MAKEFILE gc.mak \
 		 BCC_MAKEFILE EMX_MAKEFILE WCC_MAKEFILE Makefile.dj \
 		 PCR-Makefile SMakefile.amiga Makefile.DLLs \
-		 digimars.mak Makefile.direct NT_STATIC_THREADS_MAKEFILE
+		 digimars.mak Makefile.direct NT_STATIC_THREADS_MAKEFILE \
+		 configure_atomic_ops.sh
 #	Makefile and Makefile.direct are copies of each other.
 
-OTHER_FILES= Makefile setjmp_t.c callprocs pc_excludes \
+OTHER_FILES= Makefile setjmp_t.c callprocs \
            MacProjects.sit.hqx MacOS.c \
            Mac_files/datastart.c Mac_files/dataend.c \
            Mac_files/MacOS_config.h Mac_files/MacOS_Test_config.h \
            add_gc_prefix.c gc_cpp.cpp \
-	   version.h AmigaOS.c atomic_ops-0.6.tar.gz \
+	   version.h AmigaOS.c \
 	   $(TESTS) $(GNU_BUILD_FILES) $(OTHER_MAKEFILES)
 
 CORD_INCLUDE_FILES= $(srcdir)/include/gc.h $(srcdir)/include/cord.h \
@@ -379,8 +383,9 @@ all: gc.a gctest
 
 # if AO_INSTALL_DIR doesn't exist, we assume that it is pointing to
 # the default location, and we need to build
-$(AO_INSTALL_DIR):
-	tar xvfz $(AO_SRC_DIR).tar.gz; cd $(AO_SRC_DIR); make CC=$(CC) install
+$(AO_INSTALL_DIR): 
+	CC=$(CC) $(srcdir)/configure_atomic_ops.sh
+	cd $(AO_SRC_DIR); make CC=$(CC) install
 
 LEAKFLAGS=$(CFLAGS) -DFIND_LEAK
 
@@ -416,9 +421,11 @@ $(OBJS) tests/test.o dyn_load.o dyn_load_sunos53.o: \
 # options affects the size of GC_arrays,
 # invalidating all .o files that rely on gc_priv.h
 
-mark.o typd_mlc.o finalize.o ptr_chck.o: $(srcdir)/include/gc_mark.h $(srcdir)/include/private/gc_pmark.h
+mark.o typd_mlc.o finalize.o ptr_chck.o: $(srcdir)/include/gc_mark.h \
+					 $(srcdir)/include/private/gc_pmark.h
 
-specific.o pthread_support.o: $(srcdir)/include/private/specific.h
+specific.o pthread_support.o: $(srcdir)/include/private/specific.h \
+			      $(srcdir)/include/gc_inline.h
 
 dbg_mlc.o gcj_mlc.o: $(srcdir)/include/private/dbg_mlc.h
 
@@ -617,35 +624,27 @@ add_gc_prefix: $(srcdir)/add_gc_prefix.c $(srcdir)/version.h
 gcname: $(srcdir)/gcname.c $(srcdir)/version.h
 	$(CC) -o gcname $(srcdir)/gcname.c
 
-gc.tar: $(SRCS) $(DOC_FILES) $(OTHER_FILES) add_gc_prefix gcname
+#We assume this is being done from source directory.
+dist gc.tar: $(SRCS) $(DOC_FILES) $(OTHER_FILES) add_gc_prefix gcname
 	cp Makefile Makefile.old
 	cp Makefile.direct Makefile
+	CC=$(CC) ./configure_atomic_ops.sh
+	cd $(AO_SRC_DIR); make dist
+	if test $(srcdir)/libatomic_ops-$(AO_VERSION) = $(AO_SRC_DIR); \
+	then \
+	  mv $(AO_SRC_DIR) $(AO_SRC_DIR).bak ; \
+	  tar xvfz $(AO_SRC_DIR).bak/libatomic_ops-$(AO_VERSION).tar.gz ; \
+	else \
+	  tar xvfz $(AO_SRC_DIR)/libatomic_ops-$(AO_VERSION).tar.gz ; \
+	fi
 	rm -f `./gcname`
 	ln -s . `./gcname`
-	./add_gc_prefix $(SRCS) $(DOC_FILES) $(OTHER_FILES) > /tmp/gc.tar-files
+	./add_gc_prefix $(SRCS) $(DOC_FILES) $(OTHER_FILES) libatomic_ops-$(AO_VERSION) > /tmp/gc.tar-files
 	tar cvfh gc.tar `cat /tmp/gc.tar-files`
 	cp gc.tar `./gcname`.tar
 	gzip `./gcname`.tar
 	rm `./gcname`
 
-pc_gc.tar: $(SRCS) $(OTHER_FILES)
-	tar cvfX pc_gc.tar pc_excludes $(SRCS) $(OTHER_FILES)
-
-floppy: pc_gc.tar
-	-mmd a:/cord
-	-mmd a:/cord/private
-	-mmd a:/include
-	-mmd a:/include/private
-	mkdir /tmp/pc_gc
-	cat pc_gc.tar | (cd /tmp/pc_gc; tar xvf -)
-	-mcopy -tmn /tmp/pc_gc/* a:
-	-mcopy -tmn /tmp/pc_gc/cord/* a:/cord
-	-mcopy -mn /tmp/pc_gc/cord/de_win.ICO a:/cord
-	-mcopy -tmn /tmp/pc_gc/cord/private/* a:/cord/private
-	-mcopy -tmn /tmp/pc_gc/include/* a:/include
-	-mcopy -tmn /tmp/pc_gc/include/private/* a:/include/private
-	rm -r /tmp/pc_gc
-
 gc.tar.Z: gc.tar
 	compress gc.tar
 
diff --git a/Makefile.am b/Makefile.am
index 6389d0b1..c1871fb4 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -18,9 +18,10 @@
 ## FIXME: `make distcheck' in this directory will not currently work.
 ##     This is most likely to the explicit flags passed to submakes.
 
+## FIXME: I'll use AM_AUTOMAKE_INIT(options) macro call in configure.in 
 AUTOMAKE_OPTIONS = foreign
 
-SUBDIRS = doc include
+SUBDIRS = include cord . tests doc  
 
 EXTRA_DIST = 
     ## more items will be succesively added below
@@ -30,12 +31,13 @@ extra = libgccpp.la
 else
 extra = 
 endif
+
 lib_LTLIBRARIES = libgc.la $(extra) 
 
 include_HEADERS = include/gc.h include/gc_local_alloc.h \
-include/gc_pthread_redirects.h include/gc_config_macros.h \
-include/leak_detector.h include/gc_typed.h include/gc_tiny_fl.h \
-@addincludes@
+	include/gc_pthread_redirects.h include/gc_config_macros.h \
+	include/leak_detector.h include/gc_typed.h include/gc_tiny_fl.h \
+	@addincludes@
 
 EXTRA_HEADERS = include/gc_cpp.h include/gc_allocator.h
 
@@ -54,9 +56,9 @@ backgraph.c win32_threads.c \
 pthread_support.c pthread_stop_world.c darwin_stop_world.c \
 $(asm_libgc_sources)
 
-# Include THREADLIBS here to ensure that the correct versions of
+# Include THREADDLLIBS here to ensure that the correct versions of
 # linuxthread semaphore functions get linked:
-libgc_la_LIBADD = @addobjs@ $(THREADLIBS) $(UNWINDLIBS)
+libgc_la_LIBADD = @addobjs@ $(THREADDLLIBS) $(UNWINDLIBS)
 libgc_la_DEPENDENCIES = @addobjs@
 libgc_la_LDFLAGS = -version-info 1:2:0
 
@@ -65,40 +67,20 @@ EXTRA_libgc_la_SOURCES = alpha_mach_dep.S \
     rs6000_mach_dep.s sparc_mach_dep.S sparc_netbsd_mach_dep.s \
     sparc_sunos4_mach_dep.s ia64_save_regs_in_stack.s
 
+if CPLUSPLUS
 libgccpp_la_SOURCES = gc_cpp.cc
-libgccpp_la_LIBADD = $(THREADLIBS) $(UNWINDLIBS)
+libgccpp_la_LIBADD = $(THREADDLLIBS) $(UNWINDLIBS)
 libgccpp_la_LDFLAGS = -version-info 1:2:0
-
-EXTRA_DIST += alpha_mach_dep.S mips_sgi_mach_dep.s sparc_mach_dep.S
+endif
+## FIXME: automatically added to make dist if EXTRA_*_la_SOURCES
+#EXTRA_DIST += alpha_mach_dep.S mips_sgi_mach_dep.s sparc_mach_dep.S
 
 AM_CXXFLAGS = @GC_CFLAGS@
 AM_CFLAGS = @GC_CFLAGS@
 
-if CPLUSPLUS
-extra_checks = test_cpp
-else
-extra_checks = 
-endif
-
-check_PROGRAMS = gctest $(extra_checks)
-
-test.o:	$(srcdir)/tests/test.c
-	$(COMPILE) -c $(srcdir)/tests/test.c
-# 	Using $< in the above seems to fail with the HP/UX on Itanium make.
-test_cpp.o:	$(srcdir)/tests/test_cpp.cc
-	$(CXXCOMPILE) -c $(srcdir)/tests/test_cpp.cc
-
-## FIXME: this is probably the reason why some files from BUILT_SOURCES
-##     are included in the distribution
-# gctest_OBJECTS = test.o
-gctest_SOURCES = tests/test.c
-gctest_LDADD = ./libgc.la $(THREADLIBS) $(UNWINDLIBS) $(EXTRA_TEST_LIBS)
-test_cpp_SOURCES = tests/test_cpp.cc
-test_cpp_LDADD = ./libgc.la ./libgccpp.la $(THREADLIBS) $(UNWINDLIBS) $(EXTRA_TEST_LIBS)
-
-TESTS = gctest $(extra_checks)
 
 ## FIXME: relies on internal code generated by automake.
+## FIXME: ./configure --enable-dependency-tracking should be used 
 all_objs = @addobjs@ $(libgc_la_OBJECTS)
 $(all_objs) : include/private/gcconfig.h include/private/gc_priv.h \
 include/private/gc_hdrs.h include/gc.h include/gc_gcj.h \
@@ -158,11 +140,6 @@ EXTRA_DIST += add_gc_prefix.c gcname.c if_mach.c if_not_there.c \
 #
 EXTRA_DIST += gc_cpp.cc gc_cpp.cpp
 
-# tests not used by Makefile.am (:FIXME: why?)
-#
-EXTRA_DIST += tests/test_cpp.cc tests/trace_test.c \
-    tests/leak_test.c tests/thread_leak_test.c
-
 # cord package
 #
 EXTRA_DIST += cord/cordbscs.c cord/cordtest.c cord/de.c cord/de_win.c \
diff --git a/Makefile.direct b/Makefile.direct
index 83d4d60e..faa9008f 100644
--- a/Makefile.direct
+++ b/Makefile.direct
@@ -31,10 +31,10 @@ srcdir= .
 VPATH= $(srcdir)
 
 # Atomic_ops installation directory.  If this doesn't exist, we create
-# it from the included atomic_ops distribution.
-AO_VERSION=0.6
-AO_SRC_DIR=$(srcdir)/atomic_ops-$(AO_VERSION)
-AO_INSTALL_DIR=$(AO_SRC_DIR)/installed
+# it from the included libatomic_ops distribution.
+AO_VERSION=1.0
+AO_SRC_DIR=$(srcdir)/libatomic_ops-$(AO_VERSION)
+AO_INSTALL_DIR=$(srcdir)/libatomic_ops-install
 
 CFLAGS= -O -I$(srcdir)/include -I$(AO_INSTALL_DIR)/include -DATOMIC_UNCOLLECTABLE -DNO_EXECUTE_PERMISSION -DALL_INTERIOR_POINTERS
 
@@ -42,7 +42,8 @@ CFLAGS= -O -I$(srcdir)/include -I$(AO_INSTALL_DIR)/include -DATOMIC_UNCOLLECTABL
 # -DGC_LINUX_THREADS -DPARALLEL_MARK -DTHREAD_LOCAL_ALLOC
 # To build the parallel collector in a static library on HP/UX,
 # add to the above:
-# -DGC_HPUX_THREADS -DPARALLEL_MARK -DTHREAD_LOCAL_ALLOC -D_POSIX_C_SOURCE=199506L
+# -DGC_HPUX_THREADS -DTHREAD_LOCAL_ALLOC -D_POSIX_C_SOURCE=199506L -mt
+# FIXME: PARALLEL_MARK currently broken on HP/UX.
 # To build the thread-safe collector on Tru64, add to the above:
 # -pthread -DGC_OSF1_THREADS
 
@@ -236,11 +237,12 @@ HOSTCFLAGS=$(CFLAGS)
 #   causes the collector some system and pthread calls in a more transparent
 #   fashion than the usual macro-based approach.  Requires GNU ld, and
 #   currently probably works only with Linux.
-# -DTHREAD_LOCAL_ALLOC defines GC_local_malloc(), GC_local_malloc_atomic()
-#   and GC_local_gcj_malloc().  Needed for gc_gcj.h interface.  These allocate
-#   in a way that usually does not involve acquisition of a global lock.
-#   Currently requires -DGC_LINUX_THREADS, but should be easy to port to
-#   other pthreads environments.  Recommended for multiprocessors.
+# -DTHREAD_LOCAL_ALLOC defines GC_malloc(), GC_malloc_atomic()
+#   and GC_gcj_malloc() to use a per-thread set of free-lists.
+#   These then allocate  in a way that usually does not involve
+#   acquisition of a global lock.  Currently supported only on platforms
+#   such as Linux that use pthread_support.c.  Recommended for multiprocessors.
+#   Requires explicit GC_INIT() call.
 # -DUSE_COMPILER_TLS causes thread local allocation to use compiler-supported
 #   "__thread" thread-local variables.  This is the default in HP/UX.  It
 #   may help performance on recent Linux installations.  (It failed for
@@ -304,7 +306,7 @@ SRCS= $(CSRCS) mips_sgi_mach_dep.s rs6000_mach_dep.s alpha_mach_dep.S \
     sparc_mach_dep.S include/gc.h include/gc_typed.h include/gc_tiny_fl.h \
     include/private/gc_hdrs.h include/private/gc_priv.h \
     include/private/gcconfig.h include/private/gc_pmark.h \
-    include/gc_inl.h include/gc_inline.h include/gc_mark.h \
+    include/gc_inline.h include/gc_mark.h \
     threadlibs.c if_mach.c if_not_there.c gc_cpp.cc include/gc_cpp.h \
     gcname.c include/weakpointer.h include/private/gc_locks.h \
     mips_ultrix_mach_dep.s \
@@ -312,7 +314,7 @@ SRCS= $(CSRCS) mips_sgi_mach_dep.s rs6000_mach_dep.s alpha_mach_dep.S \
     include/javaxfc.h sparc_sunos4_mach_dep.s sparc_netbsd_mach_dep.s \
     include/gc_backptr.h \
     hpux_test_and_clear.s include/gc_gcj.h \
-    include/gc_local_alloc.h include/private/dbg_mlc.h \
+    include/private/dbg_mlc.h \
     include/private/specific.h powerpc_darwin_mach_dep.s \
     include/leak_detector.h include/gc_amiga_redirects.h \
     include/gc_pthread_redirects.h ia64_save_regs_in_stack.s \
@@ -335,25 +337,27 @@ DOC_FILES= README.QUICK doc/README.Mac doc/README.MacOSX doc/README.OS2 \
 TESTS= tests/test.c tests/test_cpp.cc tests/trace_test.c \
 	tests/leak_test.c tests/thread_leak_test.c tests/middle.c
 
-GNU_BUILD_FILES= configure.in Makefile.am configure acinclude.m4 \
+GNU_BUILD_FILES= configure.ac Makefile.am configure acinclude.m4 \
 		 libtool.m4 install-sh configure.host Makefile.in \
 		 aclocal.m4 config.sub config.guess \
 		 include/Makefile.am include/Makefile.in \
 		 doc/Makefile.am doc/Makefile.in \
-		 ltmain.sh mkinstalldirs depcomp missing
+		 ltmain.sh mkinstalldirs depcomp missing \
+		 cord/Makefile.am tests/Makefile.am
 
 OTHER_MAKEFILES= OS2_MAKEFILE NT_MAKEFILE NT_THREADS_MAKEFILE gc.mak \
 		 BCC_MAKEFILE EMX_MAKEFILE WCC_MAKEFILE Makefile.dj \
 		 PCR-Makefile SMakefile.amiga Makefile.DLLs \
-		 digimars.mak Makefile.direct NT_STATIC_THREADS_MAKEFILE
+		 digimars.mak Makefile.direct NT_STATIC_THREADS_MAKEFILE \
+		 configure_atomic_ops.sh
 #	Makefile and Makefile.direct are copies of each other.
 
-OTHER_FILES= Makefile setjmp_t.c callprocs pc_excludes \
+OTHER_FILES= Makefile setjmp_t.c callprocs \
            MacProjects.sit.hqx MacOS.c \
            Mac_files/datastart.c Mac_files/dataend.c \
            Mac_files/MacOS_config.h Mac_files/MacOS_Test_config.h \
            add_gc_prefix.c gc_cpp.cpp \
-	   version.h AmigaOS.c atomic_ops-0.6.tar.gz \
+	   version.h AmigaOS.c \
 	   $(TESTS) $(GNU_BUILD_FILES) $(OTHER_MAKEFILES)
 
 CORD_INCLUDE_FILES= $(srcdir)/include/gc.h $(srcdir)/include/cord.h \
@@ -379,8 +383,9 @@ all: gc.a gctest
 
 # if AO_INSTALL_DIR doesn't exist, we assume that it is pointing to
 # the default location, and we need to build
-$(AO_INSTALL_DIR):
-	tar xvfz $(AO_SRC_DIR).tar.gz; cd $(AO_SRC_DIR); make CC=$(CC) install
+$(AO_INSTALL_DIR): 
+	CC=$(CC) $(srcdir)/configure_atomic_ops.sh
+	cd $(AO_SRC_DIR); make CC=$(CC) install
 
 LEAKFLAGS=$(CFLAGS) -DFIND_LEAK
 
@@ -416,9 +421,11 @@ $(OBJS) tests/test.o dyn_load.o dyn_load_sunos53.o: \
 # options affects the size of GC_arrays,
 # invalidating all .o files that rely on gc_priv.h
 
-mark.o typd_mlc.o finalize.o ptr_chck.o: $(srcdir)/include/gc_mark.h $(srcdir)/include/private/gc_pmark.h
+mark.o typd_mlc.o finalize.o ptr_chck.o: $(srcdir)/include/gc_mark.h \
+					 $(srcdir)/include/private/gc_pmark.h
 
-specific.o pthread_support.o: $(srcdir)/include/private/specific.h
+specific.o pthread_support.o: $(srcdir)/include/private/specific.h \
+			      $(srcdir)/include/gc_inline.h
 
 dbg_mlc.o gcj_mlc.o: $(srcdir)/include/private/dbg_mlc.h
 
@@ -617,35 +624,27 @@ add_gc_prefix: $(srcdir)/add_gc_prefix.c $(srcdir)/version.h
 gcname: $(srcdir)/gcname.c $(srcdir)/version.h
 	$(CC) -o gcname $(srcdir)/gcname.c
 
-gc.tar: $(SRCS) $(DOC_FILES) $(OTHER_FILES) add_gc_prefix gcname
+#We assume this is being done from source directory.
+dist gc.tar: $(SRCS) $(DOC_FILES) $(OTHER_FILES) add_gc_prefix gcname
 	cp Makefile Makefile.old
 	cp Makefile.direct Makefile
+	CC=$(CC) ./configure_atomic_ops.sh
+	cd $(AO_SRC_DIR); make dist
+	if test $(srcdir)/libatomic_ops-$(AO_VERSION) = $(AO_SRC_DIR); \
+	then \
+	  mv $(AO_SRC_DIR) $(AO_SRC_DIR).bak ; \
+	  tar xvfz $(AO_SRC_DIR).bak/libatomic_ops-$(AO_VERSION).tar.gz ; \
+	else \
+	  tar xvfz $(AO_SRC_DIR)/libatomic_ops-$(AO_VERSION).tar.gz ; \
+	fi
 	rm -f `./gcname`
 	ln -s . `./gcname`
-	./add_gc_prefix $(SRCS) $(DOC_FILES) $(OTHER_FILES) > /tmp/gc.tar-files
+	./add_gc_prefix $(SRCS) $(DOC_FILES) $(OTHER_FILES) libatomic_ops-$(AO_VERSION) > /tmp/gc.tar-files
 	tar cvfh gc.tar `cat /tmp/gc.tar-files`
 	cp gc.tar `./gcname`.tar
 	gzip `./gcname`.tar
 	rm `./gcname`
 
-pc_gc.tar: $(SRCS) $(OTHER_FILES)
-	tar cvfX pc_gc.tar pc_excludes $(SRCS) $(OTHER_FILES)
-
-floppy: pc_gc.tar
-	-mmd a:/cord
-	-mmd a:/cord/private
-	-mmd a:/include
-	-mmd a:/include/private
-	mkdir /tmp/pc_gc
-	cat pc_gc.tar | (cd /tmp/pc_gc; tar xvf -)
-	-mcopy -tmn /tmp/pc_gc/* a:
-	-mcopy -tmn /tmp/pc_gc/cord/* a:/cord
-	-mcopy -mn /tmp/pc_gc/cord/de_win.ICO a:/cord
-	-mcopy -tmn /tmp/pc_gc/cord/private/* a:/cord/private
-	-mcopy -tmn /tmp/pc_gc/include/* a:/include
-	-mcopy -tmn /tmp/pc_gc/include/private/* a:/include/private
-	rm -r /tmp/pc_gc
-
 gc.tar.Z: gc.tar
 	compress gc.tar
 
diff --git a/NT_STATIC_THREADS_MAKEFILE b/NT_STATIC_THREADS_MAKEFILE
index cd951bf3..91fb7f6b 100644
--- a/NT_STATIC_THREADS_MAKEFILE
+++ b/NT_STATIC_THREADS_MAKEFILE
@@ -6,12 +6,20 @@ MY_CPU=X86
 CPU=$(MY_CPU)
 !include <ntwin32.mak>
 
+# Atomic_ops installation directory.  For win32, the source directory
+# should do, since we only need the headers.
+# We assume this was manually unpacked, since I'm not sure there is
+# a Windows standard command line tool to do this.
+AO_VERSION=0.6
+AO_SRC_DIR=$(srcdir)/atomic_ops-$(AO_VERSION)
+AO_INCLUDE_DIR=$(AO_SRC_DIR)
+
 OBJS= alloc.obj reclaim.obj allchblk.obj misc.obj mach_dep.obj os_dep.obj mark_rts.obj headers.obj mark.obj obj_map.obj blacklst.obj finalize.obj new_hblk.obj dbg_mlc.obj malloc.obj stubborn.obj dyn_load.obj typd_mlc.obj ptr_chck.obj gc_cpp.obj mallocx.obj win32_threads.obj
 
 all: gctest.exe cord\de.exe test_cpp.exe
 
 .c.obj:
-	$(cc) $(cdebug) $(cflags) $(cvars) -Iinclude -DALL_INTERIOR_POINTERS -D__STDC__ -DGC_NOT_DLL -DGC_WIN32_THREADS $*.c /Fo$*.obj
+	$(cc) $(cdebug) $(cflags) $(cvars) -Iinclude -I$(AO_INCLUDE_DIR) -DALL_INTERIOR_POINTERS -D__STDC__ -DGC_NOT_DLL -DGC_WIN32_THREADS $*.c /Fo$*.obj
 
 .cpp.obj:
 	$(cc) $(cdebug) $(cflags) $(cvars) -Iinclude -DALL_INTERIOR_POINTERS -DGC_NOT_DLL $*.CPP -DGC_WIN32_THREADS /Fo$*.obj
@@ -56,5 +64,6 @@ test_cpp.cpp: tests\test_cpp.cc
 test_cpp.exe: test_cpp.obj include\gc_cpp.h include\gc.h gc.lib
 	$(link) -debug:full -debugtype:cv $(guiflags) -stack:16384 -out:test_cpp.exe test_cpp.obj gc.lib $(guilibs)
 
-
+AO_SCR_DIR:
+	        tar xvfz $(AO_SRC_DIR).tar.gz;
 
diff --git a/allchblk.c b/allchblk.c
index 7ed647a7..44a7a11e 100644
--- a/allchblk.c
+++ b/allchblk.c
@@ -731,7 +731,6 @@ GC_allochblk_nth(word sz, int kind, unsigned char flags, int n)
 	              struct hblk * h;
 		      struct hblk * prev = hhdr -> hb_prev;
 	              
-		      GC_bytes_wasted += total_size;
 		      GC_large_free_bytes -= total_size;
 		      GC_remove_from_fl(hhdr, n);
 	              for (h = hbp; h < limit; h++) {
diff --git a/alloc.c b/alloc.c
index 834e8a0f..1be45164 100644
--- a/alloc.c
+++ b/alloc.c
@@ -192,10 +192,6 @@ word GC_adj_bytes_allocd(void)
     	/* had been reallocated this round. Finalization is user	*/
     	/* visible progress.  And if we don't count this, we have	*/
     	/* stability problems for programs that finalize all objects.	*/
-    result += GC_bytes_wasted;
-     	/* This doesn't reflect useful work.  But if there is lots of	*/
-     	/* new fragmentation, the same is probably true of the heap,	*/
-     	/* and the collection will be correspondingly cheaper.		*/
     if (result < (signed_word)(GC_bytes_allocd >> 3)) {
     	/* Always count at least 1/8 of the allocations.  We don't want	*/
     	/* to collect too infrequently, since that would inhibit	*/
@@ -459,9 +455,8 @@ GC_bool GC_stopped_mark(GC_stop_func stop_func)
     if (GC_print_stats) {
 	GC_log_printf("--> Marking for collection %lu ",
 		  (unsigned long)GC_gc_no + 1);
-	GC_log_printf("after %lu allocd bytes + %lu wasted bytes\n",
-	   	   (unsigned long) GC_bytes_allocd,
-	   	   (unsigned long) GC_bytes_wasted);
+	GC_log_printf("after %lu allocd bytes\n",
+	   	   (unsigned long) GC_bytes_allocd);
     }
 #   ifdef MAKE_BACK_GRAPH
       if (GC_print_back_height) {
@@ -706,7 +701,6 @@ void GC_finish_collection()
       GC_bytes_allocd_before_gc += GC_bytes_allocd;
       GC_non_gc_bytes_at_gc = GC_non_gc_bytes;
       GC_bytes_allocd = 0;
-      GC_bytes_wasted = 0;
       GC_bytes_freed = 0;
       GC_finalizer_bytes_freed = 0;
       
diff --git a/blacklst.c b/blacklst.c
index 686893d2..9811fb11 100644
--- a/blacklst.c
+++ b/blacklst.c
@@ -275,7 +275,7 @@ static word total_stack_black_listed(void)
     
     for (i = 0; i < GC_n_heap_sects; i++) {
     	struct hblk * start = (struct hblk *) GC_heap_sects[i].hs_start;
-    	word len = (word) GC_heap_sects[i].hs_bytes;
+    	size_t len = (word) GC_heap_sects[i].hs_bytes;
     	struct hblk * endp1 = start + len/HBLKSIZE;
     	
     	total += GC_number_stack_black_listed(start, endp1);
diff --git a/configure.in b/configure.ac
index 798a8359..4ff582d0 100644
--- a/configure.in
+++ b/configure.ac
@@ -17,7 +17,7 @@ dnl Process this file with autoconf to produce configure.
 # Initialization
 # ==============
 
-AC_INIT(gc,7.0alpha1,Hans.Boehm@hp.com) 
+AC_INIT(gc,7.0alpha2,Hans.Boehm@hp.com) 
     ## version must conform to [0-9]+[.][0-9]+(alpha[0-9]+)?
 AC_CONFIG_SRCDIR(gcj_mlc.c)
 AC_CANONICAL_TARGET 
@@ -25,6 +25,7 @@ AC_PREREQ(2.53)
 AC_REVISION($Revision: 1.2 $)
 GC_SET_VERSION
 AM_INIT_AUTOMAKE
+AM_MAINTAINER_MODE
 
 AC_SUBST(PACKAGE)
 AC_SUBST(GC_VERSION)
@@ -33,14 +34,13 @@ AC_PROG_CC
 AC_PROG_CXX
 
 AM_PROG_AS
+## FIXME: really needed? (AC_LIBTOOL already provides this)
 AC_CHECK_TOOL(AR, ar)
 AC_CHECK_TOOL(RANLIB, ranlib, :)  # :)
 
 AC_PROG_INSTALL
 
-AM_MAINTAINER_MODE
-
-. [$]{srcdir}/configure.host
+. ${srcdir}/configure.host
 
 GC_CFLAGS=${gc_cflags}
 AC_SUBST(GC_CFLAGS)
@@ -68,14 +68,15 @@ AC_ARG_ENABLE(cplusplus,
 )
 
 INCLUDES=-I${srcdir}/include
-THREADLIBS=
+THREADDLLIBS=
+## Libraries needed to support dynamic loading and/or threads.
 case "$THREADS" in
  no | none | single)
     THREADS=none
     ;;
  posix | pthreads)
     THREADS=posix
-    THREADLIBS=-lpthread
+    THREADDLLIBS=-lpthread
     case "$host" in
      x86-*-linux* | ia64-*-linux* | i586-*-linux* | i686-*-linux* | x86_64-*-linux* | alpha-*-linux*)
 	AC_DEFINE(GC_LINUX_THREADS)
@@ -101,13 +102,13 @@ case "$THREADS" in
 	  AC_DEFINE(PARALLEL_MARK)
 	fi
 	AC_DEFINE(THREAD_LOCAL_ALLOC)
-	THREADLIBS="-lpthread -lrt"
+	THREADDLLIBS="-lpthread -lrt"
 	;;
      *-*-freebsd*)
 	AC_MSG_WARN("FreeBSD does not yet fully support threads with Boehm GC.")
 	AC_DEFINE(GC_FREEBSD_THREADS)
 	INCLUDES="$INCLUDES -pthread"
-	THREADLIBS=-pthread
+	THREADDLLIBS=-pthread
       	;;
      *-*-solaris*)
 	AC_DEFINE(GC_SOLARIS_THREADS)
@@ -135,7 +136,10 @@ case "$THREADS" in
 	  # Measurements havent yet been done.
 	fi
 	INCLUDES="$INCLUDES -pthread"
-	THREADLIBS="-lpthread -lrt"
+	THREADDLLIBS="-lpthread -lrt"
+	;;
+      *)
+	AC_MSG_ERROR("Pthreads not supported by the GC on this platform.")
 	;;
     esac
     ;;
@@ -146,9 +150,9 @@ case "$THREADS" in
     ;;
  dgux386)
     THREADS=dgux386
-    AC_MSG_RESULT($THREADLIBS)
+    AC_MSG_RESULT($THREADDLLIBS)
     # Use pthread GCC  switch
-    THREADLIBS=-pthread
+    THREADDLLIBS=-pthread
     if test "${enable_parallel_mark}" = yes; then
         AC_DEFINE(PARALLEL_MARK)
     fi
@@ -160,7 +164,7 @@ case "$THREADS" in
     ;;
  aix)
     THREADS=posix
-    THREADLIBS=-lpthread
+    THREADDLLIBS=-lpthread
     AC_DEFINE(GC_AIX_THREADS)
     AC_DEFINE(_REENTRANT)
     ;;
@@ -171,7 +175,7 @@ case "$THREADS" in
     AC_MSG_ERROR($THREADS is an unknown thread package)
     ;;
 esac
-AC_SUBST(THREADLIBS)
+AC_SUBST(THREADDLLIBS)
 
 case "$host" in 
    powerpc-*-darwin*)
@@ -185,7 +189,7 @@ AM_CONDITIONAL(POWERPC_DARWIN,test x$powerpc_darwin = xtrue)
 case "$host" in
   *-*-darwin*) ;;
   *) 
-    AC_CHECK_LIB(dl, dlopen, EXTRA_TEST_LIBS="$EXTRA_TEST_LIBS -ldl")
+    AC_CHECK_LIB(dl, dlopen, THREADDLLIBS="$THREADDLLIBS -ldl")
     ;;
 esac
 
@@ -286,7 +290,7 @@ case "$host" in
     machdep="sparc_mach_dep.lo"
     AC_DEFINE(SUNOS53_SHARED_LIB)
     ;;
- sparc-sun-solaris2.*)
+ sparc*-sun-solaris2.*)
     machdep="sparc_mach_dep.lo"
     ;;
  ia64-*-*)
@@ -345,10 +349,10 @@ fi
 
 dnl As of 4.13a2, the collector will not properly work on Solaris when
 dnl built with gcc and -O.  So we remove -O in the appropriate case.
-dnl
+dnl Not needed anymore on Solaris.
 AC_MSG_CHECKING(whether Solaris gcc optimization fix is necessary)
 case "$host" in
- sparc-sun-solaris2*|*aix*)
+ *aix*)
     if test "$GCC" = yes; then
        AC_MSG_RESULT(yes)
        new_CFLAGS=
@@ -444,7 +448,8 @@ fi
 
 AM_CONDITIONAL(USE_LIBDIR, test -z "$with_cross_host")
 
-AC_OUTPUT([Makefile doc/Makefile include/Makefile],,
+AC_OUTPUT(
+[Makefile doc/Makefile include/Makefile tests/Makefile cord/Makefile],,
 srcdir=${srcdir}
 host=${host}
 CONFIG_SHELL=${CONFIG_SHELL-/bin/sh}
diff --git a/configure_atomic_ops.sh b/configure_atomic_ops.sh
new file mode 100755
index 00000000..6a0e31a7
--- /dev/null
+++ b/configure_atomic_ops.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+P=`pwd`/libatomic_ops-install
+cd libatomic_ops-*[0-9]
+./configure --prefix=$P
diff --git a/cord/Makefile.am b/cord/Makefile.am
new file mode 100644
index 00000000..253632f6
--- /dev/null
+++ b/cord/Makefile.am
@@ -0,0 +1,12 @@
+AM_CPPFLAGS=-I $(top_srcdir)/include -I$(top_srcdir)/include/private
+
+lib_LTLIBRARIES = libcord.la
+
+libcord_la_SOURCES = \
+	cordbscs.c				\
+	cordprnt.c				\
+	cordtest.c				\
+	cordxtra.c				
+
+
+EXTRA_DIST = de.c de_cmds.h de_win.ICO de_win.RC de_win.c de_win.h
diff --git a/dbg_mlc.c b/dbg_mlc.c
index 643f0e2d..881fa7fc 100644
--- a/dbg_mlc.c
+++ b/dbg_mlc.c
@@ -143,7 +143,7 @@ GC_bool GC_has_other_debug_info(ptr_t p)
         /* e.g. RAND_MAX = 1.5* GC_heapsize.  But for typical cases,	*/
         /* it's not too bad.						*/
     for (i = 0; i < GC_n_heap_sects; ++ i) {
-	int size = GC_heap_sects[i].hs_bytes;
+	size_t size = GC_heap_sects[i].hs_bytes;
 	if (heap_offset < size) {
 	    return GC_heap_sects[i].hs_start + heap_offset;
 	} else {
@@ -847,7 +847,7 @@ void GC_check_heap_block(struct hblk *hbp, word dummy)
     struct hblkhdr * hhdr = HDR(hbp);
     size_t sz = hhdr -> hb_sz;
     int bit_no;
-    unsigned char *p, *plim;
+    char *p, *plim;
     
     p = hbp->hb_body;
     bit_no = 0;
diff --git a/doc/README b/doc/README
index ff77113e..2d795d7d 100644
--- a/doc/README
+++ b/doc/README
@@ -28,7 +28,7 @@ are GPL'ed, but with an exception that should cover all uses in the
 collector.  (If you are concerned about such things, I recommend you look
 at the notice in config.guess or ltmain.sh.)
 
-This is version 6.3 of a conservative garbage collector for C and C++.
+This is version 7.0alpha2 of a conservative garbage collector for C and C++.
 
 You might find a more recent version of this at
 
diff --git a/doc/README.changes b/doc/README.changes
index 78a4e06c..70148a22 100644
--- a/doc/README.changes
+++ b/doc/README.changes
@@ -2108,7 +2108,7 @@ Since gc6.3:
  - Go ahead and split large blocks in GC_allochblk_nth if GC_dont_gc
    is set.  (Thanks to Alexander Petrossian.)
  - GC_PRINT_BACK_HEIGHT would deadlock with thread support.
- - Let in_progress_space in backgraph.s grow dyanmically.
+ - Let in_progress_space in backgraph.s grow dynamically.
  - Fix README.solaris2.  The GC_thr_init() hack doesn't work anymore.
  - Convert GC_finalizer_mem_freed to bytes in allchblk.c.
  - Add missing declaration for GC_generic_malloc_words_small_inner.
@@ -2128,9 +2128,66 @@ Since gc6.3:
    (Thanks to Peter Colson.)
  - Changed "int stack_size" declaration in pthread_support.c to use
    size_t.  (Only mattered with GC_ASSERTIONS enabled.)
-
-
+ - Added CRIS (etrax) support.  (Thanks to Simon Posnjak and
+   Hans-Peter Nilsson.)
+ - Removed GC_IGNORE_FB frame buffer recognition, and replaced
+   it with a check that the mapping type is MEM_IMAGE.
+   In theory, this should work much better, but it is a high
+   risk change for win32.  (Thanks to Ashley Bone for the crucial
+   experimental data behind this, and to Rutger Ovidus for
+   some further experiments.)
+ - GC_allochblk_nth incremented GC_words_wasted by bytes rather than
+   words.
+ - Consider GC_words_wasted in GC_adj_words_allocd only if it is within
+   reason.  (A hack to avoid some extremely unlikely scenarios in which
+   we manage to allocate only "wasted" space.  7.0 has a better fix.)
+ - Changed PowerPC GC_clear implementation to use lwsync instead of
+   eieio, since the documentation recommends against eieio, and
+   it seems to be incorrect if the preceding memory op is a load.
+ - Fixed print_block_list to print the correct kind number for
+   STUBBORN.  (Thanks to Rutger Ovidus.)
+ - Have configure.in generate an error if it is asked to support
+   pthreads, but doesn't know how to.
+ - Added Kazuhiro Inaoka's patch for Renesas M32R support.
+ - Have the GNU build mechanism link with -ldl.  Rename THREADLIBS
+   to THREADDLLIBS to reflect this.  (Thanks to Sven Verdoolaege.)
+ - Added Hannes Mehnert's patch for FreeBSD/SPARC support.
+ - Merged some FreeBSD specific patches to threadlibs.c and dyn_load.c.
+   (Thanks tp John Merryweather Cooper.)
+ - Define MPROTECT_VDB on MACOSX only if threads are being used, since the
+   dirty page tracking mechanism uses threads.  (This avoids an undefined
+   reference to _GC_darwin_register_mach_handler_thread.)
+ - By popular demand, use __libc symbols only if we are built with
+   USE_LIBC_PRIVATES, which is off by default, and not otherwise documented.
+ - Ignore GC_enable_incremental() requests when KEEP_BACK_PTRS is set.
+   The GC itself will dirty lots of pages in this cases, probably making
+   it counterproductive on all platforms.  And the DARWIN port crashes.
+ 
 Since gc6.4:
+ - Integrated Paolo Molaro's patch to deal with EINTR in sem_wait.
+ - Make GC_approx_sp() write to dummy location to ensure that stack
+   is grown here, when sp looks reasonable, rather than later, when
+   it might look like a bad memory reference.  (Problem was never
+   observed that I know of.  But on rereading the code it seemed
+   dubious.)
+ - Separate out GC_with_callee_saves_pushed and sometimes call
+   it from GC_suspend_handler in pthread_stop_world.c.  Callee-save
+   register values sometimes failed to get traced under HP/UX on
+   PA-RISC.  Linux/IA64 had the same problem, though non-stacked
+   callee-save registers seem to be so rarely used there that nobody
+   ever noticed.
+ - Integrated an ancient Darwin powerpc_darwin_machine_dep.s patch
+   from Andreas Tobler, which I had lost.
+ - Fix compare_and_exchange implementation for gcc/IA64 to deal with
+   pickier compiler versions.
+ - Fixed Itanium 32-bit ABI support (HP/UX).  In particular, the
+   compare_and_exchange implementation didn't consider that possibility.
+ - Undefine GC_pthread_detach in win32_threads.c.  (Thanks to
+   Tagliapietra Tommaso.)
+ - Fixed inclusion of frame.h for NETBSD in os_dep.c.
+ - Applied Dan Bonachea's patch to use mmap on AIX.
+
+Since gc6.5:
  - Remove GC_PROTO, VOLATILE, GC_PTR, and GC_CONST.  Assume ANSI C compiler
    and use ANSI constructs unconditionally.
  - Introduce #elif and #error in some of the appropriate places.
@@ -2206,13 +2263,58 @@ Since gc6.4:
    in anticipation of a merge with the inline allocation code.
  - Removed ALIGN_DOUBLE.  This is mostly handled by GRANULE_BYTES.
  - Make locking on most platforms conditional on GC_need_to_lock.
+
+Since gc7.0alpha1:
+ - GC_bytes_allocd was incremented by a possibly uninitialized variable
+   in GC_generic_malloc_inner.  (Bug introduced in gc7.0alpha1.  Thanks
+   to Ben Hutchings for tracking it down.)
+ - Win32 fixes.  (Thanks to Ben Hutchings and Maurizio Vairani.)
+ - Integrated Ben Hutchings' GetWriteWatch-based virtual dirty bit
+   implementation for win32.
+ - Removed pc_gc.tar and floppy targets in Makefile.direct.  Removed
+   pc_excludes file.
+ - No longer include GC_bytes_wasted when evaluating allocation progress.
+   Since we are now counting live memory, it no longer makes sense.
+ - Applied Davide Angeloca's configury patch.  There are now separate
+   Makefile.am's in the cord and tests supdirectory, more tests, etc.
+ - Renamed configure.in to configure.ac.
+ - Merged a very small number of Nathanael Nerode's configure.ac
+   cleanups from the gcc tree.  Unfortunately, that file is a bit
+   different from ours.
+ - Changed EINTR handling in sem_wait slightly.
+ - Restructure the root marking code.  Remove all traces of
+   USE_GENERIC_PUSH_REGS, and effectively make it the default.
+   Make it easier to pass a context pointer to the mark routine, in
+   case we ever want to do precise stack marking.
+ - Replace GC_start_blocking() and GC_end_blocking() with GC_do_blocking().
+   This remains undocumented, and only implemented for pthreads.  But it
+   removes an otherwise unavoidable race with stores of callee-save
+   registers.
+ - Fix GC_n_mark_bits for the default MARK_BIT_PER_GRANULE case.  This
+   resulted in bogus complaints in heap dumps.
+ - Upgrade to libatomic_ops-1.0, and update build structure to match.
+ - Remove SRC_M3 support. Clean up lock initialization code in misc.c.
+ - Removed gc_local_alloc.h.  If THREAD_LOCAL_ALLOC is defined, the
+   thread local allocation routines are now called automatically.
+ - Renamed gc_inl.h back to gc_inline.h.  Changed the interface appreciably
+   since locking has turned into a dominant issue, and in-line allocation
+   only makes sense if it's no worse than thread-local allocation.
+   Gc_inline.h is now also used to implement thread-local allocation.
+ - Finished replacing stubborn allocation with manual write barrier.
+   Untested.
+ - Use thread-local allocation code by default.
+ - Added GC_register_my_thread and friends for Posix and win32.
+ - Patch for GWW_VDB from Ben Hutchings.
+ - Removed explicit THREAD_LOCAL_ALLOC tests, since that now always
+   redefines GC_malloc.
+ - Removed now unused AIX memory allocation code.
+ - Various minor fixes for bugs introduced in 7.0alpha1.
   
 To do:
- - Use thread-local allocation code by default.
  - Fix USE_MARK_BITS.
- - Fix stubborn allocation.
- - function wrapping, conditional locking??
- - Finish replacing stubborn allocation with manual write barrier??
+ - Clone marker inner loop to support arch-dependent prefetching,
+   and counting of objects marked for finalization.
+ - function wrapping??
  - The USE_MUNMAP code should really use a separate data structure
    indexed by physical page to keep track of time since last use of
    a page.  Using hblk headers means we lose track of ages when
diff --git a/doc/README.environment b/doc/README.environment
index f2ec428e..d50d3709 100644
--- a/doc/README.environment
+++ b/doc/README.environment
@@ -94,28 +94,6 @@ GC_RETRY_SIGNALS, GC_NO_RETRY_SIGNALS - Try to compensate for lost
 		     was turned into a runtime flag to enable last-minute
 		     work-arounds.
 
-GC_IGNORE_FB[=<n>] -  (Win32 only.) Try to avoid treating a mapped
-		frame buffer as part of the root set.  Certain (higher end?)
-		graphics cards seems to result in the graphics memory mapped
-		into the user address space as writable memory.
-		Unfortunately, there seems to be no systematic way to
-		identify such memory.  Setting the environment variable to n
-		causes the collector to ignore mappings longer than n MB.
-		The default value of n is currently 15.  (This should cover
-		a 16 MB graphics card, since the mapping appears to be slightly
-		shorter than all of graphics memory.  It will fail if a dll
-		writes pointers to collectable objects into a data segment
-		whose length is >= 15MB.  Empirically that's rare, but
-		certainly possible.)  WARNING: Security sensitive applications
-		should probably disable this feature by setting
-		GC_disallow_ignore_fb, or by building with -DNO_GETENV,
-		since small values could force collection of reachable
-		objects, which is conceivably a	(difficult to exploit)
-		security hole.  GC_IGNORE_FB values less than 3 MB
-		are never honored, eliminating this risk for most,
-		but not all, applications.  This feature is likely to disappear
-		if/when we find a less disgusting "solution".
-
 The following turn on runtime flags that are also program settable.  Checked
 only during initialization.  We expect that they will usually be set through
 other means, but this may help with debugging and testing:
diff --git a/dyn_load.c b/dyn_load.c
index a35928d4..fa9aa2b4 100644
--- a/dyn_load.c
+++ b/dyn_load.c
@@ -87,7 +87,13 @@
 /* Newer versions of GNU/Linux define this macro.  We
  * define it similarly for any ELF systems that don't.  */
 #  ifndef ElfW
-#    ifdef NETBSD
+#    if defined(FREEBSD)
+#      if __ELF_WORD_SIZE == 32
+#        define ElfW(type) Elf32_##type
+#      else
+#        define ElfW(type) Elf64_##type
+#      endif
+#    elif defined(NETBSD)
 #      if ELFSIZE == 32
 #        define ElfW(type) Elf32_##type
 #      else
@@ -504,10 +510,10 @@ void GC_register_dynamic_libraries()
     static prmap_t * addr_map = 0;
     static int current_sz = 0;	/* Number of records currently in addr_map */
     static int needed_sz;	/* Required size of addr_map		*/
-    register int i;
-    register long flags;
-    register ptr_t start;
-    register ptr_t limit;
+    int i;
+    long flags;
+    ptr_t start;
+    ptr_t limit;
     ptr_t heap_start = (ptr_t)HEAP_START;
     ptr_t heap_end = heap_start;
 
@@ -668,77 +674,6 @@ void GC_register_dynamic_libraries()
   
 # define HAVE_REGISTER_MAIN_STATIC_DATA
 
-  GC_bool GC_warn_fb = TRUE;	/* Warn about traced likely 	*/
-  				/* graphics memory.		*/
-  GC_bool GC_disallow_ignore_fb = FALSE;
-  int GC_ignore_fb_mb;	/* Ignore mappings bigger than the 	*/
-  			/* specified number of MB.		*/
-  GC_bool GC_ignore_fb = FALSE; /* Enable frame buffer 	*/
-  				/* checking.		*/
-  
-  /* Issue warning if tracing apparent framebuffer. 		*/
-  /* This limits us to one warning, and it's a back door to	*/
-  /* disable that.						*/
- 
-  /* Should [start, start+len) be treated as a frame buffer	*/
-  /* and ignored?						*/
-  /* Unfortunately, we currently have no real way to tell	*/
-  /* automatically, and rely largely on user input.		*/
-  /* FIXME: If we had more data on this phenomenon (e.g.	*/
-  /* is start aligned to a MB multiple?) we should be able to	*/
-  /* do better.							*/
-  /* Based on a very limited sample, it appears that:		*/
-  /* 	- Frame buffer mappings appear as mappings of length	*/
-  /* 	  2**n MB - 192K.  (We guess the 192K can vary a bit.)	*/
-  /*	- Have a stating address at best 64K aligned.		*/
-  /* I'd love more information about the mapping, since I	*/
-  /* can't reproduce the problem.				*/
-  static GC_bool is_frame_buffer(ptr_t start, size_t len)
-  {
-    static GC_bool initialized = FALSE;
-#   define MB (1024*1024)
-#   define DEFAULT_FB_MB 15
-#   define MIN_FB_MB 3
-
-    if (GC_disallow_ignore_fb) return FALSE;
-    if (!initialized) {
-      char * ignore_fb_string =  GETENV("GC_IGNORE_FB");
-
-      if (0 != ignore_fb_string) {
-	while (*ignore_fb_string == ' ' || *ignore_fb_string == '\t')
-	  ++ignore_fb_string;
-	if (*ignore_fb_string == '\0') {
-	  GC_ignore_fb_mb = DEFAULT_FB_MB;
-	} else {
-	  GC_ignore_fb_mb = atoi(ignore_fb_string);
-	  if (GC_ignore_fb_mb < MIN_FB_MB) {
-	    WARN("Bad GC_IGNORE_FB value.  Using %ld\n", DEFAULT_FB_MB);
-	    GC_ignore_fb_mb = DEFAULT_FB_MB;
-	  }
-	}
-	GC_ignore_fb = TRUE;
-      } else {
-	GC_ignore_fb_mb = DEFAULT_FB_MB;  /* For warning */
-      }
-      initialized = TRUE;
-    }
-    if (len >= ((size_t)GC_ignore_fb_mb << 20)) {
-      if (GC_ignore_fb) {
-	return TRUE;
-      } else {
-	if (GC_warn_fb) {
-	  WARN("Possible frame buffer mapping at 0x%lx: \n"
-	       "\tConsider setting GC_IGNORE_FB to improve performance.\n",
-	       start);
-	  GC_warn_fb = FALSE;
-	}
-	return FALSE;
-      }
-    } else {
-      return FALSE;
-    }
-  }
-
 # ifdef DEBUG_VIRTUALQUERY
   void GC_dump_meminfo(MEMORY_BASIC_INFORMATION *buf)
   {
@@ -789,7 +724,7 @@ void GC_register_dynamic_libraries()
 		&& (protect == PAGE_EXECUTE_READWRITE
 		    || protect == PAGE_READWRITE)
 		&& !GC_is_heap_base(buf.AllocationBase)
-		&& !is_frame_buffer(p, buf.RegionSize)) {  
+		&& buf.Type == MEM_IMAGE) {  
 #	        ifdef DEBUG_VIRTUALQUERY
 	          GC_dump_meminfo(&buf);
 #	        endif
@@ -869,7 +804,7 @@ void GC_register_dynamic_libraries()
           if (moduleinfo.lmi_flags & LDR_MAIN)
               continue;    /* skip the main module */
 
-#     ifdef VERBOSE
+#     ifdef DL_VERBOSE
           GC_printf("---Module---\n");
           GC_printf("Module ID            = %16ld\n", moduleinfo.lmi_modid);
           GC_printf("Count of regions     = %16d\n", moduleinfo.lmi_nregion);
@@ -890,7 +825,7 @@ void GC_register_dynamic_libraries()
             if (! (regioninfo.lri_prot & LDR_W))
                 continue;
 
-#         ifdef VERBOSE
+#         ifdef DL_VERBOSE
               GC_printf("--- Region ---\n");
               GC_printf("Region number    = %16ld\n",
               	        regioninfo.lri_region_no);
@@ -917,7 +852,6 @@ void GC_register_dynamic_libraries()
 #include <errno.h>
 #include <dl.h>
 
-extern int errno;
 extern char *sys_errlist[];
 extern int sys_nerr;
 
@@ -953,7 +887,7 @@ void GC_register_dynamic_libraries()
 #	 endif
         }
 
-#     ifdef VERBOSE
+#     ifdef DL_VERBOSE
           GC_printf("---Shared library---\n");
           GC_printf("\tfilename        = \"%s\"\n", shl_desc->filename);
           GC_printf("\tindex           = %d\n", index);
diff --git a/gcj_mlc.c b/gcj_mlc.c
index 31aed25e..9aecca3b 100644
--- a/gcj_mlc.c
+++ b/gcj_mlc.c
@@ -131,7 +131,11 @@ static void maybe_finalize()
 /* Allocate an object, clear it, and store the pointer to the	*/
 /* type structure (vtable in gcj).				*/
 /* This adds a byte at the end of the object if GC_malloc would.*/
-void * GC_gcj_malloc(size_t lb, void * ptr_to_struct_containing_descr)
+#ifdef THREAD_LOCAL_ALLOC
+  void * GC_core_gcj_malloc(size_t lb, void * ptr_to_struct_containing_descr)
+#else
+  void * GC_gcj_malloc(size_t lb, void * ptr_to_struct_containing_descr)
+#endif
 {
     ptr_t op;
     ptr_t * opp;
diff --git a/include/gc.h b/include/gc.h
index 064b0186..8637735c 100644
--- a/include/gc.h
+++ b/include/gc.h
@@ -757,6 +757,63 @@ GC_API GC_word GC_set_free_space_divisor(GC_word value);
 typedef void * (*GC_fn_type) (void * client_data);
 GC_API void * GC_call_with_alloc_lock (GC_fn_type fn, void * client_data);
 
+/* These routines are intended to explicitly notify the collector	*/
+/* of new threads.  Often this is unnecessary because thread creation	*/
+/* is implicitly intercepted by the collector, using header-file	*/
+/* defines, or linker-based interception.  In the long run the intent	*/
+/* is to always make redundant registration safe.  In the short run,	*/
+/* this is being implemented a platform at a time.			*/
+/* The interface is complicated by the fact that we probably will not 	*/
+/* ever be able to automatically determine the stack base for thread	*/
+/* stacks on all platforms.						*/
+
+/* Structure representing the base of a thread stack.  On most		*/
+/* platforms this contains just a single address.			*/
+struct GC_stack_base {
+	void * mem_base;	/* Base of memory stack.	*/
+#	if defined(__ia64) || defined(__ia64__)
+	  void * reg_base;	/* Base of separate register stack.	*/
+#	endif
+};
+
+typedef void * (*GC_stack_base_func)(struct GC_stack_base *sb, void *arg);
+
+/* Call a function with a stack base structure corresponding to		*/
+/* somewhere in the GC_call_with_stack_base frame.  This often can	*/
+/* be used to provide a sufficiently accurate stack base.  And we 	*/
+/* implement it everywhere.						*/
+void * GC_call_with_stack_base(GC_stack_base_func fn, void *arg);
+
+/* Register the current thread, with the indicated stack base, as	*/
+/* a new thread whose stack(s) should be traced by the GC.  If a 	*/
+/* platform does not implicitly do so, this must be called before a	*/
+/* thread can allocate garbage collected memory, or assign pointers	*/
+/* to the garbage collected heap.  Once registered, a thread will be	*/
+/* stopped during garbage collections.					*/
+/* Return codes:	*/
+#define GC_SUCCESS 0
+#define GC_DUPLICATE 1	/* Was already registered.	*/
+#define GC_NO_THREADS 2	/* No thread support in GC.  	*/
+#define GC_UNIMPLEMENTED 3	/* Not yet implemented on this platform. */
+int GC_register_my_thread(struct GC_stack_base *);
+
+/* Unregister the current thread.  The thread may no longer allocate	*/
+/* garbage collected memory or manipulate pointers to the		*/
+/* garbage collected heap after making this call.			*/
+/* Specifically, if it wants to return or otherwise communicate a 	*/
+/* pointer to the garbage-collected heap to another thread, it must	*/
+/* do this before calling GC_unregister_my_thread, most probably	*/
+/* by saving it in a global data structure.				*/
+int GC_unregister_my_thread(void);
+
+/* Attempt to fill in the GC_stack_base structure with the stack base	*/
+/* for this thread.  This appears to be required to implement anything	*/
+/* like the JNI AttachCurrentThread in an environment in which new	*/
+/* threads are not automatically registered with the collector.		*/
+/* It is also unfortunately hard to implement well on many platforms.	*/
+/* Returns GC_SUCCESS or GC_UNIMPLEMENTED.				*/
+int GC_get_stack_base(struct GC_stack_base *);
+
 /* The following routines are primarily intended for use with a 	*/
 /* preprocessor which inserts calls to check C pointer arithmetic.	*/
 /* They indicate failure by invoking the corresponding _print_proc.	*/
@@ -859,7 +916,7 @@ GC_API void (*GC_is_visible_print_proc) (void * p);
 void * GC_malloc_many(size_t lb);
 #define GC_NEXT(p) (*(void * *)(p)) 	/* Retrieve the next element	*/
 					/* in returned list.		*/
-extern void GC_thr_init();	/* Needed for Solaris/X86	*/
+extern void GC_thr_init(void);	/* Needed for Solaris/X86	*/
 
 #endif /* THREADS && !SRC_M3 */
 
@@ -902,6 +959,8 @@ extern void GC_thr_init();	/* Needed for Solaris/X86	*/
   * before making any other GC_ calls.  On most platforms this is a
   * no-op and the collector self-initializes.  But a number of platforms
   * make that too hard.
+  * A GC_INIT call is required if the collector is built with THREAD_LOCAL_ALLOC
+  * defined and the initial allocation call is not to GC_malloc().
   */
 #if (defined(sparc) || defined(__sparc)) && defined(sun)
     /*
@@ -911,22 +970,17 @@ extern void GC_thr_init();	/* Needed for Solaris/X86	*/
      * This circumvents a Solaris 2.X (X<=4) linker bug.
      */
 #   define GC_INIT() { extern end, etext; \
-		       GC_noop(&end, &etext); }
-#else
-# if defined(__CYGWIN32__) && defined(GC_DLL) || defined (_AIX)
+		       GC_noop(&end, &etext); \
+		       GC_init();}
+#elif defined(__CYGWIN32__) && defined(GC_DLL) || defined (_AIX)
     /*
      * Similarly gnu-win32 DLLs need explicit initialization from
      * the main program, as does AIX.
      */
-#   define GC_INIT() { GC_add_roots(DATASTART, DATAEND); }
-# else
-#  if defined(__APPLE__) && defined(__MACH__) || defined(GC_WIN32_THREADS)
+#   define GC_INIT() { GC_add_roots(DATASTART, DATAEND); GC_init(); }
+#else
 #   define GC_INIT() { GC_init(); }
-#  else
-#   define GC_INIT()
-#  endif /* !__MACH && !GC_WIN32_THREADS */
-# endif /* !AIX && !cygwin */
-#endif /* !sparc */
+#endif
 
 #if !defined(_WIN32_WCE) \
     && ((defined(_MSDOS) || defined(_MSC_VER)) && (_M_IX86 >= 300) \
diff --git a/include/gc_gcj.h b/include/gc_gcj.h
index df5c6c71..a4ed4e4a 100644
--- a/include/gc_gcj.h
+++ b/include/gc_gcj.h
@@ -82,11 +82,6 @@ extern int GC_gcj_kind;
 
 extern int GC_gcj_debug_kind;
 
-# if defined(GC_LOCAL_ALLOC_H) && defined(GC_REDIRECT_TO_LOCAL)
-    --> gc_local_alloc.h should be included after this.  Otherwise
-    --> we undo the redirection.
-# endif
-
 # ifdef GC_DEBUG
 #   define GC_GCJ_MALLOC(s,d) GC_debug_gcj_malloc(s,d,GC_EXTRAS)
 #   define GC_GCJ_FAST_MALLOC(s,d) GC_debug_gcj_fast_malloc(s,d,GC_EXTRAS)
diff --git a/include/gc_inl.h b/include/gc_inl.h
deleted file mode 100644
index 73d63ec2..00000000
--- a/include/gc_inl.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/* 
- * Copyright 1988, 1989 Hans-J. Boehm, Alan J. Demers
- * Copyright (c) 1991-1995 by Xerox Corporation.  All rights reserved.
- *
- * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
- * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- *
- * Permission is hereby granted to use or copy this program
- * for any purpose,  provided the above notices are retained on all copies.
- * Permission to modify the code and to distribute modified code is granted,
- * provided the above notices are retained, and a notice that the code was
- * modified is included with the above copyright notice.
- */
-/* Boehm, October 3, 1995 2:07 pm PDT */
- 
-#error FIXME: This needs to be updated.
-
-# ifndef GC_PRIVATE_H
-#   include "private/gc_priv.h"
-# endif
-
-/* USE OF THIS FILE IS NOT RECOMMENDED unless GC_all_interior_pointers	*/
-/* is always set, or the collector has been built with			*/
-/* -DDONT_ADD_BYTE_AT_END, or the specified size includes a pointerfree	*/
-/* word at the end.  In the standard collector configuration,		*/
-/* the final word of each object may not be scanned.			*/
-/* This iinterface is most useful for compilers that generate C.	*/
-/* Manual use is hereby discouraged.					*/
-
-/* Allocate n words (NOT BYTES).  X is made to point to the result.	*/
-/* It is assumed that n < MAXOBJSZ, and					*/
-/* that n > 0.  On machines requiring double word alignment of some	*/
-/* data, we also assume that n is 1 or even.				*/
-/* If the collector is built with -DUSE_MARK_BYTES or -DPARALLEL_MARK,	*/
-/* the n = 1 case is also disallowed.					*/
-/* Effectively this means that portable code should make sure n is even.*/
-/* This bypasses the							*/
-/* MERGE_SIZES mechanism.  In order to minimize the number of distinct	*/
-/* free lists that are maintained, the caller should ensure that a 	*/
-/* small number of distinct values of n are used.  (The MERGE_SIZES	*/
-/* mechanism normally does this by ensuring that only the leading three	*/
-/* bits of n may be nonzero.  See misc.c for details.)  We really 	*/
-/* recommend this only in cases in which n is a constant, and no	*/
-/* locking is required.							*/
-/* In that case it may allow the compiler to perform substantial	*/
-/* additional optimizations.						*/
-# define GC_MALLOC_WORDS(result,n) \
-{	\
-    register ptr_t op;	\
-    register ptr_t *opp;	\
-    DCL_LOCK_STATE;	\
-	\
-    opp = &(GC_objfreelist[n]);	\
-    FASTLOCK();	\
-    if( !FASTLOCK_SUCCEEDED() || (op = *opp) == 0 ) {	\
-        FASTUNLOCK();	\
-        (result) = GC_generic_malloc_words_small((n), NORMAL);	\
-    } else { 	\
-        *opp = obj_link(op);	\
-        obj_link(op) = 0;	\
-        GC_words_allocd += (n);	\
-        FASTUNLOCK();	\
-        (result) = (void *) op;	\
-    }	\
-}
-
-
-/* The same for atomic objects:	*/
-# define GC_MALLOC_ATOMIC_WORDS(result,n) \
-{	\
-    register ptr_t op;	\
-    register ptr_t *opp;	\
-    DCL_LOCK_STATE;	\
-	\
-    opp = &(GC_aobjfreelist[n]);	\
-    FASTLOCK();	\
-    if( !FASTLOCK_SUCCEEDED() || (op = *opp) == 0 ) {	\
-        FASTUNLOCK();	\
-        (result) = GC_generic_malloc_words_small((n), PTRFREE);	\
-    } else { 	\
-        *opp = obj_link(op);	\
-        obj_link(op) = 0;	\
-        GC_words_allocd += (n);	\
-        FASTUNLOCK();	\
-        (result) = (void *) op;	\
-    }	\
-}
-
-/* And once more for two word initialized objects: */
-# define GC_CONS(result, first, second) \
-{	\
-    register ptr_t op;	\
-    register ptr_t *opp;	\
-    DCL_LOCK_STATE;	\
-	\
-    opp = &(GC_objfreelist[2]);	\
-    FASTLOCK();	\
-    if( !FASTLOCK_SUCCEEDED() || (op = *opp) == 0 ) {	\
-        FASTUNLOCK();	\
-        op = GC_generic_malloc_words_small(2, NORMAL);	\
-    } else {	\
-        *opp = obj_link(op);	\
-        GC_words_allocd += 2;	\
-        FASTUNLOCK();	\
-    } \
-    ((word *)op)[0] = (word)(first);	\
-    ((word *)op)[1] = (word)(second);	\
-    (result) = (void *) op;	\
-}
diff --git a/include/gc_inline.h b/include/gc_inline.h
index db62d1d5..d2008cf6 100644
--- a/include/gc_inline.h
+++ b/include/gc_inline.h
@@ -1 +1,124 @@
-# include "gc_inl.h"
+/* 
+ * Copyright 1988, 1989 Hans-J. Boehm, Alan J. Demers
+ * Copyright (c) 1991-1995 by Xerox Corporation.  All rights reserved.
+ * Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
+ * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program
+ * for any purpose,  provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is granted,
+ * provided the above notices are retained, and a notice that the code was
+ * modified is included with the above copyright notice.
+ */
+ 
+/* USE OF THIS FILE IS NOT RECOMMENDED unless GC_all_interior_pointers	*/
+/* is not set, or the collector has been built with			*/
+/* -DDONT_ADD_BYTE_AT_END, or the specified size includes a pointerfree	*/
+/* word at the end.  In the standard collector configuration,		*/
+/* the final word of each object may not be scanned.			*/
+/* This interface is most useful for compilers that generate C.		*/
+/* It is also used internally for thread-local allocation, in which	*/
+/* case, the size is suitably adjusted by the caller.			*/
+/* Manual use is hereby discouraged.					*/
+
+#include "gc.h"
+#include "gc_tiny_fl.h"
+
+#ifndef __GNUC__
+#  define __builtin_expect(x, y) (x)
+#endif
+
+/* The ultimately general inline allocation macro.  Allocate an object	*/
+/* of size bytes, putting the resulting pointer in result.  Tiny_fl is	*/
+/* a "tiny" free list array, which will be used first, if the size	*/
+/* is appropriate.  If bytes is too large, we allocate with 		*/
+/* default_expr instead.  If we need to refill the free list, we use	*/
+/* GC_generic_malloc_many with the indicated kind.			*/
+/* Tiny_fl should be an array of GC_TINY_FREELISTS void * pointers.	*/
+/* If num_direct is nonzero, and the individual free list pointers	*/
+/* are initialized to (void *)1, then we allocate numdirect granules	*/
+/* directly using gmalloc before putting multiple objects into the	*/
+/* tiny_fl entry.  If num_direct is zero, then the free lists may also	*/
+/* be initialized to (void *)0.						*/
+/* We rely on much of this hopefully getting optimized away in the	*/
+/* num_direct = 0 case.							*/
+/* Particularly if bytes is constant, this should generate a small	*/
+/* amount of code.							*/
+# define GC_FAST_MALLOC_GRANS(result,granules,tiny_fl,num_direct,\
+			      kind,default_expr,init) \
+{ \
+    if (__builtin_expect(granules >= GC_TINY_FREELISTS,0)) { \
+        result = default_expr; \
+    } else { \
+	void **my_fl = tiny_fl + granules; \
+        void *my_entry=*my_fl; \
+	void *next; \
+ \
+	while (__builtin_expect((word)my_entry \
+				<= num_direct + GC_TINY_FREELISTS + 1, 0)) { \
+	    /* Entry contains counter or NULL */ \
+	    if ((word)my_entry - 1 < num_direct) { \
+		/* Small counter value, not NULL */ \
+                *my_fl = (ptr_t)my_entry + granules + 1; \
+                result = default_expr; \
+		goto out; \
+            } else { \
+		/* Large counter or NULL */ \
+                GC_generic_malloc_many(RAW_BYTES_FROM_INDEX(granules), \
+				       kind, my_fl); \
+		my_entry = *my_fl; \
+                if (my_entry == 0) { \
+		    result = GC_oom_fn(bytes); \
+		    goto out; \
+		} \
+	    } \
+        } \
+        next = obj_link(my_entry); \
+        result = (void *)my_entry; \
+        *my_fl = next; \
+	init; \
+        PREFETCH_FOR_WRITE(next); \
+        GC_ASSERT(GC_size(result) >= bytes + EXTRA_BYTES); \
+        GC_ASSERT(((word *)result)[1] == 0); \
+      out: ; \
+   } \
+}
+
+# define GC_WORDS_TO_WHOLE_GRANULES(n) \
+	GC_WORDS_TO_GRANULES((n) + GC_GRANULE_WORDS - 1)
+
+/* Allocate n words (NOT BYTES).  X is made to point to the result.	*/
+/* This should really only be used if GC_all_interior_pointers is	*/
+/* not set, or DONT_ADD_BYTE_AT_END is set.  See above.			*/
+/* The semantics changed in version 7.0; we no longer lock, and		*/
+/* the caller is responsible for supplying a cleared tiny_fl		*/
+/* free list array.  For single-threaded applications, this may be	*/
+/* a global array.							*/
+# define GC_MALLOC_WORDS(result,n,tiny_fl) \
+{	\
+    size_t grans = WORDS_TO_WHOLE_GRANULES(n); \
+    GC_FAST_MALLOC_GRANS(result, grans, tiny_fl, 0, \
+			 NORMAL, GC_malloc(grans*GRANULE_BYTES), \
+			 *(void **)result = 0); \
+}
+
+# define GC_MALLOC_ATOMIC_WORDS(result,n,tiny_fl) \
+{	\
+    size_t grans = WORDS_TO_WHOLE_GRANULES(n); \
+    GC_FAST_MALLOC_GRANS(result, grans, tiny_fl, 0, \
+			 PTRFREE, GC_malloc_atomic(grans*GRANULE_BYTES), \
+			 /* no initialization */); \
+}
+
+
+/* And once more for two word initialized objects: */
+# define GC_CONS(result, first, second, tiny_fl) \
+{	\
+    size_t grans = WORDS_TO_WHOLE_GRANULES(2); \
+    GC_FAST_MALLOC_GRANS(result, grans, tiny_fl, 0, \
+			 NORMAL, GC_malloc(grans*GRANULE_BYTES), \
+			 *(void **)result = (void *)(first)); \
+    ((void **)(result))[1] = (void *)(second);	\
+}
diff --git a/include/gc_local_alloc.h b/include/gc_local_alloc.h
deleted file mode 100644
index 7a7b7bb7..00000000
--- a/include/gc_local_alloc.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* 
- * Copyright (c) 2000 by Hewlett-Packard Company.  All rights reserved.
- *
- * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
- * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
- *
- * Permission is hereby granted to use or copy this program
- * for any purpose,  provided the above notices are retained on all copies.
- * Permission to modify the code and to distribute modified code is granted,
- * provided the above notices are retained, and a notice that the code was
- * modified is included with the above copyright notice.
- */
-
-/*
- * Interface for thread local allocation.  Memory obtained
- * this way can be used by all threads, as though it were obtained
- * from an allocator like GC_malloc.  The difference is that GC_local_malloc
- * counts the number of allocations of a given size from the current thread,
- * and uses GC_malloc_many to perform the allocations once a threashold
- * is exceeded.  Thus far less synchronization may be needed.
- * Allocation of known large objects should not use this interface.
- * This interface is designed primarily for fast allocation of small
- * objects on multiprocessors, e.g. for a JVM running on an MP server.
- *
- * If this file is included with GC_GCJ_SUPPORT defined, GCJ-style
- * bitmap allocation primitives will also be included.
- *
- * If this file is included with GC_REDIRECT_TO_LOCAL defined, then
- * GC_MALLOC, GC_MALLOC_ATOMIC, and possibly GC_GCJ_MALLOC will
- * be redefined to use the thread local allocatoor.
- *
- * The interface is available only if the collector is built with
- * -DTHREAD_LOCAL_ALLOC, which is currently supported only on Linux.
- *
- * The debugging allocators use standard, not thread-local allocation.
- *
- * These routines normally require an explicit call to GC_init(), though
- * that may be done from a constructor function.
- */
-
-#ifndef GC_LOCAL_ALLOC_H
-#define GC_LOCAL_ALLOC_H
-
-#ifndef _GC_H
-#   include "gc.h"
-#endif
-
-#if defined(GC_GCJ_SUPPORT) && !defined(GC_GCJ_H)
-#   include "gc_gcj.h"
-#endif
-
-void * GC_local_malloc(size_t bytes);
-
-void * GC_local_malloc_atomic(size_t bytes);
-
-#if defined(GC_GCJ_SUPPORT)
-  void * GC_local_gcj_malloc(size_t bytes,
-			     void * ptr_to_struct_containing_descr);
-#endif
-
-# ifdef GC_DEBUG
-    /* We don't really use local allocation in this case.	*/
-#   define GC_LOCAL_MALLOC(s) GC_debug_malloc(s,GC_EXTRAS)
-#   define GC_LOCAL_MALLOC_ATOMIC(s) GC_debug_malloc_atomic(s,GC_EXTRAS)
-#   ifdef GC_GCJ_SUPPORT
-#	define GC_LOCAL_GCJ_MALLOC(s,d) GC_debug_gcj_malloc(s,d,GC_EXTRAS)
-#   endif
-# else
-#   define GC_LOCAL_MALLOC(s) GC_local_malloc(s)
-#   define GC_LOCAL_MALLOC_ATOMIC(s) GC_local_malloc_atomic(s)
-#   ifdef GC_GCJ_SUPPORT
-#	define GC_LOCAL_GCJ_MALLOC(s,d) GC_local_gcj_malloc(s,d)
-#   endif
-# endif
-
-# ifdef GC_REDIRECT_TO_LOCAL
-#   undef GC_MALLOC
-#   define GC_MALLOC(s) GC_LOCAL_MALLOC(s)
-#   undef GC_MALLOC_ATOMIC
-#   define GC_MALLOC_ATOMIC(s) GC_LOCAL_MALLOC_ATOMIC(s)
-#   ifdef GC_GCJ_SUPPORT
-#	undef GC_GCJ_MALLOC
-# 	define GC_GCJ_MALLOC(s,d) GC_LOCAL_GCJ_MALLOC(s,d)
-#   endif
-# endif
-
-#endif /* GC_LOCAL_ALLOC_H */
diff --git a/include/gc_tiny_fl.h b/include/gc_tiny_fl.h
index 537bc343..46c8a11d 100644
--- a/include/gc_tiny_fl.h
+++ b/include/gc_tiny_fl.h
@@ -1,5 +1,5 @@
 /* 
- * Copyright (c) 1999-2004 Hewlett-Packard Development Company, L.P.
+ * Copyright (c) 1999-2005 Hewlett-Packard Development Company, L.P.
  *
  * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
  * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
@@ -11,8 +11,8 @@
  * modified is included with the above copyright notice.
  */
 
-#ifndef TINY_FL_H
-#define TINY_FL_H
+#ifndef GC_TINY_FL_H
+#define GC_TINY_FL_H
 /*
  * Constants and data structures for "tiny" free lists.
  * These are used for thread-local allocation or in-lined allocators.
@@ -34,7 +34,7 @@
  * and space usage for mark bits (usually mark bytes).
  * On many 64-bit architectures some memory references require 16-byte
  * alignment, making this necessary anyway.
- * For a few 32-bit architecture (e.g. 86), we may also need 16-byte alignment
+ * For a few 32-bit architecture (e.g. x86), we may also need 16-byte alignment
  * for certain memory references.  But currently that does not seem to be the
  * default for all conventional malloc implementations, so we ignore that
  * problem.
@@ -43,15 +43,24 @@
  * space, so we no longer do so.
  */
 #ifndef GC_GRANULE_BYTES
-# if defined(__LP64__) || defined (_LP64) || defined(_WIN64) || defined(__s390x__) \
-	|| defined(__x86_64__) || defined(__alpha__) || defined(__powerpc64__) \
+# if defined(__LP64__) || defined (_LP64) || defined(_WIN64) \
+	|| defined(__s390x__) || defined(__x86_64__) \
+	|| defined(__alpha__) || defined(__powerpc64__) \
 	|| defined(__arch64__)
 #  define GC_GRANULE_BYTES 16
+#  define GC_GRANULE_WORDS 2
 # else
 #  define GC_GRANULE_BYTES 8
+#  define GC_GRANULE_WORDS 2
 # endif
 #endif /* !GC_GRANULE_BYTES */
 
+#if GC_GRANULE_WORDS == 2
+#  define GC_WORDS_TO_GRANULES(n) ((n)>>1)
+#else
+#  define GC_WORDS_TO_GRANULES(n) ((n)*sizeof(void *)/GRANULE_BYTES)
+#endif
+
 /* A "tiny" free list header contains TINY_FREELISTS pointers to 	*/
 /* singly linked lists of objects of different sizes, the ith one	*/
 /* containing objects i granules in size.  Note that there is a list	*/
@@ -64,4 +73,14 @@
 # endif
 #endif /* !GC_TINY_FREELISTS */
 
-#endif /* TINY_FL_H */
+/* The ith free list corresponds to size i*GRANULE_BYTES	*/
+/* Internally to the collector, the index can be computed with	*/
+/* ROUNDED_UP_GRANULES.  Externally, we don't know whether	*/
+/* DONT_ADD_BYTE_AT_END is set, but the client should know.	*/
+
+/* Convert a free list index to the actual size of objects	*/
+/* on that list, including extra space we added.  Not an	*/
+/* inverse of the above.					*/
+#define RAW_BYTES_FROM_INDEX(i) ((i) * GC_GRANULE_BYTES)
+
+#endif /* GC_TINY_FL_H */
diff --git a/include/private/gc_locks.h b/include/private/gc_locks.h
index b69d2b87..9184e3a6 100644
--- a/include/private/gc_locks.h
+++ b/include/private/gc_locks.h
@@ -72,11 +72,6 @@
 #    define FASTUNLOCK()  {\
         if( FASTLOCK_SUCCEEDED() ) PCR_Th_ML_Release(&GC_allocate_ml); }
 #  endif
-#  ifdef SRC_M3
-     extern GC_word RT0u__inCritical;
-#    define LOCK() RT0u__inCritical++
-#    define UNLOCK() RT0u__inCritical--
-#  endif
 
 #  if !defined(AO_have_test_and_set_acquire)
 #    define USE_PTHREAD_LOCKS
@@ -205,7 +200,7 @@
 # endif /* !THREADS */
 
 #if defined(UNCOND_LOCK) && !defined(LOCK) 
-     extern GC_bool GC_need_to_lock;
+     GC_API GC_bool GC_need_to_lock;
      		/* At least two thread running; need to lock.	*/
 #    define LOCK() if (GC_need_to_lock) { UNCOND_LOCK(); }
 #    define UNLOCK() if (GC_need_to_lock) { UNCOND_UNLOCK(); }
diff --git a/include/private/gc_pmark.h b/include/private/gc_pmark.h
index 03d3af16..8a79b9df 100644
--- a/include/private/gc_pmark.h
+++ b/include/private/gc_pmark.h
@@ -423,7 +423,11 @@ mse * GC_mark_from(mse * top, mse * bottom, mse *limit);
 /*
  * Mark from one finalizable object using the specified
  * mark proc. May not mark the object pointed to by 
- * real_ptr. That is the job of the caller, if appropriate
+ * real_ptr. That is the job of the caller, if appropriate.
+ * Note that this is called with the mutator running, but
+ * with us holding the allocation lock.  This is safe only if the
+ * mutator needs tha allocation lock to reveal hidden pointers.
+ * FIXME: Why do we need the GC_mark_state test below?
  */
 # define GC_MARK_FO(real_ptr, mark_proc) \
 { \
diff --git a/include/private/gc_priv.h b/include/private/gc_priv.h
index 06d8a15a..6fdf3786 100644
--- a/include/private/gc_priv.h
+++ b/include/private/gc_priv.h
@@ -19,6 +19,11 @@
 # ifndef GC_PRIVATE_H
 # define GC_PRIVATE_H
 
+# include <stdlib.h>
+# if !(defined( sony_news ) )
+#   include <stddef.h>
+# endif
+
 #ifdef DGUX
 #   include <sys/types.h>
 #   include <sys/time.h>
@@ -31,13 +36,17 @@
 #   include <sys/resource.h>
 #endif /* BSD_TIME */
 
-# ifndef _GC_H
+#ifndef _GC_H
 #   include "../gc.h"
-# endif
+#endif
 
-# ifndef GC_MARK_H
+#ifndef GC_TINY_FL_H
+#   include "../gc_tiny_fl.h"
+#endif
+
+#ifndef GC_MARK_H
 #   include "../gc_mark.h"
-# endif
+#endif
 
 typedef GC_word word;
 typedef GC_signed_word signed_word;
@@ -51,7 +60,7 @@ typedef char * ptr_t;	/* A generic pointer to which we can add	*/
 			/* byte displacements.				*/
 			/* Preferably identical to caddr_t, if it 	*/
 			/* exists.					*/
-			
+
 # ifndef GCCONFIG_H
 #   include "gcconfig.h"
 #   ifndef USE_MARK_BYTES
@@ -63,11 +72,6 @@ typedef char * ptr_t;	/* A generic pointer to which we can add	*/
 #   include "gc_hdrs.h"
 # endif
 
-# include <stdlib.h>
-# if !(defined( sony_news ) )
-#   include <stddef.h>
-# endif
-
 #if __GNUC__ >= 3
 # define EXPECT(expr, outcome) __builtin_expect(expr,outcome)
 # define INLINE inline
@@ -798,10 +802,6 @@ struct _GC_arrays {
     word _bytes_allocd;
   	/* Number of words allocated during this collection cycle */
 # endif
-  word _bytes_wasted;
-  	/* Number of words wasted due to internal fragmentation	*/
-  	/* in large objects, or due to dropping blacklisted     */
-	/* blocks, since last gc.  Approximate.                 */
   word _bytes_finalized;
   	/* Approximate number of bytes in objects (and headers)	*/
   	/* That became ready for finalization in the last 	*/
@@ -889,15 +889,16 @@ struct _GC_arrays {
         /* Stubborn object pages that were changes before last call to	*/
 	/* GC_read_changed.						*/
 # endif
-# if defined(PROC_VDB) || defined(MPROTECT_VDB)
+# if defined(PROC_VDB) || defined(MPROTECT_VDB) || \
+     defined(GWW_VDB) || defined(MANUAL_VDB)
     page_hash_table _grungy_pages; /* Pages that were dirty at last 	   */
 				     /* GC_read_dirty.			   */
 # endif
-# ifdef MPROTECT_VDB
+# if defined(MPROTECT_VDB) || defined(MANUAL_VDB)
     volatile page_hash_table _dirty_pages;	
 			/* Pages dirtied since last GC_read_dirty. */
 # endif
-# ifdef PROC_VDB
+# if defined(PROC_VDB) || defined(GWW_VDB)
     page_hash_table _written_pages;	/* Pages ever dirtied	*/
 # endif
 # ifdef LARGE_CONFIG
@@ -914,7 +915,7 @@ struct _GC_arrays {
 #   endif
 # endif
   struct HeapSect {
-      ptr_t hs_start; word hs_bytes;
+      ptr_t hs_start; size_t hs_bytes;
   } _heap_sects[MAX_HEAP_SECTS];
 # if defined(MSWIN32) || defined(MSWINCE)
     ptr_t _heap_bases[MAX_HEAP_SECTS];
@@ -968,7 +969,6 @@ GC_API GC_FAR struct _GC_arrays GC_arrays;
 # endif
 # define GC_last_heap_addr GC_arrays._last_heap_addr
 # define GC_prev_heap_addr GC_arrays._prev_heap_addr
-# define GC_bytes_wasted GC_arrays._bytes_wasted
 # define GC_large_free_bytes GC_arrays._large_free_bytes
 # define GC_large_allocd_bytes GC_arrays._large_allocd_bytes
 # define GC_max_large_allocd_bytes GC_arrays._max_large_allocd_bytes
@@ -1002,13 +1002,14 @@ GC_API GC_FAR struct _GC_arrays GC_arrays;
 # define GC_excl_table GC_arrays._excl_table
 # define GC_all_nils GC_arrays._all_nils
 # define GC_top_index GC_arrays._top_index
-# if defined(PROC_VDB) || defined(MPROTECT_VDB)
+# if defined(PROC_VDB) || defined(MPROTECT_VDB) || \
+     defined(GWW_VDB) || defined(MANUAL_VDB)
 #   define GC_grungy_pages GC_arrays._grungy_pages
 # endif
-# ifdef MPROTECT_VDB
+# if defined(MPROTECT_VDB) || defined(MANUAL_VDB)
 #   define GC_dirty_pages GC_arrays._dirty_pages
 # endif
-# ifdef PROC_VDB
+# if defined(PROC_VDB) || defined(GWW_VDB)
 #   define GC_written_pages GC_arrays._written_pages
 # endif
 # define GC_composite_in_use GC_arrays._composite_in_use
@@ -1212,7 +1213,7 @@ extern long GC_large_alloc_warn_suppressed;
 #  define FINAL_MARK_BIT(sz) ((sz) > MAXOBJBYTES? 1 : HBLK_OBJS(sz))
 	/* Position of final, always set, mark bit.			*/
 #else /* MARK_BIT_PER_GRANULE */
-#  define MARK_BIT_NO(offset, sz) BYTES_TO_GRANULES((offset))
+#  define MARK_BIT_NO(offset, sz) BYTES_TO_GRANULES(offset)
 #  define MARK_BIT_OFFSET(sz) BYTES_TO_GRANULES(sz)
 #  define IF_PER_OBJ(x)
 #  define FINAL_MARK_BIT(sz) \
@@ -1292,12 +1293,16 @@ void GC_push_all_eager (ptr_t b, ptr_t t);
   /* stacks are scheduled for scanning in *GC_push_other_roots, which	*/
   /* is thread-package-specific.					*/
 #endif
-void GC_push_current_stack(ptr_t cold_gc_frame);
+void GC_push_current_stack(ptr_t cold_gc_frame, void *context);
   			/* Push enough of the current stack eagerly to	*/
   			/* ensure that callee-save registers saved in	*/
   			/* GC frames are scanned.			*/
   			/* In the non-threads case, schedule entire	*/
   			/* stack for scanning.				*/
+			/* The second argument is a pointer to the 	*/
+			/* (possibly null) thread context, for		*/
+			/* (currently hypothetical) more precise	*/
+			/* stack scanning.				*/
 void GC_push_roots(GC_bool all, ptr_t cold_gc_frame);
   			/* Push all or dirty roots.	*/
 extern void (*GC_push_other_roots)(void);
@@ -1323,11 +1328,13 @@ extern void (*GC_start_call_back) (void);
   			/* Not called if 0.  Called with allocation 	*/
   			/* lock held.					*/
   			/* 0 by default.				*/
-# if defined(USE_GENERIC_PUSH_REGS)
-  void GC_generic_push_regs(ptr_t cold_gc_frame);
-# else
-  void GC_push_regs(void);
-# endif
+void GC_push_regs_and_stack(ptr_t cold_gc_frame);
+
+void GC_push_regs(void);
+
+void GC_with_callee_saves_pushed(void (*fn)(ptr_t, void *),
+				 ptr_t arg);
+
 # if defined(SPARC) || defined(IA64)
   /* Cause all stacked registers to be saved in memory.  Return a	*/
   /* pointer to the top of the corresponding memory stack.		*/
@@ -1409,7 +1416,7 @@ GC_bool GC_register_main_static_data(void);
 		/* dynamic library registration.			*/
   
 /* Machine dependent startup routines */
-ptr_t GC_get_stack_base(void);	/* Cold end of stack */
+ptr_t GC_get_main_stack_base(void);	/* Cold end of stack */
 #ifdef IA64
   ptr_t GC_get_register_stack_base(void);
   					/* Cold end of register stack.	*/
@@ -1611,6 +1618,16 @@ ptr_t GC_allocobj(size_t sz, int kind);
   				/* free list nonempty, and return its	*/
   				/* head.  Sz is in granules.		*/
 
+/* Allocation routines that bypass the thread local cache.	*/
+/* Used internally.						*/
+#ifdef THREAD_LOCAL_ALLOC
+  void * GC_core_malloc(size_t);
+  void * GC_core_malloc_atomic(size_t);
+# ifdef GC_GCJ_SUPPORT
+    void *GC_core_gcj_malloc(size_t, void *); 
+# endif
+#endif /* THREAD_LOCAL_ALLOC */
+
 void GC_free_inner(void * p);
 void GC_debug_free_inner(void * p);
   
diff --git a/include/private/gcconfig.h b/include/private/gcconfig.h
index 0760e9f6..e71041b2 100644
--- a/include/private/gcconfig.h
+++ b/include/private/gcconfig.h
@@ -30,6 +30,7 @@
     /* Fake ptr_t declaration, just to avoid compilation errors.	*/
     /* This avoids many instances if "ifndef GC_PRIVATE_H" below.	*/
     typedef struct GC_undefined_struct * ptr_t;
+#   include <stddef.h>	/* For size_t etc. */
 # endif
 
 /* Machine dependent parameters.  Some tuning parameters can be found	*/
@@ -160,7 +161,7 @@
 #   define mach_type_known
 # endif
 # if defined(sparc) && defined(unix) && !defined(sun) && !defined(linux) \
-     && !defined(__OpenBSD__) && !(__NetBSD__)
+     && !defined(__OpenBSD__) && !defined(__NetBSD__) && !defined(__FreeBSD__)
 #   define SPARC
 #   define DRSNX
 #   define mach_type_known
@@ -220,6 +221,12 @@
 #    define ARM32
 #    define mach_type_known
 # endif
+# if defined(LINUX) && defined(__cris__)
+#    ifndef CRIS
+#	define CRIS
+#    endif
+#    define mach_type_known
+# endif
 # if defined(LINUX) && (defined(powerpc) || defined(__powerpc__) || defined(powerpc64) || defined(__powerpc64__))
 #    define POWERPC
 #    define mach_type_known
@@ -240,6 +247,10 @@
 #    define SH
 #    define mach_type_known
 # endif
+# if defined(LINUX) && defined(__m32r__)
+#    define M32R
+#    define mach_type_known
+# endif
 # if defined(__alpha) || defined(__alpha__)
 #   define ALPHA
 #   if !defined(LINUX) && !defined(NETBSD) && !defined(OPENBSD) && !defined(FREEBSD)
@@ -302,6 +313,10 @@
 #    define X86_64
 #    define mach_type_known
 # endif
+# if defined(FREEBSD) && defined(__sparc__)
+#    define SPARC
+#    define mach_type_known
+#endif
 # if defined(bsdi) && (defined(i386) || defined(__i386__))
 #    define I386
 #    define BSDI
@@ -376,8 +391,9 @@
 #   define mach_type_known
 # endif
 # if defined(__pj__)
-#   define PJ
-#   define mach_type_known
+#   error PicoJava no longer supported
+    /* The implementation had problems, and I haven't heard of users	*/
+    /* in ages.  If you want it resurrected, let me know.		*/
 # endif
 # if defined(__embedded__) && defined(PPC)
 #   define POWERPC
@@ -459,17 +475,19 @@
 		    /*		   POWERPC    ==> IBM/Apple PowerPC	*/
 		    /*			(MACOS(<=9),DARWIN(incl.MACOSX),*/
 		    /*			 LINUX, NETBSD, NOSYS variants)	*/
+		    /*		   CRIS       ==> Axis Etrax		*/
+		    /*		   M32R	      ==> Renesas M32R		*/
 
 
 /*
  * For each architecture and OS, the following need to be defined:
  *
- * CPP_WORD_SZ is a simple integer constant representing the word size.
+ * CPP_WORDSZ is a simple integer constant representing the word size.
  * in bits.  We assume byte addressibility, where a byte has 8 bits.
- * We also assume CPP_WORD_SZ is either 32 or 64.
+ * We also assume CPP_WORDSZ is either 32 or 64.
  * (We care about the length of pointers, not hardware
  * bus widths.  Thus a 64 bit processor with a C compiler that uses
- * 32 bit pointers should use CPP_WORD_SZ of 32, not 64. Default is 32.)
+ * 32 bit pointers should use CPP_WORDSZ of 32, not 64. Default is 32.)
  *
  * MACH_TYPE is a string representation of the machine type.
  * OS_TYPE is analogous for the OS.
@@ -578,8 +596,7 @@
  */
 
 /* If we are using a recent version of gcc, we can use __builtin_unwind_init()
- * to push the relevant registers onto the stack.  This generally makes
- * USE_GENERIC_PUSH_REGS the preferred approach for marking from registers.
+ * to push the relevant registers onto the stack.
  */
 # if defined(__GNUC__) && ((__GNUC__ >= 3) || \
 			   (__GNUC__ == 2 && __GNUC_MINOR__ >= 8)) \
@@ -601,7 +618,6 @@
 	  extern char etext[];
 #	  define DATASTART ((ptr_t)(etext))
 #       endif
-#       define USE_GENERIC_PUSH_REGS
 #   endif
 #   ifdef NETBSD
 #	define OS_TYPE "NETBSD"
@@ -613,13 +629,10 @@
 	  extern char etext[];
 #	  define DATASTART ((ptr_t)(etext))
 #       endif
-#	define USE_GENERIC_PUSH_REGS
 #   endif
 #   ifdef LINUX
 #       define OS_TYPE "LINUX"
 #       define STACKBOTTOM ((ptr_t)0xf0000000)
-#       define USE_GENERIC_PUSH_REGS
-		/* We never got around to the assembly version. */
 /* #       define MPROTECT_VDB - Reported to not work  9/17/01 */
 #       ifdef __ELF__
 #            define DYNAMIC_LOADING
@@ -711,8 +724,10 @@
 #     define USE_MMAP_ANON
 #     define USE_ASM_PUSH_REGS
       /* This is potentially buggy. It needs more testing. See the comments in
-         os_dep.c */
-#     define MPROTECT_VDB
+         os_dep.c.  It relies on threads to track writes. */
+#     ifdef GC_DARWIN_THREADS
+#       define MPROTECT_VDB
+#     endif
 #     include <unistd.h>
 #     define GETPAGESIZE() getpagesize()
 #     if defined(USE_PPC_PREFETCH) && defined(__GNUC__)
@@ -858,6 +873,23 @@
 #	define DATASTART ((ptr_t)(etext))
 #     endif
 #   endif
+#   ifdef FREEBSD
+#	define OS_TYPE "FREEBSD"
+#	define SIG_SUSPEND SIGUSR1
+#	define SIG_THR_RESTART SIGUSR2
+#	define FREEBSD_STACKBOTTOM
+#	ifdef __ELF__
+#	    define DYNAMIC_LOADING
+#	endif
+	extern char etext[];
+	extern char edata[];
+	extern char end[];
+#	define NEED_FIND_LIMIT
+#	define DATASTART ((ptr_t)(&etext))
+#	define DATAEND (GC_find_limit (DATASTART, TRUE))
+#	define DATASTART2 ((ptr_t)(&edata))
+#	define DATAEND2 ((ptr_t)(&end))
+#   endif
 # endif
 
 # ifdef I386
@@ -873,9 +905,6 @@
 			/* Borland.					*/
                         /* Ivan Demakov: For Watcom the option is -zp4. */
 #   endif
-#   ifdef HAVE_BUILTIN_UNWIND_INIT
-#	define USE_GENERIC_PUSH_REGS
-#   endif
 #   ifdef SEQUENT
 #	define OS_TYPE "SEQUENT"
 	extern int etext[];
@@ -960,10 +989,6 @@
 #   endif /* DGUX */
 
 #   ifdef LINUX
-#	ifndef __GNUC__
-	  /* The Intel compiler doesn't like inline assembly */
-#	  define USE_GENERIC_PUSH_REGS
-# 	endif
 #	define OS_TYPE "LINUX"
 #       define LINUX_STACKBOTTOM
 #	if 0
@@ -1067,15 +1092,15 @@
 		/* os_dep.c. OS2 actually has the right			*/
 		/* system call!						*/
 #	define DATAEND	/* not needed */
-#	define USE_GENERIC_PUSH_REGS
 #   endif
 #   ifdef MSWIN32
 #	define OS_TYPE "MSWIN32"
 		/* STACKBOTTOM and DATASTART are handled specially in 	*/
 		/* os_dep.c.						*/
-#       ifndef __WATCOMC__
+#       if !defined(__WATCOMC__) && !defined(GC_WIN32_THREADS)
 #	  define MPROTECT_VDB
 #	endif
+#       define GWW_VDB
 #       define DATAEND  /* not needed */
 #   endif
 #   ifdef MSWINCE
@@ -1188,7 +1213,6 @@
       extern int __data_start[];
 #     define DATASTART ((ptr_t)(__data_start))
 #     define ALIGNMENT 4
-#     define USE_GENERIC_PUSH_REGS
 #     if __GLIBC__ == 2 && __GLIBC_MINOR__ >= 2 || __GLIBC__ > 2
 #        define LINUX_STACKBOTTOM
 #     else
@@ -1216,7 +1240,6 @@
 #        define ALIGNMENT 4
 #      endif
 #      define OS_TYPE "EWS4800"
-#      define USE_GENERIC_PUSH_REGS 1
 #   endif
 #   ifdef ULTRIX
 #	define HEURISTIC2
@@ -1260,7 +1283,6 @@
 #     define ALIGNMENT 4
 #     define OS_TYPE "NETBSD"
 #     define HEURISTIC2
-#     define USE_GENERIC_PUSH_REGS
 #     ifdef __ELF__
         extern int etext[];
 #       define DATASTART GC_data_start
@@ -1290,6 +1312,8 @@
 #     define CPP_WORDSZ 32
 #     define STACKBOTTOM ((ptr_t)((ulong)&errno))
 #   endif
+#   define USE_MMAP
+#   define USE_MMAP_ANON
  /* From AIX linker man page:
  _text Specifies the first location of the program.
  _etext Specifies the first location after the program.
@@ -1301,7 +1325,6 @@
 #   define DATASTART ((ptr_t)((ulong)_data))
 #   define DATAEND ((ptr_t)((ulong)_end))
     extern int errno;
-#   define USE_GENERIC_PUSH_REGS
 #   define DYNAMIC_LOADING
 	/* For really old versions of AIX, this may have to be removed. */
 # endif
@@ -1373,13 +1396,6 @@
 #   define MACH_TYPE "ALPHA"
 #   define ALIGNMENT 8
 #   define CPP_WORDSZ 64
-#   ifndef LINUX
-#     define USE_GENERIC_PUSH_REGS
-      /* Gcc and probably the DEC/Compaq compiler spill pointers to preserved */
-      /* fp registers in some cases when the target is a 21264.  The assembly */
-      /* code doesn't handle that yet, and version dependencies make that a   */
-      /* bit tricky.  Do the easy thing for now.				    */
-#   endif
 #   ifdef NETBSD
 #	define OS_TYPE "NETBSD"
 #	define HEURISTIC2
@@ -1462,7 +1478,6 @@
 
 # ifdef IA64
 #   define MACH_TYPE "IA64"
-#   define USE_GENERIC_PUSH_REGS
 	/* We need to get preserved registers in addition to register   */
 	/* windows.   That's easiest to do with setjmp.			*/
 #   ifdef PARALLEL_MARK
@@ -1589,7 +1604,6 @@
     /* be moved to the S390 category.					*/
 #   define MACH_TYPE "S370"
 #   define ALIGNMENT 4	/* Required by hardware	*/
-#   define USE_GENERIC_PUSH_REGS
 #   ifdef UTS4
 #       define OS_TYPE "UTS4"
 	extern int etext[];
@@ -1604,7 +1618,6 @@
 
 # ifdef S390
 #   define MACH_TYPE "S390"
-#   define USE_GENERIC_PUSH_REGS
 #   ifndef __s390x__
 #   define ALIGNMENT 4
 #   define CPP_WORDSZ 32
@@ -1626,13 +1639,6 @@
 #   endif
 # endif
 
-# if defined(PJ)
-#   define ALIGNMENT 4
-    extern int _etext[];
-#   define DATASTART ((ptr_t)(_etext))
-#   define HEURISTIC1
-# endif
-
 # ifdef ARM32
 #   define CPP_WORDSZ 32
 #   define MACH_TYPE "ARM32"
@@ -1647,14 +1653,12 @@
            extern char etext[];
 #          define DATASTART ((ptr_t)(etext))
 #	endif
-#       define USE_GENERIC_PUSH_REGS
 #   endif
 #   ifdef LINUX
 #       define OS_TYPE "LINUX"
 #       define HEURISTIC1
 #       undef STACK_GRAN
 #       define STACK_GRAN 0x10000000
-#       define USE_GENERIC_PUSH_REGS
 #       ifdef __ELF__
 #            define DYNAMIC_LOADING
 #	     include <features.h>
@@ -1687,13 +1691,24 @@
       /* __data_start is usually defined in the target linker script.  */
       extern int __data_start[];
 #     define DATASTART (ptr_t)(__data_start)
-#     define USE_GENERIC_PUSH_REGS
       /* __stack_base__ is set in newlib/libc/sys/arm/crt0.S  */
       extern void *__stack_base__;
 #     define STACKBOTTOM ((ptr_t) (__stack_base__))
 #   endif
 #endif
 
+# ifdef CRIS
+#   define MACH_TYPE "CRIS"
+#   define CPP_WORDSZ 32
+#   define ALIGNMENT 1
+#   define OS_TYPE "LINUX"
+#   define DYNAMIC_LOADING
+#   define LINUX_STACKBOTTOM
+#   define SEARCH_FOR_DATA_START
+      extern int _end[];
+#   define DATAEND (_end)
+# endif
+
 # ifdef SH
 #   define MACH_TYPE "SH"
 #   define ALIGNMENT 4
@@ -1704,7 +1719,6 @@
 #   ifdef LINUX
 #     define OS_TYPE "LINUX"
 #     define STACKBOTTOM ((ptr_t) 0x7c000000)
-#     define USE_GENERIC_PUSH_REGS
 #     define DYNAMIC_LOADING
 #     define SEARCH_FOR_DATA_START
       extern int _end[];
@@ -1714,7 +1728,6 @@
 #      define OS_TYPE "NETBSD"
 #      define HEURISTIC2
 #      define DATASTART GC_data_start
-#       define USE_GENERIC_PUSH_REGS
 #      define DYNAMIC_LOADING
 #   endif
 # endif
@@ -1726,6 +1739,22 @@
 #   define DATAEND /* not needed */
 # endif
 
+# ifdef M32R
+#   define CPP_WORDSZ 32
+#   define MACH_TYPE "M32R"
+#   define ALIGNMENT 4
+#   ifdef LINUX
+#     define OS_TYPE "LINUX"
+#     define LINUX_STACKBOTTOM
+#     undef STACK_GRAN
+#     define STACK_GRAN 0x10000000
+#     define DYNAMIC_LOADING
+#     define SEARCH_FOR_DATA_START
+      extern int _end[];
+#     define DATAEND (_end)
+#   endif
+# endif
+
 # ifdef X86_64
 #   define MACH_TYPE "X86_64"
 #   define ALIGNMENT 8
@@ -1734,7 +1763,6 @@
 #     define HBLKSIZE 4096
 #   endif
 #   define CACHE_LINE_SIZE 64
-#   define USE_GENERIC_PUSH_REGS
 #   ifdef LINUX
 #	define OS_TYPE "LINUX"
 #       define LINUX_STACKBOTTOM
@@ -1881,7 +1909,8 @@
 #   undef MPROTECT_VDB  /* For now.	*/
 # endif
 
-# if !defined(PCR_VDB) && !defined(PROC_VDB) && !defined(MPROTECT_VDB)
+# if !defined(PCR_VDB) && !defined(PROC_VDB) && !defined(MPROTECT_VDB) \
+    && !defined(GWW_VDB)
 #   define DEFAULT_VDB
 # endif
 
@@ -1949,15 +1978,6 @@
 #   define THREADS
 # endif
 
-# if defined(HP_PA) || defined(M88K) || defined(POWERPC) && !defined(DARWIN) \
-	     || defined(LINT) || defined(MSWINCE) || defined(ARM32) \
-	     || (defined(I386) && defined(__LCC__))
-	/* Use setjmp based hack to mark from callee-save registers.    */
-	/* The define should move to the individual platform 		*/
-	/* descriptions.						*/
-#	define USE_GENERIC_PUSH_REGS
-# endif
-
 # if defined(MSWINCE)
 #   define NO_GETENV
 # endif
diff --git a/include/private/pthread_support.h b/include/private/pthread_support.h
index b1733824..8548f4af 100644
--- a/include/private/pthread_support.h
+++ b/include/private/pthread_support.h
@@ -31,7 +31,13 @@ typedef struct GC_Thread_Rep {
     
     short flags;
 #	define FINISHED 1   	/* Thread has exited.	*/
-#	define DETACHED 2	/* Thread is intended to be detached.	*/
+#	define DETACHED 2	/* Thread is treated as detached.	*/
+    				/* Thread may really be detached, or	*/
+    				/* it may have have been explicitly	*/
+    				/* registered, in which case we can	*/
+    				/* deallocate its GC_Thread_Rep once	*/
+    				/* it unregisters itself, since it	*/
+    				/* may not return a GC pointer.		*/
 #	define MAIN_THREAD 4	/* True for the original thread only.	*/
     short thread_blocked;	/* Protected by GC lock.		*/
     				/* Treated as a boolean value.  If set,	*/
@@ -49,18 +55,10 @@ typedef struct GC_Thread_Rep {
     				/* Used only to avoid premature 	*/
 				/* reclamation of any data it might 	*/
 				/* reference.				*/
+    				/* This is unfortunately also the	*/
+    				/* reason we need to intercept join	*/
+    				/* and detach.				*/
 #   ifdef THREAD_LOCAL_ALLOC
-	/* The ith free list corresponds to size i*GRANULE_BYTES	*/
-        /* Convert the number of requested bytes to a suitable free	*/
-    	/* list index, adding EXTRA_BYTES if appripriate.		*/
-#	define INDEX_FROM_REQUESTED_BYTES(n) \
-    			((ADD_SLOP(n) + GRANULE_BYTES - 1)/GRANULE_BYTES)
-        /* Convert a free list index to the actual size of objects	*/
-    	/* on that list, including extra space we added.  Not an	*/
-    	/* inverse of the above.					*/
-#	define RAW_BYTES_FROM_INDEX(i) ((i) * GRANULE_BYTES)
-#	define SMALL_ENOUGH(bytes) (ADD_SLOP(bytes) <= \
-				    (TINY_FREELISTS-1)*GRANULE_BYTES)
 	void * ptrfree_freelists[TINY_FREELISTS];
 	void * normal_freelists[TINY_FREELISTS];
 #	ifdef GC_GCJ_SUPPORT
@@ -80,6 +78,7 @@ typedef struct GC_Thread_Rep {
 #	define DIRECT_GRANULES (HBLKSIZE/GRANULE_BYTES)
 		/* Don't use local free lists for up to this much 	*/
 		/* allocation.						*/
+
 #   endif
 } * GC_thread;
 
diff --git a/mach_dep.c b/mach_dep.c
index b11ad6a6..7fc025b1 100644
--- a/mach_dep.c
+++ b/mach_dep.c
@@ -69,78 +69,17 @@ asm static void PushMacRegisters()
 # endif
 
 /* Routine to mark from registers that are preserved by the C compiler. */
-/* This must be ported to every new architecture.  There is a generic   */
-/* version at the end, that is likely, but not guaranteed to work       */
-/* on your architecture.  Run the test_setjmp program to see whether    */
-/* there is any chance it will work.                                    */
+/* This must be ported to every new architecture.  It is noe optional,	*/
+/* and should not be used on platforms that are either UNIX-like, or	*/
+/* require thread support.						*/
 
-#if !defined(USE_GENERIC_PUSH_REGS) && !defined(USE_ASM_PUSH_REGS)
 #undef HAVE_PUSH_REGS
+
+#if defined(USE_ASM_PUSH_REGS)
+#  define HAVE_PUSH_REGS
+#else  /* No asm implementation */
 void GC_push_regs()
 {
-#       ifdef RT
-	  register long TMP_SP; /* must be bound to r11 */
-#       endif
-
-#       ifdef VAX
-	  /* VAX - generic code below does not work under 4.2 */
-	  /* r1 through r5 are caller save, and therefore     */
-	  /* on the stack or dead.                            */
-	  asm("pushl r11");     asm("calls $1,_GC_push_one");
-	  asm("pushl r10"); 	asm("calls $1,_GC_push_one");
-	  asm("pushl r9");	asm("calls $1,_GC_push_one");
-	  asm("pushl r8");	asm("calls $1,_GC_push_one");
-	  asm("pushl r7");	asm("calls $1,_GC_push_one");
-	  asm("pushl r6");	asm("calls $1,_GC_push_one");
-#	  define HAVE_PUSH_REGS
-#       endif
-#       if defined(M68K) && defined(NEXT)
-	/*  M68K SUNOS - could be replaced by generic code */
-	  /* a0, a1 and d1 are caller save          */
-	  /*  and therefore are on stack or dead.   */
-	
-	  asm("subqw #0x4,sp");		/* allocate word on top of stack */
-
-	  asm("movl a2,sp@");	asm("jbsr _GC_push_one");
-	  asm("movl a3,sp@");	asm("jbsr _GC_push_one");
-	  asm("movl a4,sp@");	asm("jbsr _GC_push_one");
-	  asm("movl a5,sp@");	asm("jbsr _GC_push_one");
-	  /* Skip frame pointer and stack pointer */
-	  asm("movl d1,sp@");	asm("jbsr _GC_push_one");
-	  asm("movl d2,sp@");	asm("jbsr _GC_push_one");
-	  asm("movl d3,sp@");	asm("jbsr _GC_push_one");
-	  asm("movl d4,sp@");	asm("jbsr _GC_push_one");
-	  asm("movl d5,sp@");	asm("jbsr _GC_push_one");
-	  asm("movl d6,sp@");	asm("jbsr _GC_push_one");
-	  asm("movl d7,sp@");	asm("jbsr _GC_push_one");
-
-	  asm("addqw #0x4,sp");		/* put stack back where it was	*/
-#	  define HAVE_PUSH_REGS
-#       endif
-
-#       if defined(M68K) && defined(HP)
-	/*  M68K HP - could be replaced by generic code */
-	  /* a0, a1 and d1 are caller save.  */
-	
-	  asm("subq.w &0x4,%sp");	/* allocate word on top of stack */
-
-	  asm("mov.l %a2,(%sp)"); asm("jsr _GC_push_one");
-	  asm("mov.l %a3,(%sp)"); asm("jsr _GC_push_one");
-	  asm("mov.l %a4,(%sp)"); asm("jsr _GC_push_one");
-	  asm("mov.l %a5,(%sp)"); asm("jsr _GC_push_one");
-	  /* Skip frame pointer and stack pointer */
-	  asm("mov.l %d1,(%sp)"); asm("jsr _GC_push_one");
-	  asm("mov.l %d2,(%sp)"); asm("jsr _GC_push_one");
-	  asm("mov.l %d3,(%sp)"); asm("jsr _GC_push_one");
-	  asm("mov.l %d4,(%sp)"); asm("jsr _GC_push_one");
-	  asm("mov.l %d5,(%sp)"); asm("jsr _GC_push_one");
-	  asm("mov.l %d6,(%sp)"); asm("jsr _GC_push_one");
-	  asm("mov.l %d7,(%sp)"); asm("jsr _GC_push_one");
-
-	  asm("addq.w &0x4,%sp");	/* put stack back where it was	*/
-#	  define HAVE_PUSH_REGS
-#       endif /* M68K HP */
-
 #	if defined(M68K) && defined(AMIGA)
  	 /*  AMIGA - could be replaced by generic code 			*/
  	 /* a0, a1, d0 and d1 are caller save */
@@ -211,18 +150,7 @@ void GC_push_regs()
 #	endif	/* __MWERKS__ */
 #   endif	/* MACOS */
 
-#       if defined(I386) &&!defined(OS2) &&!defined(SVR4) \
-	&& (defined(__MINGW32__) || !defined(MSWIN32)) \
-	&& !defined(SCO) && !defined(SCO_ELF) \
- 	&& !(defined(LINUX) && defined(__ELF__)) \
-	&& !(defined(FREEBSD) && defined(__ELF__)) \
-	&& !(defined(NETBSD) && defined(__ELF__)) \
-	&& !(defined(OPENBSD) && defined(__ELF__)) \
-	&& !(defined(BEOS) && defined(__ELF__)) \
-	&& !defined(DOS4GW) && !defined(HURD)
-	/* I386 code, generic code does not appear to work */
-	/* It does appear to work under OS2, and asms dont */
-	/* This is used for some 38g UNIX variants and for CYGWIN32 */
+#       if defined(I386) && (defined(__MINGW32__) || defined(CYGWIN32))
 	  asm("pushl %eax");  asm("call _GC_push_one"); asm("addl $4,%esp");
 	  asm("pushl %ecx");  asm("call _GC_push_one"); asm("addl $4,%esp");
 	  asm("pushl %edx");  asm("call _GC_push_one"); asm("addl $4,%esp");
@@ -233,43 +161,8 @@ void GC_push_regs()
 #	  define HAVE_PUSH_REGS
 #       endif
 
-#	if ( defined(I386) && defined(LINUX) && defined(__ELF__) ) \
-	|| ( defined(I386) && defined(FREEBSD) && defined(__ELF__) ) \
-	|| ( defined(I386) && defined(NETBSD) && defined(__ELF__) ) \
-	|| ( defined(I386) && defined(OPENBSD) && defined(__ELF__) ) \
-	|| ( defined(I386) && defined(HURD) && defined(__ELF__) ) \
-	|| ( defined(I386) && defined(DGUX) )
-
-	/* This is modified for Linux with ELF (Note: _ELF_ only) */
-	/* This section handles FreeBSD with ELF. */
-	/* Eax is caller-save and dead here.  Other caller-save 	*/
-	/* registers could also be skipped.  We assume there are no	*/
-	/* pointers in MMX registers, etc.				*/
-	/* We combine instructions in a single asm to prevent gcc from 	*/
-	/* inserting code in the middle.				*/
-	  asm("pushl %ecx; call GC_push_one; addl $4,%esp");
-	  asm("pushl %edx; call GC_push_one; addl $4,%esp");
-	  asm("pushl %ebp; call GC_push_one; addl $4,%esp");
-	  asm("pushl %esi; call GC_push_one; addl $4,%esp");
-	  asm("pushl %edi; call GC_push_one; addl $4,%esp");
-	  asm("pushl %ebx; call GC_push_one; addl $4,%esp");
-#	  define HAVE_PUSH_REGS
-#	endif
-
-#	if ( defined(I386) && defined(BEOS) && defined(__ELF__) )
-	/* As far as I can understand from				*/
-	/* http://www.beunited.org/articles/jbq/nasm.shtml,		*/
-	/* only ebp, esi, edi and ebx are not scratch. How MMX 		*/
-	/* etc. registers should be treated, I have no idea. 		*/
-	  asm("pushl %ebp; call GC_push_one; addl $4,%esp");
-	  asm("pushl %esi; call GC_push_one; addl $4,%esp");
-	  asm("pushl %edi; call GC_push_one; addl $4,%esp");
-	  asm("pushl %ebx; call GC_push_one; addl $4,%esp");
-#	  define HAVE_PUSH_REGS
-#       endif
-
 #       if defined(I386) && defined(MSWIN32) && !defined(__MINGW32__) \
-	   && !defined(USE_GENERIC)
+	   && !defined(CYGWIN32)
 	/* I386 code, Microsoft variant		*/
 	  __asm  push eax
 	  __asm  call GC_push_one
@@ -294,178 +187,89 @@ void GC_push_regs()
 	  __asm  add esp,4
 #	  define HAVE_PUSH_REGS
 #       endif
+}
+#endif /* !USE_ASM_PUSH_REGS */
 
-#       if defined(I386) && (defined(SVR4) || defined(SCO) || defined(SCO_ELF))
-	/* I386 code, SVR4 variant, generic code does not appear to work */
-	  asm("pushl %eax");  asm("call GC_push_one"); asm("addl $4,%esp");
-	  asm("pushl %ebx");  asm("call GC_push_one"); asm("addl $4,%esp");
-	  asm("pushl %ecx");  asm("call GC_push_one"); asm("addl $4,%esp");
-	  asm("pushl %edx");  asm("call GC_push_one"); asm("addl $4,%esp");
-	  asm("pushl %ebp");  asm("call GC_push_one"); asm("addl $4,%esp");
-	  asm("pushl %esi");  asm("call GC_push_one"); asm("addl $4,%esp");
-	  asm("pushl %edi");  asm("call GC_push_one"); asm("addl $4,%esp");
-#	  define HAVE_PUSH_REGS
-#       endif
-
-#       ifdef NS32K
-	  asm ("movd r3, tos"); asm ("bsr ?_GC_push_one"); asm ("adjspb $-4");
-	  asm ("movd r4, tos"); asm ("bsr ?_GC_push_one"); asm ("adjspb $-4");
-	  asm ("movd r5, tos"); asm ("bsr ?_GC_push_one"); asm ("adjspb $-4");
-	  asm ("movd r6, tos"); asm ("bsr ?_GC_push_one"); asm ("adjspb $-4");
-	  asm ("movd r7, tos"); asm ("bsr ?_GC_push_one"); asm ("adjspb $-4");
-#	  define HAVE_PUSH_REGS
-#       endif
-
-#       if defined(SPARC)
-	  GC_save_regs_ret_val = GC_save_regs_in_stack();
-#	  define HAVE_PUSH_REGS
-#       endif
-
-#	ifdef RT
-	    GC_push_one(TMP_SP);    /* GC_push_one from r11 */
-
-	    asm("cas r11, r6, r0"); GC_push_one(TMP_SP);	/* r6 */
-	    asm("cas r11, r7, r0"); GC_push_one(TMP_SP);	/* through */
-	    asm("cas r11, r8, r0"); GC_push_one(TMP_SP);	/* r10 */
-	    asm("cas r11, r9, r0"); GC_push_one(TMP_SP);
-	    asm("cas r11, r10, r0"); GC_push_one(TMP_SP);
-
-	    asm("cas r11, r12, r0"); GC_push_one(TMP_SP); /* r12 */
-	    asm("cas r11, r13, r0"); GC_push_one(TMP_SP); /* through */
-	    asm("cas r11, r14, r0"); GC_push_one(TMP_SP); /* r15 */
-	    asm("cas r11, r15, r0"); GC_push_one(TMP_SP);
-#	    define HAVE_PUSH_REGS
-#       endif
-
-#       if defined(M68K) && defined(SYSV)
-  	/*  Once again similar to SUN and HP, though setjmp appears to work.
-  		--Parag
-  	 */
-#        ifdef __GNUC__
-  	  asm("subqw #0x4,%sp");	/* allocate word on top of stack */
-  
-  	  asm("movl %a2,%sp@");	asm("jbsr GC_push_one");
-  	  asm("movl %a3,%sp@");	asm("jbsr GC_push_one");
-  	  asm("movl %a4,%sp@");	asm("jbsr GC_push_one");
-  	  asm("movl %a5,%sp@");	asm("jbsr GC_push_one");
-  	  /* Skip frame pointer and stack pointer */
-  	  asm("movl %d1,%sp@");	asm("jbsr GC_push_one");
-  	  asm("movl %d2,%sp@");	asm("jbsr GC_push_one");
-  	  asm("movl %d3,%sp@");	asm("jbsr GC_push_one");
-  	  asm("movl %d4,%sp@");	asm("jbsr GC_push_one");
-  	  asm("movl %d5,%sp@");	asm("jbsr GC_push_one");
-  	  asm("movl %d6,%sp@");	asm("jbsr GC_push_one");
-  	  asm("movl %d7,%sp@");	asm("jbsr GC_push_one");
-  
-  	  asm("addqw #0x4,%sp");	/* put stack back where it was	*/
-#	  define HAVE_PUSH_REGS
-#        else /* !__GNUC__*/
-  	  asm("subq.w &0x4,%sp");	/* allocate word on top of stack */
-  
-  	  asm("mov.l %a2,(%sp)"); asm("jsr GC_push_one");
-  	  asm("mov.l %a3,(%sp)"); asm("jsr GC_push_one");
-  	  asm("mov.l %a4,(%sp)"); asm("jsr GC_push_one");
-  	  asm("mov.l %a5,(%sp)"); asm("jsr GC_push_one");
-  	  /* Skip frame pointer and stack pointer */
-  	  asm("mov.l %d1,(%sp)"); asm("jsr GC_push_one");
-  	  asm("mov.l %d2,(%sp)"); asm("jsr GC_push_one");
-  	  asm("mov.l %d3,(%sp)"); asm("jsr GC_push_one");
-  	  asm("mov.l %d4,(%sp)"); asm("jsr GC_push_one");
-  	  asm("mov.l %d5,(%sp)"); asm("jsr GC_push_one");
-   	  asm("mov.l %d6,(%sp)"); asm("jsr GC_push_one");
-  	  asm("mov.l %d7,(%sp)"); asm("jsr GC_push_one");
-  
-  	  asm("addq.w &0x4,%sp");	/* put stack back where it was	*/
-#	  define HAVE_PUSH_REGS
-#        endif /* !__GNUC__ */
-#       endif /* M68K/SYSV */
+#if defined(HAVE_PUSH_REGS) && defined(THREADS)
+# error GC_push_regs cannot be used with threads
+# undef HAVE_PUSH_REGS
+#endif
 
-#     if defined(PJ)
-	{
-	    register int * sp asm ("optop");
-	    extern int *__libc_stack_end;
+#if !defined(HAVE_PUSH_REGS) && defined(UNIX_LIKE)
+# include <ucontext.h>
+#endif
 
-	    GC_push_all_stack (sp, __libc_stack_end);
-#	    define HAVE_PUSH_REGS
-	    /* Isn't this redundant with the code to push the stack? */
+/* Ensure that either registers are pushed, or callee-save registers	*/
+/* are somewhere on the stack, and then call fn(arg, ctxt).		*/
+/* ctxt is either a pointer to a ucontext_t we generated, or NULL.	*/
+void GC_with_callee_saves_pushed(void (*fn)(ptr_t, void *),
+				 ptr_t arg)
+{
+    word dummy;
+    void * context = 0;
+
+#   if defined(HAVE_PUSH_REGS)
+      GC_push_regs();
+#   elif defined(UNIX_LIKE)
+      ucontext_t ctxt;
+      getcontext(&ctxt);
+      context = &ctxt;
+#     if defined(SPARC) || defined(IA64)
+        /* On a register window machine, we need to save register	*/
+        /* contents on the stack for this to work.  This may already be	*/
+        /* subsumed by the getcontext() call.				*/
+        {
+          GC_save_regs_ret_val = GC_save_regs_in_stack();
         }
-#     endif
-
-      /* other machines... */
-#       if !defined(HAVE_PUSH_REGS)
-	    --> We just generated an empty GC_push_regs, which
-	    --> is almost certainly broken.  Try defining
-	    --> USE_GENERIC_PUSH_REGS instead.
+#     endif /* register windows. */
+#   elif defined(HAVE_BUILTIN_UNWIND_INIT)
+      /* This was suggested by Richard Henderson as the way to	*/
+      /* force callee-save registers and register windows onto	*/
+      /* the stack.						*/
+      __builtin_unwind_init();
+#   else /* !HAVE_BUILTIN_UNWIND_INIT && !UNIX_LIKE  */
+         /* && !HAVE_PUSH_REGS			     */
+        /* Generic code                          */
+        /* The idea is due to Parag Patel at HP. */
+        /* We're not sure whether he would like  */
+        /* to be he acknowledged for it or not.  */
+        jmp_buf regs;
+        register word * i = (word *) regs;
+        register ptr_t lim = (ptr_t)(regs) + (sizeof regs);
+  
+        /* Setjmp doesn't always clear all of the buffer.		*/
+        /* That tends to preserve garbage.  Clear it.   		*/
+  	for (; (char *)i < lim; i++) {
+  	    *i = 0;
+  	}
+#       if defined(POWERPC) || defined(MSWIN32) || defined(MSWINCE) \
+                  || defined(UTS4) || defined(LINUX) || defined(EWS4800)
+  	  (void) setjmp(regs);
+#       else
+          (void) _setjmp(regs);
+  	  /* We don't want to mess with signals. According to	*/
+  	  /* SUSV3, setjmp() may or may not save signal mask.	*/
+  	  /* _setjmp won't, but is less portable.		*/
 #       endif
+#   endif /* !HAVE_PUSH_REGS ... */
+    fn(arg, context);
+    /* Strongly discourage the compiler from treating the above	*/
+    /* as a tail-call, since that would pop the register 	*/
+    /* contents before we get a chance to look at them.		*/
+    GC_noop1((word)(&dummy));
 }
-#endif /* !USE_GENERIC_PUSH_REGS && !USE_ASM_PUSH_REGS */
 
-#if defined(USE_GENERIC_PUSH_REGS)
-void GC_generic_push_regs(cold_gc_frame)
+void GC_push_regs_and_stack(cold_gc_frame)
 ptr_t cold_gc_frame;
 {
-	{
-	    word dummy;
-
-#	    ifdef HAVE_BUILTIN_UNWIND_INIT
-	      /* This was suggested by Richard Henderson as the way to	*/
-	      /* force callee-save registers and register windows onto	*/
-	      /* the stack.						*/
-	      __builtin_unwind_init();
-#	    else /* !HAVE_BUILTIN_UNWIND_INIT */
-	      /* Generic code                          */
-	      /* The idea is due to Parag Patel at HP. */
-	      /* We're not sure whether he would like  */
-	      /* to be he acknowledged for it or not.  */
-	      jmp_buf regs;
-	      register word * i = (word *) regs;
-	      register ptr_t lim = (ptr_t)(regs) + (sizeof regs);
-
-	      /* Setjmp doesn't always clear all of the buffer.		*/
-	      /* That tends to preserve garbage.  Clear it.   		*/
-		for (; (char *)i < lim; i++) {
-		    *i = 0;
-		}
-#	      if defined(POWERPC) || defined(MSWIN32) || defined(MSWINCE) \
-                || defined(UTS4) || defined(LINUX) || defined(EWS4800)
-		  (void) setjmp(regs);
-#	      else
-	          (void) _setjmp(regs);
-		  /* We don't want to mess with signals. According to	*/
-		  /* SUSV3, setjmp() may or may not save signal mask.	*/
-		  /* _setjmp won't, but is less portable.		*/
-#	      endif
-#	    endif /* !HAVE_BUILTIN_UNWIND_INIT */
-#           if (defined(SPARC) && !defined(HAVE_BUILTIN_UNWIND_INIT)) \
-		|| defined(IA64)
-	      /* On a register window machine, we need to save register	*/
-	      /* contents on the stack for this to work.  The setjmp	*/
-	      /* is probably not needed on SPARC, since pointers are	*/
-	      /* only stored in windowed or scratch registers.  It is	*/
-	      /* needed on IA64, since some non-windowed registers are	*/
-	      /* preserved.						*/
-	      {
-	        GC_save_regs_ret_val = GC_save_regs_in_stack();
-		/* On IA64 gcc, could use __builtin_ia64_flushrs() and	*/
-		/* __builtin_ia64_flushrs().  The latter will be done	*/
-		/* implicitly by __builtin_unwind_init() for gcc3.0.1	*/
-		/* and later.						*/
-	      }
-#           endif
-	    GC_push_current_stack(cold_gc_frame);
-	    /* Strongly discourage the compiler from treating the above	*/
-	    /* as a tail-call, since that would pop the register 	*/
-	    /* contents before we get a chance to look at them.		*/
-	    GC_noop1((word)(&dummy));
-	}
+    GC_with_callee_saves_pushed(GC_push_current_stack, cold_gc_frame);
 }
-#endif /* USE_GENERIC_PUSH_REGS */
 
 /* On register window machines, we need a way to force registers into 	*/
 /* the stack.	Return sp.						*/
 # ifdef SPARC
     asm("	.seg 	\"text\"");
-#   if defined(SVR4) || defined(NETBSD)
+#   if defined(SVR4) || defined(NETBSD) || defined(FREEBSD)
       asm("	.globl	GC_save_regs_in_stack");
       asm("GC_save_regs_in_stack:");
       asm("	.type GC_save_regs_in_stack,#function");
@@ -492,53 +296,6 @@ ptr_t cold_gc_frame;
 #   endif
 # endif
 
-/* On IA64, we also need to flush register windows.  But they end	*/
-/* up on the other side of the stack segment.				*/
-/* Returns the backing store pointer for the register stack.		*/
-/* We now implement this as a separate assembly file, since inline	*/
-/* assembly code here doesn't work with either the Intel or HP 		*/
-/* compilers.								*/
-# if 0
-#   ifdef LINUX
-	asm("        .text");
-	asm("        .psr abi64");
-	asm("        .psr lsb");
-	asm("        .lsb");
-	asm("");
-	asm("        .text");
-	asm("        .align 16");
-	asm("        .global GC_save_regs_in_stack");
-	asm("        .proc GC_save_regs_in_stack");
-	asm("GC_save_regs_in_stack:");
-	asm("        .body");
-	asm("        flushrs");
-	asm("        ;;");
-	asm("        mov r8=ar.bsp");
-	asm("        br.ret.sptk.few rp");
-	asm("        .endp GC_save_regs_in_stack");
-#   endif /* LINUX */
-#   if 0 /* Other alternatives that don't work on HP/UX */
-	word GC_save_regs_in_stack() {
-#	  if USE_BUILTINS
-	    __builtin_ia64_flushrs();
-	    return __builtin_ia64_bsp();
-#	  else
-#	    ifdef HPUX
-	      _asm("        flushrs");
-	      _asm("        ;;");
-	      _asm("        mov r8=ar.bsp");
-	      _asm("        br.ret.sptk.few rp");
-#	    else
-	      asm("        flushrs");
-	      asm("        ;;");
-	      asm("        mov r8=ar.bsp");
-	      asm("        br.ret.sptk.few rp");
-#	    endif
-#	  endif
-	}
-#   endif
-# endif
-
 /* GC_clear_stack_inner(arg, limit) clears stack area up to limit and	*/
 /* returns arg.  Stack clearing is crucial on SPARC, so we supply	*/
 /* an assembly version that's more careful.  Assumes limit is hotter	*/
diff --git a/malloc.c b/malloc.c
index 879aa945..a36956ab 100644
--- a/malloc.c
+++ b/malloc.c
@@ -69,7 +69,6 @@ ptr_t GC_alloc_large(size_t lb, int k, unsigned flags)
 	        GC_max_large_allocd_bytes = GC_large_allocd_bytes;
 	}
 	result = h -> hb_body;
-	GC_bytes_wasted += total_bytes - lb;
     }
     return result;
 }
@@ -99,14 +98,13 @@ ptr_t GC_alloc_large_and_clear(size_t lb, int k, unsigned flags)
 /* hold lock:					*/
 void * GC_generic_malloc_inner(size_t lb, int k)
 {
-size_t lg;
-void *op;
-void **opp;
+    void *op;
 
     if(SMALL_OBJ(lb)) {
-        register struct obj_kind * kind = GC_obj_kinds + k;
-	lg = GC_size_map[lb];
-	opp = &(kind -> ok_freelist[lg]);
+        struct obj_kind * kind = GC_obj_kinds + k;
+	size_t lg = GC_size_map[lb];
+	void ** opp = &(kind -> ok_freelist[lg]);
+
         if( (op = *opp) == 0 ) {
 	    if (GC_size_map[lb] == 0) {
 	      if (!GC_is_initialized)  GC_init_inner();
@@ -121,10 +119,11 @@ void **opp;
         }
         *opp = obj_link(op);
         obj_link(op) = 0;
+        GC_bytes_allocd += GRANULES_TO_BYTES(lg);
     } else {
 	op = (ptr_t)GC_alloc_large_and_clear(ADD_SLOP(lb), k, 0);
+        GC_bytes_allocd += lb;
     }
-    GC_bytes_allocd += GRANULES_TO_BYTES(lg);
     
 out:
     return op;
@@ -202,7 +201,11 @@ void * GC_generic_malloc(size_t lb, int k)
 /* the stack.								*/
 
 /* Allocate lb bytes of atomic (pointerfree) data */
-void * GC_malloc_atomic(size_t lb)
+#ifdef THREAD_LOCAL_ALLOC
+  void * GC_core_malloc_atomic(size_t lb)
+#else
+  void * GC_malloc_atomic(size_t lb)
+#endif
 {
     void *op;
     void ** opp;
@@ -227,7 +230,11 @@ void * GC_malloc_atomic(size_t lb)
 }
 
 /* Allocate lb bytes of composite (pointerful) data */
-void * GC_malloc(size_t lb)
+#ifdef THREAD_LOCAL_ALLOC
+  void * GC_core_malloc(size_t lb)
+#else
+  void * GC_malloc(size_t lb)
+#endif
 {
     void *op;
     void **opp;
diff --git a/mallocx.c b/mallocx.c
index 11567ff3..761514da 100644
--- a/mallocx.c
+++ b/mallocx.c
@@ -163,7 +163,8 @@ void * realloc(void * p, size_t lb)
 # endif /* REDIRECT_REALLOC */
 
 
-/* The same thing, except caller does not hold allocation lock.	*/
+/* Allocate memory such that only pointers to near the          */
+/* beginning of the object are considered.                      */
 /* We avoid holding allocation lock while we clear memory.	*/
 void * GC_generic_malloc_ignore_off_page(size_t lb, int k)
 {
@@ -233,7 +234,7 @@ void GC_incr_bytes_freed(size_t n)
     GC_bytes_freed += n;
 }
 
-#if defined(THREADS) && !defined(SRC_M3)
+#if defined(THREADS)
 
 extern signed_word GC_bytes_found;   /* Protected by GC lock.  */
 
diff --git a/mark.c b/mark.c
index 2197474d..ea255bf8 100644
--- a/mark.c
+++ b/mark.c
@@ -103,7 +103,7 @@ word GC_n_rescuing_pages;	/* Number of dirty pages we marked from */
 
 mse * GC_mark_stack;
 
-mse * GC_mark_stack_limit;;
+mse * GC_mark_stack_limit;
 
 size_t GC_mark_stack_size = 0;
  
@@ -1371,6 +1371,11 @@ struct GC_ms_entry *GC_mark_and_push(void *obj,
 	GC_ADD_TO_BLACK_LIST_NORMAL(p, src);
 	return;
     }
+#   if defined(MANUAL_VDB) && defined(THREADS)
+      /* Pointer is on the stack.  We may have dirtied the object	*/
+      /* it points to, but not yet have called GC_dirty();	*/
+      GC_dirty(p);	/* Implicitly affects entire object.	*/
+#   endif
     PUSH_CONTENTS_HDR(r, GC_mark_stack_top, GC_mark_stack_limit,
 		      source, mark_and_push_exit, hhdr, FALSE);
   mark_and_push_exit: ;
@@ -1461,6 +1466,10 @@ void GC_push_all_eager(ptr_t bottom, ptr_t top)
  * register values are not lost.
  * Cold_gc_frame delimits the stack section that must be scanned
  * eagerly.  A zero value indicates that no eager scanning is needed.
+ * We don't need to worry about the MANUAL_VDB case here, since this
+ * is only called in the single-threaded case.  We assume that we
+ * cannot collect between an assignment and the corresponding
+ * GC_dirty() call.
  */
 void GC_push_all_stack_partially_eager(ptr_t bottom, ptr_t top,
 				       ptr_t cold_gc_frame)
@@ -1493,11 +1502,15 @@ void GC_push_all_stack_partially_eager(ptr_t bottom, ptr_t top,
 
 void GC_push_all_stack(ptr_t bottom, ptr_t top)
 {
-  if (!NEED_FIXUP_POINTER && GC_all_interior_pointers) {
-    GC_push_all(bottom, top);
-  } else {
+# if defined(THREADS) && defined(MPROTECT_VDB)
     GC_push_all_eager(bottom, top);
-  }
+# else
+    if (!NEED_FIXUP_POINTER && GC_all_interior_pointers) {
+      GC_push_all(bottom, top);
+    } else {
+      GC_push_all_eager(bottom, top);
+    }
+# endif
 }
 
 #if !defined(SMALL_CONFIG) && !defined(USE_MARK_BYTES)
diff --git a/mark_rts.c b/mark_rts.c
index 50ac09ac..19ea80a9 100644
--- a/mark_rts.c
+++ b/mark_rts.c
@@ -352,8 +352,11 @@ GC_bool GC_is_tmp_root(ptr_t p)
 
 ptr_t GC_approx_sp(void)
 {
-    word dummy;
+    volatile word dummy;
 
+    dummy = 42;	/* Force stack to grow if necessary.	Otherwise the	*/
+    		/* later accesses might cause the kernel to think we're	*/
+    		/* doing something wrong.				*/
 #   ifdef _MSC_VER
 #     pragma warning(disable:4172)
 #   endif
@@ -456,8 +459,9 @@ void GC_push_conditional_with_exclusions(ptr_t bottom, ptr_t top, GC_bool all)
  * In the presence of threads, push enough of the current stack
  * to ensure that callee-save registers saved in collector frames have been
  * seen.
+ * FIXME: Merge with per-thread stuff.
  */
-void GC_push_current_stack(ptr_t cold_gc_frame)
+void GC_push_current_stack(ptr_t cold_gc_frame, void * context)
 {
 #   if defined(THREADS)
 	if (0 == cold_gc_frame) return;
@@ -523,8 +527,8 @@ void GC_push_gc_structures(void)
 
 void GC_cond_register_dynamic_libraries(void)
 {
-# if (defined(DYNAMIC_LOADING) || defined(MSWIN32) || defined(MSWINCE) \
-     || defined(PCR)) && !defined(SRC_M3)
+# if defined(DYNAMIC_LOADING) || defined(MSWIN32) || defined(MSWINCE) \
+     || defined(PCR)
     GC_remove_tmp_roots();
     if (!GC_no_dls) GC_register_dynamic_libraries();
 # else
@@ -597,23 +601,11 @@ void GC_push_roots(GC_bool all, ptr_t cold_gc_frame)
      * Now traverse stacks, and mark from register contents.
      * These must be done last, since they can legitimately overflow
      * the mark stack.
+     * This is usually done by saving the current context on the
+     * stack, and then just tracing from the stack.
      */
-#   ifdef USE_GENERIC_PUSH_REGS
-	GC_generic_push_regs(cold_gc_frame);
-	/* Also pushes stack, so that we catch callee-save registers	*/
-	/* saved inside the GC_push_regs frame.				*/
-#   else
-       /*
-        * push registers - i.e., call GC_push_one(r) for each
-        * register contents r.
-        */
-        GC_push_regs(); /* usually defined in machine_dep.c */
-	GC_push_current_stack(cold_gc_frame);
-	/* In the threads case, this only pushes collector frames.      */
-	/* In the case of linux threads on IA64, the hot section of	*/
-	/* the main stack is marked here, but the register stack	*/
-	/* backing store is handled in the threads-specific code.	*/
-#   endif
+      GC_push_regs_and_stack(cold_gc_frame);
+
     if (GC_push_other_roots != 0) (*GC_push_other_roots)();
     	/* In the threads case, this also pushes thread stacks.	*/
         /* Note that without interior pointer recognition lots	*/
diff --git a/misc.c b/misc.c
index b286026d..fe082f86 100644
--- a/misc.c
+++ b/misc.c
@@ -47,38 +47,25 @@
 #   ifdef PCR
 #     include "il/PCR_IL.h"
       PCR_Th_ML GC_allocate_ml;
-#   else
-#     ifdef SRC_M3
-	/* Critical section counter is defined in the M3 runtime 	*/
-	/* That's all we use.						*/
+#   elif defined(GC_WIN32_THREADS) 
+#     if defined(GC_PTHREADS)
+	pthread_mutex_t GC_allocate_ml = PTHREAD_MUTEX_INITIALIZER;
+#     elif defined(GC_DLL)
+	 __declspec(dllexport) CRITICAL_SECTION GC_allocate_ml;
 #     else
-#	ifdef GC_SOLARIS_THREADS
-	  mutex_t GC_allocate_ml;	/* Implicitly initialized.	*/
-#	else
-#          if defined(GC_WIN32_THREADS) 
-#             if defined(GC_PTHREADS)
-		  pthread_mutex_t GC_allocate_ml = PTHREAD_MUTEX_INITIALIZER;
-#	      elif defined(GC_DLL)
-		 __declspec(dllexport) CRITICAL_SECTION GC_allocate_ml;
-#	      else
-		 CRITICAL_SECTION GC_allocate_ml;
-#	      endif
-#          else
-#             if defined(GC_PTHREADS) && !defined(GC_SOLARIS_THREADS)
-#		if defined(USE_SPIN_LOCK)
-	          pthread_t GC_lock_holder = NO_THREAD;
-#	        else
-		  pthread_mutex_t GC_allocate_ml = PTHREAD_MUTEX_INITIALIZER;
-	          pthread_t GC_lock_holder = NO_THREAD;
-			/* Used only for assertions, and to prevent	 */
-			/* recursive reentry in the system call wrapper. */
-#		endif 
-#    	      else
-	          --> declare allocator lock here
-#	      endif
-#	   endif
-#	endif
+	 CRITICAL_SECTION GC_allocate_ml;
 #     endif
+#   elif defined(GC_PTHREADS)
+#     if defined(USE_SPIN_LOCK)
+        pthread_t GC_lock_holder = NO_THREAD;
+#     else
+	pthread_mutex_t GC_allocate_ml = PTHREAD_MUTEX_INITIALIZER;
+	pthread_t GC_lock_holder = NO_THREAD;
+		/* Used only for assertions, and to prevent	 */
+		/* recursive reentry in the system call wrapper. */
+#     endif 
+#   else
+       --> declare allocator lock here
 #   endif
 # endif
 
@@ -661,7 +648,7 @@ void GC_init_inner()
 #   if !defined(THREADS) || defined(GC_PTHREADS) || defined(GC_WIN32_THREADS) \
 	|| defined(GC_SOLARIS_THREADS)
       if (GC_stackbottom == 0) {
-	GC_stackbottom = GC_get_stack_base();
+	GC_stackbottom = GC_get_main_stack_base();
 #       if (defined(LINUX) || defined(HPUX)) && defined(IA64)
 	  GC_register_stackbottom = GC_get_register_stack_base();
 #       endif
@@ -786,7 +773,10 @@ void GC_init_inner()
 
 void GC_enable_incremental(void)
 {
-# if !defined(SMALL_CONFIG)
+# if !defined(SMALL_CONFIG) && !defined(KEEP_BACK_PTRS)
+  /* If we are keeping back pointers, the GC itself dirties all	*/
+  /* pages on which objects have been marked, making 		*/
+  /* incremental GC pointless.					*/
   if (!GC_find_leak) {
     DCL_LOCK_STATE;
     
@@ -797,6 +787,7 @@ void GC_enable_incremental(void)
 #   ifndef GC_SOLARIS_THREADS 
       maybe_install_looping_handler();  /* Before write fault handler! */
       GC_dirty_init();
+      if (!GC_dirty_maintained) goto out;
 #   endif
     if (!GC_is_initialized) {
         GC_init_inner();
@@ -941,6 +932,9 @@ int GC_write(fd, buf, len)
 #endif
 
 #define BUFSZ 1024
+#ifdef _MSC_VER
+# define vsnprintf _vsnprintf
+#endif
 /* A version of printf that is unlikely to call malloc, and is thus safer */
 /* to call from the collector in case malloc has been bound to GC_malloc. */
 /* Floating point arguments ans formats should be avoided, since fp	  */
@@ -1126,6 +1120,19 @@ int GC_new_proc(GC_mark_proc proc)
     return result;
 }
 
+void * GC_call_with_stack_base(GC_stack_base_func fn, void *arg)
+{
+    int dummy;
+    struct GC_stack_base base;
+
+    base.mem_base = (void *)&dummy;
+#   ifdef IA64
+      base.reg_base = GC_save_regs_in_stack();
+      /* Unnecessarily flushes register stack, 		*/
+      /* but that probably doesn't hurt.		*/
+#   endif
+    return fn(&base, arg);
+}
 
 #if !defined(NO_DEBUGGING)
 
diff --git a/os_dep.c b/os_dep.c
index 45037b14..b83e2a2d 100644
--- a/os_dep.c
+++ b/os_dep.c
@@ -641,22 +641,33 @@ word GC_get_writable_length(ptr_t p, ptr_t *base)
     return(buf.RegionSize);
 }
 
-ptr_t GC_get_stack_base(void)
+int GC_get_stack_base(struct GC_stack_base *sb)
 {
     int dummy;
     ptr_t sp = (ptr_t)(&dummy);
     ptr_t trunc_sp = (ptr_t)((word)sp & ~(GC_page_size - 1));
     word size = GC_get_writable_length(trunc_sp, 0);
    
-    return(trunc_sp + size);
+    sb -> mem_base = trunc_sp + size;
+    return GC_SUCCESS;
 }
 
+#define HAVE_GET_STACK_BASE
+
+/* This is always called from the main thread.	*/
+ptr_t GC_get_main_stack_base(void)
+{
+    struct GC_stack_base sb;
+
+    GC_get_stack_base(&sb);
+    return (ptr_t)sb.mem_base;
+}
 
 # endif /* MS Windows */
 
 # ifdef BEOS
 # include <kernel/OS.h>
-ptr_t GC_get_stack_base(void){
+ptr_t GC_get_main_stack_base(void){
 	thread_info th;
 	get_thread_info(find_thread(NULL),&th);
 	return th.stack_end;
@@ -666,7 +677,7 @@ ptr_t GC_get_stack_base(void){
 
 # ifdef OS2
 
-ptr_t GC_get_stack_base(void)
+ptr_t GC_get_main_stack_base(void)
 {
     PTIB ptib;
     PPIB ppib;
@@ -807,7 +818,7 @@ ptr_t GC_get_stack_base(void)
 # endif
 
 #if defined(ECOS) || defined(NOSYS)
-  ptr_t GC_get_stack_base(void)
+  ptr_t GC_get_main_stack_base(void)
   {
     return STACKBOTTOM;
   }
@@ -845,8 +856,10 @@ ptr_t GC_get_stack_base(void)
 # define STAT_SKIP 27   /* Number of fields preceding startstack	*/
 			/* field in /proc/self/stat			*/
 
+#ifdef USE_LIBC_PRIVATES
 # pragma weak __libc_stack_end
   extern ptr_t __libc_stack_end;
+#endif
 
 # ifdef IA64
     /* Try to read the backing store base from /proc/self/maps.	*/
@@ -877,30 +890,33 @@ ptr_t GC_get_stack_base(void)
         return GC_apply_to_maps(backing_store_base_from_maps);
     }
 
-#   pragma weak __libc_ia64_register_backing_store_base
-    extern ptr_t __libc_ia64_register_backing_store_base;
+#   ifdef USE_LIBC_PRIVATES
+#     pragma weak __libc_ia64_register_backing_store_base
+      extern ptr_t __libc_ia64_register_backing_store_base;
+#   endif
 
     ptr_t GC_get_register_stack_base(void)
     {
-      if (0 != &__libc_ia64_register_backing_store_base
-	  && 0 != __libc_ia64_register_backing_store_base) {
-	/* Glibc 2.2.4 has a bug such that for dynamically linked	*/
-	/* executables __libc_ia64_register_backing_store_base is 	*/
-	/* defined but uninitialized during constructor calls.  	*/
-	/* Hence we check for both nonzero address and value.		*/
-	return __libc_ia64_register_backing_store_base;
-      } else {
-	word result = backing_store_base_from_proc();
-	if (0 == result) {
+#     ifdef USE_LIBC_PRIVATES
+        if (0 != &__libc_ia64_register_backing_store_base
+	    && 0 != __libc_ia64_register_backing_store_base) {
+	  /* Glibc 2.2.4 has a bug such that for dynamically linked	*/
+	  /* executables __libc_ia64_register_backing_store_base is 	*/
+	  /* defined but uninitialized during constructor calls.  	*/
+	  /* Hence we check for both nonzero address and value.		*/
+	  return __libc_ia64_register_backing_store_base;
+        }
+#     endif
+      word result = backing_store_base_from_proc();
+      if (0 == result) {
 	  /* Use dumb heuristics.  Works only for default configuration. */
 	  result = (word)GC_stackbottom - BACKING_STORE_DISPLACEMENT;
 	  result += BACKING_STORE_ALIGNMENT - 1;
 	  result &= ~(BACKING_STORE_ALIGNMENT - 1);
 	  /* Verify that it's at least readable.  If not, we goofed. */
 	  GC_noop1(*(word *)result); 
-	}
-	return (ptr_t)result;
       }
+      return (ptr_t)result;
     }
 # endif
 
@@ -923,6 +939,7 @@ ptr_t GC_get_stack_base(void)
     /* since the correct value of __libc_stack_end never	*/
     /* becomes visible to us.  The second test works around 	*/
     /* this.							*/  
+#   ifdef USE_LIBC_PRIVATES
       if (0 != &__libc_stack_end && 0 != __libc_stack_end ) {
 #       ifdef IA64
 	  /* Some versions of glibc set the address 16 bytes too	*/
@@ -935,6 +952,7 @@ ptr_t GC_get_stack_base(void)
 	  return __libc_stack_end;
 #	endif
       }
+#   endif
     f = open("/proc/self/stat", O_RDONLY);
     if (f < 0 || STAT_READ(f, stat_buf, STAT_BUF_SIZE) < 2 * STAT_SKIP) {
 	ABORT("Couldn't read /proc/self/stat");
@@ -985,7 +1003,7 @@ ptr_t GC_get_stack_base(void)
 #if !defined(BEOS) && !defined(AMIGA) && !defined(MSWIN32) \
     && !defined(MSWINCE) && !defined(OS2) && !defined(NOSYS) && !defined(ECOS)
 
-ptr_t GC_get_stack_base(void)
+ptr_t GC_get_main_stack_base(void)
 {
 #   if defined(HEURISTIC1) || defined(HEURISTIC2)
       word dummy;
@@ -1042,6 +1060,31 @@ ptr_t GC_get_stack_base(void)
 
 # endif /* ! AMIGA, !OS 2, ! MS Windows, !BEOS, !NOSYS, !ECOS */
 
+#ifndef HAVE_GET_STACK_BASE
+/* Retrieve stack base.						*/
+/* IIRC, there is a nonportable way to do this on Linux for	*/
+/* non-main threads. 						*/
+/* Actually using the GC_find_limit version seems risky.	*/
+/* FIXME - Implement better strategies here.			*/
+int GC_get_stack_base(struct GC_stack_base *b)
+{
+    int dummy;
+
+#   ifdef NEED_FIND_LIMIT
+#     ifdef STACK_GROWS_DOWN
+    	b -> mem_base = GC_find_limit(&dummy, TRUE);
+#       ifdef IA64
+	  b -> reg_base = GC_find_limit(GC_save_regs_in_stack(), FALSE);
+#       endif
+#     else
+	b -> mem_base = GC_find_limit(&dummy, FALSE);
+#     endif
+#   else
+      return GC_UNIMPLEMENTED;
+#   endif
+}
+#endif
+
 /*
  * Register static data segment(s) as roots.
  * If more data segments are added later then they need to be registered
@@ -1156,6 +1199,46 @@ void GC_register_data_segments(void)
   	/* This used to be set for gcc, to avoid dealing with		*/
   	/* the structured exception handling issues.  But we now have	*/
   	/* assembly code to do that right.				*/
+
+# if defined(GWW_VDB)
+
+#   ifndef _BASETSD_H_
+      typedef ULONG * PULONG_PTR;
+#   endif
+    typedef UINT (WINAPI * GetWriteWatch_type)(
+      DWORD, PVOID, SIZE_T, PVOID*, PULONG_PTR, PULONG);
+    static GetWriteWatch_type GetWriteWatch_func;
+    static DWORD GetWriteWatch_alloc_flag;
+
+#   define GC_GWW_AVAILABLE() (GetWriteWatch_func != NULL)
+
+    static void detect_GetWriteWatch(void)
+    {
+      static GC_bool done;
+      if (done)
+        return;
+
+      GetWriteWatch_func = (GetWriteWatch_type)
+        GetProcAddress(GetModuleHandle("kernel32.dll"), "GetWriteWatch");
+      if (GetWriteWatch_func != NULL) {
+        /* Also check whether VirtualAlloc accepts MEM_WRITE_WATCH,   */
+        /* as some versions of kernel32.dll have one but not the      */
+        /* other, making the feature completely broken.               */
+        void * page = VirtualAlloc(NULL, GC_page_size,
+                                    MEM_WRITE_WATCH | MEM_RESERVE,
+                                    PAGE_READWRITE);
+        if (page != NULL) {
+          GetWriteWatch_alloc_flag = MEM_WRITE_WATCH;
+          VirtualFree(page, GC_page_size, MEM_RELEASE);
+        } else {
+          /* GetWriteWatch will be useless. */
+          GetWriteWatch_func = NULL;
+        }
+      }
+      done = TRUE;
+    }
+
+# endif
   
   void GC_init_win32(void)
   {
@@ -1402,7 +1485,7 @@ ptr_t GC_FreeBSDGetDataStart(size_t max_page_size, ptr_t etext_addr)
 
 void GC_register_data_segments(void)
 {
-#   if !defined(PCR) && !defined(SRC_M3) && !defined(MACOS)
+#   if !defined(PCR) && !defined(MACOS)
 #     if defined(REDIRECT_MALLOC) && defined(GC_SOLARIS_THREADS)
 	/* As of Solaris 2.3, the Solaris threads implementation	*/
 	/* allocates the data structure for the initial thread with	*/
@@ -1472,36 +1555,6 @@ void GC_register_data_segments(void)
 # define SBRK_ARG_T ptrdiff_t
 
 
-# ifdef RS6000
-/* The compiler seems to generate speculative reads one past the end of	*/
-/* an allocated object.  Hence we need to make sure that the page 	*/
-/* following the last heap page is also mapped.				*/
-ptr_t GC_unix_get_mem(word bytes)
-{
-    caddr_t cur_brk = (caddr_t)sbrk(0);
-    caddr_t result;
-    SBRK_ARG_T lsbs = (word)cur_brk & (GC_page_size-1);
-    static caddr_t my_brk_val = 0;
-    
-    if ((SBRK_ARG_T)bytes < 0) return(0); /* too big */
-    if (lsbs != 0) {
-        if((caddr_t)(sbrk(GC_page_size - lsbs)) == (caddr_t)(-1)) return(0);
-    }
-    if (cur_brk == my_brk_val) {
-    	/* Use the extra block we allocated last time. */
-        result = (ptr_t)sbrk((SBRK_ARG_T)bytes);
-        if (result == (caddr_t)(-1)) return(0);
-        result -= GC_page_size;
-    } else {
-        result = (ptr_t)sbrk(GC_page_size + (SBRK_ARG_T)bytes);
-        if (result == (caddr_t)(-1)) return(0);
-    }
-    my_brk_val = result + bytes + GC_page_size;	/* Always page aligned */
-    return((ptr_t)result);
-}
-
-#else  /* Not RS6000 */
-
 #if defined(USE_MMAP) || defined(USE_MUNMAP)
 
 #ifdef USE_MMAP_FIXED
@@ -1569,7 +1622,7 @@ ptr_t GC_unix_get_mem(word bytes)
     return((ptr_t)result);
 }
 
-#else /* Not RS6000, not USE_MMAP */
+#else /* Not USE_MMAP */
 ptr_t GC_unix_get_mem(word bytes)
 {
   ptr_t result;
@@ -1596,7 +1649,6 @@ ptr_t GC_unix_get_mem(word bytes)
 }
 
 #endif /* Not USE_MMAP */
-#endif /* Not RS6000 */
 
 # endif /* UN*X */
 
@@ -1651,7 +1703,15 @@ ptr_t GC_win32_get_mem(word bytes)
 	/* This wastes a small amount of memory, and risks	*/
 	/* increased fragmentation.  But better alternatives	*/
 	/* would require effort.				*/
+        /* Pass the MEM_WRITE_WATCH only if GetWriteWatch-based */
+        /* VDBs are enabled and the GetWriteWatch function is   */
+        /* available.  Otherwise we waste resources or possibly */
+        /* cause VirtualAlloc to fail (observed in Windows 2000 */
+        /* SP2).                                                */
         result = (ptr_t) VirtualAlloc(NULL, bytes + 1,
+#                                     ifdef GWW_VDB
+                                        GetWriteWatch_alloc_flag |
+#                                     endif
     				      MEM_COMMIT | MEM_RESERVE,
     				      PAGE_EXECUTE_READWRITE);
     }
@@ -1706,10 +1766,10 @@ ptr_t GC_wince_get_mem(word bytes)
 	/* Reserve more pages */
 	word res_bytes = (bytes + GC_sysinfo.dwAllocationGranularity-1)
 			 & ~(GC_sysinfo.dwAllocationGranularity-1);
-	/* If we ever support MPROTECT_VDB here, we will probably need to	*/
-	/* ensure that res_bytes is strictly > bytes, so that VirtualProtect	*/
-	/* never spans regions.  It seems to be OK for a VirtualFree argument	*/
-	/* to span regions, so we should be OK for now.				*/
+	/* If we ever support MPROTECT_VDB here, we will probably need to    */
+	/* ensure that res_bytes is strictly > bytes, so that VirtualProtect */
+	/* never spans regions.  It seems to be OK for a VirtualFree	     */
+	/* argument to span regions, so we should be OK for now.	     */
 	result = (ptr_t) VirtualAlloc(NULL, res_bytes,
     				      MEM_RESERVE | MEM_TOP_DOWN,
     				      PAGE_EXECUTE_READWRITE);
@@ -1954,50 +2014,6 @@ void GC_default_push_other_roots(void)
 
 # endif /* PCR */
 
-# ifdef SRC_M3
-
-# ifdef ALL_INTERIOR_POINTERS
-    --> misconfigured
-# endif
-
-void GC_push_thread_structures(void)
-{
-    /* Not our responsibibility. */
-}
-
-extern void ThreadF__ProcessStacks(void);
-
-void GC_push_thread_stack(word start, word stop)
-{
-   GC_push_all_stack((ptr_t)start, (ptr_t)stop + sizeof(word));
-}
-
-/* Push routine with M3 specific calling convention. */
-GC_m3_push_root(ptr_t dummy1, word *p, ptr_t dummy2, int dummy3)
-{
-    word q = *p;
-    
-    GC_PUSH_ONE_STACK(q, p);
-}
-
-/* M3 set equivalent to RTHeap.TracedRefTypes */
-typedef struct { int elts[1]; }  RefTypeSet;
-RefTypeSet GC_TracedRefTypes = {{0x1}};
-
-void GC_default_push_other_roots(void)
-{
-    /* Use the M3 provided routine for finding static roots.	 */
-    /* This is a bit dubious, since it presumes no C roots.	 */
-    /* We handle the collector roots explicitly in GC_push_roots */
-      	RTMain__GlobalMapProc(GC_m3_push_root, 0, GC_TracedRefTypes);
-	if (GC_bytes_allocd > 0) {
-	    ThreadF__ProcessStacks(GC_push_thread_stack);
-	}
-	/* Otherwise this isn't absolutely necessary, and we have	*/
-	/* startup ordering problems.					*/
-}
-
-# endif /* SRC_M3 */
 
 # if defined(GC_SOLARIS_THREADS) || defined(GC_PTHREADS) || \
      defined(GC_WIN32_THREADS)
@@ -2017,7 +2033,7 @@ void (*GC_push_other_roots)(void) = GC_default_push_other_roots;
 
 /*
  * Routines for accessing dirty  bits on virtual pages.
- * There are five ways to maintain this information:
+ * There are six ways to maintain this information:
  * DEFAULT_VDB:	A simple dummy implementation that treats every page
  *		as possibly dirty.  This makes incremental collection
  *		useless, but the implementation is still correct.
@@ -2031,8 +2047,11 @@ void (*GC_push_other_roots)(void) = GC_default_push_other_roots;
  * 		In order to avoid races, an object must be marked dirty
  * 		after it is written, and a reference to the object
  * 		must be kept on a stack or in a register in the interim.
- * 		In this mode, an object directly reachable from the
+ * 		With threads enabled, an object directly reachable from the
  * 		stack at the time of a collection is treated as dirty.
+ * 		In single-threaded mode, it suffices to ensure that no
+ * 		collection can take place between the pointer assignment
+ * 		and the GC_dirty() call.
  * PCR_VDB:	Use PPCRs virtual dirty bit facility.
  * PROC_VDB:	Use the /proc facility for reading dirty bits.  Only
  *		works under some SVR4 variants.  Even then, it may be
@@ -2046,9 +2065,125 @@ void (*GC_push_other_roots)(void) = GC_default_push_other_roots;
  *		call from doing so.  It is the clients responsibility to
  *		make sure that other system calls are similarly protected
  *		or write only to the stack.
+ * GWW_VDB:     Use the Win32 GetWriteWatch functions, if available, to
+ *              read dirty bits.  In case it is not available (because we
+ *              are running on Windows 95, Windows 2000 or earlier),
+ *              MPROTECT_VDB may be defined as a fallback strategy.
  */
 GC_bool GC_dirty_maintained = FALSE;
 
+#if defined(PROC_VDB) || defined(GWW_VDB)
+
+/* Add all pages in pht2 to pht1 */
+void GC_or_pages(page_hash_table pht1, page_hash_table pht2)
+{
+    register int i;
+    
+    for (i = 0; i < PHT_SIZE; i++) pht1[i] |= pht2[i];
+}
+
+#endif
+
+#ifdef GWW_VDB
+
+# define GC_GWW_BUF_LEN 1024
+  static PVOID gww_buf[GC_GWW_BUF_LEN];
+
+# ifdef MPROTECT_VDB
+    static GC_bool GC_gww_dirty_init(void)
+    {
+      detect_GetWriteWatch();
+      return GC_GWW_AVAILABLE();
+    }
+# else
+    void GC_dirty_init(void)
+    {
+      detect_GetWriteWatch();
+      GC_dirty_maintained = GC_GWW_AVAILABLE();
+    }
+# endif
+
+# ifdef MPROTECT_VDB
+    static void GC_gww_read_dirty(void)
+# else
+    void GC_read_dirty(void)
+# endif
+  {
+    word i;
+
+    BZERO(GC_grungy_pages, sizeof(GC_grungy_pages));
+
+    for (i = 0; i != GC_n_heap_sects; ++i) {
+      DWORD count;
+
+      do {
+        PVOID * pages, * pages_end;
+        DWORD page_size;
+
+        pages = gww_buf;
+        count = GC_GWW_BUF_LEN;
+        /*
+        * GetWriteWatch is documented as returning non-zero when it fails,
+        * but the documentation doesn't explicitly say why it would fail or
+        * what its behaviour will be if it fails.  If there are more dirty
+        * pages than will fit in the buffer, this is not treated as a
+        * failure; we must check the page count in the loop condition.
+        */
+        if (GetWriteWatch_func(WRITE_WATCH_FLAG_RESET,
+                               GC_heap_sects[i].hs_start,
+                               GC_heap_sects[i].hs_bytes,
+                               pages,
+                               &count,
+                               &page_size) != 0)
+        {
+          GC_err_printf(
+            "GC_gww_read_dirty fell back to marking all pages dirty\n");
+          memset(GC_grungy_pages, 0xff, sizeof(page_hash_table));
+          memset(GC_written_pages, 0xff, sizeof(page_hash_table));
+          return;
+        }
+
+        pages_end = pages + count;
+        while (pages != pages_end) {
+          struct hblk * h = (struct hblk *) *pages++;
+          struct hblk * h_end = (struct hblk *) ((char *) h + page_size);
+          do
+            set_pht_entry_from_index(GC_grungy_pages, PHT_HASH(h));
+          while (++h < h_end);
+        }
+      } while (count == GC_GWW_BUF_LEN);
+    }
+
+    GC_or_pages(GC_written_pages, GC_grungy_pages);
+  }
+
+# ifdef MPROTECT_VDB
+    static GC_bool GC_gww_page_was_dirty(struct hblk * h)
+# else
+    GC_bool GC_page_was_dirty(struct hblk * h)
+# endif
+  {
+    return HDR(h) == 0 || get_pht_entry_from_index(GC_grungy_pages, PHT_HASH(h));
+  }
+
+# ifdef MPROTECT_VDB
+    static GC_bool GC_gww_page_was_ever_dirty(struct hblk * h)
+# else
+    GC_bool GC_page_was_ever_dirty(struct hblk * h)
+# endif
+  {
+    return HDR(h) == 0 || get_pht_entry_from_index(GC_written_pages, PHT_HASH(h));
+  }
+
+# ifndef MPROTECT_VDB
+    void GC_is_fresh(struct hblk *h, word n)
+    {}
+    void GC_remove_protection(struct hblk *h, word nblocks, GC_bool is_ptrfree)
+    {}
+# endif
+
+# endif /* GWW_VDB */
+
 # ifdef DEFAULT_VDB
 
 /* All of the following assume the allocation lock is held, and	*/
@@ -2120,7 +2255,7 @@ void GC_dirty_init(void)
 {
     if (GC_print_stats == VERBOSE)
       GC_log_printf("Initializing MANUAL_VDB...\n");
-    /* FIXME - implement me.	*/
+    /* GC_dirty_pages and GC_grungy_pages are already cleared. */
     GC_dirty_maintained = TRUE;
 }
 
@@ -2128,7 +2263,9 @@ void GC_dirty_init(void)
 /* Restore the systems notion of which pages are dirty.		*/
 void GC_read_dirty(void)
 {
-    /* FIXME - implement me.	*/
+    BCOPY((word *)GC_dirty_pages, GC_grungy_pages,
+          (sizeof GC_dirty_pages));
+    BZERO((word *)GC_dirty_pages, (sizeof GC_dirty_pages));
 }
 
 /* Is the HBLKSIZE sized page at h marked dirty in the local buffer?	*/
@@ -2138,10 +2275,11 @@ void GC_read_dirty(void)
 /*ARGSUSED*/
 GC_bool GC_page_was_dirty(struct hblk *h)
 {
-    /* FIXME - implement me.	*/
-    return(TRUE);
+    register word index;
+    
+    index = PHT_HASH(h);
+    return(HDR(h) == 0 || get_pht_entry_from_index(GC_grungy_pages, index));
 }
-
  
 /* Could any valid GC heap pointer ever have been written to this page?	*/
 /*ARGSUSED*/
@@ -2151,6 +2289,13 @@ GC_bool GC_page_was_ever_dirty(struct hblk *h)
     return(TRUE);
 }
 
+/* Mark the page containing p as dirty.  Logically, this dirties the	*/
+/* entire object.							*/
+void GC_dirty(ptr_t p)
+{
+    async_set_pht_entry_from_index(GC_dirty_pages, index);
+}
+
 /* Reset the n pages starting at h to "was never dirty" status.	*/
 void GC_is_fresh(struct hblk *h, word n)
 {
@@ -2323,7 +2468,7 @@ GC_bool GC_old_segv_handler_used_si;
 #if !defined(DARWIN)
 #   include <errno.h>
 #   if defined(FREEBSD)
-#     define SIG_OK (sig == SIGBUS)
+#     define SIG_OK TRUE
 #     define CODE_OK (code == BUS_PAGE_FAULT)
 #   elif defined(OSF1)
 #     define SIG_OK (sig == SIGSEGV)
@@ -2342,17 +2487,17 @@ GC_bool GC_old_segv_handler_used_si;
 	/* architectures.						*/
 #   elif defined(HPUX)
 #     define SIG_OK (sig == SIGSEGV || sig == SIGBUS)
-#     define CODE_OK (scp -> si_code == SEGV_ACCERR) \
-		     || (scp -> si_code == BUS_ADRERR) \
-		     || (scp -> si_code == BUS_UNKNOWN) \
-		     || (scp -> si_code == SEGV_UNKNOWN) \
-		     || (scp -> si_code == BUS_OBJERR)
+#     define CODE_OK (si -> si_code == SEGV_ACCERR) \
+		     || (si -> si_code == BUS_ADRERR) \
+		     || (si -> si_code == BUS_UNKNOWN) \
+		     || (si -> si_code == SEGV_UNKNOWN) \
+		     || (si -> si_code == BUS_OBJERR)
 #   elif defined(FREEBSD)
 #     define SIG_OK (sig == SIGBUS)
-#     define CODE_OK (scp -> si_code == BUS_PAGE_FAULT)
+#     define CODE_OK (si -> si_code == BUS_PAGE_FAULT)
 #   elif defined(SUNOS5SIGS)
 #     define SIG_OK (sig == SIGSEGV)
-#     define CODE_OK (scp -> si_code == SEGV_ACCERR)
+#     define CODE_OK (si -> si_code == SEGV_ACCERR)
 #   elif defined(MSWIN32) || defined(MSWINCE)
 #     define SIG_OK (exc_info -> ExceptionRecord -> ExceptionCode \
 		     == STATUS_ACCESS_VIOLATION)
@@ -2485,6 +2630,9 @@ void GC_remove_protection(struct hblk *h, word nblocks, GC_bool is_ptrfree)
     struct hblk * current;
     GC_bool found_clean;
     
+#   if defined(GWW_VDB)
+      if (GC_GWW_AVAILABLE()) return;
+#   endif
     if (!GC_dirty_maintained) return;
     h_trunc = (struct hblk *)((word)h & ~(GC_page_size-1));
     h_end = (struct hblk *)(((word)(h + nblocks) + GC_page_size-1)
@@ -2569,10 +2717,15 @@ void GC_dirty_init(void)
       }
 #   endif /* HPUX || LINUX || HURD || (FREEBSD && SUNOS5SIGS) */
 #   if defined(MSWIN32)
+#     if defined(GWW_VDB)
+        if (GC_gww_dirty_init())
+          return;
+#     endif
       GC_old_segv_handler = SetUnhandledExceptionFilter(GC_write_fault_handler);
       if (GC_old_segv_handler != NULL) {
-	if (GC_print_stats == VERBOSE)
-          GC_log_printf("Replaced other UnhandledExceptionFilter\n");
+#	ifdef PRINTSTATS
+          GC_err_printf0("Replaced other UnhandledExceptionFilter\n");
+#	endif
       } else {
           GC_old_segv_handler = SIG_DFL;
       }
@@ -2597,7 +2750,7 @@ int GC_incremental_protection_needs(void)
 void GC_protect_heap(void)
 {
     ptr_t start;
-    word len;
+    size_t len;
     struct hblk * current;
     struct hblk * current_start;  /* Start of block to be protected. */
     struct hblk * limit;
@@ -2658,6 +2811,12 @@ void GC_protect_heap(void)
 /* bits while this is happenning (as in GC_enable_incremental).		*/
 void GC_read_dirty(void)
 {
+#   if defined(GWW_VDB)
+      if (GC_GWW_AVAILABLE()) {
+        GC_gww_read_dirty();
+        return;
+      }
+#   endif
     BCOPY((word *)GC_dirty_pages, GC_grungy_pages,
           (sizeof GC_dirty_pages));
     BZERO((word *)GC_dirty_pages, (sizeof GC_dirty_pages));
@@ -2666,8 +2825,14 @@ void GC_read_dirty(void)
 
 GC_bool GC_page_was_dirty(struct hblk *h)
 {
-    register word index = PHT_HASH(h);
+    register word index;
     
+#   if defined(GWW_VDB)
+      if (GC_GWW_AVAILABLE())
+        return GC_gww_page_was_dirty(h);
+#   endif
+
+    index = PHT_HASH(h);
     return(HDR(h) == 0 || get_pht_entry_from_index(GC_grungy_pages, index));
 }
 
@@ -2802,6 +2967,10 @@ ssize_t read(int fd, void *buf, size_t nbyte)
 /*ARGSUSED*/
 GC_bool GC_page_was_ever_dirty(struct hblk *h)
 {
+#   if defined(GWW_VDB)
+      if (GC_GWW_AVAILABLE())
+        return GC_gww_page_was_ever_dirty(h);
+#   endif
     return(TRUE);
 }
 
@@ -2855,14 +3024,6 @@ char *GC_proc_buf;
 	(GC_fresh_pages[FRESH_PAGE_SLOT(h)] == (h) && (h) != 0)
 #endif
 
-/* Add all pages in pht2 to pht1 */
-void GC_or_pages(pgae_hash_table pht1, page_hash_table pht2)
-{
-    register int i;
-    
-    for (i = 0; i < PHT_SIZE; i++) pht1[i] |= pht2[i];
-}
-
 int GC_proc_fd;
 
 void GC_dirty_init(void)
@@ -3706,8 +3867,10 @@ kern_return_t catch_exception_raise_state_identity(
      };
 #  elif defined (DRSNX)
 #    include <sys/sparc/frame.h>
-#  elif defined(OPENBSD) || defined(NETBSD)
+#  elif defined(OPENBSD)
 #    include <frame.h>
+#  elif defined(FREEBSD) || defined(NETBSD)
+#    include <machine/frame.h>
 #  else
 #    include <sys/frame.h>
 #  endif
@@ -3751,7 +3914,7 @@ void GC_save_callers (struct callinfo info[NFRAMES])
 
 #else /* No builtin backtrace; do it ourselves */
 
-#if (defined(OPENBSD) || defined(NETBSD)) && defined(SPARC)
+#if (defined(OPENBSD) || defined(NETBSD) || defined(FREEBSD)) && defined(SPARC)
 #  define FR_SAVFP fr_fp
 #  define FR_SAVPC fr_pc
 #else
diff --git a/pc_excludes b/pc_excludes
deleted file mode 100644
index 15c90455..00000000
--- a/pc_excludes
+++ /dev/null
@@ -1,21 +0,0 @@
-solaris_threads.c
-solaris_pthreads.c
-irix_threads.c
-pcr_interface.c
-real_malloc.c
-mips_mach_dep.s
-rs6000_mach_dep.s
-alpha_mach_dep.s
-sparc_mach_dep.s
-PCR-Makefile
-setjmp_t.c
-callprocs
-doc/gc.man
-pc_excludes
-barrett_diagram
-include/gc_c++.h
-include/gc_inline.h
-doc/README.hp
-doc/README.rs6000
-doc/README.sgi
-
diff --git a/powerpc_darwin_mach_dep.s b/powerpc_darwin_mach_dep.s
index 7586f459..fd23110b 100644
--- a/powerpc_darwin_mach_dep.s
+++ b/powerpc_darwin_mach_dep.s
@@ -64,7 +64,8 @@ _GC_push_regs:
 ; PIC stuff, generated by GCC
 
 .data
-.picsymbol_stub
+.section __TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32
+	.align 2
 L_GC_push_one$stub:
 	.indirect_symbol _GC_push_one
 	mflr r0
@@ -73,12 +74,12 @@ L0$_GC_push_one:
 	mflr r11
 	addis r11,r11,ha16(L_GC_push_one$lazy_ptr-L0$_GC_push_one)
 	mtlr r0
-	lwz r12,lo16(L_GC_push_one$lazy_ptr-L0$_GC_push_one)(r11)
+	lwzu r12,lo16(L_GC_push_one$lazy_ptr-L0$_GC_push_one)(r11)
 	mtctr r12
-	addi r11,r11,lo16(L_GC_push_one$lazy_ptr-L0$_GC_push_one)
 	bctr
 .data
 .lazy_symbol_pointer
 L_GC_push_one$lazy_ptr:
 	.indirect_symbol _GC_push_one
 	.long dyld_stub_binding_helper
+
diff --git a/pthread_stop_world.c b/pthread_stop_world.c
index efc18d15..f0652346 100644
--- a/pthread_stop_world.c
+++ b/pthread_stop_world.c
@@ -99,8 +99,25 @@ word GC_stop_count;	/* Incremented at the beginning of GC_stop_world. */
 
 sem_t GC_suspend_ack_sem;
 
-void GC_suspend_handler(int sig)
+void GC_suspend_handler_inner(ptr_t sig_arg, void *context);
+
+#if defined(IA64) || defined(HP_PA)
+void GC_suspend_handler(int sig, siginfo_t *info, void *context)
+{
+  GC_with_callee_saves_pushed(GC_suspend_handler_inner, (ptr_t)(word)sig);
+}
+#else
+/* We believe that in all other cases the full context is already	*/
+/* in the signal handler frame.						*/
+void GC_suspend_handler(int sig, siginfo_t *info, void *context)
+{
+  GC_suspend_handler_inner((ptr_t)(word)sig, context);
+}
+#endif
+
+void GC_suspend_handler_inner(ptr_t sig_arg, void *context)
 {
+    int sig = (int)(word)sig_arg;
     int dummy;
     pthread_t my_thread = pthread_self();
     GC_thread me;
@@ -366,13 +383,10 @@ void GC_stop_world()
     for (i = 0; i < n_live_threads; i++) {
 	retry:
 	  if (0 != (code = sem_wait(&GC_suspend_ack_sem))) {
-	      GC_err_printf("Sem_wait returned %d (errno = %d)\n", code, errno);
-#	      ifdef LINUX
-	        GC_err_printf("\tSem_wait is documented to never do this.\n");
-#	      endif
+	      /* On Linux, sem_wait is documented to always return zero.*/
+	      /* But the documentation appears to be incorrect.		*/
 	      if (errno == EINTR) {
 		/* Seems to happen with some versions of gdb.	*/
-		GC_err_printf("\tRetrying anyway\n");
 		goto retry;
 	      }
 	      ABORT("sem_wait for handler failed");
@@ -437,18 +451,19 @@ void GC_stop_init() {
     if (sem_init(&GC_suspend_ack_sem, 0, 0) != 0)
         ABORT("sem_init failed");
 
-    act.sa_flags = SA_RESTART;
+    act.sa_flags = SA_RESTART | SA_SIGINFO;
     if (sigfillset(&act.sa_mask) != 0) {
     	ABORT("sigfillset() failed");
     }
     GC_remove_allowed_signals(&act.sa_mask);
     /* SIG_THR_RESTART is set in the resulting mask.		*/
     /* It is unmasked by the handler when necessary. 		*/
-    act.sa_handler = GC_suspend_handler;
+    act.sa_sigaction = GC_suspend_handler;
     if (sigaction(SIG_SUSPEND, &act, NULL) != 0) {
     	ABORT("Cannot set SIG_SUSPEND handler");
     }
 
+    act.sa_flags &= ~ SA_SIGINFO;
     act.sa_handler = GC_restart_handler;
     if (sigaction(SIG_THR_RESTART, &act, NULL) != 0) {
     	ABORT("Cannot set SIG_THR_RESTART handler");
diff --git a/pthread_support.c b/pthread_support.c
index 688297f8..0c70732e 100644
--- a/pthread_support.c
+++ b/pthread_support.c
@@ -2,7 +2,7 @@
  * Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
  * Copyright (c) 1996 by Silicon Graphics.  All rights reserved.
  * Copyright (c) 1998 by Fergus Henderson.  All rights reserved.
- * Copyright (c) 2000-2001 by Hewlett-Packard Company.  All rights reserved.
+ * Copyright (c) 2000-2005 by Hewlett-Packard Company.  All rights reserved.
  *
  * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
  * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
@@ -14,17 +14,14 @@
  * modified is included with the above copyright notice.
  */
 /*
- * Support code for LinuxThreads, the clone()-based kernel
+ * Support code originally for LinuxThreads, the clone()-based kernel
  * thread package for Linux which is included in libc6.
  *
- * This code relies on implementation details of LinuxThreads,
- * (i.e. properties not guaranteed by the Pthread standard),
- * though this version now does less of that than the other Pthreads
- * support code.
- *
- * Note that there is a lot of code duplication between linux_threads.c
- * and thread support for some of the other Posix platforms; any changes
- * made here may need to be reflected there too.
+ * This code no doubt makes some assumptions beyond what is
+ * guaranteed by the pthread standard, though it now does
+ * very little of that.  It now also supports NPTL, and many
+ * other Posix thread implementations.  We are trying to merge
+ * all flavors of pthread dupport code into this file.
  */
  /* DG/UX ix86 support <takis@xfree86.org> */
 /*
@@ -45,7 +42,6 @@
  */
 
 /*#define DEBUG_THREADS 1*/
-/*#define GC_ASSERTIONS*/
 
 # include "private/pthread_support.h"
 
@@ -110,6 +106,8 @@
 # include <fcntl.h>
 # include <signal.h>
 
+# include "gc_inline.h"
+
 #if defined(GC_DARWIN_THREADS)
 # include "private/darwin_semaphore.h"
 #else
@@ -264,83 +262,49 @@ void GC_destroy_thread_local(GC_thread p)
 #   endif
 }
 
-void * GC_local_malloc(size_t bytes)
+void * GC_malloc(size_t bytes)
 {
-    if (EXPECT(!SMALL_ENOUGH(bytes),0)) {
-        return(GC_malloc(bytes));
-    } else {
-	int index = INDEX_FROM_REQUESTED_BYTES(bytes);
-	void ** my_fl;
-	void * my_entry;
-#	if defined(REDIRECT_MALLOC) && !defined(USE_PTHREAD_SPECIFIC)
-	GC_key_t k = GC_thread_key;
-#	endif
-	void * tsd;
-
-#	if defined(REDIRECT_MALLOC) && !defined(USE_PTHREAD_SPECIFIC)
-	    if (EXPECT(0 == k, 0)) {
-		/* This can happen if we get called when the world is	*/
-		/* being initialized.  Whether we can actually complete	*/
-		/* the initialization then is unclear.			*/
-		GC_init_parallel();
-		k = GC_thread_key;
-	    }
-#	endif
-	tsd = GC_getspecific(GC_thread_key);
-#	ifdef GC_ASSERTIONS
-	  LOCK();
-	  GC_ASSERT(tsd == (void *)GC_lookup_thread(pthread_self()));
-	  UNLOCK();
-#	endif
-	my_fl = ((GC_thread)tsd) -> normal_freelists + index;
-	my_entry = *my_fl;
-	if (EXPECT((word)my_entry >= HBLKSIZE, 1)) {
-	    void * next = obj_link(my_entry);
-	    void * result = (void *)my_entry;
-	    *my_fl = next;
-	    obj_link(my_entry) = 0;
-	    PREFETCH_FOR_WRITE(next);
-	    GC_ASSERT(GC_size(result) >= bytes + EXTRA_BYTES);
-	    GC_ASSERT(((word *)result)[1] == 0);
-	    return result;
-	} else if ((word)my_entry - 1 < DIRECT_GRANULES) {
-	    *my_fl = my_entry + index + 1;
-            return GC_malloc(bytes);
-	} else {
-	    GC_generic_malloc_many(RAW_BYTES_FROM_INDEX(index), NORMAL, my_fl);
-	    if (*my_fl == 0) return GC_oom_fn(bytes);
-	    return GC_local_malloc(bytes);
-	}
-    }
+    size_t granules = ROUNDED_UP_GRANULES(bytes);
+    void *tsd;
+    void *result;
+    void **tiny_fl;
+
+#   if defined(REDIRECT_MALLOC) && !defined(USE_PTHREAD_SPECIFIC)
+      GC_key_t k = GC_thread_key;
+      if (EXPECT(0 == k, 0)) {
+	/* We haven't yet run GC_init_parallel.  That means	*/
+	/* we also aren't locking, so this is fairly cheap.	*/
+	return GC_core_malloc(bytes);
+      }
+      tsd = GC_getspecific(k);
+#   else
+      tsd = GC_getspecific(GC_thread_key);
+#   endif
+#   if defined(REDIRECT_MALLOC) && defined(USE_PTHREAD_SPECIFIC)
+      if (EXPECT(NULL == tsd, 0)) {
+	return GC_core_malloc(bytes);
+      }
+#   endif
+#   ifdef GC_ASSERTIONS
+      LOCK();
+      GC_ASSERT(tsd == (void *)GC_lookup_thread(pthread_self()));
+      UNLOCK();
+#   endif
+    tiny_fl = ((GC_thread)tsd) -> normal_freelists;
+    GC_FAST_MALLOC_GRANS(result, granules, tiny_fl, DIRECT_GRANULES,
+		         NORMAL, GC_core_malloc(bytes), obj_link(result)=0);
+    return result;
 }
 
-void * GC_local_malloc_atomic(size_t bytes)
+void * GC_malloc_atomic(size_t bytes)
 {
-    if (EXPECT(!SMALL_ENOUGH(bytes), 0)) {
-        return(GC_malloc_atomic(bytes));
-    } else {
-	int index = INDEX_FROM_REQUESTED_BYTES(bytes);
-	void **my_fl = ((GC_thread)GC_getspecific(GC_thread_key))
-		        -> ptrfree_freelists + index;
-	void *my_entry = *my_fl;
-    
-	if (EXPECT((word)my_entry >= HBLKSIZE, 1)) {
-	    void * result = my_entry;
-	    *my_fl = obj_link(my_entry);
-	    GC_ASSERT(GC_size(result) >= bytes + EXTRA_BYTES);
-	    return result;
-	} else if ((word)my_entry - 1 < DIRECT_GRANULES) {
-	    *my_fl = my_entry + index + 1;
-            return GC_malloc_atomic(bytes);
-	} else {
-	    GC_generic_malloc_many(RAW_BYTES_FROM_INDEX(index), PTRFREE, my_fl);
-	    /* *my_fl is updated while the collector is excluded;	*/
-	    /* the free list is always visible to the collector as 	*/
-	    /* such.							*/
-	    if (*my_fl == 0) return GC_oom_fn(bytes);
-	    return GC_local_malloc_atomic(bytes);
-	}
-    }
+    size_t granules = ROUNDED_UP_GRANULES(bytes);
+    void *result;
+    void **tiny_fl = ((GC_thread)GC_getspecific(GC_thread_key))
+		        		-> ptrfree_freelists;
+    GC_FAST_MALLOC_GRANS(result, bytes, tiny_fl, DIRECT_GRANULES,
+		         PTRFREE, GC_core_malloc_atomic(bytes), /* no init */);
+    return result;
 }
 
 #ifdef GC_GCJ_SUPPORT
@@ -353,48 +317,35 @@ void * GC_local_malloc_atomic(size_t bytes)
 
 extern int GC_gcj_kind;
 
-void * GC_local_gcj_malloc(size_t bytes,
-			   void * ptr_to_struct_containing_descr)
+void * GC_gcj_malloc(size_t bytes,
+		     void * ptr_to_struct_containing_descr)
 {
+    size_t granules = ROUNDED_UP_GRANULES(bytes);
+    void *result;
+    void **tiny_fl = (GC_thread)GC_getspecific(GC_thread_key)
+		        		-> ptrfree_freelists;
     GC_ASSERT(GC_gcj_malloc_initialized);
-    if (EXPECT(!SMALL_ENOUGH(bytes), 0)) {
-        return GC_gcj_malloc(bytes, ptr_to_struct_containing_descr);
-    } else {
-	int index = INDEX_FROM_REQUESTED_BYTES(bytes);
-	void **my_fl = ((GC_thread)GC_getspecific(GC_thread_key))
-	                -> gcj_freelists + index;
-	void *my_entry = *my_fl;
-	if (EXPECT((word)my_entry >= HBLKSIZE, 1)) {
-	    void * result = my_entry;
-	    GC_ASSERT(!GC_incremental);
-	    /* We assert that any concurrent marker will stop us.	*/
-	    /* Thus it is impossible for a mark procedure to see the 	*/
-	    /* allocation of the next object, but to see this object 	*/
-	    /* still containing a free list pointer.  Otherwise the 	*/
-	    /* marker might find a random "mark descriptor".		*/
-	    *(volatile ptr_t *)my_fl = obj_link(my_entry);
-	    /* We must update the freelist before we store the pointer.	*/
-	    /* Otherwise a GC at this point would see a corrupted	*/
-	    /* free list.						*/
-	    /* A memory barrier is probably never needed, since the 	*/
-	    /* action of stopping this thread will cause prior writes	*/
-	    /* to complete.						*/
-	    GC_ASSERT(GC_size(result) >= bytes + EXTRA_BYTES);
-	    GC_ASSERT(((void * volatile *)result)[1] == 0); 
-	    *(void * volatile *)result = ptr_to_struct_containing_descr; 
-	    return result;
-	} else if ((word)my_entry - 1 < DIRECT_GRANULES) {
-	    if (!GC_incremental) *my_fl = my_entry + index + 1;
-	    	/* In the incremental case, we always have to take this */
-	    	/* path.  Thus we leave the counter alone.		*/
-            return GC_gcj_malloc(bytes, ptr_to_struct_containing_descr);
-	} else {
-	    GC_generic_malloc_many(RAW_BYTES_FROM_INDEX(index),
-			           GC_gcj_kind, my_fl);
-	    if (*my_fl == 0) return GC_oom_fn(bytes);
-	    return GC_local_gcj_malloc(bytes, ptr_to_struct_containing_descr);
-	}
-    }
+    GC_FAST_MALLOC_GRANS(result, bytes, tiny_fl, DIRECT_GRANULES,
+		         PTRFREE, GC_core_gcj_malloc(bytes),
+			 (AO_compiler_barrier(),
+			  *(void **)result = ptr_to_struct_containing_descr));
+    	/* This forces the initialization of the "method ptr".		*/
+        /* This is necessary to ensure some very subtle properties	*/
+    	/* required if a GC is run in the middle of such an allocation.	*/
+    	/* Here we implicitly also assume atomicity for the free list.	*/
+        /* and method pointer assignments.				*/
+	/* We must update the freelist before we store the pointer.	*/
+	/* Otherwise a GC at this point would see a corrupted		*/
+	/* free list.							*/
+	/* A real memory barrier is not needed, since the 		*/
+	/* action of stopping this thread will cause prior writes	*/
+	/* to complete.							*/
+	/* We assert that any concurrent marker will stop us.		*/
+	/* Thus it is impossible for a mark procedure to see the 	*/
+	/* allocation of the next object, but to see this object 	*/
+	/* still containing a free list pointer.  Otherwise the 	*/
+	/* marker might find a random "mark descriptor".		*/
+    return result;
 }
 
 #endif /* GC_GCJ_SUPPORT */
@@ -405,20 +356,6 @@ void * GC_local_gcj_malloc(size_t bytes,
 
 # endif /* !THREAD_LOCAL_ALLOC */
 
-#if 0
-/*
-To make sure that we're using LinuxThreads and not some other thread
-package, we generate a dummy reference to `pthread_kill_other_threads_np'
-(was `__pthread_initial_thread_bos' but that disappeared),
-which is a symbol defined in LinuxThreads, but (hopefully) not in other
-thread packages.
-
-We no longer do this, since this code is now portable enough that it might
-actually work for something else.
-*/
-void (*dummy_var_to_force_linux_threads)() = pthread_kill_other_threads_np;
-#endif /* 0 */
-
 long GC_nprocs = 1;	/* Number of processors.  We may not have	*/
 			/* access to all of them, but this is as good	*/
 			/* a guess as any ...				*/
@@ -1000,64 +937,45 @@ int WRAP_FUNC(pthread_sigmask)(int how, const sigset_t *set, sigset_t *oset)
 }
 #endif /* !GC_DARWIN_THREADS */
 
-/* Wrappers for functions that are likely to block for an appreciable	*/
-/* length of time.  Must be called in pairs, if at all.			*/
-/* Nothing much beyond the system call itself should be executed	*/
-/* between these.							*/
+/* Wrapper for functions that are likely to block for an appreciable	*/
+/* length of time.							*/
+
+struct blocking_data {
+    void (*fn)(void *);
+    void *arg;
+};
 
-void GC_start_blocking(void) {
-#   define SP_SLOP 128
+static void GC_do_blocking_inner(ptr_t data, void * context) {
+    struct blocking_data * d = (struct blocking_data *) data;
     GC_thread me;
     LOCK();
     me = GC_lookup_thread(pthread_self());
     GC_ASSERT(!(me -> thread_blocked));
 #   ifdef SPARC
 	me -> stop_info.stack_ptr = (ptr_t)GC_save_regs_in_stack();
-#   else
-#   ifndef GC_DARWIN_THREADS
+#   elif !defined(GC_DARWIN_THREADS)
 	me -> stop_info.stack_ptr = (ptr_t)GC_approx_sp();
 #   endif
-#   endif
 #   ifdef IA64
-	me -> backing_store_ptr = (ptr_t)GC_save_regs_in_stack() + SP_SLOP;
-#   endif
-    /* Add some slop to the stack pointer, since the wrapped call may 	*/
-    /* end up pushing more callee-save registers.			*/
-#   ifndef GC_DARWIN_THREADS
-#   ifdef STACK_GROWS_UP
-	me -> stop_info.stack_ptr += SP_SLOP;
-#   else
-	me -> stop_info.stack_ptr -= SP_SLOP;
-#   endif
+	me -> backing_store_ptr = (ptr_t)GC_save_regs_in_stack();
 #   endif
     me -> thread_blocked = TRUE;
+    /* Save context here if we want to support precise stack marking */
     UNLOCK();
-}
-
-void GC_end_blocking(void) {
-    GC_thread me;
+    (d -> fn)(d -> arg);
     LOCK();   /* This will block if the world is stopped.	*/
-    me = GC_lookup_thread(pthread_self());
-    GC_ASSERT(me -> thread_blocked);
     me -> thread_blocked = FALSE;
     UNLOCK();
 }
-    
-#if defined(GC_DGUX386_THREADS)
-#define __d10_sleep sleep
-#endif /* GC_DGUX386_THREADS */
 
-/* A wrapper for the standard C sleep function	*/
-int WRAP_FUNC(sleep) (unsigned int seconds)
-{
-    int result;
+void GC_do_blocking(void (*fn)(void *), void *arg) {
+    struct blocking_data my_data;
 
-    GC_start_blocking();
-    result = REAL_FUNC(sleep)(seconds);
-    GC_end_blocking();
-    return result;
+    my_data.fn = fn;
+    my_data.arg = arg;
+    GC_with_callee_saves_pushed(GC_do_blocking_inner, (ptr_t)(&my_data));
 }
-
+    
 struct start_info {
     void *(*start_routine)(void *);
     void *arg;
@@ -1066,16 +984,14 @@ struct start_info {
 				/* parent hasn't yet noticed.		*/
 };
 
-/* Called at thread exit.				*/
-/* Never called for main thread.  That's OK, since it	*/
-/* results in at most a tiny one-time leak.  And 	*/
-/* linuxthreads doesn't reclaim the main threads 	*/
-/* resources or id anyway.				*/
-void GC_thread_exit_proc(void *arg)
+int GC_unregister_my_thread(void)
 {
     GC_thread me;
 
     LOCK();
+    /* Wait for any GC that may be marking from our stack to	*/
+    /* complete before we remove this thread.			*/
+    GC_wait_for_gc_completion(FALSE);
     me = GC_lookup_thread(pthread_self());
     GC_destroy_thread_local(me);
     if (me -> flags & DETACHED) {
@@ -1087,9 +1003,18 @@ void GC_thread_exit_proc(void *arg)
        && !defined(USE_COMPILER_TLS) && !defined(DBG_HDRS_ALL)
       GC_remove_specific(GC_thread_key);
 #   endif
-    /* The following may run the GC from "nonexistent" thread.	*/
-    GC_wait_for_gc_completion(FALSE);
     UNLOCK();
+    return GC_SUCCESS;
+}
+
+/* Called at thread exit.				*/
+/* Never called for main thread.  That's OK, since it	*/
+/* results in at most a tiny one-time leak.  And 	*/
+/* linuxthreads doesn't reclaim the main threads 	*/
+/* resources or id anyway.				*/
+void GC_thread_exit_proc(void *arg)
+{
+    GC_unregister_my_thread();
 }
 
 int WRAP_FUNC(pthread_join)(pthread_t thread, void **retval)
@@ -1147,7 +1072,46 @@ WRAP_FUNC(pthread_detach)(pthread_t thread)
 
 GC_bool GC_in_thread_creation = FALSE;
 
-void * GC_start_routine(void * arg)
+GC_thread GC_register_my_thread_inner(struct GC_stack_base *sb,
+				      pthread_t my_pthread)
+{
+    GC_thread me;
+
+    GC_in_thread_creation = TRUE; /* OK to collect from unknow thread. */
+    me = GC_new_thread(my_pthread);
+    GC_in_thread_creation = FALSE;
+#   ifdef GC_DARWIN_THREADS
+      me -> stop_info.mach_thread = mach_thread_self();
+#   else
+      me -> stop_info.stack_ptr = sb -> mem_base;
+#   endif
+    me -> stack_end = sb -> mem_base;
+#   ifdef IA64
+      me -> backing_store_end = sb -> reg_base;
+#   endif /* IA64 */
+    return me;
+}
+
+int GC_register_my_thread(struct GC_stack_base *sb)
+{
+    pthread_t my_pthread = pthread_self();
+
+    LOCK();
+    GC_thread me = GC_lookup_thread(my_pthread);
+    if (0 == me) {
+        me = GC_register_my_thread_inner(sb, my_pthread);
+	me -> flags |= DETACHED;
+    	  /* Treat as detached, since we do not need to worry about	*/
+    	  /* pointer results.						*/
+	UNLOCK();
+        return GC_SUCCESS;
+    } else {
+	UNLOCK();
+	return GC_DUPLICATE;
+    }
+}
+
+void * GC_inner_start_routine(struct GC_stack_base *sb, void * arg)
 {
     int dummy;
     struct start_info * si = arg;
@@ -1164,39 +1128,8 @@ void * GC_start_routine(void * arg)
         GC_printf("sp = 0x%lx\n", (long) &arg);
 #   endif
     LOCK();
-    GC_in_thread_creation = TRUE;
-    me = GC_new_thread(my_pthread);
-    GC_in_thread_creation = FALSE;
-#ifdef GC_DARWIN_THREADS
-    me -> stop_info.mach_thread = mach_thread_self();
-#else
-    me -> stop_info.stack_ptr = 0;
-#endif
+    me = GC_register_my_thread_inner(sb, my_pthread);
     me -> flags = si -> flags;
-    /* me -> stack_end = GC_linux_stack_base(); -- currently (11/99)	*/
-    /* doesn't work because the stack base in /proc/self/stat is the 	*/
-    /* one for the main thread.  There is a strong argument that that's	*/
-    /* a kernel bug, but a pervasive one.				*/
-#   ifdef STACK_GROWS_DOWN
-      me -> stack_end = (ptr_t)(((word)(&dummy) + (GC_page_size - 1))
-		                & ~(GC_page_size - 1));
-#	  ifndef GC_DARWIN_THREADS
-        me -> stop_info.stack_ptr = me -> stack_end - 0x10;
-#	  endif
-	/* Needs to be plausible, since an asynchronous stack mark	*/
-	/* should not crash.						*/
-#   else
-      me -> stack_end = (ptr_t)((word)(&dummy) & ~(GC_page_size - 1));
-      me -> stop_info.stack_ptr = me -> stack_end + 0x10;
-#   endif
-    /* This is dubious, since we may be more than a page into the stack, */
-    /* and hence skip some of it, though it's not clear that matters.	 */
-#   ifdef IA64
-      me -> backing_store_end = (ptr_t)
-			(GC_save_regs_in_stack() & ~(GC_page_size - 1));
-      /* This is also < 100% convincing.  We should also read this 	*/
-      /* from /proc, but the hook to do so isn't there yet.		*/
-#   endif /* IA64 */
     UNLOCK();
     start = si -> start_routine;
 #   ifdef DEBUG_THREADS
@@ -1212,9 +1145,9 @@ void * GC_start_routine(void * arg)
 	UNLOCK();
 #   endif
     result = (*start)(start_arg);
-#if DEBUG_THREADS
+#   if DEBUG_THREADS
         GC_printf("Finishing thread 0x%x\n", (unsigned)pthread_self());
-#endif
+#   endif
     me -> status = result;
     pthread_cleanup_pop(1);
     /* Cleanup acquires lock, ensuring that we can't exit		*/
@@ -1223,6 +1156,11 @@ void * GC_start_routine(void * arg)
     return(result);
 }
 
+void * GC_start_routine(void * arg)
+{
+    GC_call_with_stack_base(GC_inner_start_routine, arg);
+}
+
 int
 WRAP_FUNC(pthread_create)(pthread_t *new_thread,
 		  const pthread_attr_t *attr,
diff --git a/ptr_chck.c b/ptr_chck.c
index 23e183c7..ad20c9ab 100644
--- a/ptr_chck.c
+++ b/ptr_chck.c
@@ -209,8 +209,7 @@ void * GC_is_visible(void *p)
     	    if (GC_is_static_root(p)) return(p);
     	    /* Else do it again correctly:	*/
 #           if (defined(DYNAMIC_LOADING) || defined(MSWIN32) || \
-		defined(MSWINCE) || defined(PCR)) \
-                && !defined(SRC_M3)
+		defined(MSWINCE) || defined(PCR))
     	        GC_register_dynamic_libraries();
     	        result = GC_is_static_root(p);
     	        if (result) return(p);
diff --git a/reclaim.c b/reclaim.c
index 279b75b7..6cb8b47e 100644
--- a/reclaim.c
+++ b/reclaim.c
@@ -332,13 +332,14 @@ int GC_n_set_marks(hdr *hhdr)
 {
     int result = 0;
     int i;
-    int n_objs = HBLK_OBJS(hhdr -> hb_sz);
-    
-    if (0 == n_objs) n_objs = 1;
-    for (i = 0; i < n_objs; i++) {
+    size_t sz = hhdr -> hb_sz;
+    int offset = MARK_BIT_OFFSET(sz);
+    int limit = FINAL_MARK_BIT(sz);
+
+    for (i = 0; i < limit; i += offset) {
         result += hhdr -> hb_marks[i];
     }
-    GC_ASSERT(hhdr -> hb_marks[n_objs]);
+    GC_ASSERT(hhdr -> hb_marks[limit]);
     return(result);
 }
 
@@ -362,16 +363,24 @@ int GC_n_set_marks(hdr *hhdr)
 {
     int result = 0;
     int i;
-    int n_objs = HBLK_OBJS(hhdr -> hb_sz);
     int n_mark_words;
+#   ifdef MARK_BIT_PER_OBJ
+      int n_objs = HBLK_OBJS(hhdr -> hb_sz);
     
-    if (0 == n_objs) n_objs = 1;
-    n_mark_words = divWORDSZ(n_objs + WORDSZ - 1);
+      if (0 == n_objs) n_objs = 1;
+      n_mark_words = divWORDSZ(n_objs + WORDSZ - 1);
+#   else /* MARK_BIT_PER_GRANULE */
+      n_mark_words = MARK_BITS_SZ;
+#   endif
     for (i = 0; i < n_mark_words - 1; i++) {
         result += set_bits(hhdr -> hb_marks[i]);
     }
-    result += set_bits((hhdr -> hb_marks[n_mark_words])
-		       << (n_mark_words * WORDSZ - n_objs));
+#   ifdef MARK_BIT_PER_OBJ
+      result += set_bits((hhdr -> hb_marks[n_mark_words - 1])
+		         << (n_mark_words * WORDSZ - n_objs));
+#   else
+      result += set_bits(hhdr -> hb_marks[n_mark_words - 1]);
+#   endif
     return(result - 1);
 }
 
@@ -405,7 +414,7 @@ void GC_print_block_list()
 {
     struct Print_stats pstats;
 
-    GC_printf("(kind(0=ptrfree,1=normal,2=unc.,3=stubborn):size_in_bytes, #_marks_set)\n");
+    GC_printf("(kind(0=ptrfree,1=normal,2=unc.):size_in_bytes, #_marks_set)\n");
     pstats.number_of_blocks = 0;
     pstats.total_bytes = 0;
     GC_apply_to_all_blocks(GC_print_block_descr, (word)&pstats);
diff --git a/tests/Makefile.am b/tests/Makefile.am
new file mode 100644
index 00000000..0064f3ba
--- /dev/null
+++ b/tests/Makefile.am
@@ -0,0 +1,37 @@
+AM_CPPFLAGS=-I$(top_builddir)/include -I$(top_builddir)/include/private
+
+if CPLUSPLUS
+extra_checks = test_cpp
+else
+extra_checks = 
+endif
+
+## FIXME: trace_test don't works on macosx 10.3 
+## gcc -g -O2 -o .libs/tracetest trace_test.o  ../.libs/libgc.dylib -lpthread
+## ld: Undefined symbols:
+## _GC_generate_random_backtrace
+
+check_PROGRAMS = gctest leaktest middletest threadleaktest $(extra_checks)
+
+gctest_SOURCES = test.c
+gctest_LDADD = $(top_builddir)/libgc.la $(THREADLIBS) $(UNWINDLIBS) $(EXTRA_TEST_LIBS)
+gctest_DEPENDENCIES = $(top_builddir)/libgc.la
+
+leaktest_SOURCES = leak_test.c
+leaktest_LDADD = $(top_builddir)/libgc.la $(THREADLIBS) $(UNWINDLIBS) $(EXTRA_TEST_LIBS)
+
+middletest_SOURCES = middle.c
+middletest_LDADD = $(top_builddir)/libgc.la $(THREADLIBS) $(UNWINDLIBS) $(EXTRA_TEST_LIBS)
+
+## tracetest_SOURCES = trace_test.c
+## tracetest_LDADD = $(top_builddir)/libgc.la $(THREADLIBS) $(UNWINDLIBS) $(EXTRA_TEST_LIBS)
+
+threadleaktest_SOURCES = thread_leak_test.c
+threadleaktest_LDADD = $(top_builddir)/libgc.la $(THREADLIBS) $(UNWINDLIBS) $(EXTRA_TEST_LIBS)
+
+if CPLUSPLUS
+test_cpp_SOURCES = test_cpp.cc
+test_cpp_LDADD = $(top_builddir)/libgc.la $(top_builddir)/libgccpp.la $(THREADLIBS) $(UNWINDLIBS) $(EXTRA_TEST_LIBS)
+endif
+
+TESTS = gctest leaktest middletest threadleaktest $(extra_checks)
diff --git a/tests/test.c b/tests/test.c
index 14989bab..1abf30e3 100644
--- a/tests/test.c
+++ b/tests/test.c
@@ -39,9 +39,6 @@
 # include <assert.h>	/* Not normally used, but handy for debugging. */
 # include "gc.h"
 # include "gc_typed.h"
-# ifdef THREAD_LOCAL_ALLOC
-#   include "gc_local_alloc.h"
-# endif
 # include "private/gc_priv.h"	/* For output, locking, MIN_WORDS, 	*/
 				/* and some statistics.			*/
 # include "private/gcconfig.h"
@@ -242,53 +239,6 @@ struct GC_ms_entry * fake_gcj_mark_proc(word * addr,
 
 #endif /* GC_GCJ_SUPPORT */
 
-#ifdef THREAD_LOCAL_ALLOC
-
-#undef GC_REDIRECT_TO_LOCAL
-#include "gc_local_alloc.h"
-
-sexpr local_cons (x, y)
-sexpr x;
-sexpr y;
-{
-    register sexpr r;
-    register int *p;
-    register int my_extra = extra_count;
-    static int my_random = 0;
-    
-    collectable_count++;
-    r = (sexpr) GC_LOCAL_MALLOC(sizeof(struct SEXPR) + my_extra);
-#   ifdef GC_GCJ_SUPPORT
-      if (collectable_count % 2 == 0) {
-        r = (sexpr) GC_LOCAL_GCJ_MALLOC(sizeof(struct SEXPR) + sizeof(GC_word) + my_extra,
-					&gcj_class_struct1);
-        r = (sexpr) ((GC_word *)r + 1);
-      }
-#   endif
-    if (r == 0) {
-        (void)GC_printf("Out of memory\n");
-        exit(1);
-    }
-    for (p = (int *)r;
-         ((char *)p) < ((char *)r) + my_extra + sizeof(struct SEXPR); p++) {
-	if (*p) {
-	    (void)GC_printf(
-		"Found nonzero at %p (local) - allocator is broken\n", p);
-	    FAIL;
-        }
-        *p = (7 << 12) + ((p - (int *)r) & 0xfff);
-    }
-    r -> sexpr_car = x;
-    r -> sexpr_cdr = y;
-    my_extra++;
-    if ( my_extra >= 5000 || (my_extra == 200 && ++my_random % 37 != 0)) {
-        extra_count = 0;
-    } else {
-        extra_count = my_extra;
-    }
-    return(r);
-}
-#endif /* THREAD_LOCAL_ALLOC */
 
 sexpr small_cons (x, y)
 sexpr x;
@@ -416,35 +366,6 @@ int low, up;
 }
 #endif /* GC_GCJ_SUPPORT */
 
-#ifdef THREAD_LOCAL_ALLOC
-/* Return reverse(x) concatenated with y */
-sexpr local_reverse1(x, y)
-sexpr x, y;
-{
-    if (is_nil(x)) {
-        return(y);
-    } else {
-        return( local_reverse1(cdr(x), local_cons(car(x), y)) );
-    }
-}
-
-sexpr local_reverse(x)
-sexpr x;
-{
-    return( local_reverse1(x, nil) );
-}
-
-sexpr local_ints(low, up)
-int low, up;
-{
-    if (low > up) {
-	return(nil);
-    } else {
-        return(local_cons(local_cons(INT_TO_SEXPR(low), nil), local_ints(low+1, up)));
-    }
-}
-#endif /* THREAD_LOCAL_ALLOC */
-
 /* To check uncollectable allocation we build lists with disguised cdr	*/
 /* pointers, and make sure they don't go away.				*/
 sexpr uncollectable_ints(low, up)
@@ -549,9 +470,6 @@ void check_marks_int_list(sexpr x)
     int i;
     for (i = 0; i < 5; ++i) {
       check_ints(reverse(reverse(ints(1,10))), 1, 10);
-#     ifdef THREAD_LOCAL_ALLOC
-        check_ints(local_reverse(local_reverse(local_ints(1,10))), 1, 10);
-#     endif
     }
     return 0;
 }
@@ -700,9 +618,6 @@ void reverse_test()
     	/* 49 integers.  Thus this is thread safe without locks,	  */
     	/* assuming atomic pointer assignments.				  */
         a = reverse(reverse(a));
-#       ifdef THREAD_LOCAL_ALLOC
-	  a = local_reverse(local_reverse(a));
-#	endif
 #	if !defined(AT_END) && !defined(THREADS)
 	  /* This is not thread safe, since realloc explicitly deallocates */
           if (i & 1) {
@@ -808,20 +723,9 @@ int live_indicators_count = 0;
 tn * mktree(n)
 int n;
 {
-#   ifdef THREAD_LOCAL_ALLOC
-      tn * result = (tn *)GC_LOCAL_MALLOC(sizeof(tn));
-#   else
-      tn * result = (tn *)GC_MALLOC(sizeof(tn));
-#   endif
+    tn * result = (tn *)GC_MALLOC(sizeof(tn));
     
     collectable_count++;
-#   ifdef THREAD_LOCAL_ALLOC
-       /* Minimally exercise thread local allocation */
-       {
-         char * result = (char *)GC_LOCAL_MALLOC_ATOMIC(17);
-	 memset(result, 'a', 17);
-       }
-#   endif /* THREAD_LOCAL_ALLOC */
 #   if defined(MACOS)
 	/* get around static data limitations. */
 	if (!live_indicators)
@@ -1512,7 +1416,7 @@ void SetMinimumStack(long minSize)
 #   endif
     GC_INIT();	/* Only needed on a few platforms.	*/
     (void) GC_set_warn_proc(warn_proc);
-#   if (defined(MPROTECT_VDB) || defined(PROC_VDB)) \
+#   if (defined(MPROTECT_VDB) || defined(PROC_VDB) || defined(GWW_VDB)) \
           && !defined(MAKE_BACK_GRAPH)
       GC_enable_incremental();
       (void) GC_printf("Switched to incremental mode\n");
@@ -1652,8 +1556,8 @@ int APIENTRY WinMain(HINSTANCE instance, HINSTANCE prev, LPSTR cmd, int n)
     HANDLE win_thr_h;
 # endif
   DWORD thread_id;
+  GC_INIT();
   GC_enable_incremental();
-  GC_init();
   InitializeCriticalSection(&incr_cs);
   (void) GC_set_warn_proc(warn_proc);
 # ifdef MSWINCE
@@ -1727,7 +1631,7 @@ test()
 }
 #endif
 
-#if defined(GC_SOLARIS_THREADS) || defined(GC_PTHREADS)
+#if defined(GC_PTHREADS)
 void * thr_run_one_test(void * arg)
 {
     run_one_test();
@@ -1738,50 +1642,6 @@ void * thr_run_one_test(void * arg)
 #  define GC_free GC_debug_free
 #endif
 
-#if defined(GC_SOLARIS_THREADS) && !defined(GC_SOLARIS_PTHREADS)
-main()
-{
-    thread_t th1;
-    thread_t th2;
-    int code;
-
-    n_tests = 0;
-    GC_INIT();	/* Only needed if gc is dynamic library.	*/
-#   ifndef MAKE_BACK_GRAPH
-      GC_enable_incremental();
-#   endif
-    (void) GC_set_warn_proc(warn_proc);
-    if (thr_keycreate(&fl_key, GC_free) != 0) {
-        (void)GC_printf("Key creation failed %d\n", code);
-    	FAIL;
-    }
-    if ((code = thr_create(0, 1024*1024, thr_run_one_test, 0, 0, &th1)) != 0) {
-    	(void)GC_printf("Thread 1 creation failed %d\n", code);
-    	FAIL;
-    }
-    if ((code = thr_create(0, 1024*1024, thr_run_one_test, 0, THR_NEW_LWP, &th2)) != 0) {
-    	(void)GC_printf("Thread 2 creation failed %d\n", code);
-    	FAIL;
-    }
-    run_one_test();
-    if ((code = thr_join(th1, 0, 0)) != 0) {
-        (void)GC_printf("Thread 1 failed %d\n", code);
-        FAIL;
-    }
-    if (thr_join(th2, 0, 0) != 0) {
-        (void)GC_printf("Thread 2 failed %d\n", code);
-        FAIL;
-    }
-    check_heap_stats();
-    (void)fflush(stdout);
-    return(0);
-}
-#else /* pthreads */
-
-#ifndef GC_PTHREADS
-  --> bad news
-#endif
-
 int main()
 {
     pthread_t th1;
@@ -1853,4 +1713,3 @@ int main()
     return(0);
 }
 #endif /* GC_PTHREADS */
-#endif /* GC_SOLARIS_THREADS || GC_PTHREADS */
diff --git a/threadlibs.c b/threadlibs.c
index 264b7240..9078c8d8 100644
--- a/threadlibs.c
+++ b/threadlibs.c
@@ -11,10 +11,17 @@ int main()
 	       "-Wl,--wrap -Wl,pthread_sigmask -Wl,--wrap -Wl,sleep\n");
 #   endif
 #   if defined(GC_LINUX_THREADS) || defined(GC_IRIX_THREADS) \
-	|| defined(GC_FREEBSD_THREADS) || defined(GC_SOLARIS_PTHREADS) \
+	|| defined(GC_SOLARIS_PTHREADS) \
 	|| defined(GC_DARWIN_THREADS) || defined(GC_AIX_THREADS)
         printf("-lpthread\n");
 #   endif
+#   if defined(GC_FREEBSD_THREADS)
+#       if (__FREEBSD_version >= 500000)
+          printf("-lpthread\n");
+#       else
+          printf("-pthread\n");
+#       endif
+#   endif
 #   if defined(GC_HPUX_THREADS) || defined(GC_OSF1_THREADS)
 	printf("-lpthread -lrt\n");
 #   endif
diff --git a/version.h b/version.h
index 43a2d593..10d8edbd 100644
--- a/version.h
+++ b/version.h
@@ -3,7 +3,7 @@
 /* it to keep the old-style build process working.		*/
 #define GC_TMP_VERSION_MAJOR 7
 #define GC_TMP_VERSION_MINOR 0
-#define GC_TMP_ALPHA_VERSION 1
+#define GC_TMP_ALPHA_VERSION 2
 
 #ifndef GC_NOT_ALPHA
 #   define GC_NOT_ALPHA 0xff
diff --git a/win32_threads.c b/win32_threads.c
index 51ed87e0..6f7efaa5 100755
--- a/win32_threads.c
+++ b/win32_threads.c
@@ -11,6 +11,7 @@
 # undef pthread_create 
 # undef pthread_sigmask 
 # undef pthread_join 
+# undef pthread_detach
 # undef dlopen 
 
 # define DEBUG_CYGWIN_THREADS 0
@@ -37,7 +38,7 @@ typedef LONG * IE_t;
 GC_bool GC_thr_initialized = FALSE;
 
 #ifdef GC_DLL
-  GC_bool GC_need_to_lock = TRUE;
+  GC_API GC_bool GC_need_to_lock = TRUE;
   	/* Cannot intercept thread creation.	*/
 #else
   GC_bool GC_need_to_lock = FALSE;
@@ -46,7 +47,7 @@ GC_bool GC_thr_initialized = FALSE;
 DWORD GC_main_thread = 0;
 
 struct GC_thread_Rep {
-  LONG in_use; /* Updated without lock.	*/
+  AO_t in_use; 		/* Updated without lock.	*/
   			/* We assert that unused 	*/
   			/* entries have invalid ids of	*/
   			/* zero and zero stack fields.  */
@@ -69,8 +70,8 @@ struct GC_thread_Rep {
 typedef volatile struct GC_thread_Rep * GC_thread;
 
 /*
- * We generally assume that volatile ==> memory ordering, at least among
- * volatiles.
+ * We assumed that volatile ==> memory ordering, at least among
+ * volatiles.  This code should consistently use atomic_ops.
  */
 
 volatile GC_bool GC_please_stop = FALSE;
@@ -84,9 +85,12 @@ extern LONG WINAPI GC_write_fault_handler(struct _EXCEPTION_POINTERS *exc_info);
 
 /*
  * This may be called from DllMain, and hence operates under unusual
- * constraints.
+ * constraints.  In particular, it must be lock-free.
+ * Always called from the thread being added.
  */
-static GC_thread GC_new_thread(void) {
+static GC_thread GC_register_my_thread_inner(struct GC_stack_base *sb,
+					     DWORD thread_id)
+{
   int i;
   /* It appears to be unsafe to acquire a lock here, since this	*/
   /* code is apparently not preeemptible on some systems.	*/
@@ -108,6 +112,8 @@ static GC_thread GC_new_thread(void) {
     /* InterlockedExchange is supposed to be replaced by		*/
     /* InterlockedExchangePointer, but that's not really what I		*/
     /* want here.							*/
+    /* FIXME: We should eventually declare Win95 dead and use AO_	*/
+    /* primitives here.							*/
     if (i == MAX_THREADS - 1)
       ABORT("too many threads");
   }
@@ -139,14 +145,14 @@ static GC_thread GC_new_thread(void) {
 	GC_err_printf("Last error code: %d\n", last_error);
 	ABORT("DuplicateHandle failed");
   }
-  thread_table[i].stack_base = GC_get_stack_base();
+  thread_table[i].stack_base = sb -> mem_base;
   /* Up until this point, GC_push_all_stacks considers this thread	*/
   /* invalid.								*/
   if (thread_table[i].stack_base == NULL) 
-    ABORT("Failed to find stack base in GC_new_thread");
+    ABORT("Bad stack base in GC_register_my_thread");
   /* Up until this point, this entry is viewed as reserved but invalid	*/
   /* by GC_delete_thread.						*/
-  thread_table[i].id = GetCurrentThreadId();
+  thread_table[i].id = thread_id;
   /* If this thread is being created while we are trying to stop	*/
   /* the world, wait here.  Hopefully this can't happen on any	*/
   /* systems that don't allow us to block here.			*/
@@ -169,9 +175,44 @@ LONG GC_get_max_thread_index()
   return my_max;
 }
 
-/* This is intended to be lock-free, though that			*/
-/* assumes that the CloseHandle becomes visible before the 		*/
-/* in_use assignment.							*/
+/* Return the GC_thread corresponding to a thread id.  May be called 	*/
+/* without a lock, but should be called in contexts in which the	*/
+/* requested thread cannot be asynchronously deleted, e.g. from the	*/
+/* thread itself.							*/
+static GC_thread GC_lookup_thread(DWORD thread_id) {
+  int i;
+  LONG my_max = GC_get_max_thread_index();
+
+  for (i = 0;
+       i <= my_max &&
+       (!AO_load_acquire(&(thread_table[i].in_use))
+	|| thread_table[i].id != thread_id);
+       /* Must still be in_use, since nobody else can store our thread_id. */
+       i++) {}
+  if (i > my_max) {
+    return 0;
+  } else {
+    return thread_table + i;
+  }
+}
+
+int GC_register_my_thread(struct GC_stack_base *sb) {
+  DWORD t = GetCurrentThreadId();
+
+  if (0 == GC_lookup_thread(t)) {
+    /* We lock here, since we want to wait for an ongoing GC.	*/
+    LOCK();
+    GC_register_my_thread_inner(sb, t);
+    UNLOCK();
+    return GC_SUCCESS;
+  } else {
+    return GC_DUPLICATE;
+  }
+}
+
+/* This is intended to be lock-free.					*/
+/* It is either called synchronously from the thread being deleted,	*/
+/* or by the joining thread.						*/
 static void GC_delete_gc_thread(GC_thread thr)
 {
     CloseHandle(thr->handle);
@@ -181,41 +222,43 @@ static void GC_delete_gc_thread(GC_thread thr)
 #   ifdef CYGWIN32
       thr->pthread_id = 0;
 #   endif /* CYGWIN32 */
-    thr->in_use = FALSE;
+    AO_store_release(&(thr->in_use), FALSE);
 }
 
+
 static void GC_delete_thread(DWORD thread_id) {
-  int i;
-  LONG my_max = GC_get_max_thread_index();
+  GC_thread t = GC_lookup_thread(thread_id);
 
-  for (i = 0;
-       i <= my_max &&
-       (!thread_table[i].in_use || thread_table[i].id != thread_id);
-       /* Must still be in_use, since nobody else can store our thread_id. */
-       i++) {}
-  if (i > my_max) {
+  if (0 == t) {
     WARN("Removing nonexisiting thread %ld\n", (GC_word)thread_id);
   } else {
-    GC_delete_gc_thread(thread_table+i);
+    GC_delete_gc_thread(t);
   }
 }
 
+int GC_unregister_my_thread(void)
+{
+    GC_delete_thread(GetCurrentThreadId());
+    return GC_SUCCESS;
+}
+
 
 #ifdef CYGWIN32
 
 /* Return a GC_thread corresponding to a given pthread_t.	*/
 /* Returns 0 if it's not there.					*/
 /* We assume that this is only called for pthread ids that	*/
-/* have not yet terminated or are still joinable.		*/
-static GC_thread GC_lookup_thread(pthread_t id)
+/* have not yet terminated or are still joinable, and		*/
+/* cannot be concurrently terminated.				*/
+static GC_thread GC_lookup_pthread(pthread_t id)
 {
   int i;
   LONG my_max = GC_get_max_thread_index();
 
   for (i = 0;
        i <= my_max &&
-       (!thread_table[i].in_use || thread_table[i].pthread_id != id
-	|| !thread_table[i].in_use);
+       (!AO_load_acquire(&(thread_table[i].in_use))
+	|| thread_table[i].pthread_id != id);
        /* Must still be in_use, since nobody else can store our thread_id. */
        i++);
   if (i > my_max) return 0;
@@ -269,7 +312,7 @@ void GC_stop_world(void)
 #         ifndef CYGWIN32
             /* this breaks pthread_join on Cygwin, which is guaranteed to  */
 	    /* only see user pthreads 					   */
-	    thread_table[i].in_use = FALSE;
+	    AO_store(&(thread_table[i].in_use), FALSE);
 	    CloseHandle(thread_table[i].handle);
 #         endif
 	  continue;
@@ -497,8 +540,10 @@ static DWORD WINAPI thread_start(LPVOID arg)
 {
     DWORD ret = 0;
     thread_args *args = (thread_args *)arg;
+    struct GC_stack_base *sb;
 
-    GC_new_thread();
+    GC_get_stack_base(&sb);
+    GC_register_my_thread(&sb); /* This waits for an in-progress GC. */
 
     /* Clear the thread entry even if we exit with an exception.	*/
     /* This is probably pointless, since an uncaught exception is	*/
@@ -576,12 +621,15 @@ DWORD WINAPI main_thread_start(LPVOID arg)
 
 /* Called by GC_init() - we hold the allocation lock.	*/
 void GC_thr_init(void) {
+    struct GC_stack_base sb;
+
     if (GC_thr_initialized) return;
     GC_main_thread = GetCurrentThreadId();
     GC_thr_initialized = TRUE;
 
     /* Add the initial thread, so we can stop it.	*/
-    GC_new_thread();
+    GC_get_stack_base(&sb);
+    GC_register_my_thread(&sb);
 }
 
 #ifdef CYGWIN32
@@ -595,7 +643,7 @@ struct start_info {
 int GC_pthread_join(pthread_t pthread_id, void **retval) {
     int result;
     int i;
-    GC_thread me;
+    GC_thread joinee;
 
 #   if DEBUG_CYGWIN_THREADS
       GC_printf("thread 0x%x(0x%x) is joining thread 0x%x.\n",
@@ -607,11 +655,13 @@ int GC_pthread_join(pthread_t pthread_id, void **retval) {
     /* FIXME: It would be better if this worked more like	 */
     /* pthread_support.c.					 */
 
-    while ((me = GC_lookup_thread(pthread_id)) == 0) Sleep(10);
+    while ((joinee = GC_lookup_pthread(pthread_id)) == 0) Sleep(10);
 
     result = pthread_join(pthread_id, retval);
 
-    GC_delete_gc_thread(me);
+    /* FIXME:  This is an asynchronous deletion, which we said can't	*/
+    /* happen?								*/
+    GC_delete_gc_thread(joinee);
 
 #   if DEBUG_CYGWIN_THREADS
       GC_printf("thread 0x%x(0x%x) completed join with thread 0x%x.\n",
@@ -669,13 +719,15 @@ void * GC_start_routine(void * arg)
     void *(*start)(void *);
     void *start_arg;
     pthread_t pthread_id;
+    DWORD thread_id = GetCurrentThreadId();
     GC_thread me;
     GC_bool detached;
     int i;
+    struct GC_stack_base sb;
 
 #   if DEBUG_CYGWIN_THREADS
       GC_printf("thread 0x%x(0x%x) starting...\n",(int)pthread_self(),
-		      				  GetCurrentThreadId());
+		      				  thread_id);
 #   endif
 
     /* If a GC occurs before the thread is registered, that GC will	*/
@@ -685,7 +737,8 @@ void * GC_start_routine(void * arg)
     LOCK();
     /* We register the thread here instead of in the parent, so that	*/
     /* we don't need to hold the allocation lock during pthread_create. */
-    me = GC_new_thread();
+    GC_get_stack_base(&sb);
+    me = GC_register_my_thread_inner(&sb, thread_id);
     UNLOCK();
 
     start = si -> start_routine;
@@ -739,7 +792,7 @@ int GC_pthread_detach(pthread_t thread)
     GC_thread thread_gc_id;
     
     LOCK();
-    thread_gc_id = GC_lookup_thread(thread);
+    thread_gc_id = GC_lookup_pthread(thread);
     UNLOCK();
     result = pthread_detach(thread);
     if (result == 0) {
@@ -758,24 +811,33 @@ int GC_pthread_detach(pthread_t thread)
 
 /*
  * We avoid acquiring locks here, since this doesn't seem to be preemptable.
- * Pontus Rydin suggests wrapping the thread start routine instead.
+ * Pontus Rydin suggested wrapping the thread start routine instead, which
+ * we do in other places.
  */
 #ifdef GC_DLL
 BOOL WINAPI DllMain(HINSTANCE inst, ULONG reason, LPVOID reserved)
 {
+  struct GC_stack_base sb;
+  DWORD thread_id;
+
   switch (reason) {
   case DLL_PROCESS_ATTACH:
     GC_init();	/* Force initialization before thread attach.	*/
     /* fall through */
   case DLL_THREAD_ATTACH:
     GC_ASSERT(GC_thr_initialized);
-    if (GC_main_thread != GetCurrentThreadId()) {
-        GC_new_thread();
+    thread_id = GetCurrentThreadId();
+    if (GC_main_thread != thread_id) {
+	/* Don't lock here.	*/
+	GC_get_stack_base(&sb);
+	GC_register_my_thread_inner(&sb, thread_id);
     } /* o.w. we already did it during GC_thr_init(), called by GC_init() */
     break;
 
   case DLL_THREAD_DETACH:
+    LOCK();	/* Safe? DllMain description is ambiguous.	*/
     GC_delete_thread(GetCurrentThreadId());
+    UNLOCK();
     break;
 
   case DLL_PROCESS_DETACH:
@@ -785,7 +847,7 @@ BOOL WINAPI DllMain(HINSTANCE inst, ULONG reason, LPVOID reserved)
       LOCK();
       for (i = 0; i <= GC_get_max_thread_index(); ++i)
       {
-          if (thread_table[i].in_use)
+          if (AO_load(&(thread_table[i].in_use)))
 	    GC_delete_gc_thread(thread_table + i);
       }
       UNLOCK();
@@ -803,4 +865,12 @@ BOOL WINAPI DllMain(HINSTANCE inst, ULONG reason, LPVOID reserved)
 
 # endif /* !MSWINCE */
 
+# if defined(THREAD_LOCAL_ALLOC) && !defined(DBG_HDRS_ALL)
+
+/* We don't really support thread-local allocation with DBG_HDRS_ALL */
+
+/* Add thread-local allocation support.  Microsoft uses __declspec(thread) */
+
+#endif /* THREAD_LOCAL_ALLOC ... */
+
 #endif /* GC_WIN32_THREADS */
author	Ivan Maidanski <ivmai@mail.ru>	2011-07-26 20:18:03 +0400
committer	Ivan Maidanski <ivmai@mail.ru>	2011-07-26 20:18:03 +0400
commit	64040040407b11d8740516fad2438109e7f22c02 (patch)
tree	6d18a075ceb6f63855730afe630e50b0afa7ed32
parent	e35a4171fe47dfbf847e08988ea6cec4dfc8d124 (diff)
download	bdwgc-64040040407b11d8740516fad2438109e7f22c02.tar.gz