This commit was generated by cvs2svn to compensate for changes in r85899,

which included commits to RCS files with non-trunk default branches. From-SVN: r85900

This commit was generated by cvs2svn to compensate for changes in r85899,
which included commits to RCS files with non-trunk default branches. From-SVN: r85900
aa7a966b · Bryce McKinlay · 5c4dc108 · aa7a966b · aa7a966b · aa7a966b
Commit aa7a966b authored Aug 12, 2004 by Bryce McKinlay
10 changed files
--- a/boehm-gc/aix_irix_threads.c
+++ b/boehm-gc/aix_irix_threads.c
@@ -422,12 +422,7 @@ void GC_thr_init()
    struct sigaction act;
    if (GC_thr_initialized) return;
-#if 0
-    /* unfortunately, GC_init_inner calls us without the lock, so
-     * this assertion is not always true. */
-    /* Why doesn't GC_init_inner hold the lock? - HB		*/
    GC_ASSERT(I_HOLD_LOCK());
-#endif
    GC_thr_initialized = TRUE;
 #ifndef GC_AIX_THREADS
    (void) sigaction(SIG_SUSPEND, 0, &act);

--- a/boehm-gc/darwin_stop_world.c
+++ b/boehm-gc/darwin_stop_world.c
@@ -2,8 +2,6 @@
 # if defined(GC_DARWIN_THREADS)
-#define DEBUG_THREADS 0
 /* From "Inside Mac OS X - Mach-O Runtime Architecture" published by Apple
   Page 49:
   "The space beneath the stack pointer, where a new stack frame would normally
@@ -16,101 +14,258 @@
 */
 #define PPC_RED_ZONE_SIZE 224
+/* Not 64-bit clean. Wait until Apple defines their 64-bit ABI */
+typedef struct StackFrame {
+  unsigned int	savedSP;
+  unsigned int	savedCR;
+  unsigned int	savedLR;
+  unsigned int	reserved[2];
+  unsigned int	savedRTOC;
+} StackFrame;
+unsigned int FindTopOfStack(unsigned int stack_start) {
+  StackFrame	*frame;
+  if (stack_start == 0) {
+    __asm__ volatile("lwz	%0,0(r1)" : "=r" (frame));
+  } else {
+    frame = (StackFrame *)stack_start;
+  }
+# ifdef DEBUG_THREADS
+    /* GC_printf1("FindTopOfStack start at sp = %p\n", frame); */
+# endif
+  do {
+    if (frame->savedSP == NULL) break;
+    		/* if there are no more stack frames, stop */
+    frame = (StackFrame*)frame->savedSP;
+    /* we do these next two checks after going to the next frame
+       because the LR for the first stack frame in the loop
+       is not set up on purpose, so we shouldn't check it. */
+    if ((frame->savedLR & ~3) == 0) break; /* if the next LR is bogus, stop */
+    if ((~(frame->savedLR) & ~3) == 0) break; /* ditto */
+  } while (1); 
+# ifdef DEBUG_THREADS
+    /* GC_printf1("FindTopOfStack finish at sp = %p\n", frame); */
+# endif
+  return (unsigned int)frame;
+}	
 void GC_push_all_stacks() {
    int i;
    kern_return_t r;
-    GC_thread p;
+    mach_port_t me;
-    pthread_t me;
    ptr_t lo, hi;
-#	if defined(POWERPC)
+    thread_act_array_t act_list = 0;
-        ppc_thread_state_t state;
+    mach_msg_type_number_t listcount = 0;
-#	else
-#		error FIXME for non-ppc OS X
+    me = mach_thread_self();
-#	endif
-    mach_msg_type_number_t thread_state_count = MACHINE_THREAD_STATE_COUNT;
-    me = pthread_self();
    if (!GC_thr_initialized) GC_thr_init();
-    for(i=0;i<THREAD_TABLE_SZ;i++) {
+    r = task_threads(current_task(), &act_list, &listcount);
-        for(p=GC_threads[i];p!=0;p=p->next) {
+    if(r != KERN_SUCCESS) ABORT("task_threads failed");
-            if(p -> flags & FINISHED) continue;
+    for(i = 0; i < listcount; i++) {
-            if(pthread_equal(p->id,me)) {
+      thread_act_t thread = act_list[i];
-                lo = GC_approx_sp();
+      if (thread == me) {
-            } else {
+	lo = GC_approx_sp();
-                /* Get the thread state (registers, etc) */
+	hi = (ptr_t)FindTopOfStack(0);
-                r = thread_get_state(
+      } else {
-                    p->stop_info.mach_thread,
+#      ifdef POWERPC
-                    MACHINE_THREAD_STATE,
+	ppc_thread_state_t info;
-                    (natural_t*)&state,
+	mach_msg_type_number_t outCount = THREAD_STATE_MAX;
-                    &thread_state_count);
+	r = thread_get_state(thread, MACHINE_THREAD_STATE,
-                if(r != KERN_SUCCESS) ABORT("thread_get_state failed");
+			     (natural_t *)&info, &outCount);
+	if(r != KERN_SUCCESS) ABORT("task_get_state failed");
-                #ifdef POWERPC
-                    lo = (void*)(state.r1 - PPC_RED_ZONE_SIZE);
+	lo = (void*)(info.r1 - PPC_RED_ZONE_SIZE);
+	hi = (ptr_t)FindTopOfStack(info.r1);
-                    GC_push_one(state.r0); 
-                    GC_push_one(state.r2); 
+	GC_push_one(info.r0); 
-                    GC_push_one(state.r3); 
+	GC_push_one(info.r2); 
-                    GC_push_one(state.r4); 
+	GC_push_one(info.r3); 
-                    GC_push_one(state.r5); 
+	GC_push_one(info.r4); 
-                    GC_push_one(state.r6); 
+	GC_push_one(info.r5); 
-                    GC_push_one(state.r7); 
+	GC_push_one(info.r6); 
-                    GC_push_one(state.r8); 
+	GC_push_one(info.r7); 
-                    GC_push_one(state.r9); 
+	GC_push_one(info.r8); 
-                    GC_push_one(state.r10); 
+	GC_push_one(info.r9); 
-                    GC_push_one(state.r11); 
+	GC_push_one(info.r10); 
-                    GC_push_one(state.r12); 
+	GC_push_one(info.r11); 
-                    GC_push_one(state.r13); 
+	GC_push_one(info.r12); 
-                    GC_push_one(state.r14); 
+	GC_push_one(info.r13); 
-                    GC_push_one(state.r15); 
+	GC_push_one(info.r14); 
-                    GC_push_one(state.r16); 
+	GC_push_one(info.r15); 
-                    GC_push_one(state.r17); 
+	GC_push_one(info.r16); 
-                    GC_push_one(state.r18); 
+	GC_push_one(info.r17); 
-                    GC_push_one(state.r19); 
+	GC_push_one(info.r18); 
-                    GC_push_one(state.r20); 
+	GC_push_one(info.r19); 
-                    GC_push_one(state.r21); 
+	GC_push_one(info.r20); 
-                    GC_push_one(state.r22); 
+	GC_push_one(info.r21); 
-                    GC_push_one(state.r23); 
+	GC_push_one(info.r22); 
-                    GC_push_one(state.r24); 
+	GC_push_one(info.r23); 
-                    GC_push_one(state.r25); 
+	GC_push_one(info.r24); 
-                    GC_push_one(state.r26); 
+	GC_push_one(info.r25); 
-                    GC_push_one(state.r27); 
+	GC_push_one(info.r26); 
-                    GC_push_one(state.r28); 
+	GC_push_one(info.r27); 
-                    GC_push_one(state.r29); 
+	GC_push_one(info.r28); 
-                    GC_push_one(state.r30); 
+	GC_push_one(info.r29); 
-                    GC_push_one(state.r31);
+	GC_push_one(info.r30); 
-                #else
+	GC_push_one(info.r31);
-                #	error FIXME for non-PPC darwin
+#      else
-                #endif /* !POWERPC */
+	/* FIXME: Remove after testing:	*/
-            } /* p != me */
+	WARN("This is completely untested and likely will not work\n", 0);
-            if(p->flags & MAIN_THREAD)
+	i386_thread_state_t info;
-                hi = GC_stackbottom;
+	mach_msg_type_number_t outCount = THREAD_STATE_MAX;
-            else
+	r = thread_get_state(thread, MACHINE_THREAD_STATE,
-                hi = p->stack_end;
+			     (natural_t *)&info, &outCount);
-            #if DEBUG_THREADS
+	if(r != KERN_SUCCESS) ABORT("task_get_state failed");
-                GC_printf3("Darwin: Stack for thread 0x%lx = [%lx,%lx)\n",
-                    (unsigned long) p -> id,
+	lo = (void*)info.esp;
-                    (unsigned long) lo,
+	hi = (ptr_t)FindTopOfStack(info.esp);
-                    (unsigned long) hi
-                );
+	GC_push_one(info.eax); 
-            #endif
+	GC_push_one(info.ebx); 
-            GC_push_all_stack(lo,hi);
+	GC_push_one(info.ecx); 
-        } /* for(p=GC_threads[i]...) */
+	GC_push_one(info.edx); 
-    } /* for(i=0;i<THREAD_TABLE_SZ...) */
+	GC_push_one(info.edi); 
+	GC_push_one(info.esi); 
+	/* GC_push_one(info.ebp);  */
+	/* GC_push_one(info.esp);  */
+	GC_push_one(info.ss); 
+	GC_push_one(info.eip); 
+	GC_push_one(info.cs); 
+	GC_push_one(info.ds); 
+	GC_push_one(info.es); 
+	GC_push_one(info.fs); 
+	GC_push_one(info.gs); 
+#      endif /* !POWERPC */
+      }
+#     if DEBUG_THREADS
+       GC_printf3("Darwin: Stack for thread 0x%lx = [%lx,%lx)\n",
+		  (unsigned long) thread,
+		  (unsigned long) lo,
+		  (unsigned long) hi
+		 );
+#     endif
+      GC_push_all_stack(lo, hi); 
+    } /* for(p=GC_threads[i]...) */
+}
+static mach_port_t GC_mach_handler_thread;
+static int GC_use_mach_handler_thread = 0;
+static struct GC_mach_thread GC_mach_threads[THREAD_TABLE_SZ];
+static int GC_mach_threads_count;
+void GC_stop_init() {
+  int i;
+  for (i = 0; i < THREAD_TABLE_SZ; i++) {
+    GC_mach_threads[i].thread = 0;
+    GC_mach_threads[i].already_suspended = 0;
+  }
+  GC_mach_threads_count = 0;
+}
+/* returns true if there's a thread in act_list that wasn't in old_list */
+int GC_suspend_thread_list(thread_act_array_t act_list, int count, 
+			   thread_act_array_t old_list, int old_count) {
+  mach_port_t my_thread = mach_thread_self();
+  int i, j;
+  int changed = 0;
+  for(i = 0; i < count; i++) {
+    thread_act_t thread = act_list[i];
+#   if DEBUG_THREADS 
+      GC_printf1("Attempting to suspend thread %p\n", thread);
+#   endif
+    /* find the current thread in the old list */
+    int found = 0;
+    for(j = 0; j < old_count; j++) {
+      thread_act_t old_thread = old_list[j];
+      if (old_thread == thread) {
+	found = 1;
+	break;
+      }
+    }
+    if (!found) {
+      /* add it to the GC_mach_threads list */
+      GC_mach_threads[GC_mach_threads_count].thread = thread;
+      /* default is not suspended */
+      GC_mach_threads[GC_mach_threads_count].already_suspended = 0;
+      changed = 1;
+    }      
+    if (thread != my_thread &&
+	(!GC_use_mach_handler_thread
+	 || (GC_use_mach_handler_thread
+	     && GC_mach_handler_thread != thread))) {
+      struct thread_basic_info info;
+      mach_msg_type_number_t outCount = THREAD_INFO_MAX;
+      kern_return_t kern_result = thread_info(thread, THREAD_BASIC_INFO,
+				(thread_info_t)&info, &outCount);
+      if(kern_result != KERN_SUCCESS) {
+	/* the thread may have quit since the thread_threads () call 
+	 * we mark already_suspended so it's not dealt with anymore later
+	 */
+        if (!found) {
+	  GC_mach_threads[GC_mach_threads_count].already_suspended = TRUE;
+    	  GC_mach_threads_count++;
+	}
+	continue;
+      }
+#     if DEBUG_THREADS
+        GC_printf2("Thread state for 0x%lx = %d\n", thread, info.run_state);
+#     endif
+      if (!found) {
+	GC_mach_threads[GC_mach_threads_count].already_suspended = info.suspend_count;
+      }
+      if (info.suspend_count) continue;
+#     if DEBUG_THREADS
+        GC_printf1("Suspending 0x%lx\n", thread);
+#     endif
+      /* Suspend the thread */
+      kern_result = thread_suspend(thread);
+      if(kern_result != KERN_SUCCESS) {
+	/* the thread may have quit since the thread_threads () call 
+	 * we mark already_suspended so it's not dealt with anymore later
+	 */
+        if (!found) {
+	  GC_mach_threads[GC_mach_threads_count].already_suspended = TRUE;
+    	  GC_mach_threads_count++;
+	}
+	continue;
+      }
+    } 
+    if (!found) GC_mach_threads_count++;
+  }
+  return changed;
 }
 /* Caller holds allocation lock.	*/
 void GC_stop_world()
 {
-    int i;
+  int i, changes;
    GC_thread p;
-    pthread_t my_thread = pthread_self();
+    mach_port_t my_thread = mach_thread_self();
    kern_return_t kern_result;
+    thread_act_array_t act_list, prev_list;
+    mach_msg_type_number_t listcount, prevcount;
-    #if DEBUG_THREADS
+#   if DEBUG_THREADS
-    GC_printf1("Stopping the world from 0x%lx\n", pthread_self());
+      GC_printf1("Stopping the world from 0x%lx\n", mach_thread_self());
-    #endif
+#   endif
+    /* clear out the mach threads list table */
+    GC_stop_init(); 
    /* Make sure all free list construction has stopped before we start. */
    /* No new construction can start, since free list construction is	*/
@@ -122,43 +277,40 @@ void GC_stop_world()
      /* We should have previously waited for it to become zero. */
 #   endif /* PARALLEL_MARK */
-    for (i = 0; i < THREAD_TABLE_SZ; i++) {
+      /* Loop stopping threads until you have gone over the whole list
-        for (p = GC_threads[i]; p != 0; p = p -> next) {
+	 twice without a new one appearing. thread_create() won't
-            if (p -> id == my_thread) continue;
+	 return (and thus the thread stop) until the new thread
-            if (p -> flags & FINISHED) continue;
+	 exists, so there is no window whereby you could stop a
-            if (p -> thread_blocked) /* Will wait */ continue;
+	 thread, recognise it is stopped, but then have a new thread
+	 it created before stopping show up later.
-            #if DEBUG_THREADS
+      */
-            GC_printf1("Suspending thread 0x%lx\n", p -> id);
-            #endif
+      changes = 1;
+      prev_list = NULL;
-            /* Suspend the thread */
+      prevcount = 0;
-            kern_result = thread_suspend(p->stop_info.mach_thread);
+      do {
-            if(kern_result != KERN_SUCCESS) ABORT("thread_suspend failed");
+	int result;
+	kern_result = task_threads(current_task(), &act_list, &listcount);
-            /* This is only needed if we are modifying the threads 
+	result = GC_suspend_thread_list(act_list, listcount,
-               state. thread_abort_safely should also be used
+					prev_list, prevcount);
-               if this code is ever added in again.
+	changes = result;
+	prev_list = act_list;
-               kern_result = thread_abort(p->stop_info.mach_thread);
+	prevcount = listcount;
-               if(kern_result != KERN_SUCCESS)
+      } while (changes);
-                   ABORT("thread_abort failed (%ul)",kern_result);
-            */
-        }
-    }
 #   ifdef MPROTECT_VDB
-    if(GC_incremental) {
+      if(GC_incremental) {
        extern void GC_mprotect_stop();
        GC_mprotect_stop();
-    }
+      }
 #   endif
 #   ifdef PARALLEL_MARK
      GC_release_mark_lock();
 #   endif
    #if DEBUG_THREADS
-      GC_printf1("World stopped from 0x%lx\n", pthread_self());
+      GC_printf1("World stopped from 0x%lx\n", my_thread);
    #endif
 }
@@ -166,44 +318,63 @@ void GC_stop_world()
 /* the world stopped.							*/
 void GC_start_world()
 {
-    pthread_t my_thread = pthread_self();
+  mach_port_t my_thread = mach_thread_self();
-    int i;
+  int i, j;
-    GC_thread p;
+  GC_thread p;
-    kern_return_t kern_result;
+  kern_return_t kern_result;
+  thread_act_array_t act_list;
+  mach_msg_type_number_t listcount;
 #   if DEBUG_THREADS
      GC_printf0("World starting\n");
 #   endif
 #   ifdef MPROTECT_VDB
-    if(GC_incremental) {
+      if(GC_incremental) {
        extern void GC_mprotect_resume();
        GC_mprotect_resume();
-    }
+      }
 #   endif
-    for (i = 0; i < THREAD_TABLE_SZ; i++) {
+    kern_result = task_threads(current_task(), &act_list, &listcount);
-        for (p = GC_threads[i]; p != 0; p = p -> next) {
+    for(i = 0; i < listcount; i++) {
-            if (p -> id == my_thread) continue;
+      thread_act_t thread = act_list[i];
-            if (p -> flags & FINISHED) continue;
+      if (thread != my_thread &&
-            if (p -> thread_blocked) continue;
+	  (!GC_use_mach_handler_thread ||
+	   (GC_use_mach_handler_thread && GC_mach_handler_thread != thread))) {
-            #if DEBUG_THREADS
+	for(j = 0; j < GC_mach_threads_count; j++) {
-            GC_printf1("Resuming 0x%lx\n", p -> id);
+	  if (thread == GC_mach_threads[j].thread) {
-            #endif
+	    if (GC_mach_threads[j].already_suspended) {
+#             if DEBUG_THREADS
-            /* Resume the thread */
+	        GC_printf1("Not resuming already suspended thread %p\n", thread);
-            kern_result = thread_resume(p->stop_info.mach_thread);
+#             endif
-            if(kern_result != KERN_SUCCESS) ABORT("thread_resume failed");
+	      continue;
-        }
+	    }
+	    struct thread_basic_info info;
+	    mach_msg_type_number_t outCount = THREAD_INFO_MAX;
+	    kern_result = thread_info(thread, THREAD_BASIC_INFO,
+				      (thread_info_t)&info, &outCount);
+	    if(kern_result != KERN_SUCCESS) ABORT("thread_info failed");
+#           if DEBUG_THREADS
+	      GC_printf2("Thread state for 0x%lx = %d\n", thread,
+			 info.run_state);
+	      GC_printf1("Resuming 0x%lx\n", thread);
+#           endif
+	    /* Resume the thread */
+	    kern_result = thread_resume(thread);
+	    if(kern_result != KERN_SUCCESS) ABORT("thread_resume failed");
+	  } 
+	}
+      }
    }
-    #if DEBUG_THREADS
+#   if DEBUG_THREADS
-      GC_printf0("World started\n");
+     GC_printf0("World started\n");
-    #endif
+#   endif
 }
-void GC_stop_init() {
+void GC_darwin_register_mach_handler_thread(mach_port_t thread) {
+  GC_mach_handler_thread = thread;
+  GC_use_mach_handler_thread = 1;
 }
 #endif
--- a/boehm-gc/doc/README.darwin
+++ b/boehm-gc/doc/README.darwin
-Darwin/MacOSX Support - July 22, 2003
+Darwin/MacOSX Support - December 16, 2003
-====================================
+=========================================
 Important Usage Notes
 =====================
@@ -15,7 +15,7 @@ run and perhaps called GC_malloc(), create an initialization routine
 for each library to call GC_init():
 #include <gc/gc.h>
-void my_library_init() { GC_init(); }
+extern "C" void my_library_init() { GC_init(); }
 Compile this code into a my_library_init.o, and link it into your
 dylib. When you link the dylib, pass the -init argument with
@@ -34,6 +34,12 @@ work reliably with workarounds for a few possible bugs in place however
 these workaround may not work correctly in all cases. There may also
 be additional problems that I have not found. 
+Thread-local GC allocation will not work with threads that are not
+created using the GC-provided override of pthread_create(). Threads
+created without the GC-provided pthread_create() do not have the
+necessary data structures in the GC to store this data. 
 Implementation Information
 ==========================
 Darwin/MacOSX support is nearly complete. Thread support is reliable on 
@@ -42,11 +48,27 @@ Darwin versions (MacOSX 10.1). Shared library support had also been
 added and the gc can be run from a shared library. There is currently only
 support for Darwin/PPC although adding x86 support should be trivial.
-Thread support is implemented in terms of mach thread_suspend and 
+Thread support is implemented in terms of mach thread_suspend and
 thread_resume calls. These provide a very clean interface to thread
 suspension. This implementation doesn't rely on pthread_kill so the
-code works on Darwin < 6.0 (MacOSX 10.1). All the code to stop the
+code works on Darwin < 6.0 (MacOSX 10.1). All the code to stop and
-world is located in darwin_stop_world.c.
+start the world is located in darwin_stop_world.c.
+Since not all uses of the GC enable clients to override pthread_create()
+before threads have been created, the code for stopping the world has
+been rewritten to look for threads using Mach kernel calls. Each
+thread identified in this way is suspended and resumed as above. In
+addition, since Mach kernel threads do not contain pointers to their
+stacks, a stack-walking function has been written to find the stack
+limits. Given an initial stack pointer (for the current thread, a
+pointer to a stack-allocated local variable will do; for a non-active
+thread, we grab the value of register 1 (on PowerPC)), it
+will walk the PPC Mach-O-ABI compliant stack chain until it reaches the
+top of the stack. This appears to work correctly for GCC-compiled C,
+C++, Objective-C, and Objective-C++ code, as well as for Java
+programs that use JNI. If you run code that does not follow the stack
+layout or stack pointer conventions laid out in the PPC Mach-O ABI,
+then this will likely crash the garbage collector. 
 The original incremental collector support unfortunatelly no longer works
 on recent Darwin versions. It also relied on some undocumented kernel

--- a/boehm-gc/doc/gcinterface.html
+++ b/boehm-gc/doc/gcinterface.html
@@ -5,31 +5,33 @@
 <BODY>
 <H1>C Interface</h1>
 On many platforms, a single-threaded garbage collector library can be built
-to act as a plug-in malloc replacement.  (Build with -DREDIRECT_MALLOC=GC_malloc
+to act as a plug-in malloc replacement.
-DIGNORE_FREE.)  This is often the best way to deal with third-party libraries
+(Build with <TT>-DREDIRECT_MALLOC=GC_malloc -DIGNORE_FREE</tt>.)
-which leak or prematurely free objects.  -DREDIRECT_MALLOC is intended
+This is often the best way to deal with third-party libraries
+which leak or prematurely free objects.  <TT>-DREDIRECT_MALLOC</tt> is intended
 primarily as an easy way to adapt old code, not for new development.
 <P>
 New code should use the interface discussed below.
 <P>
 Code must be linked against the GC library.  On most UNIX platforms,
-this will be gc.a.
+depending on how the collector is built, this will be <TT>gc.a</tt>
+or <TT>libgc.{a,so}</tt>.
 <P>
 The following describes the standard C interface to the garbage collector.
 It is not a complete definition of the interface.  It describes only the
 most commonly used functionality, approximately in decreasing order of
-frequency of use.  The description assumes an ANSI C compiler.
+frequency of use.
 The full interface is described in
 <A HREF="http://hpl.hp.com/personal/Hans_Boehm/gc/gc_source/gch.txt">gc.h</a>
 or <TT>gc.h</tt> in the distribution.
 <P>
-Clients should include gc.h.
+Clients should include <TT>gc.h</tt>.
 <P>
 In the case of multithreaded code,
-gc.h should be included after the threads header file, and
+<TT>gc.h</tt> should be included after the threads header file, and
-after defining the appropriate GC_XXXX_THREADS macro.
+after defining the appropriate <TT>GC_</tt><I>XXXX</i><TT>_THREADS</tt> macro.
-(For 6.2alpha4 and later, simply defining GC_THREADS should suffice.)
+(For 6.2alpha4 and later, simply defining <TT>GC_THREADS</tt> should suffice.)
-Gc.h must be included
+The header file <TT>gc.h</tt> must be included
 in files that use either GC or threads primitives, since threads primitives
 will be redefined to cooperate with the GC on many platforms.
 <DL>
@@ -39,9 +41,10 @@ Allocates and clears <I>nbytes</i> of storage.
 Requires (amortized) time proportional to <I>nbytes</i>.
 The resulting object will be automatically deallocated when unreferenced.
 References from objects allocated with the system malloc are usually not
-considered by the collector.  (See GC_MALLOC_UNCOLLECTABLE, however.)
+considered by the collector.  (See <TT>GC_MALLOC_UNCOLLECTABLE</tt>, however.)
-GC_MALLOC is a macro which invokes GC_malloc by default or, if GC_DEBUG
+<TT>GC_MALLOC</tt> is a macro which invokes <TT>GC_malloc</tt> by default or,
-is defined before gc.h is included, a debugging version that checks
+if <TT>GC_DEBUG</tt>
+is defined before <TT>gc.h</tt> is included, a debugging version that checks
 occasionally for overwrite errors, and the like.
 <DT> <B>void * GC_MALLOC_ATOMIC(size_t <I>nbytes</i>)</b>
 <DD>
@@ -57,60 +60,70 @@ collector using the interface in
 <A HREF="http://www.hpl.hp.com/personal/Hans_Boehm/gc/gc_source/gc_typedh.txt">gc_typed.h</a> in the distribution.
 <DT> <B>void * GC_MALLOC_UNCOLLECTABLE(size_t <I>nbytes</i>)</b>
 <DD>
-Identical to GC_MALLOC, except that the resulting object is not automatically
+Identical to <TT>GC_MALLOC</tt>,
+except that the resulting object is not automatically
 deallocated.  Unlike the system-provided malloc, the collector does
 scan the object for pointers to garbage-collectable memory, even if the
 block itself does not appear to be reachable.  (Objects allocated in this way
 are effectively treated as roots by the collector.)
-<DT> <B> void * GC_REALLOC(void *old, size_t new_size) </b>
+<DT> <B> void * GC_REALLOC(void *<I>old</i>, size_t <I>new_size</i>) </b>
 <DD>
 Allocate a new object of the indicated size and copy (a prefix of) the
 old object into the new object.  The old object is reused in place if
-convenient.  If the original object was allocated with GC_malloc_atomic,
+convenient.  If the original object was allocated with
+<TT>GC_MALLOC_ATOMIC</tt>,
 the new object is subject to the same constraints.  If it was allocated
 as an uncollectable object, then the new object is uncollectable, and
 the old object (if different) is deallocated.
-(Use GC_REALLOC with GC_MALLOC, etc.)
+<DT> <B> void GC_FREE(void *<I>dead</i>) </b>
-<DT> <B> void GC_FREE(void *dead) </b>
 <DD>
 Explicitly deallocate an object.  Typically not useful for small
-collectable objects.  (Use GC_FREE with GC_MALLOC, etc.)
+collectable objects.
 <DT> <B> void * GC_MALLOC_IGNORE_OFF_PAGE(size_t <I>nbytes</i>) </b>
 <DD>
 <DT> <B> void * GC_MALLOC_ATOMIC_IGNORE_OFF_PAGE(size_t <I>nbytes</i>) </b>
 <DD>
-Analogous to GC_MALLOC and GC_MALLOC_ATOMIC, except that the client
+Analogous to <TT>GC_MALLOC</tt> and <TT>GC_MALLOC_ATOMIC</tt>,
+except that the client
 guarantees that as long
 as the resulting object is of use, a pointer is maintained to someplace
 inside the first 512 bytes of the object.  This pointer should be declared
 volatile to avoid interference from compiler optimizations.
 (Other nonvolatile pointers to the object may exist as well.)
 This is the
-preferred way to allocate objects that are likely to be > 100KBytes in size.
+preferred way to allocate objects that are likely to be &gt; 100KBytes in size.
 It greatly reduces the risk that such objects will be accidentally retained
 when they are no longer needed.  Thus space usage may be significantly reduced.
+<DT> <B> void GC_INIT(void) </b>
+<DD>
+On some platforms, it is necessary to invoke this
+<I>from the main executable, not from a dynamic library,</i> before
+the initial invocation of a GC routine.  It is recommended that this be done
+in portable code, though we try to ensure that it expands to a no-op
+on as many platforms as possible.
 <DT> <B> void GC_gcollect(void) </b>
 <DD>
 Explicitly force a garbage collection.
 <DT> <B> void GC_enable_incremental(void) </b>
 <DD>
 Cause the garbage collector to perform a small amount of work
-every few invocations of GC_malloc or the like, instead of performing
+every few invocations of <TT>GC_MALLOC</tt> or the like, instead of performing
 an entire collection at once.  This is likely to increase total
 running time.  It will improve response on a platform that either has
-suitable support in the garbage collector (Irix and most other Unix
+suitable support in the garbage collector (Linux and most Unix
 versions, win32 if the collector was suitably built) or if "stubborn"
-allocation is used (see <A HREF="http://www.hpl.hp.com/personal/Hans_Boehm/gc/gc_source/gch.txt">gc.h</a>).
+allocation is used (see
+<A HREF="http://www.hpl.hp.com/personal/Hans_Boehm/gc/gc_source/gch.txt">gc.h</a>).
 On many platforms this interacts poorly with system calls 
 that write to the garbage collected heap.
-<DT> <B> GC_warn_proc GC_set_warn_proc(GC_warn_proc p) </b>
+<DT> <B> GC_warn_proc GC_set_warn_proc(GC_warn_proc <I>p</i>) </b>
 <DD>
 Replace the default procedure used by the collector to print warnings.
 The collector
 may otherwise write to sterr, most commonly because GC_malloc was used
 in a situation in which GC_malloc_ignore_off_page would have been more
 appropriate.  See <A HREF="http://www.hpl.hp.com/personal/Hans_Boehm/gc/gc_source/gch.txt">gc.h</a> for details.
-<DT> <B> void GC_register_finalizer(...) </b>
+<DT> <B> void GC_REGISTER_FINALIZER(...) </b>
 <DD>
 Register a function to be called when an object becomes inaccessible.
 This is often useful as a backup method for releasing system resources
@@ -123,7 +136,8 @@ See <A HREF="http://www.hpl.hp.com/personal/Hans_Boehm/gc/finalization.html">her
 of the design.
 <P>
 Note that an object may become inaccessible before client code is done
-operating on its fields.  Suitable synchronization is usually required.
+operating on objects referenced by its fields.
+Suitable synchronization is usually required.
 See <A HREF="http://portal.acm.org/citation.cfm?doid=604131.604153">here</a>
 or <A HREF="http://www.hpl.hp.com/techreports/2002/HPL-2002-335.html">here</a>
 for details.
@@ -131,13 +145,13 @@ for details.
 <P>
 If you are concerned with multiprocessor performance and scalability,
 you should consider enabling and using thread local allocation (<I>e.g.</i>
-GC_LOCAL_MALLOC, see <TT>gc_local_alloc.h</tt>.  If your platform
+<TT>GC_LOCAL_MALLOC</tt>, see <TT>gc_local_alloc.h</tt>.  If your platform
 supports it, you should build the collector with parallel marking support
-(-DPARALLEL_MARK, or --enable-parallel-mark).
+(<TT>-DPARALLEL_MARK</tt>, or <TT>--enable-parallel-mark</tt>).
 <P>
 If the collector is used in an environment in which pointer location
 information for heap objects is easily available, this can be passed on
-to the colllector using the interfaces in either <TT>gc_typed.h</tt>
+to the collector using the interfaces in either <TT>gc_typed.h</tt>
 or <TT>gc_gcj.h</tt>.
 <P>
 The collector distribution also includes a <B>string package</b> that takes
@@ -145,7 +159,31 @@ advantage of the collector.  For details see
 <A HREF="http://www.hpl.hp.com/personal/Hans_Boehm/gc/gc_source/cordh.txt">cord.h</a>
 <H1>C++ Interface</h1>
-There are three distinct ways to use the collector from C++:
+Usage of the collector from C++ is complicated by the fact that there
+are many "standard" ways to allocate memory in C++.  The default ::new
+operator, default malloc, and default STL allocators allocate memory
+that is not garbage collected, and is not normally "traced" by the
+collector.  This means that any pointers in memory allocated by these
+default allocators will not be seen by the collector.  Garbage-collectable
+memory referenced only by pointers stored in such default-allocated
+objects is likely to be reclaimed prematurely by the collector.
+<P>
+It is the programmers responsibility to ensure that garbage-collectable
+memory is referenced by pointers stored in one of
+<UL>
+<LI> Program variables
+<LI> Garbage-collected objects
+<LI> Uncollected but "traceable" objects
+</ul>
+"Traceable" objects are not necessarily reclaimed by the collector,
+but are scanned for pointers to collectable objects.
+They are allocated by <TT>GC_MALLOC_UNCOLLECTABLE</tt>, as described
+above, and through some interfaces described below.
+<P>
+The easiest way to ensure that collectable objects are properly referenced
+is to allocate only collectable objects.  This requires that every
+allocation go through one of the following interfaces, each one of
+which replaces a standard C++ allocation mechanism:
 <DL>
 <DT> <B> STL allocators </b>
 <DD>
@@ -170,7 +208,7 @@ multiple threads, but are faster.
 For an example, click <A HREF="http://hpl.hp.com/personal/Hans_Boehm/gc/gc_alloc_exC.txt">here</a>.
 <P>
 Recent versions of the collector also include a more standard-conforming
-allocator implemention in <TT>gc_allocator.h</tt>.  It defines
+allocator implementation in <TT>gc_allocator.h</tt>.  It defines
 <UL>
 <LI> traceable_allocator
 <LI> gc_allocator
@@ -179,9 +217,15 @@ Again the former allocates uncollectable but traced memory.
 This should work with any fully standard-conforming C++ compiler.
 <DT> <B> Class inheritance based interface </b>
 <DD>
-Users may include gc_cpp.h and then cause members of certain classes to
+Users may include gc_cpp.h and then cause members of classes to
-be allocated in garbage collectable memory by inheriting from class gc.
+be allocated in garbage collectable memory by having those classes
+inherit from class gc.
 For details see <A HREF="http://hpl.hp.com/personal/Hans_Boehm/gc/gc_source/gc_cpph.txt">gc_cpp.h</a>.
+<P>
+Linking against libgccpp in addition to the gc library overrides
+::new (and friends) to allocate traceable memory but uncollectable
+memory, making it safe to refer to collectable objects from the resulting
+memory.
 <DT> <B> C interface </b>
 <DD>
 It is also possible to use the C interface from 
@@ -189,14 +233,15 @@ It is also possible to use the C interface from
 On platforms which use malloc to implement ::new, it should usually be possible
 to use a version of the collector that has been compiled as a malloc
 replacement.  It is also possible to replace ::new and other allocation
-functions suitably.
+functions suitably, as is done by libgccpp.
 <P>
 Note that user-implemented small-block allocation often works poorly with
 an underlying garbage-collected large block allocator, since the collector
 has to view all objects accessible from the user's free list as reachable.
-This is likely to cause problems if GC_malloc is used with something like
+This is likely to cause problems if <TT>GC_MALLOC</tt>
+is used with something like
 the original HP version of STL.
-This approach works with the SGI versions of the STL only if the
+This approach works well with the SGI versions of the STL only if the
 <TT>malloc_alloc</tt> allocator is used.
 </dl>
 </body>

--- a/boehm-gc/include/gc_allocator.h
+++ b/boehm-gc/include/gc_allocator.h
@@ -35,7 +35,17 @@
 * library, which itself was derived from the SGI STL implementation.
 */
-#include "gc.h" 	// For size_t
+#ifndef GC_ALLOCATOR_H
+#define GC_ALLOCATOR_H
+#include "gc.h"
+#if defined(__GNUC__)
+#  define GC_ATTR_UNUSED __attribute__((unused))
+#else
+#  define GC_ATTR_UNUSED
+#endif
 /* First some helpers to allow us to dispatch on whether or not a type
 * is known to be pointerfree.
@@ -118,7 +128,7 @@ public:
  }
  // __p is not permitted to be a null pointer.
-  void deallocate(pointer __p, size_type GC_n)
+  void deallocate(pointer __p, size_type GC_ATTR_UNUSED GC_n)
    { GC_FREE(__p); }
  size_type max_size() const throw()
@@ -194,7 +204,7 @@ public:
  }
  // __p is not permitted to be a null pointer.
-  void deallocate(pointer __p, size_type GC_n)
+  void deallocate(pointer __p, size_type GC_ATTR_UNUSED GC_n)
    { GC_FREE(__p); }
  size_type max_size() const throw()
@@ -230,3 +240,4 @@ inline bool operator!=(const traceable_allocator<GC_T1>&, const traceable_alloca
  return false;
 }
+#endif /* GC_ALLOCATOR_H */
--- a/boehm-gc/include/gc_config_macros.h
+++ b/boehm-gc/include/gc_config_macros.h
@@ -97,7 +97,10 @@
 # endif
 #endif /* GC_THREADS */
-#if defined(GC_THREADS) && !defined(GC_PTHREADS) && defined(MSWIN32)
+#if defined(GC_THREADS) && !defined(GC_PTHREADS) && \
+    (defined(_WIN32) || defined(_MSC_VER) || defined(__CYGWIN__) \
+     || defined(__MINGW32__) || defined(__BORLANDC__) \
+     || defined(_WIN32_WCE))
 # define GC_WIN32_THREADS
 #endif
@@ -106,8 +109,9 @@
 #endif
 # define __GC
-# include <stddef.h>
+# ifndef _WIN32_WCE
-# ifdef _WIN32_WCE
+#   include <stddef.h>
+# else /* ! _WIN32_WCE */
 /* Yet more kluges for WinCE */
 #   include <stdlib.h>		/* size_t is defined here */
    typedef long ptrdiff_t;	/* ptrdiff_t is not defined */

--- a/boehm-gc/include/private/darwin_stop_world.h
+++ b/boehm-gc/include/private/darwin_stop_world.h
@@ -12,4 +12,11 @@ struct thread_stop_info {
    mach_port_t mach_thread;
 };
+struct GC_mach_thread {
+  thread_act_t thread;
+  int already_suspended;
+};
+void GC_darwin_register_mach_handler_thread(mach_port_t thread);
 #endif
--- a/boehm-gc/include/private/pthread_support.h
+++ b/boehm-gc/include/private/pthread_support.h
@@ -93,5 +93,10 @@ GC_thread GC_lookup_thread(pthread_t id);
 void GC_stop_init();
+extern GC_bool GC_in_thread_creation;
+	/* We may currently be in thread creation or destruction.	*/
+	/* Only set to TRUE while allocation lock is held.		*/
+	/* When set, it is OK to run GC from unknown thread.		*/
 #endif /* GC_PTHREADS && !GC_SOLARIS_THREADS.... etc */
 #endif /* GC_PTHREAD_SUPPORT_H */
--- a/boehm-gc/pthread_stop_world.c
+++ b/boehm-gc/pthread_stop_world.c
@@ -39,6 +39,34 @@ void GC_print_sig_mask()
 #endif
+/* Remove the signals that we want to allow in thread stopping 	*/
+/* handler from a set.						*/
+void GC_remove_allowed_signals(sigset_t *set)
+{
+#   ifdef NO_SIGNALS
+      if (sigdelset(set, SIGINT) != 0
+	  || sigdelset(set, SIGQUIT) != 0
+	  || sigdelset(set, SIGABRT) != 0
+	  || sigdelset(set, SIGTERM) != 0) {
+        ABORT("sigdelset() failed");
+      }
+#   endif
+#   ifdef MPROTECT_VDB
+      /* Handlers write to the thread structure, which is in the heap,	*/
+      /* and hence can trigger a protection fault.			*/
+      if (sigdelset(set, SIGSEGV) != 0
+#	  ifdef SIGBUS
+	    || sigdelset(set, SIGBUS) != 0
+# 	  endif
+	  ) {
+        ABORT("sigdelset() failed");
+      }
+#   endif
+}
+static sigset_t suspend_handler_mask;
 word GC_stop_count;	/* Incremented at the beginning of GC_stop_world. */
 #ifdef GC_OSF1_THREADS
@@ -78,7 +106,6 @@ void GC_suspend_handler(int sig)
    int dummy;
    pthread_t my_thread = pthread_self();
    GC_thread me;
-    sigset_t mask;
 #   ifdef PARALLEL_MARK
 	word my_mark_no = GC_mark_no;
 	/* Marker can't proceed until we acknowledge.  Thus this is	*/
@@ -125,17 +152,9 @@ void GC_suspend_handler(int sig)
    /* this thread a SIG_THR_RESTART signal.			*/
    /* SIG_THR_RESTART should be masked at this point.  Thus there	*/
    /* is no race.						*/
-    if (sigfillset(&mask) != 0) ABORT("sigfillset() failed");
-    if (sigdelset(&mask, SIG_THR_RESTART) != 0) ABORT("sigdelset() failed");
-#   ifdef NO_SIGNALS
-      if (sigdelset(&mask, SIGINT) != 0) ABORT("sigdelset() failed");
-      if (sigdelset(&mask, SIGQUIT) != 0) ABORT("sigdelset() failed");
-      if (sigdelset(&mask, SIGTERM) != 0) ABORT("sigdelset() failed");
-      if (sigdelset(&mask, SIGABRT) != 0) ABORT("sigdelset() failed");
-#   endif
    do {
 	    me->stop_info.signal = 0;
-	    sigsuspend(&mask);             /* Wait for signal */
+	    sigsuspend(&suspend_handler_mask);        /* Wait for signal */
    } while (me->stop_info.signal != SIG_THR_RESTART);
    /* If the RESTART signal gets lost, we can still lose.  That should be  */
    /* less likely than losing the SUSPEND signal, since we don't do much   */
@@ -186,6 +205,7 @@ void GC_restart_handler(int sig)
 /* world is stopped.  Should not fail if it isn't.			*/
 void GC_push_all_stacks()
 {
+    GC_bool found_me = FALSE;
    int i;
    GC_thread p;
    ptr_t lo, hi;
@@ -206,6 +226,7 @@ void GC_push_all_stacks()
 #  	    else
 	        lo = GC_approx_sp();
 #           endif
+	    found_me = TRUE;
 	    IF_IA64(bs_hi = (ptr_t)GC_save_regs_in_stack();)
 	} else {
 	    lo = p -> stop_info.stack_ptr;
@@ -232,6 +253,11 @@ void GC_push_all_stacks()
          GC_push_all_stack(lo, hi);
 #	endif
 #	ifdef IA64
+#         if DEBUG_THREADS
+            GC_printf3("Reg stack for thread 0x%lx = [%lx,%lx)\n",
+    	        (unsigned long) p -> id,
+		(unsigned long) bs_lo, (unsigned long) bs_hi);
+#	  endif
          if (pthread_equal(p -> id, me)) {
 	    GC_push_all_eager(bs_lo, bs_hi);
 	  } else {
@@ -240,6 +266,8 @@ void GC_push_all_stacks()
 #	endif
      }
    }
+    if (!found_me && !GC_in_thread_creation)
+      ABORT("Collecting from unknown thread.");
 }
 /* There seems to be a very rare thread stopping problem.  To help us  */
@@ -408,16 +436,9 @@ void GC_stop_init() {
    if (sigfillset(&act.sa_mask) != 0) {
    	ABORT("sigfillset() failed");
    }
-#   ifdef NO_SIGNALS
+    GC_remove_allowed_signals(&act.sa_mask);
-      if (sigdelset(&act.sa_mask, SIGINT) != 0
+    /* SIG_THR_RESTART is set in the resulting mask.		*/
-	  || sigdelset(&act.sa_mask, SIGQUIT != 0)
+    /* It is unmasked by the handler when necessary. 		*/
-	  || sigdelset(&act.sa_mask, SIGABRT != 0)
-	  || sigdelset(&act.sa_mask, SIGTERM != 0)) {
-        ABORT("sigdelset() failed");
-      }
-#   endif
-    /* SIG_THR_RESTART is unmasked by the handler when necessary. 	*/
    act.sa_handler = GC_suspend_handler;
    if (sigaction(SIG_SUSPEND, &act, NULL) != 0) {
    	ABORT("Cannot set SIG_SUSPEND handler");
@@ -428,6 +449,12 @@ void GC_stop_init() {
    	ABORT("Cannot set SIG_THR_RESTART handler");
    }
+    /* Inititialize suspend_handler_mask. It excludes SIG_THR_RESTART. */
+      if (sigfillset(&suspend_handler_mask) != 0) ABORT("sigfillset() failed");
+      GC_remove_allowed_signals(&suspend_handler_mask);
+      if (sigdelset(&suspend_handler_mask, SIG_THR_RESTART) != 0)
+	  ABORT("sigdelset() failed");
    /* Check for GC_RETRY_SIGNALS.	*/
      if (0 != GETENV("GC_RETRY_SIGNALS")) {
 	  GC_retry_signals = TRUE;

--- a/boehm-gc/pthread_support.c
+++ b/boehm-gc/pthread_support.c
@@ -54,8 +54,17 @@
     && !defined(GC_AIX_THREADS)
 # if defined(GC_HPUX_THREADS) && !defined(USE_PTHREAD_SPECIFIC) \
-     && !defined(USE_HPUX_TLS)
+     && !defined(USE_COMPILER_TLS)
-#   define USE_HPUX_TLS
+#   ifdef __GNUC__
+#     define USE_PTHREAD_SPECIFIC
+      /* Empirically, as of gcc 3.3, USE_COMPILER_TLS doesn't work.	*/
+#   else
+#     define USE_COMPILER_TLS
+#   endif
+# endif
+# if defined USE_HPUX_TLS
+    --> Macro replaced by USE_COMPILER_TLS
 # endif
 # if (defined(GC_DGUX386_THREADS) || defined(GC_OSF1_THREADS) || \
@@ -72,7 +81,7 @@
 # endif
 # ifdef THREAD_LOCAL_ALLOC
-#   if !defined(USE_PTHREAD_SPECIFIC) && !defined(USE_HPUX_TLS)
+#   if !defined(USE_PTHREAD_SPECIFIC) && !defined(USE_COMPILER_TLS)
 #     include "private/specific.h"
 #   endif
 #   if defined(USE_PTHREAD_SPECIFIC)
@@ -81,7 +90,7 @@
 #     define GC_key_create pthread_key_create
      typedef pthread_key_t GC_key_t;
 #   endif
-#   if defined(USE_HPUX_TLS)
+#   if defined(USE_COMPILER_TLS)
 #     define GC_getspecific(x) (x)
 #     define GC_setspecific(key, v) ((key) = (v), 0)
 #     define GC_key_create(key, d) 0
@@ -99,6 +108,7 @@
 # include <sys/types.h>
 # include <sys/stat.h>
 # include <fcntl.h>
+# include <signal.h>
 #if defined(GC_DARWIN_THREADS)
 # include "private/darwin_semaphore.h"
@@ -158,7 +168,7 @@ void GC_init_parallel();
 /* We don't really support thread-local allocation with DBG_HDRS_ALL */
-#ifdef USE_HPUX_TLS
+#ifdef USE_COMPILER_TLS
  __thread
 #endif
 GC_key_t GC_thread_key;
@@ -500,19 +510,6 @@ static __inline__ void start_mark_threads()
 #endif /* !PARALLEL_MARK */
-/* Defining INSTALL_LOOPING_SEGV_HANDLER causes SIGSEGV and SIGBUS to 	*/
-/* result in an infinite loop in a signal handler.  This can be very	*/
-/* useful for debugging, since (as of RH7) gdb still seems to have	*/
-/* serious problems with threads.					*/
-#ifdef INSTALL_LOOPING_SEGV_HANDLER
-void GC_looping_handler(int sig)
-{
-    GC_printf3("Signal %ld in thread %lx, pid %ld\n",
-	       sig, pthread_self(), getpid());
-    for (;;);
-}
-#endif
 GC_bool GC_thr_initialized = FALSE;
 volatile GC_thread GC_threads[THREAD_TABLE_SZ];
@@ -622,7 +619,7 @@ void GC_delete_gc_thread(pthread_t id, GC_thread gc_id)
    GC_INTERNAL_FREE(p);
 }
-/* Return a GC_thread corresponding to a given thread_t.	*/
+/* Return a GC_thread corresponding to a given pthread_t.	*/
 /* Returns 0 if it's not there.					*/
 /* Caller holds  allocation lock or otherwise inhibits 		*/
 /* updates.							*/
@@ -747,7 +744,9 @@ void GC_wait_for_gc_completion(GC_bool wait_for_all)
 	while (GC_incremental && GC_collection_in_progress()
 	       && (wait_for_all || old_gc_no == GC_gc_no)) {
 	    ENTER_GC();
+	    GC_in_thread_creation = TRUE;
            GC_collect_a_little_inner(1);
+	    GC_in_thread_creation = FALSE;
 	    EXIT_GC();
 	    UNLOCK();
 	    sched_yield();
@@ -1055,9 +1054,10 @@ void GC_thread_exit_proc(void *arg)
 	me -> flags |= FINISHED;
    }
 #   if defined(THREAD_LOCAL_ALLOC) && !defined(USE_PTHREAD_SPECIFIC) \
-       && !defined(USE_HPUX_TLS) && !defined(DBG_HDRS_ALL)
+       && !defined(USE_COMPILER_TLS) && !defined(DBG_HDRS_ALL)
      GC_remove_specific(GC_thread_key);
 #   endif
+    /* The following may run the GC from "nonexistent" thread.	*/
    GC_wait_for_gc_completion(FALSE);
    UNLOCK();
 }
@@ -1115,6 +1115,8 @@ WRAP_FUNC(pthread_detach)(pthread_t thread)
    return result;
 }
+GC_bool GC_in_thread_creation = FALSE;
 void * GC_start_routine(void * arg)
 {
    int dummy;
@@ -1132,7 +1134,9 @@ void * GC_start_routine(void * arg)
        GC_printf1("sp = 0x%lx\n", (long) &arg);
 #   endif
    LOCK();
+    GC_in_thread_creation = TRUE;
    me = GC_new_thread(my_pthread);
+    GC_in_thread_creation = FALSE;
 #ifdef GC_DARWIN_THREADS
    me -> stop_info.mach_thread = mach_thread_self();
 #else
@@ -1301,12 +1305,12 @@ WRAP_FUNC(pthread_create)(pthread_t *new_thread,
 void GC_pause()
 {
    int i;
-#	ifndef __GNUC__
+#   if !defined(__GNUC__) || defined(__INTEL_COMPILER)
-        volatile word dummy = 0;
+      volatile word dummy = 0;
-#	endif
+#   endif
    for (i = 0; i < 10; ++i) { 
-#     ifdef __GNUC__
+#     if defined(__GNUC__) && !defined(__INTEL_COMPILER)
        __asm__ __volatile__ (" " : : : "memory");
 #     else
 	/* Something that's unlikely to be optimized away. */
@@ -1315,7 +1319,7 @@ void GC_pause()
    }
 }
-#define SPIN_MAX 1024	/* Maximum number of calls to GC_pause before	*/
+#define SPIN_MAX 128	/* Maximum number of calls to GC_pause before	*/
 			/* give up.					*/
 VOLATILE GC_bool GC_collecting = 0;
@@ -1340,19 +1344,34 @@ VOLATILE GC_bool GC_collecting = 0;
 /* yield by calling pthread_mutex_lock(); it never makes sense to	*/
 /* explicitly sleep.							*/
+#define LOCK_STATS
+#ifdef LOCK_STATS
+  unsigned long GC_spin_count = 0;
+  unsigned long GC_block_count = 0;
+  unsigned long GC_unlocked_count = 0;
+#endif
 void GC_generic_lock(pthread_mutex_t * lock)
 {
 #ifndef NO_PTHREAD_TRYLOCK
    unsigned pause_length = 1;
    unsigned i;
-    if (0 == pthread_mutex_trylock(lock)) return;
+    if (0 == pthread_mutex_trylock(lock)) {
+#       ifdef LOCK_STATS
+	    ++GC_unlocked_count;
+#       endif
+	return;
+    }
    for (; pause_length <= SPIN_MAX; pause_length <<= 1) {
 	for (i = 0; i < pause_length; ++i) {
 	    GC_pause();
 	}
        switch(pthread_mutex_trylock(lock)) {
 	    case 0:
+#		ifdef LOCK_STATS
+		    ++GC_spin_count;
+#		endif
 		return;
 	    case EBUSY:
 		break;
@@ -1361,6 +1380,9 @@ void GC_generic_lock(pthread_mutex_t * lock)
        }
    }
 #endif /* !NO_PTHREAD_TRYLOCK */
+#   ifdef LOCK_STATS
+	++GC_block_count;
+#   endif
    pthread_mutex_lock(lock);
 }