6 files changed, 186 insertions, 33 deletions
diff --git a/gdb/ChangeLog b/gdb/ChangeLog
index 7ae3c583477..a8b8850f7f6 100644
--- a/gdb/ChangeLog
+++ b/gdb/ChangeLog
@@ -1,5 +1,18 @@
 2015-03-19  Pedro Alves  <palves@redhat.com>
 
+	* linux-nat.c (linux_resume_one_lwp): Rename to ...
+	(linux_resume_one_lwp_throw): ... this.  Don't handle ESRCH here,
+	instead call perror_with_name.
+	(check_ptrace_stopped_lwp_gone): New function.
+	(linux_resume_one_lwp): Reimplement as wrapper around
+	linux_resume_one_lwp_throw that swallows errors if the LWP is
+	gone.
+	(resume_stopped_resumed_lwps): Try register reads in TRY/CATCH and
+	swallows errors if the LWP is gone.  Use
+	linux_resume_one_lwp_throw instead of linux_resume_one_lwp.
+
+2015-03-19  Pedro Alves  <palves@redhat.com>
+
 	* linux-nat.c (status_callback): Return early if the LWP has no
 	status pending.
 
diff --git a/gdb/gdbserver/ChangeLog b/gdb/gdbserver/ChangeLog
index 0383e67c59a..cbd199b6c9f 100644
--- a/gdb/gdbserver/ChangeLog
+++ b/gdb/gdbserver/ChangeLog
@@ -1,5 +1,15 @@
 2015-03-19  Pedro Alves  <palves@redhat.com>
 
+	* linux-low.c (linux_resume_one_lwp): Rename to ...
+	(linux_resume_one_lwp_throw): ... this.  Don't handle ESRCH here,
+	instead call perror_with_name.
+	(check_ptrace_stopped_lwp_gone): New function.
+	(linux_resume_one_lwp): Reimplement as wrapper around
+	linux_resume_one_lwp_throw that swallows errors if the LWP is
+	gone.
+
+2015-03-19  Pedro Alves  <palves@redhat.com>
+
 	* linux-low.c (count_events_callback, select_event_lwp_callback):
 	No longer check whether the thread has resume_stop as last resume
 	kind.
diff --git a/gdb/gdbserver/linux-low.c b/gdb/gdbserver/linux-low.c
index 2b988ec1361..0c5411510f5 100644
--- a/gdb/gdbserver/linux-low.c
+++ b/gdb/gdbserver/linux-low.c
@@ -3379,13 +3379,12 @@ stop_all_lwps (int suspend, struct lwp_info *except)
     }
 }
 
-/* Resume execution of the inferior process.
-   If STEP is nonzero, single-step it.
-   If SIGNAL is nonzero, give it that signal.  */
+/* Resume execution of LWP.  If STEP is nonzero, single-step it.  If
+   SIGNAL is nonzero, give it that signal.  */
 
 static void
-linux_resume_one_lwp (struct lwp_info *lwp,
-		      int step, int signal, siginfo_t *info)
+linux_resume_one_lwp_throw (struct lwp_info *lwp,
+			    int step, int signal, siginfo_t *info)
 {
   struct thread_info *thread = get_lwp_thread (lwp);
   struct thread_info *saved_thread;
@@ -3566,8 +3565,6 @@ linux_resume_one_lwp (struct lwp_info *lwp,
 
   regcache_invalidate_thread (thread);
   errno = 0;
-  lwp->stopped = 0;
-  lwp->stop_reason = TARGET_STOPPED_BY_NO_REASON;
   lwp->stepping = step;
   ptrace (step ? PTRACE_SINGLESTEP : PTRACE_CONT, lwpid_of (thread),
 	  (PTRACE_TYPE_ARG3) 0,
@@ -3577,19 +3574,68 @@ linux_resume_one_lwp (struct lwp_info *lwp,
 
   current_thread = saved_thread;
   if (errno)
+    perror_with_name ("resuming thread");
+
+  /* Successfully resumed.  Clear state that no longer makes sense,
+     and mark the LWP as running.  Must not do this before resuming
+     otherwise if that fails other code will be confused.  E.g., we'd
+     later try to stop the LWP and hang forever waiting for a stop
+     status.  Note that we must not throw after this is cleared,
+     otherwise handle_zombie_lwp_error would get confused.  */
+  lwp->stopped = 0;
+  lwp->stop_reason = TARGET_STOPPED_BY_NO_REASON;
+}
+
+/* Called when we try to resume a stopped LWP and that errors out.  If
+   the LWP is no longer in ptrace-stopped state (meaning it's zombie,
+   or about to become), discard the error, clear any pending status
+   the LWP may have, and return true (we'll collect the exit status
+   soon enough).  Otherwise, return false.  */
+
+static int
+check_ptrace_stopped_lwp_gone (struct lwp_info *lp)
+{
+  struct thread_info *thread = get_lwp_thread (lp);
+
+  /* If we get an error after resuming the LWP successfully, we'd
+     confuse !T state for the LWP being gone.  */
+  gdb_assert (lp->stopped);
+
+  /* We can't just check whether the LWP is in 'Z (Zombie)' state,
+     because even if ptrace failed with ESRCH, the tracee may be "not
+     yet fully dead", but already refusing ptrace requests.  In that
+     case the tracee has 'R (Running)' state for a little bit
+     (observed in Linux 3.18).  See also the note on ESRCH in the
+     ptrace(2) man page.  Instead, check whether the LWP has any state
+     other than ptrace-stopped.  */
+
+  /* Don't assume anything if /proc/PID/status can't be read.  */
+  if (linux_proc_pid_is_trace_stopped_nowarn (lwpid_of (thread)) == 0)
     {
-      /* ESRCH from ptrace either means that the thread was already
-	 running (an error) or that it is gone (a race condition).  If
-	 it's gone, we will get a notification the next time we wait,
-	 so we can ignore the error.  We could differentiate these
-	 two, but it's tricky without waiting; the thread still exists
-	 as a zombie, so sending it signal 0 would succeed.  So just
-	 ignore ESRCH.  */
-      if (errno == ESRCH)
-	return;
+      lp->stop_reason = TARGET_STOPPED_BY_NO_REASON;
+      lp->status_pending_p = 0;
+      return 1;
+    }
+  return 0;
+}
+
+/* Like linux_resume_one_lwp_throw, but no error is thrown if the LWP
+   disappears while we try to resume it.  */
 
-      perror_with_name ("ptrace");
+static void
+linux_resume_one_lwp (struct lwp_info *lwp,
+		      int step, int signal, siginfo_t *info)
+{
+  TRY
+    {
+      linux_resume_one_lwp_throw (lwp, step, signal, info);
+    }
+  CATCH (ex, RETURN_MASK_ERROR)
+    {
+      if (!check_ptrace_stopped_lwp_gone (lwp))
+	throw_exception (ex);
     }
+  END_CATCH
 }
 
 struct thread_resume_array
diff --git a/gdb/linux-nat.c b/gdb/linux-nat.c
index 40f1e1fce40..8b620417a40 100644
--- a/gdb/linux-nat.c
+++ b/gdb/linux-nat.c
@@ -1503,7 +1503,8 @@ linux_nat_detach (struct target_ops *ops, const char *args, int from_tty)
    single-step it.  If SIGNAL is nonzero, give it that signal.  */
 
 static void
-linux_resume_one_lwp (struct lwp_info *lp, int step, enum gdb_signal signo)
+linux_resume_one_lwp_throw (struct lwp_info *lp, int step,
+			    enum gdb_signal signo)
 {
   lp->step = step;
 
@@ -1522,11 +1523,68 @@ linux_resume_one_lwp (struct lwp_info *lp, int step, enum gdb_signal signo)
   if (linux_nat_prepare_to_resume != NULL)
     linux_nat_prepare_to_resume (lp);
   linux_ops->to_resume (linux_ops, lp->ptid, step, signo);
-  lp->stop_reason = TARGET_STOPPED_BY_NO_REASON;
+
+  /* Successfully resumed.  Clear state that no longer makes sense,
+     and mark the LWP as running.  Must not do this before resuming
+     otherwise if that fails other code will be confused.  E.g., we'd
+     later try to stop the LWP and hang forever waiting for a stop
+     status.  Note that we must not throw after this is cleared,
+     otherwise handle_zombie_lwp_error would get confused.  */
   lp->stopped = 0;
+  lp->stop_reason = TARGET_STOPPED_BY_NO_REASON;
   registers_changed_ptid (lp->ptid);
 }
 
+/* Called when we try to resume a stopped LWP and that errors out.  If
+   the LWP is no longer in ptrace-stopped state (meaning it's zombie,
+   or about to become), discard the error, clear any pending status
+   the LWP may have, and return true (we'll collect the exit status
+   soon enough).  Otherwise, return false.  */
+
+static int
+check_ptrace_stopped_lwp_gone (struct lwp_info *lp)
+{
+  /* If we get an error after resuming the LWP successfully, we'd
+     confuse !T state for the LWP being gone.  */
+  gdb_assert (lp->stopped);
+
+  /* We can't just check whether the LWP is in 'Z (Zombie)' state,
+     because even if ptrace failed with ESRCH, the tracee may be "not
+     yet fully dead", but already refusing ptrace requests.  In that
+     case the tracee has 'R (Running)' state for a little bit
+     (observed in Linux 3.18).  See also the note on ESRCH in the
+     ptrace(2) man page.  Instead, check whether the LWP has any state
+     other than ptrace-stopped.  */
+
+  /* Don't assume anything if /proc/PID/status can't be read.  */
+  if (linux_proc_pid_is_trace_stopped_nowarn (ptid_get_lwp (lp->ptid)) == 0)
+    {
+      lp->stop_reason = TARGET_STOPPED_BY_NO_REASON;
+      lp->status = 0;
+      lp->waitstatus.kind = TARGET_WAITKIND_IGNORE;
+      return 1;
+    }
+  return 0;
+}
+
+/* Like linux_resume_one_lwp_throw, but no error is thrown if the LWP
+   disappears while we try to resume it.  */
+
+static void
+linux_resume_one_lwp (struct lwp_info *lp, int step, enum gdb_signal signo)
+{
+  TRY
+    {
+      linux_resume_one_lwp_throw (lp, step, signo);
+    }
+  CATCH (ex, RETURN_MASK_ERROR)
+    {
+      if (!check_ptrace_stopped_lwp_gone (lp))
+	throw_exception (ex);
+    }
+  END_CATCH
+}
+
 /* Resume LP.  */
 
 static void
@@ -3542,24 +3600,39 @@ resume_stopped_resumed_lwps (struct lwp_info *lp, void *data)
     {
       struct regcache *regcache = get_thread_regcache (lp->ptid);
       struct gdbarch *gdbarch = get_regcache_arch (regcache);
-      CORE_ADDR pc = regcache_read_pc (regcache);
 
-      /* Don't bother if there's a breakpoint at PC that we'd hit
-	 immediately, and we're not waiting for this LWP.  */
-      if (!ptid_match (lp->ptid, *wait_ptid_p))
+      TRY
 	{
-	  if (breakpoint_inserted_here_p (get_regcache_aspace (regcache), pc))
-	    return 0;
-	}
+	  CORE_ADDR pc = regcache_read_pc (regcache);
+	  int leave_stopped = 0;
 
-      if (debug_linux_nat)
-	fprintf_unfiltered (gdb_stdlog,
-			    "RSRL: resuming stopped-resumed LWP %s at %s: step=%d\n",
-			    target_pid_to_str (lp->ptid),
-			    paddress (gdbarch, pc),
-			    lp->step);
+	  /* Don't bother if there's a breakpoint at PC that we'd hit
+	     immediately, and we're not waiting for this LWP.  */
+	  if (!ptid_match (lp->ptid, *wait_ptid_p))
+	    {
+	      if (breakpoint_inserted_here_p (get_regcache_aspace (regcache), pc))
+		leave_stopped = 1;
+	    }
 
-      linux_resume_one_lwp (lp, lp->step, GDB_SIGNAL_0);
+	  if (!leave_stopped)
+	    {
+	      if (debug_linux_nat)
+		fprintf_unfiltered (gdb_stdlog,
+				    "RSRL: resuming stopped-resumed LWP %s at "
+				    "%s: step=%d\n",
+				    target_pid_to_str (lp->ptid),
+				    paddress (gdbarch, pc),
+				    lp->step);
+
+	      linux_resume_one_lwp_throw (lp, lp->step, GDB_SIGNAL_0);
+	    }
+	}
+      CATCH (ex, RETURN_MASK_ERROR)
+	{
+	  if (!check_ptrace_stopped_lwp_gone (lp))
+	    throw_exception (ex);
+	}
+      END_CATCH
     }
 
   return 0;
diff --git a/gdb/nat/linux-procfs.c b/gdb/nat/linux-procfs.c
index f1493839f60..7599b32ed3e 100644
--- a/gdb/nat/linux-procfs.c
+++ b/gdb/nat/linux-procfs.c
@@ -151,6 +151,15 @@ linux_proc_pid_is_stopped (pid_t pid)
   return linux_proc_pid_has_state (pid, "T (stopped)", 1);
 }
 
+/* Detect `T (tracing stop)' in `/proc/PID/status'.
+   Other states including `T (stopped)' are reported as false.  */
+
+int
+linux_proc_pid_is_trace_stopped_nowarn (pid_t pid)
+{
+  return linux_proc_pid_has_state (pid, "T (tracing stop)", 1);
+}
+
 /* Return non-zero if PID is a zombie.  If WARN, warn on failure to
    open the /proc file.  */
 
diff --git a/gdb/nat/linux-procfs.h b/gdb/nat/linux-procfs.h
index 979ae0dbd82..c4f57885b1d 100644
--- a/gdb/nat/linux-procfs.h
+++ b/gdb/nat/linux-procfs.h
@@ -36,6 +36,8 @@ extern pid_t linux_proc_get_tracerpid_nowarn (pid_t lwpid);
 
 extern int linux_proc_pid_is_stopped (pid_t pid);
 
+extern int linux_proc_pid_is_trace_stopped_nowarn (pid_t pid);
+
 /* Return non-zero if PID is a zombie.  Failure to open the
    /proc/pid/status file results in a warning.  */