merge these fixes from 2.1-dev:

*) worker MPM: Fix a problem which could cause httpd processes to remain active after shutdown. [Jeff Trawick] *) Unix MPMs: Shut down the server more quickly when child processes are slow to exit. [Joe Orton, Jeff Trawick] Reviewed by: stoddard, striker git-svn-id: https://svn.apache.org/repos/asf/httpd/httpd/branches/2.0.x@159470 13f79535-47bb-0310-9956-ffa450edef68
author: Jeff Trawick <trawick@apache.org> 2005-03-30 09:42:15 +0000
committer: Jeff Trawick <trawick@apache.org> 2005-03-30 09:42:15 +0000
commit: 17ac47209072655a1b7a1b40210437d7bfa668c8 (patch)
tree: 78f4d05ec9d9b581007dd0b440afa67d8aecee30
parent: 1d3dc2840fe8452b0f1f56fa94202cc93ee91ecb (diff)
download: httpd-17ac47209072655a1b7a1b40210437d7bfa668c8.tar.gz
5 files changed, 226 insertions, 82 deletions
diff --git a/CHANGES b/CHANGES
index cb4cb854ab..480ee17b55 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,5 +1,11 @@
 Changes with Apache 2.0.54
 
+  *) worker MPM: Fix a problem which could cause httpd processes to
+     remain active after shutdown.  [Jeff Trawick]
+
+  *) Unix MPMs: Shut down the server more quickly when child processes are
+     slow to exit.  [Joe Orton, Jeff Trawick]
+
   *) Remove formatting characters from ap_log_error() calls.  These
      were escaped as fallout from CAN-2003-0020.
      [Eric Covener <ecovener gmail.com>]
diff --git a/STATUS b/STATUS
index fab9807f18..e63bdabd2c 100644
--- a/STATUS
+++ b/STATUS
@@ -209,15 +209,6 @@ PATCHES TO BACKPORT FROM TRUNK:
                    it as-is.  For the one or two platforms that don't like 
                    which, they can write their own version of the script.
 
-    * worker MPM: Fix a problem which could cause httpd processes to
-      remain active after shutdown.  (Reliability issue.)
-      Unix MPMs: Shut down the server more quickly when child processes are
-      slow to exit.  (Nice-to-have, but code intersects with the
-      reliability issue)
-      http://svn.apache.org/viewcvs.cgi?rev=109510&view=rev
-      http://svn.apache.org/viewcvs.cgi?rev=105195&view=rev
-      +1: trawick, stoddard, striker
-
     * modules/http/http_request.c (ap_internal_fast_redirect): Take over
       important members of the subrequest. Especially the proxyreq copying
       is interesting for proxying DirectoryIndex'd resources:
diff --git a/include/mpm_common.h b/include/mpm_common.h
index 2cddca899f..3fa0a28ca3 100644
--- a/include/mpm_common.h
+++ b/include/mpm_common.h
@@ -60,7 +60,7 @@ extern "C" {
  * Make sure all child processes that have been spawned by the parent process
  * have died.  This includes process registered as "other_children".
  * @warning This is only defined if the MPM defines 
- *          MPM_NEEDS_RECLAIM_CHILD_PROCESS
+ *          AP_MPM_WANT_RECLAIM_CHILD_PROCESSES
  * @param terminate Either 1 or 0.  If 1, send the child processes SIGTERM
  *        each time through the loop.  If 0, give the process time to die
  *        on its own before signalling it.
@@ -68,12 +68,43 @@ extern "C" {
  *  MPM_CHILD_PID -- Get the pid from the specified spot in the scoreboard
  *  MPM_NOTE_CHILD_KILLED -- Note the child died in the scoreboard
  * </pre>
+ * @tip The MPM child processes which are reclaimed are those listed
+ * in the scoreboard as well as those currently registered via
+ * ap_register_extra_mpm_process().
  */
 #ifdef AP_MPM_WANT_RECLAIM_CHILD_PROCESSES
 void ap_reclaim_child_processes(int terminate);
 #endif
 
 /**
+ * Tell ap_reclaim_child_processes() about an MPM child process which has no
+ * entry in the scoreboard.
+ * @warning This is only defined if the MPM defines
+ *          AP_MPM_WANT_RECLAIM_CHILD_PROCESSES
+ * @param pid The process id of an MPM child process which should be
+ * reclaimed when ap_reclaim_child_processes() is called.
+ * @tip If an extra MPM child process terminates prior to calling
+ * ap_reclaim_child_processes(), remove it from the list of such processes
+ * by calling ap_unregister_extra_mpm_process().
+ */
+#ifdef AP_MPM_WANT_RECLAIM_CHILD_PROCESSES
+void ap_register_extra_mpm_process(pid_t pid);
+#endif
+
+/**
+ * Unregister an MPM child process which was previously registered by a
+ * call to ap_register_extra_mpm_process().
+ * @warning This is only defined if the MPM defines
+ *          AP_MPM_WANT_RECLAIM_CHILD_PROCESSES
+ * @param pid The process id of an MPM child process which no longer needs to
+ * be reclaimed.
+ * @return 1 if the process was found and removed, 0 otherwise
+ */
+#ifdef AP_MPM_WANT_RECLAIM_CHILD_PROCESSES
+int ap_unregister_extra_mpm_process(pid_t pid);
+#endif
+
+/**
  * Determine if any child process has died.  If no child process died, then
  * this process sleeps for the amount of time specified by the MPM defined
  * macro SCOREBOARD_MAINTENANCE_INTERVAL.
diff --git a/server/mpm/worker/worker.c b/server/mpm/worker/worker.c
index 820fc74ebd..9fe8f5e588 100644
--- a/server/mpm/worker/worker.c
+++ b/server/mpm/worker/worker.c
@@ -1285,6 +1285,21 @@ static int make_child(server_rec *s, int slot)
         clean_child_exit(0);
     }
     /* else */
+    if (ap_scoreboard_image->parent[slot].pid != 0) {
+        /* This new child process is squatting on the scoreboard
+         * entry owned by an exiting child process, which cannot
+         * exit until all active requests complete.
+         * Don't forget about this exiting child process, or we
+         * won't be able to kill it if it doesn't exit by the
+         * time the server is shut down.
+         */
+        ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf,
+                     "taking over scoreboard slot from %" APR_PID_T_FMT "%s",
+                     ap_scoreboard_image->parent[slot].pid,
+                     ap_scoreboard_image->parent[slot].quiescing ?
+                         " (quiescing)" : "");
+        ap_register_extra_mpm_process(ap_scoreboard_image->parent[slot].pid);
+    }
     ap_scoreboard_image->parent[slot].quiescing = 0;
     ap_scoreboard_image->parent[slot].pid = pid;
     return 0;
@@ -1499,6 +1514,9 @@ static void server_main_loop(int remaining_children_to_start)
                     make_child(ap_server_conf, child_slot);
                     --remaining_children_to_start;
                 }
+            }
+            else if (ap_unregister_extra_mpm_process(pid.pid) == 1) {
+                /* handled */
 #if APR_HAS_OTHER_CHILD
             }
             else if (apr_proc_other_child_read(&pid, status) == 0) {
diff --git a/server/mpm_common.c b/server/mpm_common.c
index d6bf53de6c..ca5c7a93f2 100644
--- a/server/mpm_common.c
+++ b/server/mpm_common.c
@@ -60,105 +60,203 @@
 #endif
 
 #ifdef AP_MPM_WANT_RECLAIM_CHILD_PROCESSES
+
+typedef enum {DO_NOTHING, SEND_SIGTERM, SEND_SIGKILL, GIVEUP} action_t;
+
+typedef struct extra_process_t {
+    struct extra_process_t *next;
+    pid_t pid;
+} extra_process_t;
+
+static extra_process_t *extras;
+
+void ap_register_extra_mpm_process(pid_t pid)
+{
+    extra_process_t *p = (extra_process_t *)malloc(sizeof(extra_process_t));
+
+    p->next = extras;
+    p->pid = pid;
+    extras = p;
+}
+
+int ap_unregister_extra_mpm_process(pid_t pid)
+{
+    extra_process_t *cur = extras;
+    extra_process_t *prev = NULL;
+
+    while (cur && cur->pid != pid) {
+        prev = cur;
+        cur = cur->next;
+    }
+
+    if (cur) {
+        if (prev) {
+            prev->next = cur->next;
+        }
+        else {
+            extras = cur->next;
+        }
+        free(cur);
+        return 1; /* found */
+    }
+    else {
+        /* we don't know about any such process */
+        return 0;
+    }
+}
+
+static int reclaim_one_pid(pid_t pid, action_t action)
+{
+    apr_proc_t proc;
+    apr_status_t waitret;
+
+    proc.pid = pid;
+    waitret = apr_proc_wait(&proc, NULL, NULL, APR_NOWAIT);
+    if (waitret != APR_CHILD_NOTDONE) {
+        return 1;
+    }
+
+    switch(action) {
+    case DO_NOTHING:
+        break;
+        
+    case SEND_SIGTERM:
+        /* ok, now it's being annoying */
+        ap_log_error(APLOG_MARK, APLOG_WARNING,
+                     0, ap_server_conf,
+                     "child process %" APR_PID_T_FMT
+                     " still did not exit, "
+                     "sending a SIGTERM",
+                     pid);
+        kill(pid, SIGTERM);
+        break;
+        
+    case SEND_SIGKILL:
+        ap_log_error(APLOG_MARK, APLOG_ERR,
+                     0, ap_server_conf,
+                     "child process %" APR_PID_T_FMT
+                     " still did not exit, "
+                     "sending a SIGKILL",
+                     pid);
+#ifndef BEOS
+        kill(pid, SIGKILL);
+#else
+        /* sending a SIGKILL kills the entire team on BeOS, and as
+         * httpd thread is part of that team it removes any chance
+         * of ever doing a restart.  To counter this I'm changing to
+         * use a kinder, gentler way of killing a specific thread
+         * that is just as effective.
+         */
+        kill_thread(pid);
+#endif
+        break;
+                
+    case GIVEUP:
+        /* gave it our best shot, but alas...  If this really
+         * is a child we are trying to kill and it really hasn't
+         * exited, we will likely fail to bind to the port
+         * after the restart.
+         */
+        ap_log_error(APLOG_MARK, APLOG_ERR,
+                     0, ap_server_conf,
+                     "could not make child process %" APR_PID_T_FMT
+                     " exit, "
+                     "attempting to continue anyway",
+                     pid);
+        break;
+    }
+    
+    return 0;
+}
+
 void ap_reclaim_child_processes(int terminate)
 {
+    apr_time_t waittime = 1024 * 16;
     int i;
-    long int waittime = 1024 * 16;      /* in usecs */
-    apr_status_t waitret;
-    int tries;
+    extra_process_t *cur_extra;
     int not_dead_yet;
     int max_daemons;
+    apr_time_t starttime = apr_time_now();
+    /* this table of actions and elapsed times tells what action is taken
+     * at which elapsed time from starting the reclaim
+     */
+    struct {
+        action_t action;
+        apr_time_t action_time;
+    } action_table[] = {
+        {DO_NOTHING, 0}, /* dummy entry for iterations where we reap
+                          * children but take no action against
+                          * stragglers
+                          */
+        {SEND_SIGTERM, apr_time_from_sec(3)},
+        {SEND_SIGTERM, apr_time_from_sec(5)},
+        {SEND_SIGTERM, apr_time_from_sec(7)},
+        {SEND_SIGKILL, apr_time_from_sec(9)},
+        {GIVEUP,       apr_time_from_sec(10)}
+    };
+    int cur_action;      /* index of action we decided to take this
+                          * iteration
+                          */
+    int next_action = 1; /* index of first real action */
 
     ap_mpm_query(AP_MPMQ_MAX_DAEMON_USED, &max_daemons);
 
-    for (tries = terminate ? 4 : 1; tries <= 9; ++tries) {
-        /* don't want to hold up progress any more than
-         * necessary, but we need to allow children a few moments to exit.
-         * Set delay with an exponential backoff.
-         */
+    do {
         apr_sleep(waittime);
+        /* don't let waittime get longer than 1 second; otherwise, we don't
+         * react quickly to the last child exiting, and taking action can
+         * be delayed
+         */
         waittime = waittime * 4;
+        if (waittime > apr_time_from_sec(1)) {
+            waittime = apr_time_from_sec(1);
+        }
+
+        /* see what action to take, if any */
+        if (action_table[next_action].action_time <= apr_time_now() - starttime) {
+            cur_action = next_action;
+            ++next_action;
+        }
+        else {
+            cur_action = 0; /* nothing to do */
+        }
 
         /* now see who is done */
         not_dead_yet = 0;
         for (i = 0; i < max_daemons; ++i) {
             pid_t pid = MPM_CHILD_PID(i);
-            apr_proc_t proc;
 
-            if (pid == 0)
-                continue;
+            if (pid == 0) {
+                continue; /* not every scoreboard entry is in use */
+            }
 
-            proc.pid = pid;
-            waitret = apr_proc_wait(&proc, NULL, NULL, APR_NOWAIT);
-            if (waitret != APR_CHILD_NOTDONE) {
+            if (reclaim_one_pid(pid, action_table[cur_action].action)) {
                 MPM_NOTE_CHILD_KILLED(i);
-                continue;
             }
+            else {
+                ++not_dead_yet;
+            }
+        }
+ 
+        cur_extra = extras;
+        while (cur_extra) {
+            extra_process_t *next = cur_extra->next;
 
-            ++not_dead_yet;
-            switch (tries) {
-            case 1:     /*  16ms */
-            case 2:     /*  82ms */
-            case 3:     /* 344ms */
-            case 4:     /*  16ms */
-                break;
-
-            case 5:     /*  82ms */
-            case 6:     /* 344ms */
-            case 7:     /* 1.4sec */
-                /* ok, now it's being annoying */
-                ap_log_error(APLOG_MARK, APLOG_WARNING,
-                             0, ap_server_conf,
-                             "child process %ld still did not exit, "
-                             "sending a SIGTERM",
-                             (long)pid);
-                kill(pid, SIGTERM);
-                break;
-
-            case 8:     /*  6 sec */
-                /* die child scum */
-                ap_log_error(APLOG_MARK, APLOG_ERR,
-                             0, ap_server_conf,
-                             "child process %ld still did not exit, "
-                             "sending a SIGKILL",
-                             (long)pid);
-#ifndef BEOS
-                kill(pid, SIGKILL);
-#else
-                /* sending a SIGKILL kills the entire team on BeOS, and as
-                 * httpd thread is part of that team it removes any chance
-                 * of ever doing a restart.  To counter this I'm changing to
-                 * use a kinder, gentler way of killing a specific thread
-                 * that is just as effective.
-                 */
-                kill_thread(pid);
-#endif
-                break;
-
-            case 9:     /* 14 sec */
-                /* gave it our best shot, but alas...  If this really
-                 * is a child we are trying to kill and it really hasn't
-                 * exited, we will likely fail to bind to the port
-                 * after the restart.
-                 */
-                ap_log_error(APLOG_MARK, APLOG_ERR,
-                             0, ap_server_conf,
-                             "could not make child process %ld exit, "
-                             "attempting to continue anyway",
-                             (long)pid);
-                break;
+            if (reclaim_one_pid(cur_extra->pid, action_table[cur_action].action)) {
+                AP_DEBUG_ASSERT(1 == ap_unregister_extra_mpm_process(cur_extra->pid));
+            }
+            else {
+                ++not_dead_yet;
             }
+            cur_extra = next;
         }
 
 #if APR_HAS_OTHER_CHILD
         apr_proc_other_child_check();
 #endif
 
-        if (!not_dead_yet) {
-            /* nothing left to wait for */
-            break;
-        }
-    }
+    } while (not_dead_yet > 0 &&
+             action_table[cur_action].action != GIVEUP);
 }
 #endif /* AP_MPM_WANT_RECLAIM_CHILD_PROCESSES */
author	Jeff Trawick <trawick@apache.org>	2005-03-30 09:42:15 +0000
committer	Jeff Trawick <trawick@apache.org>	2005-03-30 09:42:15 +0000
commit	17ac47209072655a1b7a1b40210437d7bfa668c8 (patch)
tree	78f4d05ec9d9b581007dd0b440afa67d8aecee30
parent	1d3dc2840fe8452b0f1f56fa94202cc93ee91ecb (diff)
download	httpd-17ac47209072655a1b7a1b40210437d7bfa668c8.tar.gz