summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordanielsdeleo <dan@opscode.com>2013-12-06 12:41:06 -0800
committerdanielsdeleo <dan@opscode.com>2013-12-09 09:58:20 -0800
commitb148f45428299159697344ee63713650c9083c06 (patch)
treeb4d5bb6811dff35fac4eb5177475c84a85aceff8
parentc195a3f69d9182e52b57eed5eb6ffea24c974ef8 (diff)
downloadmixlib-shellout-b148f45428299159697344ee63713650c9083c06.tar.gz
Run commands with unique pgid to improve timeout cleanup
To ensure that all child processes are properly signaled to exit before forcibly killing them, use `setsid()` in the child process to give the child (and any grandchildren it might create) a new and unique process group. If the command times out, kill and reap the entire process group.
-rw-r--r--lib/mixlib/shellout/unix.rb65
-rw-r--r--spec/mixlib/shellout_spec.rb53
2 files changed, 111 insertions, 7 deletions
diff --git a/lib/mixlib/shellout/unix.rb b/lib/mixlib/shellout/unix.rb
index 15ead60..b5e0d2d 100644
--- a/lib/mixlib/shellout/unix.rb
+++ b/lib/mixlib/shellout/unix.rb
@@ -29,13 +29,20 @@ module Mixlib
# to +stdout+ and +stderr+, and saving its exit status object to +status+
# === Returns
# returns +self+; +stdout+, +stderr+, +status+, and +exitstatus+ will be
- # populated with results of the command
+ # populated with results of the command.
# === Raises
# * Errno::EACCES when you are not privileged to execute the command
# * Errno::ENOENT when the command is not available on the system (or not
# in the current $PATH)
# * Chef::Exceptions::CommandTimeout when the command does not complete
- # within +timeout+ seconds (default: 600s)
+ # within +timeout+ seconds (default: 600s). When this happens, ShellOut
+ # will send a TERM and then KILL to the entire process group to ensure
+ # that any grandchild processes are terminated. If the invocation of
+ # the child process spawned multiple child processes (which commonly
+ # happens if the command is passed as a single string to be interpreted
+ # by bin/sh, and bin/sh is not bash), the exit status object may not
+ # contain the correct exit code of the process (of course there is no
+ # exit code if the command is killed by SIGKILL, also).
def run_command
@child_pid = fork_subprocess
@reaped = false
@@ -51,6 +58,7 @@ module Mixlib
# CHEF-3390: Marshall.load on Ruby < 1.8.7p369 also has a GC bug related
# to Marshall.load, so try disabling GC first.
propagate_pre_exec_failure
+ @child_pgid = -Process.getpgid(@child_pid)
@result = nil
@execution_time = 0
@@ -122,6 +130,12 @@ module Mixlib
Dir.chdir(cwd) if cwd
end
+ # Process group id of the child. Returned as a negative value so you can
+ # put it directly in arguments to kill, wait, etc.
+ def child_pgid
+ @child_pgid
+ end
+
def initialize_ipc
@stdin_pipe, @stdout_pipe, @stderr_pipe, @process_status_pipe = IO.pipe, IO.pipe, IO.pipe, IO.pipe
@process_status_pipe.last.fcntl(Fcntl::F_SETFD, Fcntl::FD_CLOEXEC)
@@ -263,6 +277,14 @@ module Mixlib
initialize_ipc
fork do
+ # Child processes may themselves fork off children. A common case
+ # is when the command is given as a single string (instead of
+ # command name plus Array of arguments) and /bin/sh does not
+ # support the "ONESHOT" optimization (where sh -c does exec without
+ # forking). To support cleaning up all the children, we need to
+ # ensure they're in a unique process group.
+ Process.setsid
+
configure_subprocess_file_descriptors
clean_parent_file_descriptors
@@ -304,13 +326,13 @@ module Mixlib
return if attempt_reap
@terminate_reason = "Command execeded allowed execution time, killed by TERM signal."
logger.error("Command execeded allowed execution time, sending TERM") if logger
- Process.kill(:TERM, @child_pid)
+ Process.kill(:TERM, child_pgid)
sleep 3
- return if attempt_reap
+ return if attempt_reap_all
@terminate_reason = "Command execeded allowed execution time, did not respond to TERM. Killed by KILL signal."
logger.error("Command did not exit from TERM, sending KILL") if logger
- Process.kill(:KILL, @child_pid)
- reap
+ Process.kill(:KILL, child_pgid)
+ reap_all
# Should not hit this but it's possible if something is calling waitall
# in a separate thread.
@@ -323,12 +345,29 @@ module Mixlib
@child_pid && !@reaped
end
+ # Unconditionally reap the child process. This is used in scenarios where
+ # we can be confident the child will exit quickly, and has not spawned
+ # and grandchild processes.
def reap
results = Process.waitpid2(@child_pid)
@reaped = true
@status = results.last
end
+ # Reap app processes with the same process group id as the forked child.
+ # This is used after a timeout when forcibly killing the child process
+ # and any grandchild processes it may have spawned.
+ def reap_all
+ results = []
+ loop do
+ results = Process.waitpid2(child_pgid)
+ end
+ rescue Errno::ECHILD, Errno::ESRCH
+ @reaped = true
+ @status = results.last
+ end
+
+ # Try to reap the child process but don't block if it isn't dead yet.
def attempt_reap
if results = Process.waitpid2(@child_pid, Process::WNOHANG)
@reaped = true
@@ -338,6 +377,20 @@ module Mixlib
end
end
+ # Try to reap all the processes in the child process' process group.
+ # Return a false-y value if any processes in the group cannot be reaped
+ # without blocking (e.g., not dead yet).
+ def attempt_reap_all
+ results = []
+ while results = Process.waitpid2(@child_pid, Process::WNOHANG)
+ @reaped = true
+ @status = results.last
+ end
+ nil
+ rescue Errno::ECHILD, Errno::ESRCH
+ @status
+ end
+
end
end
end
diff --git a/spec/mixlib/shellout_spec.rb b/spec/mixlib/shellout_spec.rb
index 5e92a54..582aef4 100644
--- a/spec/mixlib/shellout_spec.rb
+++ b/spec/mixlib/shellout_spec.rb
@@ -825,7 +825,7 @@ describe Mixlib::ShellOut do
end
end
- context 'with subprocess that takes longer than timeout', :focus do
+ context 'with subprocess that takes longer than timeout' do
def ruby_wo_shell(code)
parts = %w[ruby]
parts << "--disable-gems" if ruby_19?
@@ -889,6 +889,57 @@ describe Mixlib::ShellOut do
end
end
+
+ context "and the child process forks grandchildren" do
+ let(:cmd) do
+ ruby_wo_shell(<<-CODE)
+ STDOUT.sync = true
+ trap(:TERM) { print "got term in child\n"; exit!(123) }
+ fork do
+ trap(:TERM) { print "got term in grandchild\n"; exit!(142) }
+ sleep 10
+ end
+ sleep 10
+ CODE
+ end
+
+ it "should TERM the wayward child and grandchild" do
+ # note: let blocks don't correctly memoize if an exception is raised,
+ # so can't use executed_cmd
+ lambda { shell_cmd.run_command}.should raise_error(Mixlib::ShellOut::CommandTimeout)
+ shell_cmd.stdout.should include("got term in child")
+ shell_cmd.stdout.should include("got term in grandchild")
+ end
+
+ end
+ context "and the child process forks grandchildren that don't respond to TERM" do
+ let(:cmd) do
+ ruby_wo_shell(<<-CODE)
+ STDOUT.sync = true
+ trap(:TERM) { print "got term in child\n" }
+ fork do
+ trap(:TERM) { print "got term in grandchild\n" }
+ sleep 10
+ end
+ sleep 10
+ CODE
+ end
+
+ it "should TERM the wayward child and grandchild, then KILL whoever is left" do
+ # note: let blocks don't correctly memoize if an exception is raised,
+ # so can't use executed_cmd
+ lambda { shell_cmd.run_command}.should raise_error(Mixlib::ShellOut::CommandTimeout)
+ shell_cmd.stdout.should include("got term in child")
+ shell_cmd.stdout.should include("got term in grandchild")
+
+ # A little janky. We get the process group id out of the command
+ # object, then try to kill a process in it to make sure none
+ # exists. Trusting the system under test like this isn't great but
+ # it's difficult to test otherwise.
+ lambda { Process.kill(:INT, shell_cmd.send(:child_pgid)) }.should raise_error(Errno::ESRCH)
+ end
+
+ end
end
context 'with subprocess that exceeds buffersize' do