diff options
author | danielsdeleo <dan@opscode.com> | 2013-12-06 12:41:06 -0800 |
---|---|---|
committer | danielsdeleo <dan@opscode.com> | 2013-12-09 09:58:20 -0800 |
commit | b148f45428299159697344ee63713650c9083c06 (patch) | |
tree | b4d5bb6811dff35fac4eb5177475c84a85aceff8 /lib/mixlib | |
parent | c195a3f69d9182e52b57eed5eb6ffea24c974ef8 (diff) | |
download | mixlib-shellout-b148f45428299159697344ee63713650c9083c06.tar.gz |
Run commands with unique pgid to improve timeout cleanup
To ensure that all child processes are properly signaled to exit before
forcibly killing them, use `setsid()` in the child process to give the
child (and any grandchildren it might create) a new and unique process
group. If the command times out, kill and reap the entire process group.
Diffstat (limited to 'lib/mixlib')
-rw-r--r-- | lib/mixlib/shellout/unix.rb | 65 |
1 files changed, 59 insertions, 6 deletions
diff --git a/lib/mixlib/shellout/unix.rb b/lib/mixlib/shellout/unix.rb index 15ead60..b5e0d2d 100644 --- a/lib/mixlib/shellout/unix.rb +++ b/lib/mixlib/shellout/unix.rb @@ -29,13 +29,20 @@ module Mixlib # to +stdout+ and +stderr+, and saving its exit status object to +status+ # === Returns # returns +self+; +stdout+, +stderr+, +status+, and +exitstatus+ will be - # populated with results of the command + # populated with results of the command. # === Raises # * Errno::EACCES when you are not privileged to execute the command # * Errno::ENOENT when the command is not available on the system (or not # in the current $PATH) # * Chef::Exceptions::CommandTimeout when the command does not complete - # within +timeout+ seconds (default: 600s) + # within +timeout+ seconds (default: 600s). When this happens, ShellOut + # will send a TERM and then KILL to the entire process group to ensure + # that any grandchild processes are terminated. If the invocation of + # the child process spawned multiple child processes (which commonly + # happens if the command is passed as a single string to be interpreted + # by bin/sh, and bin/sh is not bash), the exit status object may not + # contain the correct exit code of the process (of course there is no + # exit code if the command is killed by SIGKILL, also). def run_command @child_pid = fork_subprocess @reaped = false @@ -51,6 +58,7 @@ module Mixlib # CHEF-3390: Marshall.load on Ruby < 1.8.7p369 also has a GC bug related # to Marshall.load, so try disabling GC first. propagate_pre_exec_failure + @child_pgid = -Process.getpgid(@child_pid) @result = nil @execution_time = 0 @@ -122,6 +130,12 @@ module Mixlib Dir.chdir(cwd) if cwd end + # Process group id of the child. Returned as a negative value so you can + # put it directly in arguments to kill, wait, etc. + def child_pgid + @child_pgid + end + def initialize_ipc @stdin_pipe, @stdout_pipe, @stderr_pipe, @process_status_pipe = IO.pipe, IO.pipe, IO.pipe, IO.pipe @process_status_pipe.last.fcntl(Fcntl::F_SETFD, Fcntl::FD_CLOEXEC) @@ -263,6 +277,14 @@ module Mixlib initialize_ipc fork do + # Child processes may themselves fork off children. A common case + # is when the command is given as a single string (instead of + # command name plus Array of arguments) and /bin/sh does not + # support the "ONESHOT" optimization (where sh -c does exec without + # forking). To support cleaning up all the children, we need to + # ensure they're in a unique process group. + Process.setsid + configure_subprocess_file_descriptors clean_parent_file_descriptors @@ -304,13 +326,13 @@ module Mixlib return if attempt_reap @terminate_reason = "Command execeded allowed execution time, killed by TERM signal." logger.error("Command execeded allowed execution time, sending TERM") if logger - Process.kill(:TERM, @child_pid) + Process.kill(:TERM, child_pgid) sleep 3 - return if attempt_reap + return if attempt_reap_all @terminate_reason = "Command execeded allowed execution time, did not respond to TERM. Killed by KILL signal." logger.error("Command did not exit from TERM, sending KILL") if logger - Process.kill(:KILL, @child_pid) - reap + Process.kill(:KILL, child_pgid) + reap_all # Should not hit this but it's possible if something is calling waitall # in a separate thread. @@ -323,12 +345,29 @@ module Mixlib @child_pid && !@reaped end + # Unconditionally reap the child process. This is used in scenarios where + # we can be confident the child will exit quickly, and has not spawned + # and grandchild processes. def reap results = Process.waitpid2(@child_pid) @reaped = true @status = results.last end + # Reap app processes with the same process group id as the forked child. + # This is used after a timeout when forcibly killing the child process + # and any grandchild processes it may have spawned. + def reap_all + results = [] + loop do + results = Process.waitpid2(child_pgid) + end + rescue Errno::ECHILD, Errno::ESRCH + @reaped = true + @status = results.last + end + + # Try to reap the child process but don't block if it isn't dead yet. def attempt_reap if results = Process.waitpid2(@child_pid, Process::WNOHANG) @reaped = true @@ -338,6 +377,20 @@ module Mixlib end end + # Try to reap all the processes in the child process' process group. + # Return a false-y value if any processes in the group cannot be reaped + # without blocking (e.g., not dead yet). + def attempt_reap_all + results = [] + while results = Process.waitpid2(@child_pid, Process::WNOHANG) + @reaped = true + @status = results.last + end + nil + rescue Errno::ECHILD, Errno::ESRCH + @status + end + end end end |