2 files changed, 177 insertions, 149 deletions
diff --git a/lib/chef/run_lock.rb b/lib/chef/run_lock.rb
index cefe637db6..9e0952bdcb 100644
--- a/lib/chef/run_lock.rb
+++ b/lib/chef/run_lock.rb
@@ -87,27 +87,8 @@ class Chef
     # Either acquire() or test() methods should be called in order to
     # get the ownership of run_lock.
     def test
-      # ensure the runlock_file path exists
-      create_path(File.dirname(runlock_file))
-      @runlock = File.open(runlock_file,'a+')
-
-      if Chef::Platform.windows?
-        acquire_win32_mutex
-      else
-        # If we support FD_CLOEXEC, then use it.
-        # NB: ruby-2.0.0-p195 sets FD_CLOEXEC by default, but not
-        # ruby-1.8.7/1.9.3
-        if Fcntl.const_defined?('F_SETFD') && Fcntl.const_defined?('FD_CLOEXEC')
-          runlock.fcntl(Fcntl::F_SETFD, runlock.fcntl(Fcntl::F_GETFD, 0) | Fcntl::FD_CLOEXEC)
-        end
-        # Flock will return 0 if it can acquire the lock otherwise it
-        # will return false
-        if runlock.flock(File::LOCK_NB|File::LOCK_EX) == 0
-          true
-        else
-          false
-        end
-      end
+      create_lock
+      acquire_lock
     end
 
     #
@@ -147,6 +128,34 @@ class Chef
       end
     end
 
+    # @api private solely for race condition tests
+    def create_lock
+      # ensure the runlock_file path exists
+      create_path(File.dirname(runlock_file))
+      @runlock = File.open(runlock_file,'a+')
+    end
+
+    # @api private solely for race condition tests
+    def acquire_lock
+      if Chef::Platform.windows?
+        acquire_win32_mutex
+      else
+        # If we support FD_CLOEXEC, then use it.
+        # NB: ruby-2.0.0-p195 sets FD_CLOEXEC by default, but not
+        # ruby-1.8.7/1.9.3
+        if Fcntl.const_defined?('F_SETFD') && Fcntl.const_defined?('FD_CLOEXEC')
+          runlock.fcntl(Fcntl::F_SETFD, runlock.fcntl(Fcntl::F_GETFD, 0) | Fcntl::FD_CLOEXEC)
+        end
+        # Flock will return 0 if it can acquire the lock otherwise it
+        # will return false
+        if runlock.flock(File::LOCK_NB|File::LOCK_EX) == 0
+          true
+        else
+          false
+        end
+      end
+    end
+
     private
 
     def reset
diff --git a/spec/functional/run_lock_spec.rb b/spec/functional/run_lock_spec.rb
index ce0598bead..78b3847d19 100644
--- a/spec/functional/run_lock_spec.rb
+++ b/spec/functional/run_lock_spec.rb
@@ -34,8 +34,15 @@ describe Chef::RunLock do
     let(:lockfile){ "#{random_temp_root}/this/long/path/does/not/exist/chef-client-running.pid" }
 
     # make sure to start with a clean slate.
-    before(:each){ FileUtils.rm_r(random_temp_root) if File.exist?(random_temp_root) }
-    after(:each){ FileUtils.rm_r(random_temp_root) if File.exist?(random_temp_root) }
+    before(:each){ log_event("rm -rf before"); FileUtils.rm_r(random_temp_root) if File.exist?(random_temp_root) }
+    after(:each){ log_event("rm -rf after"); FileUtils.rm_r(random_temp_root) if File.exist?(random_temp_root) }
+
+    def log_event(message, time=Time.now.strftime("%H:%M:%S.%L"))
+      events << [ message, time ]
+    end
+    def events
+      @events ||= []
+    end
 
     WAIT_ON_LOCK_TIME = 1.0
     def wait_on_lock
@@ -48,118 +55,90 @@ describe Chef::RunLock do
       raise "Lockfile never created, abandoning test"
     end
 
-    ##
-    # Side channel via a pipe allows child processes to send errors to the parent
-
-    # Don't lazy create the pipe or else we might not share it with subprocesses
-    let!(:error_pipe) do
-      r,w = IO.pipe
-      w.sync = true
-      [r,w]
-    end
-
-    let(:error_read) { error_pipe[0] }
-    let(:error_write) { error_pipe[1] }
-
-    after do
-      error_read.close unless error_read.closed?
-      error_write.close unless error_write.closed?
-    end
-
-    # Send a RuntimeError from the child process to the parent process. Also
-    # prints error to $stdout, just in case something goes wrong with the error
-    # marshaling stuff.
-    def send_side_channel_error(message)
-      $stderr.puts(message)
-      $stderr.puts(caller)
-      e = RuntimeError.new(message)
-      error_write.print(Marshal.dump(e))
-    end
+    CLIENT_PROCESS_TIMEOUT = 10
+    BREATHING_ROOM = 1
 
-    # Read the error (if any) from the error channel. If a marhaled error is
-    # present, it is unmarshaled and raised (which will fail the test)
-    def raise_side_channel_error!
-      error_write.close
-      err = error_read.read
-      error_read.close
+    # ClientProcess is defined below
+    let!(:p1) { ClientProcess.new(self, 'p1') }
+    let!(:p2) { ClientProcess.new(self, 'p2') }
+    after(:each) do |example|
       begin
-        # ArgumentError from Marshal.load indicates no data, which we assume
-        # means no error in child process.
-        raise Marshal.load(err)
-      rescue ArgumentError
-        nil
+        p1.stop
+        p2.stop
+      rescue
+        example.exception = $!
+        raise
+      ensure
+        if example.exception
+          print_events
+        end
       end
     end
 
-    ##
-    # Interprocess synchronization via a pipe. This allows us to control the
-    # state of the processes competing over the lock without relying on sleep.
-
-    let!(:sync_pipe) do
-      r,w = IO.pipe
-      w.sync = true
-      [r,w]
-    end
-    let(:sync_read) { sync_pipe[0] }
-    let(:sync_write) { sync_pipe[1] }
-
-    after do
-      sync_read.close unless sync_read.closed?
-      sync_write.close unless sync_write.closed?
-    end
-
-    # Wait on synchronization signal. If not received within the timeout, an
-    # error is sent via the error channel, and the process exits.
-    def sync_wait
-      if IO.select([sync_read], nil, nil, 20).nil?
-        # timeout reading from the sync pipe.
-        send_side_channel_error("Error syncing processes in run lock test (timeout)")
-        exit!(1)
-      else
-        sync_read.getc
+    def print_events
+      # Consume any remaining events that went on the channel and print them all
+      p1.last_event
+      p2.last_event
+      events.each_with_index.sort_by { |(message, time), index| [ time, index ] }.each do |(message, time), index|
+        print "#{time} #{message}\n"
       end
     end
 
-    # Sends a character in the sync pipe, which wakes ("unlocks") another
-    # process that is waiting on the sync signal
-    def sync_send
-      sync_write.putc("!")
-      sync_write.flush
-    end
+    context "when the lockfile does not already exist" do
+      context "when a client creates the lockfile but has not yet acquired the lock" do
+        before { p1.run_to("created lock") }
+        shared_context "second client gets the lock" do
+          it "the lockfile is created" do
+            log_event("lockfile exists? #{File.exist?(lockfile)}")
+            expect(File.exist?(lockfile)).to be_truthy
+          end
 
-    ##
-    # IPC to record test results in a pipe. Tests can read pipe contents to
-    # check that operations occur in the expected order.
+          it "the lockfile is not locked" do
+            run_lock = Chef::RunLock.new(lockfile)
+            begin
+              expect(run_lock.test).to be_truthy
+            ensure
+              run_lock.release
+            end
+          end
 
-    let!(:results_pipe) do
-      r,w = IO.pipe
-      w.sync = true
-      [r,w]
-    end
-    let(:results_read) { results_pipe[0] }
-    let(:results_write) { results_pipe[1] }
+          it "the lockfile is empty" do
+            expect(IO.read(lockfile)).to eq('')
+          end
 
-    after do
-      results_read.close unless results_read.closed?
-      results_write.close unless results_write.closed?
-    end
+          context "and a second client gets the lock" do
+            before { p2.run_to("acquired lock") }
+            it "the first client does not get the lock until the second finishes" do
+              p1.run_to("acquired lock") do
+                p2.run_to_completion
+              end
+            end
+            it "and the first client tries to get the lock and the second is killed, the first client gets the lock immediately" do
+              p1.run_to("acquired lock") do
+                sleep BREATHING_ROOM
+                expect(p1.last_event).to match(/after (started|created lock)/)
+                p2.stop
+              end
+              p1.run_to_completion
+            end
+          end
+        end
 
-    CLIENT_PROCESS_TIMEOUT = 2
-    BREATHING_ROOM = 1
+        context "and the second client has done nothing" do
+          include_context "second client gets the lock"
+        end
 
-    # ClientProcess is defined below
-    let!(:p1) { ClientProcess.new(self, 'p1') }
-    let!(:p2) { ClientProcess.new(self, 'p2') }
-    after do
-      p1.stop
-      p2.stop
-    end
+        context "and the second client has created the lockfile but not yet acquired the lock" do
+          before { p2.run_to("created lock") }
+          include_context "second client gets the lock"
+        end
+      end
 
-    context "when the lockfile does not already exist" do
       context "when a client acquires the lock but has not yet saved the pid" do
         before { p1.run_to("acquired lock") }
 
         it "the lockfile is created" do
+          log_event("lockfile exists? #{File.exist?(lockfile)}")
           expect(File.exist?(lockfile)).to be_truthy
         end
 
@@ -186,7 +165,7 @@ describe Chef::RunLock do
           p2.run_to("acquired lock") do
             # While p2 is trying to acquire, wait a bit and then let p1 complete
             sleep(BREATHING_ROOM)
-            expect(p2.last_event).to eq("started")
+            expect(p2.last_event).to match(/after (started|created lock)/)
             p1.run_to_completion
           end
 
@@ -196,7 +175,7 @@ describe Chef::RunLock do
         it "and a second client tries to get the lock and the first is killed, the second client gets the lock immediately" do
           p2.run_to("acquired lock") do
             sleep BREATHING_ROOM
-            expect(p2.last_event).to eq("started")
+            expect(p2.last_event).to match(/after (started|created lock)/)
             p1.stop
           end
           p2.run_to_completion
@@ -233,7 +212,7 @@ describe Chef::RunLock do
           p2.run_to("acquired lock") do
             # While p2 is trying to acquire, wait a bit and then let p1 complete
             sleep(BREATHING_ROOM)
-            expect(p2.last_event).to eq("started")
+            expect(p2.last_event).to match(/after (started|created lock)/)
             p1.run_to_completion
           end
 
@@ -243,7 +222,7 @@ describe Chef::RunLock do
         it "when a second client tries to get the lock and the first is killed, the second client gets the lock immediately" do
           p2.run_to("acquired lock") do
             sleep BREATHING_ROOM
-            expect(p2.last_event).to eq("started")
+            expect(p2.last_event).to match(/after (started|created lock)/)
             p1.stop
           end
           p2.run_to_completion
@@ -341,73 +320,97 @@ describe Chef::RunLock do
     attr_reader :name
     attr_reader :pid
 
-    def fire_event(event)
-      # Let the caller know what event we've reached
-      write_to_tests.puts(event)
-
-      # Block until the client tells us where to stop
-      if !@run_to_event || event == @run_to_event
-        @run_to_event = read_from_tests.gets.strip
-      end
-    end
-
     def last_event
       while true
-        event = readline_nonblock(read_from_process)
-        break if event.nil?
-        @last_event = event.strip
+        line = readline_nonblock(read_from_process)
+        break if line.nil?
+        event, time = line.split("@")
+        example.log_event("#{name}.last_event got #{event}")
+        example.log_event("[#{name}] #{event}", time.strip)
+        @last_event = event
       end
       @last_event
     end
 
-    def run_to(event, &background_block)
+    def run_to(to_event, &background_block)
+      example.log_event("#{name}.run_to(#{to_event.inspect})")
+
       # Start the process if it's not started
       start if !pid
 
       # Tell the process what to stop at (also means it can go)
-      write_to_process.puts event
+      write_to_process.print "#{to_event}\n"
 
       # Run the background block
       background_block.call if background_block
 
       # Wait until it gets there
       Timeout::timeout(CLIENT_PROCESS_TIMEOUT) do
-        until @last_event == event
-          @last_event = read_from_process.gets.strip
+        until @last_event == "after #{to_event}"
+          got_event, time = read_from_process.gets.split("@")
+          example.log_event("#{name}.last_event got #{got_event}")
+          example.log_event("[#{name}] #{got_event}", time.strip)
+          @last_event = got_event
         end
       end
+
+      example.log_event("#{name}.run_to(#{to_event.inspect}) finished")
     end
 
     def run_to_completion
+      example.log_event("#{name}.run_to_completion")
       # Start the process if it's not started
       start if !pid
 
       # Tell the process to stop at nothing (no blocking)
-      @write_to_process.puts "nothing"
+      @write_to_process.print "nothing\n"
 
       # Wait for the process to exit
       wait_for_exit
+      example.log_event("#{name}.run_to_completion finished")
     end
 
     def wait_for_exit
+      example.log_event("#{name}.wait_for_exit (pid #{pid})")
       Timeout::timeout(CLIENT_PROCESS_TIMEOUT) do
         Process.wait(pid) if pid
       end
+      example.log_event("#{name}.wait_for_exit finished (pid #{pid})")
     end
 
     def stop
       if pid
+        example.log_event("#{name}.stop (pid #{pid})")
         begin
-          Process.kill(:KILL, pid)
+          # Send it the kill signal over and over until it dies
           Timeout::timeout(CLIENT_PROCESS_TIMEOUT) do
-            Process.waitpid2(pid)
+            Process.kill(:KILL, pid)
+            while !Process.waitpid2(pid, Process::WNOHANG)
+              sleep(0.05)
+            end
           end
+          example.log_event("#{name}.stop finished (stopped pid #{pid})")
         # Process not found is perfectly fine when we're trying to kill a process :)
         rescue Errno::ESRCH
+          example.log_event("#{name}.stop finished (pid #{pid} wasn't running)")
         end
       end
     end
 
+    def fire_event(event)
+      # Let the caller know what event we've reached
+      write_to_tests.print("after #{event}@#{Time.now.strftime("%H:%M:%S.%L")}\n")
+
+      # Block until the client tells us where to stop
+      if !@run_to_event || event == @run_to_event
+        write_to_tests.print("waiting for instructions after #{event}@#{Time.now.strftime("%H:%M:%S.%L")}\n")
+        @run_to_event = read_from_tests.gets.strip
+        write_to_tests.print("told to run to #{@run_to_event} after #{event}@#{Time.now.strftime("%H:%M:%S.%L")}\n")
+      elsif @run_to_event
+        write_to_tests.print("continuing until #{@run_to_event} after #{event}@#{Time.now.strftime("%H:%M:%S.%L")}\n")
+      end
+    end
+
     private
 
     attr_reader :read_from_process
@@ -415,18 +418,34 @@ describe Chef::RunLock do
     attr_reader :read_from_tests
     attr_reader :write_to_process
 
+    class TestRunLock < Chef::RunLock
+      attr_accessor :client_process
+      def create_lock
+        super
+        client_process.fire_event("created lock")
+      end
+    end
+
     def start
+      example.log_event("#{name}.start")
       @pid = fork do
-        Timeout::timeout(CLIENT_PROCESS_TIMEOUT) do
-          run_lock = Chef::RunLock.new(example.lockfile)
-          fire_event("started")
-          run_lock.acquire
-          fire_event("acquired lock")
-          run_lock.save_pid
-          fire_event("saved pid")
-          exit!(0)
+        begin
+          Timeout::timeout(CLIENT_PROCESS_TIMEOUT) do
+            run_lock = TestRunLock.new(example.lockfile)
+            run_lock.client_process = self
+            fire_event("started")
+            run_lock.acquire
+            fire_event("acquired lock")
+            run_lock.save_pid
+            fire_event("saved pid")
+            exit!(0)
+          end
+        rescue
+          fire_event($!.message.lines.join(" // "))
+          raise
         end
       end
+      example.log_event("#{name}.start forked (pid #{pid})")
     end
 
     def readline_nonblock(fd)