diff options
author | Martin Blanchard <martin.blanchard@codethink.co.uk> | 2018-09-18 08:56:43 +0100 |
---|---|---|
committer | Martin Blanchard <martin.blanchard@codethink.co.uk> | 2018-10-23 11:54:40 +0100 |
commit | aa0cbf5dfb8e68674c60efe841cf862d6e5cdea6 (patch) | |
tree | 4c7cf7ccc256b7a33985884593873159ab6929a9 | |
parent | ecb58b423ab6896a6bfc4f634089ca73d9816782 (diff) | |
download | buildstream-aa0cbf5dfb8e68674c60efe841cf862d6e5cdea6.tar.gz |
_sandboxremote.py: Try to reopen operation steam on failuremablanch/630-remote-execution-reconn
The REAPI allows a client to reconnect to an ongoing operation stream by
providing a WaitExecution(). If implemented on server side, BuildStream
will try to recover from connection errors using it.
https://gitlab.com/BuildStream/buildstream/issues/630
-rw-r--r-- | buildstream/sandbox/_sandboxremote.py | 58 |
1 files changed, 43 insertions, 15 deletions
diff --git a/buildstream/sandbox/_sandboxremote.py b/buildstream/sandbox/_sandboxremote.py index ab0c31bff..f522cc772 100644 --- a/buildstream/sandbox/_sandboxremote.py +++ b/buildstream/sandbox/_sandboxremote.py @@ -76,8 +76,7 @@ class SandboxRemote(Sandbox): # Upload the Command message to the remote CAS server command_digest = cascache.push_message(self._get_project(), remote_command) if not command_digest or not cascache.verify_digest_pushed(self._get_project(), command_digest): - # Command push failed - return None + raise SandboxError("Failed pushing build command to remote CAS.") # Create and send the action. action = remote_execution_pb2.Action(command_digest=command_digest, @@ -88,27 +87,57 @@ class SandboxRemote(Sandbox): # Upload the Action message to the remote CAS server action_digest = cascache.push_message(self._get_project(), action) if not action_digest or not cascache.verify_digest_pushed(self._get_project(), action_digest): - # Action push failed - return None + raise SandboxError("Failed pushing build action to remote CAS.") # Next, try to create a communication channel to the BuildGrid server. channel = grpc.insecure_channel(self.server_url) stub = remote_execution_pb2_grpc.ExecutionStub(channel) request = remote_execution_pb2.ExecuteRequest(action_digest=action_digest, skip_cache_lookup=False) - try: - operation_iterator = stub.Execute(request) - except grpc.RpcError: - return None + + def __run_remote_command(stub, execute_request=None, running_operation=None): + try: + last_operation = None + if execute_request is not None: + operation_iterator = stub.Execute(execute_request) + else: + request = remote_execution_pb2.WaitExecutionRequest(name=running_operation.name) + operation_iterator = stub.WaitExecution(request) + + for operation in operation_iterator: + if operation.done: + return operation + else: + last_operation = operation + except grpc.RpcError as e: + status_code = e.code() + if status_code == grpc.StatusCode.UNAVAILABLE: + raise SandboxError("Failed contacting remote execution server at {}." + .format(self.server_url)) + + elif status_code in (grpc.StatusCode.INVALID_ARGUMENT, + grpc.StatusCode.FAILED_PRECONDITION, + grpc.StatusCode.RESOURCE_EXHAUSTED, + grpc.StatusCode.INTERNAL, + grpc.StatusCode.DEADLINE_EXCEEDED): + raise SandboxError("{} ({}).".format(e.details(), status_code.name)) + + elif running_operation and status_code == grpc.StatusCode.UNIMPLEMENTED: + raise SandboxError("Failed trying to recover from connection loss: " + "server does not support operation status polling recovery.") + + return last_operation operation = None with self._get_context().timed_activity("Waiting for the remote build to complete"): - # It is advantageous to check operation_iterator.code() is grpc.StatusCode.OK here, - # which will check the server is actually contactable. However, calling it when the - # server is available seems to cause .code() to hang forever. - for operation in operation_iterator: - if operation.done: - break + operation = __run_remote_command(stub, execute_request=request) + if operation is None: + return None + elif operation.done: + return operation + + while operation is not None and not operation.done: + operation = __run_remote_command(stub, running_operation=operation) return operation @@ -192,7 +221,6 @@ class SandboxRemote(Sandbox): if operation is None: # Failure of remote execution, usually due to an error in BuildStream - # NB This error could be raised in __run_remote_command raise SandboxError("No response returned from server") assert not operation.HasField('error') and operation.HasField('response') |