diff options
author | Nobuaki Sukegawa <nsuke@apache.org> | 2016-01-24 01:34:29 +0900 |
---|---|---|
committer | Nobuaki Sukegawa <nsuke@apache.org> | 2016-01-24 03:44:26 +0900 |
commit | 5e432ec37b3c9db3a4534f6acdea12c74d42455e (patch) | |
tree | aa4fdf42807669ccd3f8a5b1fc7d064de00eeb08 /test/crossrunner | |
parent | ccd998a04e8b82a56b1788aa13167e84aa2126d1 (diff) | |
download | thrift-5e432ec37b3c9db3a4534f6acdea12c74d42455e.tar.gz |
THRIFT-3579 Introduce retry to make cross
This closes #817
Diffstat (limited to 'test/crossrunner')
-rw-r--r-- | test/crossrunner/report.py | 92 | ||||
-rw-r--r-- | test/crossrunner/run.py | 77 | ||||
-rw-r--r-- | test/crossrunner/test.py | 3 |
3 files changed, 105 insertions, 67 deletions
diff --git a/test/crossrunner/report.py b/test/crossrunner/report.py index a84e89121..be7271cb1 100644 --- a/test/crossrunner/report.py +++ b/test/crossrunner/report.py @@ -100,14 +100,15 @@ class TestReporter(object): return '%s' % datetime.datetime.now().strftime(cls.DATETIME_FORMAT) def _print_date(self): - self.out.write('%s\n' % self._format_date()) + print(self._format_date(), file=self.out) def _print_bar(self, out=None): - (out or self.out).write( - '======================================================================\n') + print( + '==========================================================================', + file=(out or self.out)) def _print_exec_time(self): - self.out.write('Test execution took {:.1f} seconds.\n'.format(self._elapsed)) + print('Test execution took {:.1f} seconds.'.format(self._elapsed), file=self.out) class ExecReporter(TestReporter): @@ -139,11 +140,13 @@ class ExecReporter(TestReporter): self._lock.release() def killed(self): - self.out.write('Process is killed.\n') + print(file=self.out) + print('Server process is successfully killed.', file=self.out) self.end(None) def died(self): - self.out.write('Process is died unexpectedly.\n') + print(file=self.out) + print('*** Server process has died unexpectedly ***', file=self.out) self.end(None) _init_failure_exprs = { @@ -191,19 +194,19 @@ class ExecReporter(TestReporter): def _print_header(self): self._print_date() - self.out.write('Executing: %s\n' % str_join(' ', self._prog.command)) - self.out.write('Directory: %s\n' % self._prog.workdir) - self.out.write('config:delay: %s\n' % self._test.delay) - self.out.write('config:timeout: %s\n' % self._test.timeout) + print('Executing: %s' % str_join(' ', self._prog.command), file=self.out) + print('Directory: %s' % self._prog.workdir, file=self.out) + print('config:delay: %s' % self._test.delay, file=self.out) + print('config:timeout: %s' % self._test.timeout, file=self.out) self._print_bar() self.out.flush() def _print_footer(self, returncode=None): self._print_bar() if returncode is not None: - self.out.write('Return code: %d\n' % returncode) + print('Return code: %d' % returncode, file=self.out) else: - self.out.write('Process is killed.\n') + print('Process is killed.', file=self.out) self._print_exec_time() self._print_date() @@ -224,6 +227,7 @@ class SummaryReporter(TestReporter): os.mkdir(self.logdir) self._known_failures = load_known_failures(self.testdir) self._unexpected_success = [] + self._flaky_success = [] self._unexpected_failure = [] self._expected_failure = [] self._print_header() @@ -232,6 +236,19 @@ class SummaryReporter(TestReporter): def testdir(self): return path_join(self._basedir, self._testdir_rel) + def _result_string(self, test): + if test.success: + if test.retry_count == 0: + return 'success' + elif test.retry_count == 1: + return 'flaky(1 retry)' + else: + return 'flaky(%d retries)' % test.retry_count + elif test.expired: + return 'failure(timeout)' + else: + return 'failure(%d)' % test.returncode + def _get_revision(self): p = subprocess.Popen(['git', 'rev-parse', '--short', 'HEAD'], cwd=self.testdir, stdout=subprocess.PIPE) @@ -242,23 +259,19 @@ class SummaryReporter(TestReporter): name = '%s-%s' % (test.server.name, test.client.name) trans = '%s-%s' % (test.transport, test.socket) if not with_result: - return '{:19s}{:13s}{:25s}'.format(name[:18], test.protocol[:12], trans[:24]) + return '{:24s}{:13s}{:25s}'.format(name[:23], test.protocol[:12], trans[:24]) else: - result = 'success' if test.success else ( - 'timeout' if test.expired else 'failure') - result_string = '%s(%d)' % (result, test.returncode) - return '{:19s}{:13s}{:25s}{:s}\n'.format(name[:18], test.protocol[:12], trans[:24], result_string) + return '{:24s}{:13s}{:25s}{:s}\n'.format(name[:23], test.protocol[:12], trans[:24], self._result_string(test)) def _print_test_header(self): self._print_bar() - self.out.write( - '{:19s}{:13s}{:25s}{:s}\n'.format('server-client:', 'protocol:', 'transport:', 'result:')) + print( + '{:24s}{:13s}{:25s}{:s}'.format('server-client:', 'protocol:', 'transport:', 'result:'), + file=self.out) def _print_header(self): self._start() - self.out.writelines([ - 'Apache Thrift - Integration Test Suite\n', - ]) + print('Apache Thrift - Integration Test Suite', file=self.out) self._print_date() self._print_test_header() @@ -274,12 +287,23 @@ class SummaryReporter(TestReporter): self.out.write(self._format_test(self._tests[i])) self._print_bar() else: - self.out.write('No unexpected failures.\n') + print('No unexpected failures.', file=self.out) + + def _print_flaky_success(self): + if len(self._flaky_success) > 0: + print( + 'Following %d tests were expected to cleanly succeed but needed retry:' % len(self._flaky_success), + file=self.out) + self._print_test_header() + for i in self._flaky_success: + self.out.write(self._format_test(self._tests[i])) + self._print_bar() def _print_unexpected_success(self): if len(self._unexpected_success) > 0: - self.out.write( - 'Following %d tests were known to fail but succeeded (it\'s normal):\n' % len(self._unexpected_success)) + print( + 'Following %d tests were known to fail but succeeded (maybe flaky):' % len(self._unexpected_success), + file=self.out) self._print_test_header() for i in self._unexpected_success: self.out.write(self._format_test(self._tests[i])) @@ -295,13 +319,14 @@ class SummaryReporter(TestReporter): fail_count = len(self._expected_failure) + len(self._unexpected_failure) self._print_bar() self._print_unexpected_success() + self._print_flaky_success() self._print_unexpected_failure() self._write_html_data() self._assemble_log('unexpected failures', self._unexpected_failure) self._assemble_log('known failures', self._expected_failure) self.out.writelines([ 'You can browse results at:\n', - '\tfile://%s/%s\n' % (self._basedir, RESULT_HTML), + '\tfile://%s/%s\n' % (self.testdir, RESULT_HTML), '# If you use Chrome, run:\n', '# \tcd %s\n#\t%s\n' % (self._basedir, self._http_server_command(8001)), '# then browse:\n', @@ -358,7 +383,7 @@ class SummaryReporter(TestReporter): add_prog_log(fp, test, test.server.kind) add_prog_log(fp, test, test.client.kind) fp.write('**********************************************************************\n\n') - print('%s are logged to test/%s/%s' % (title.capitalize(), LOG_DIR, filename)) + print('%s are logged to %s/%s/%s' % (title.capitalize(), self._testdir_rel, LOG_DIR, filename)) def end(self): self._print_footer() @@ -376,10 +401,11 @@ class SummaryReporter(TestReporter): finally: self._lock.release() - def add_result(self, index, returncode, expired): + def add_result(self, index, returncode, expired, retry_count): self._lock.acquire() try: failed = returncode is None or returncode != 0 + flaky = not failed and retry_count != 0 test = self._tests[index] known = test.name in self._known_failures if failed: @@ -389,17 +415,19 @@ class SummaryReporter(TestReporter): else: self._log.info('unexpected failure: %s' % test.name) self._unexpected_failure.append(index) - elif known: + elif flaky and not known: + self._log.info('unexpected flaky success: %s' % test.name) + self._flaky_success.append(index) + elif not flaky and known: self._log.info('unexpected success: %s' % test.name) self._unexpected_success.append(index) test.success = not failed test.returncode = returncode + test.retry_count = retry_count test.expired = expired test.as_expected = known == failed if not self.concurrent: - result = 'success' if not failed else 'failure' - result_string = '%s(%d)' % (result, returncode) - self.out.write(result_string + '\n') + self.out.write(self._result_string(test) + '\n') else: self.out.write(self._format_test(test)) finally: diff --git a/test/crossrunner/run.py b/test/crossrunner/run.py index 0d617c0e3..68bd92869 100644 --- a/test/crossrunner/run.py +++ b/test/crossrunner/run.py @@ -60,12 +60,12 @@ class ExecutionContext(object): if platform.system() != 'Windows': try: os.killpg(self.proc.pid, signal.SIGKILL) - except Exception as err: - self._log.info('Failed to kill process group : %s' % str(err)) + except Exception: + self._log.info('Failed to kill process group', exc_info=sys.exc_info()) try: self.proc.kill() - except Exception as err: - self._log.info('Failed to kill process : %s' % str(err)) + except Exception: + self._log.info('Failed to kill process', exc_info=sys.exc_info()) def _popen_args(self): args = { @@ -122,15 +122,17 @@ def exec_context(port, logdir, test, prog): return ExecutionContext(prog.command, prog.workdir, prog.env, report) -def run_test(testdir, logdir, test_dict, async=True, max_retry=3): +def run_test(testdir, logdir, test_dict, max_retry, async=True): try: logger = multiprocessing.get_logger() + max_bind_retry = 3 retry_count = 0 + bind_retry_count = 0 test = TestEntry(testdir, **test_dict) while True: if stop.is_set(): logger.debug('Skipping because shutting down') - return None + return (retry_count, None) logger.debug('Start') with PortAllocator.alloc_port_scoped(ports, test.socket) as port: logger.debug('Start with port %d' % port) @@ -142,35 +144,41 @@ def run_test(testdir, logdir, test_dict, async=True, max_retry=3): if test.delay > 0: logger.debug('Delaying client for %.2f seconds' % test.delay) time.sleep(test.delay) - cl_retry_count = 0 - cl_max_retry = 10 - cl_retry_wait = 0.5 + connect_retry_count = 0 + max_connect_retry = 10 + connect_retry_wait = 0.5 while True: logger.debug('Starting client') cl.start(test.timeout) logger.debug('Waiting client') cl.wait() - if not cl.report.maybe_false_positive() or cl_retry_count >= cl_max_retry: - if cl_retry_count > 0 and cl_retry_count < cl_max_retry: - logger.warn('[%s]: Connected after %d retry (%.2f sec each)' % (test.server.name, cl_retry_count, cl_retry_wait)) - # Wait for 50 ms to see if server does not die at the end. + if not cl.report.maybe_false_positive() or connect_retry_count >= max_connect_retry: + if connect_retry_count > 0 and connect_retry_count < max_connect_retry: + logger.warn('[%s]: Connected after %d retry (%.2f sec each)' % (test.server.name, connect_retry_count, connect_retry_wait)) + # Wait for 50ms to see if server does not die at the end. time.sleep(0.05) break - logger.debug('Server may not be ready, waiting %.2f second...' % cl_retry_wait) - time.sleep(cl_retry_wait) - cl_retry_count += 1 + logger.debug('Server may not be ready, waiting %.2f second...' % connect_retry_wait) + time.sleep(connect_retry_wait) + connect_retry_count += 1 - if not sv.report.maybe_false_positive() or retry_count >= max_retry: - logger.debug('Finish') + if sv.report.maybe_false_positive() and bind_retry_count < max_bind_retry: + logger.warn('[%s]: Detected socket bind failure, retrying...', test.server.name) + bind_retry_count += 1 + else: if cl.expired: - return RESULT_TIMEOUT + result = RESULT_TIMEOUT elif not sv.killed and cl.proc.returncode == 0: # Server should be alive at the end. - return RESULT_ERROR + result = RESULT_ERROR + else: + result = cl.proc.returncode + + if result == 0 or retry_count >= max_retry: + return (retry_count, result) else: - return cl.proc.returncode - logger.warn('[%s]: Detected socket bind failure, retrying...' % test.server.name) - retry_count += 1 + logger.info('[%s-%s]: test failed, retrying...', test.server.name, test.client.name) + retry_count += 1 except (KeyboardInterrupt, SystemExit): logger.info('Interrupted execution') if not async: @@ -181,7 +189,7 @@ def run_test(testdir, logdir, test_dict, async=True, max_retry=3): if not async: raise logger.warn('Error executing [%s]', test.name, exc_info=sys.exc_info()) - return RESULT_ERROR + return (retry_count, RESULT_ERROR) class PortAllocator(object): @@ -245,8 +253,8 @@ class PortAllocator(object): self._dom_ports.remove(port) else: self._ports.remove(port) - except IOError as err: - self._log.info('Error while freeing port : %s' % str(err)) + except IOError: + self._log.info('Error while freeing port', exc_info=sys.exc_info()) finally: self._lock.release() @@ -300,26 +308,27 @@ class TestDispatcher(object): m.connect() ports = m.ports() - def _dispatch_sync(self, test, cont): - r = run_test(self.testdir, self.logdir, test, False) + def _dispatch_sync(self, test, cont, max_retry): + r = run_test(self.testdir, self.logdir, test, max_retry, False) cont(r) return NonAsyncResult(r) - def _dispatch_async(self, test, cont): + def _dispatch_async(self, test, cont, max_retry): self._log.debug('_dispatch_async') - return self._pool.apply_async(func=run_test, args=(self.testdir, self.logdir, test,), callback=cont) + return self._pool.apply_async(func=run_test, args=(self.testdir, self.logdir, test, max_retry), callback=cont) - def dispatch(self, test): + def dispatch(self, test, max_retry): index = self._report.add_test(test) - def cont(r): + def cont(result): if not self._stop.is_set(): + retry_count, returncode = result self._log.debug('freeing port') self._log.debug('adding result') - self._report.add_result(index, r, r == RESULT_TIMEOUT) + self._report.add_result(index, returncode, returncode == RESULT_TIMEOUT, retry_count) self._log.debug('finish continuation') fn = self._dispatch_async if self._async else self._dispatch_sync - return fn(test, cont) + return fn(test, cont, max_retry) def wait(self): if self._async: diff --git a/test/crossrunner/test.py b/test/crossrunner/test.py index bb81c4fe5..fc90f7f30 100644 --- a/test/crossrunner/test.py +++ b/test/crossrunner/test.py @@ -70,7 +70,7 @@ class TestProgram(object): def build_command(self, port): cmd = copy.copy(self._base_command) - args = self._extra_args2 + args = copy.copy(self._extra_args2) args.append('--protocol=' + self.protocol) args.append('--transport=' + self.transport) socket_args = self._socket_args(self.socket, port) @@ -109,6 +109,7 @@ class TestEntry(object): self.as_expected = None self.returncode = None self.expired = False + self.retry_count = 0 def _fix_workdir(self, config): key = 'workdir' |