From 4e4503254dd3628059f9f9094e39f9eadf0feb43 Mon Sep 17 00:00:00 2001 From: Richard Ipsum Date: Thu, 13 Feb 2014 11:33:13 +0000 Subject: Add trove upgrade test suite --- test_trove_upgrades.py | 673 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 673 insertions(+) create mode 100644 test_trove_upgrades.py diff --git a/test_trove_upgrades.py b/test_trove_upgrades.py new file mode 100644 index 0000000..980ba81 --- /dev/null +++ b/test_trove_upgrades.py @@ -0,0 +1,673 @@ +#!/usr/bin/python +# +# Copyright (C) 2014 Codethink Limited +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +''' +Baserock test suite for Trove upgrade + +Trove is the most complex system that we currently have in Baserock. Therefore +we test the Baserock toolset's upgrade functionality using a Trove system. + +This will one day be a MUSTARD Loom Yarn. + +This test must be run on a Baserock devel system, which has passwordless SSH +access to the KVM host specified as 'DEPLOY_URL' below. The tests will deploy a +VM named 'brtests-$host', so a single KVM host can be used by multiple test +machines, as long as each test machine has only one test running at a time. + +Ideas for improvement: + - use https://github.com/paramiko/paramiko (ssh library for Python) + +How to fit this into Yarn: + - I don't know! We need to pass state in a totally different way (via the + environment). + - Should be enough to break the tests into function calls that save state + via pickle or the environment. An annoying extra layer of indirection on + an already complex codebase, though. + +Helpful advice: + - There is a '--reuse-fixture' option which reuses 'GIVEN' state for a + test suite instead of deploying a new VM (which takes ~5 minutes). + - Deployment is broken into separate create_config(), run_build() and + run_deploy() steps, so that you can comment out calls to one or more of + these when iterating over a specific test. +''' + +import cliapp +import contextlib +import os +import shutil +import socket +import subprocess +import sys +import tempfile +import time +import urlparse +import yaml + + +# The test host must have passwordless access to this machine. The tests set +# set StrictHostKeyChecking=no for SSH connections so it does not need to be in +# '~/.ssh/known_hosts'. +DEPLOY_URL = 'kvm+ssh://sam@landfill.ducie.codethink.co.uk/' +DEPLOY_PATH = '/home/VIRT-IMAGES/' + +# Seconds to wait for machine to appear on network before assuming it didn't +# boot or connect to network successfully. +BOOT_TIMEOUT=20 + +# FIXME: building should automatically use the version of Morph from the system +# branch, really ... but for now, if the installed Morph can't build +# baserock:baserock/morphs 'master' branch, you can change this! +MORPH_COMMAND = ['/src/morph/morph', '--no-git-update'] +#MORPH_COMMAND = '/src/morph/morph' +#MORPH_COMMAND = 'morph' + +BUILD_TEMPDIR = '/src/tmp' + +#BRANCH = 'master' +BRANCH = 'baserock/sam/trove-upgrades' + +# For debugging. FIXME: would be better if cliapp's logging mechanism supported +# logging to stdout, but .... :( +VERBOSE = True + + +def remote_runcmd(url, command, **kwargs): + ''' + Execute a command on machine 'url'. + + Command must be a list of arguments, not a single string. + + FIXME: perhaps this functionality should be merged into cliapp.ssh_runcmd() + so that we can use that instead. + ''' + if VERBOSE: + print "%s: %s" % (url, ' '.join(command)) + url = urlparse.urlsplit(url) + if url[0] in ['ssh', 'kvm+ssh']: + ssh_host = url[1] + + ssh_cmd = ['ssh'] + + # The identity of the newly-created test machine will never be in + # '~/.ssh/known_hosts'; this switch avoids seeing the 'do you want to + # connect' prompt that SSH would normally present in this situation. + ssh_cmd.extend(['-o', 'StrictHostKeyChecking=no']) + + return cliapp.runcmd(ssh_cmd + [ssh_host, ' '.join(command)], **kwargs) + else: + raise NotImplementedError("Remote machine must be an ssh:// URL") + + +def run_morph(args, **kwargs): + ''' + Run Morph on the current machine. + ''' + morph_command = MORPH_COMMAND + if isinstance(morph_command, str): + morph_command = morph_command.split(' ') + cmd = morph_command + args + if VERBOSE: + print ' '.join(cmd) + if 'stdout' not in kwargs: + kwargs['stdout'] = sys.stdout + if 'stderr' not in kwargs: + kwargs['stderr'] = sys.stdout + return cliapp.runcmd(cmd, **kwargs) + + +def run_git(args, **kwargs): + return cliapp.runcmd(['git'] + args, **kwargs) + + +def read_file(file_path): + with open(file_path, 'r') as f: + return f.read() + + +def write_file(file_path, text): + with open(file_path, 'w') as f: + f.write(text) + + +class Deployment(object): + ''' + Base class for context of an initial deployment or upgrade. + + Creating config, building the system and doing the deployment are + deliberately separated because building and deploying are currently + slow operations even when repeating an identical build or deployment, + so it's often necessary during development to comment out these steps + so that the edit-test cycle is not impossibly long. + ''' + def __init__(self, system_morph_name, deploy_morph_name, systembranch, + vm_name): + self.branch = systembranch + self.vm_name = vm_name + self.system_morph_name = system_morph_name + self.deploy_morph_name = deploy_morph_name + + def create_config(self): + pass + + def run_build(self): + run_morph( + ['build', self.system_morph_name], cwd=self.branch.branch_dir) + + def run_deploy(self): + run_morph( + ['deploy', self.deploy_morph_name, + '%s.AUTOSTART=yes' % self.vm_name], + cwd=self.branch.branch_dir) + + def create_ssh_key(self, key_name): + file_path = os.path.join(self.branch.morphs_dir, '%s.key' % key_name) + # The '-N ""' is very important here: otherwise ssh-keygen will block + # waiting for input. If you try to pass this command to cliapp.runcmd() + # as a list it will mangle the quotes. + comment = "Generated by Baserock automated tests for '%s'" % key_name + keygen_cmd = 'ssh-keygen -N "" -t rsa -b 2048 -f %s -C "%s"' % \ + (file_path, comment) + cliapp.runcmd(['sh', '-c', keygen_cmd]) + + def ensure_configure_extension_enabled(self, extension_name): + system_morph = os.path.join( + self.branch.morphs_dir, '%s.morph' % self.system_morph_name) + morph = yaml.load(read_file(system_morph)) + if extension_name not in morph['configuration-extensions']: + morph['configuration-extensions'].append(extension_name) + write_file(system_morph, yaml.dump(morph)) + + def add_root_ssh_authorized_key(self, public_key_text): + self.ensure_configure_extension_enabled('install-files') + + deploy_files_dir = os.path.join( + self.branch.morphs_dir, '%s-files' % self.deploy_morph_name) + + root_ssh_dir = os.path.join(deploy_files_dir, 'root', '.ssh') + os.makedirs(root_ssh_dir) + with open(os.path.join(root_ssh_dir, 'authorized_keys'), 'w') as f: + f.write("# Added by Baserock automated test runner\n") + f.write(public_key_text) + + with open(os.path.join(deploy_files_dir, 'manifest'), 'wa') as f: + f.write('0040700 0 0 /root/.ssh/\n') + f.write('0100644 0 0 /root/.ssh/authorized_keys\n') + + +class TroveInitialDeployment(Deployment): + ''' + Wraps creating necessary files for a Trove deployment. + + Attribute 'deploy_morph_name' contains a value to passed to 'morph deploy'. + ''' + def __init__(self, systembranch, vm_name): + super(TroveInitialDeployment, self).__init__( + 'trove-system-x86_64', 'trove-test-deploy', systembranch, + vm_name) + + def create_config(self, initial_deploy_type='kvm'): + self.create_ssh_key('lorry') + self.create_ssh_key('mason') + self.create_ssh_key('worker') + self.create_ssh_key('testuser') + + if initial_deploy_type == 'kvm': + deploy_location = ''.join( + [DEPLOY_URL, self.vm_name, DEPLOY_PATH, '%s.img' % + self.vm_name]) + else: + raise NotImplementedError() + + self.create_trove_deployment_morph( + deploy_type='kvm', location=deploy_location) + + testuser_public_key = read_file( + os.path.join(self.branch.morphs_dir, 'testuser.key.pub')) + self.add_root_ssh_authorized_key(testuser_public_key) + + self.admin_id = os.path.join(self.branch.morphs_dir, 'testuser.key') + + def create_trove_deployment_morph(self, deploy_type=None, location=None): + trove_config = dict( + type=deploy_type, + location=location, + DISK_SIZE='3G', + VERSION_LABEL='trove-old', + INSTALL_FILES='%s-files/manifest' % self.deploy_morph_name, + TROVE_ID=self.vm_name, + TROVE_COMPANY='Codethink', + #UPSTREAM_TROVE='git.baserock.org' + #UPSTREAM_TROVE_USER='nobody' + #UPSTREAM_TROVE_EMAIL='nobody@example.com' + TROVE_ADMIN_USER='testuser', + TROVE_ADMIN_EMAIL='test@example.com', + TROVE_ADMIN_NAME='Automated Test Gitano Admin User', + TROVE_ADMIN_SSH_PUBKEY='testuser.key.pub', + LORRY_SSH_KEY='lorry.key', + WORKER_SSH_PUBKEY='worker.key.pub', + MASON_SSH_PUBKEY='mason.key.pub', + ) + + cluster_morph = dict( + name=self.deploy_morph_name, + kind='cluster', + description='Generated by Baserock automated tests', + systems=[ + dict( + morph=self.system_morph_name, + deploy={ + self.vm_name: trove_config + } + ) + ] + ) + + text = yaml.dump(cluster_morph) + file_path = os.path.join( + self.branch.morphs_dir, '%s.morph' % self.deploy_morph_name) + with open(file_path, 'w') as f: + f.write(text) + + +class TroveUpgrade(Deployment): + def __init__(self, systembranch, vm_name): + super(TroveUpgrade, self).__init__( + 'trove-system-x86_64', 'trove-test-upgrade', systembranch, + vm_name) + + def create_config(self, initial_deployment, upgrade_method='ssh-rsync'): + self.ensure_configure_extension_enabled('install-files') + + if upgrade_method == 'ssh-rsync': + location = 'root@%s' % self.vm_name + else: + raise NotImplementedError() + + self.create_trove_upgrade_morph( + initial_deployment, + upgrade_method=upgrade_method, location=location) + + def create_trove_upgrade_morph( + self, initial_deployment, upgrade_method=None, location=None): + ''' + FIXME: this is totally wrong! + + Instead of having to provide exactly the config that the initial + deployment used, we should avoid configuration extensions for upgrades + entirely and propagate the deploy-time configuration using + baserock-system-config-sync. + ''' + def copy_file_from_initial_deployment(filename, dest_filename=None): + src = os.path.join(initial_deployment.branch.morphs_dir, filename) + dest = os.path.join(self.branch.morphs_dir, dest_filename or filename) + shutil.copyfile(src, dest) + + def copy_dir_from_initial_deployment(dirname): + src = os.path.join(initial_deployment.branch.morphs_dir, dirname) + dest = os.path.join(self.branch.morphs_dir, dirname) + shutil.copytree(src, dest) + + for key in ['testuser', 'lorry', 'worker', 'mason']: + copy_file_from_initial_deployment('%s.key' % key) + copy_file_from_initial_deployment('%s.key.pub' % key) + copy_file_from_initial_deployment( + '%s.morph' % initial_deployment.deploy_morph_name, + '%s.morph' % self.deploy_morph_name) + + copy_dir_from_initial_deployment( + '%s-files' % initial_deployment.deploy_morph_name) + + deploy_morph_file = os.path.join( + self.branch.morphs_dir, '%s.morph' % self.deploy_morph_name) + deploy_morph = yaml.load(read_file(deploy_morph_file)) + + deploy_morph['name'] = self.deploy_morph_name + + system_config = deploy_morph['systems'][0]['deploy'][self.vm_name] + system_config['type'] = upgrade_method + system_config['location'] = location + system_config['VERSION_LABEL'] = 'trove-current' + + write_file(deploy_morph_file, yaml.dump(deploy_morph)) + + +class SystemTestBranch(object): + def __init__(self, workspace_dir, name): + self.workspace_dir = workspace_dir + self.branch_dir = os.path.join(workspace_dir, name) + self.morphs_dir = os.path.join( + self.branch_dir, 'baserock:baserock', 'morphs') + + +class TestInitialDeployment(object): + ''' + FIXME: this is out of date! Make it use the 'BaseTestSuite' class instead! + ''' + def initial_deploy(self, branch, vm_name, **deploy_kwargs): + ''' + Initial deployment of trove-system-x86_64 to a newly-created VM. + + Returns a context with the following things tied to it: + - the VM itself (FIXME: isn't actually deleted when the context + exists) + - an SSH identity added to the machine's SSH agent that provides + root access to the deployed VM + ''' + if self.settings['reuse-workspace'] is not None: + # Hack to reuse an existing workspace and running VM because `morph + # deploy` currently takes several minutes. + + class ReuseTroveDeploy(TroveDeployment): + def __init__(self, systembranch): + self.branch = systembranch + self.set_conveniences() + + trove_deploy = ReuseTroveDeploy(branch) + else: + trove_deploy = TroveDeployment(branch, vm_name, **deploy_kwargs) + + def test_rawdisk_upgrade(self, workspace_dir): + branch = self.create_system_branch(workspace_dir, 'testbranch') + + # Script should: + # - deploy trove + # - apply patch in system branch + # - deploy trove as an upgrade + # FIXME: doesn't perform an upgrade, yet. Should we keep the rawdisk + # upgrade path? + run_morph( + ['build', 'trove-system-x86_64'], cwd=branch.branch_dir) + image_path = os.path.join( + branch.workspace_dir, 'deployed-system.img') + trove_deploy = TroveDeployment( + branch, deploy_type='rawdisk', location=image_path) + + run_morph(['deploy', trove_deploy.deploy_morph_name], cwd=branch.branch_dir) + + +class TimeoutError(Exception): + pass + + +class BaseTestSuite(object): + def wait_for_hostname_to_appear(self, hostname, timeout=10): + ''' + Block until given hostname resolves successfully. + + Raises TimeoutError if the hostname has not appeared in 'timeout' seconds. + ''' + start_time = time.time() + while True: + try: + socket.gethostbyname(hostname) + return time.time() - start_time + except socket.gaierror as e: + pass + if time.time() > start_time + timeout: + raise TimeoutError( + "Host %s did not appear after %i seconds" % + (hostname, timeout)) + time.sleep(0.5) + + def wait_for_ssh(self, host_url, timeout=BOOT_TIMEOUT, **kwargs): + print "Waiting for machine to respond over SSH ..." + start_time = time.time() + while True: + try: + print remote_runcmd(host_url, ['whoami'], **kwargs) + break + except cliapp.AppException as e: + if time.time() < start_time + timeout: + # Assume that this is because sshd hasn't started yet. + pass + else: + print("Waited > %s seconds for host %s to respond over " + "SSH" % (timeout, host_url)) + raise + time.sleep(0.5) + + def wait_for_machine_to_boot(self, instance): + wait_time = self.wait_for_hostname_to_appear( + instance.vm_name, timeout=BOOT_TIMEOUT) + if VERBOSE: + print "Host %s appeared after %0.1f seconds" % \ + (instance.vm_name, wait_time) + + # Remove machine from 'known_hosts', as its identity has probably + # changed. + cliapp.runcmd(['ssh-keygen', '-R', instance.vm_name]) + + test_url = 'ssh://root@%s/' % instance.vm_name + self.wait_for_ssh( + test_url, timeout=BOOT_TIMEOUT-wait_time) + + def create_system_branch(self, workspace_dir, name, parent=BRANCH): + run_morph( + ['branch', 'baserock:baserock/morphs', name, parent], + cwd=workspace_dir) + return SystemTestBranch(workspace_dir, name) + + +class TestUpgrades(BaseTestSuite): + def set_lighttpd_version(self, branch, tag='baserock/morph'): + ''' + Use 'morph edit' and 'git reset' to force a specific lighttpd version. + ''' + run_morph(['edit', 'trove-system-x86_64', 'trove', 'lighttpd'], + cwd=branch.morphs_dir) + run_git(['add', 'trove.morph'], cwd=branch.morphs_dir) + run_git(['commit', '-m', 'Edit lighttpd chunk'], cwd=branch.morphs_dir) + + chunk_dir = os.path.join(branch.branch_dir, 'upstream:lighttpd') + # Set the system branch's corresponding Git branch in the chunk repo + # to the specific ref. This chunk doesn't have a chunk morphology so + # there's no further work to do! + run_git(['reset', '--hard', tag], cwd=chunk_dir) + + @contextlib.contextmanager + def given_out_of_date_trove_instance(self, vm_name, fixture_dir, reuse=False): + ''' + GIVEN a running current Trove system but with lighttpd version 1.3.14 + ''' + if reuse: + branch = SystemTestBranch(fixture_dir, 'old') + instance = TroveInitialDeployment(branch, vm_name) + instance.admin_id = os.path.join(branch.morphs_dir, 'testuser.key') + else: + branch = self.create_system_branch(fixture_dir, 'old') + self.set_lighttpd_version(branch, tag='lighttpd-1.3.14') + + instance = TroveInitialDeployment(branch, vm_name) + instance.create_config() + instance.run_build() + instance.run_deploy() + + cliapp.runcmd(['ssh-add', instance.admin_id]) + + try: + self.wait_for_machine_to_boot(instance) + + yield instance + finally: + # Should pass the .pub file really ... + cliapp.runcmd(['ssh-add', '-d', instance.admin_id]) + + def test_scenario_trove_upgrade(self, + vm_name, fixture_dir, workspace_dir, reuse_fixture=False): + ''' + We want to be able to upgrade an old Trove system to the latest + Trove system. While in the real world the user would only want to + deploy a released Trove system, that would preclude using this test in + continuous integration as we would only notice breakages *after* we + had made a release, and the tests would need to be updated for every + release. Better to test that deploying 'master' of Trove still works. + We artificially create an 'out of date' Trove system because we need + to know what to test for (and there is only one public release of + Trove at the time of writing). This is more useful than just deploying + and upgrade and assuming that if there were no errors from the + Baserock deployment tool then it must have been successful. + + Lighttpd is used in the test because it triggers very few rebuilds. + + SCENARIO Bob upgrades his Trove (vague version) + GIVEN a running an out-of-date Trove system + WHEN Bob builds and upgrades to the current version of Trove with + AUTOSTART=1 + THEN the Trove is at the new version + + SCENARIO Bob upgrades his Trove (specific version) + GIVEN a running current Trove system but with lighttpd version 1.3.14 + WHEN Bob upgrades to the current version of Trove and sets it to + autostart immediately + THEN the Trove uses a newer version of lightttpd than 1.3.14 + ''' + + with self.given_out_of_date_trove_instance( + vm_name, fixture_dir, reuse=reuse_fixture) as instance: + branch = self.create_system_branch(workspace_dir, 'current') + + test_url = 'ssh://root@%s/' % vm_name + + old_lighttpd_output = remote_runcmd(test_url, ['lighttpd', '-v']) + print "Lighttpd outout: %s" % old_lighttpd_output + + upgrade = TroveUpgrade( + branch, vm_name) + upgrade.create_config(instance, upgrade_method='ssh-rsync') + upgrade.run_build() + upgrade.run_deploy() + + # FIXME: AUTOSTART=yes should do this + try: + remote_runcmd(test_url, ['reboot']) + except cliapp.AppException: + # Bit of a hack because we get disconnect before the command + # exits so SSH returns failure. + pass + + self.wait_for_machine_to_boot(instance) + + new_lighttpd_output = remote_runcmd(test_url, ['lighttpd', '-v']) + print "OLD Lighttpd outout: %s" % old_lighttpd_output + print "NEW Lighttpd outout: %s" % new_lighttpd_output + + # We have a machine! + # Initial tests to run: + # check system metadata against what should have been built + # Trove tests: + # check you can lorry something + # check you can 'git pull' something + # check you can issue Gitano commands ... + # perhaps crib from Gitano test suite + + +class SimpleTestRunner(cliapp.Application): + ''' + Run a Baserock system test suite. + + There is a test suite-wide Morph workspace provided, which should be shared + by all prerequisites ('GIVEN') implementations. This is called the + 'fixture_dir.' Multiple GIVEN implementations in a single test suite should + use differently-named system branches to avoid conflicting with each other. + It is up to the test suite's GIVEN implementations to deal with a + directory that already contains their data without failing. + + Each test gets another, 'workspace_dir'. This is per-test and should be used + for the 'WHEN' implementations. + + Since initial deployments currently take several minutes it is very useful + to reuse 'GIVEN' state instead of recreating it each time when working on a + specific test suite. + ''' + + def check_access_to_deploy_host(self): + # From: https://stackoverflow.com/questions/3830508/check-if-passwordless-access-has-been-setup + deploy_url = urlparse.urlsplit(DEPLOY_URL) + assert deploy_url[0] == 'kvm+ssh' + try: + cliapp.runcmd( + ['ssh', '-o', 'NumberOfPasswordPrompts=0', deploy_url[1], + 'whoami']) + except cliapp.AppException: + raise cliapp.AppException( + "No passwordless access to deploy host '%s'. Check the SSH " + "authorized keys for the remote account." % deploy_url[1]) + + def maybe_delete_vm(self, vm_name): + # FIXME: Would be better if this would check if the machine was running + # before destroying it, and checked if it existed before undefining + # it, rather than just ignoring exceptions. + def run_virsh(args): + try: + remote_runcmd(DEPLOY_URL, ['virsh', '-c', 'qemu:///system'] + args) + except cliapp.AppException as e: + pass + run_virsh(['destroy', self.vm_name]) + run_virsh(['undefine', self.vm_name]) + + def add_settings(self): + self.settings.string( + ['reuse-fixture', 'r'], + 'use an existing deployment from a test fixture instead of ' + 'building a clean one, to avoid repeating a slow morph deploy', + metavar='DIR', + default=None) + + def process_args(self, args): + self.check_access_to_deploy_host() + + self.vm_name = 'brtests-%s' % (socket.gethostname()) + + if self.settings['reuse-fixture'] is None: + self.maybe_delete_vm(self.vm_name) + + self.run_test() + + def run_test(self): + test = TestUpgrades().test_scenario_trove_upgrade + + if self.settings['reuse-fixture'] is not None: + fixture_dir = self.settings['reuse-fixture'] + else: + fixture_dir = cliapp.runcmd(['mktemp', '-d', '-p', BUILD_TEMPDIR]).strip() + run_morph(['init', fixture_dir]) + + try: + print "Running %s" % test + workspace_dir = cliapp.runcmd(['mktemp', '-d', '-p', BUILD_TEMPDIR]).strip() + + try: + run_morph(['init', workspace_dir]) + reuse_fixture = self.settings['reuse-fixture'] is not None + test(self.vm_name, fixture_dir, workspace_dir, + reuse_fixture=reuse_fixture) + finally: + print "Workspace kept in %s" % workspace_dir + #cliapp.runcmd(['rm', '-r', workspace_dir]) + except Exception as e: + import pdb + print 'Exception: ', e + pdb.post_mortem(sys.exc_traceback) + finally: + # Careful now! + print "Fixture kept in %s" % fixture_dir + #cliapp.runcmd(['rm', '-r', fixture_dir]) + + +if __name__ == '__main__': + SimpleTestRunner().run() -- cgit v1.2.1