diff options
author | Lars Wirzenius <lars.wirzenius@codethink.co.uk> | 2014-09-08 15:50:52 +0000 |
---|---|---|
committer | Lars Wirzenius <lars.wirzenius@codethink.co.uk> | 2014-09-08 15:50:52 +0000 |
commit | 4a6d0ef584a3d87c1d6ad237336660aacd161650 (patch) | |
tree | 71ac6cc3557d6cf340f1dbeec32ce429794ec096 | |
parent | 0ef176c196db439e05026705450f691e678cdccd (diff) | |
parent | 2ce8d016add6dd279c3903fea26645a7499ec50a (diff) | |
download | lorry-controller-4a6d0ef584a3d87c1d6ad237336660aacd161650.tar.gz |
Merge branch 'baserock/liw/de-ghost'
-rw-r--r-- | ARCH | 5 | ||||
-rwxr-xr-x | lorry-controller-webapp | 19 | ||||
-rw-r--r-- | lorrycontroller/__init__.py | 1 | ||||
-rw-r--r-- | lorrycontroller/jobupdate.py | 4 | ||||
-rw-r--r-- | lorrycontroller/removeghostjobs.py | 65 | ||||
-rw-r--r-- | lorrycontroller/statedb.py | 41 | ||||
-rw-r--r-- | units/lorry-controller-remove-ghost-jobs.service | 12 | ||||
-rw-r--r-- | units/lorry-controller-remove-ghost-jobs.timer | 9 | ||||
-rw-r--r-- | yarns.webapp/040-running-jobs.yarn | 79 |
9 files changed, 224 insertions, 11 deletions
@@ -275,6 +275,11 @@ Requests for admins: of all jobs, running or finished, that it knows about. (RQ/ALLJOBS) * `POST /1.0/remove-job` with `job_id=jobid` in the body, removes a stopped job from the state database. +* `POST /1.0/remove-ghost-jobs` looks for any running jobs in STATEDB + that haven't been updated (with `job-update`, see below) in a long + time (see `--ghost-timeout`), and marks them as terminated. This is + used to catch situations when a MINION fails to tell the WEBAPP that + a job has terminated. Requests for MINION: diff --git a/lorry-controller-webapp b/lorry-controller-webapp index 9234498..faabb2d 100755 --- a/lorry-controller-webapp +++ b/lorry-controller-webapp @@ -28,6 +28,9 @@ from flup.server.fcgi import WSGIServer import lorrycontroller +ONE_MINUTE = 60 + + class WEBAPP(cliapp.Application): def add_settings(self): @@ -110,6 +113,22 @@ class WEBAPP(cliapp.Application): metavar='DIR', default='/usr/share/lorry-controller/static') + # The default value of ten minutes for the ghost-timeout + # setting was chosen arbitrarily, by Lars Wirzenius. The value + # needs to be long enough that there's no realistic danger of + # hitting it just because a host is a bit overloaded, but + # still short enough that ghost jobs do get removed often + # enough, especially right after boot, when all jobs are + # ghosts. Experience may show that a different value would + # actually be better, and if so, the code and this comment + # should be changed accordingly. + self.settings.integer( + ['ghost-timeout'], + 'running jobs should get an update from their ' + 'MINION within this time or they will be considered ' + 'ghosts and be removed from STATEDB (in seconds)', + default=10*ONE_MINUTE) + def find_routes(self): '''Return all classes that are API routes. diff --git a/lorrycontroller/__init__.py b/lorrycontroller/__init__.py index bc51b88..a65ff02 100644 --- a/lorrycontroller/__init__.py +++ b/lorrycontroller/__init__.py @@ -32,6 +32,7 @@ from movetopbottom import MoveToTop, MoveToBottom from stopjob import StopJob from listjobs import ListAllJobs, ListAllJobsHTML from showjob import ShowJob, ShowJobHTML, JobShower +from removeghostjobs import RemoveGhostJobs from removejob import RemoveJob from lstroves import LsTroves, ForceLsTrove from pretendtime import PretendTime diff --git a/lorrycontroller/jobupdate.py b/lorrycontroller/jobupdate.py index 3bd0e81..efc9ce1 100644 --- a/lorrycontroller/jobupdate.py +++ b/lorrycontroller/jobupdate.py @@ -44,11 +44,13 @@ class JobUpdate(lorrycontroller.LorryControllerRoute): if stderr: statedb.append_to_job_output(job_id, stderr) + now = statedb.get_current_time() + statedb.set_job_updated(job_id, now) + path = statedb.find_lorry_running_job(job_id) lorry_info = statedb.get_lorry_info(path) if exit is not None and exit != 'no': - now = statedb.get_current_time() statedb.set_lorry_last_run(path, int(now)) statedb.set_running_job(path, None) statedb.set_job_exit(job_id, exit, int(now), disk_usage) diff --git a/lorrycontroller/removeghostjobs.py b/lorrycontroller/removeghostjobs.py new file mode 100644 index 0000000..2b2760c --- /dev/null +++ b/lorrycontroller/removeghostjobs.py @@ -0,0 +1,65 @@ +# Copyright (C) 2014 Codethink Limited +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + +import logging +import time + +import bottle + +import lorrycontroller + + +class RemoveGhostJobs(lorrycontroller.LorryControllerRoute): + + http_method = 'POST' + path = '/1.0/remove-ghost-jobs' + + def run(self, **kwargs): + logging.info('%s %s called', self.http_method, self.path) + + ghost_timeout = self.app_settings['ghost-timeout'] + ghosts = [] + with self.open_statedb() as statedb: + for job_id in statedb.get_running_jobs(): + if self.is_ghost_job(statedb, job_id, ghost_timeout): + self.exorcise_ghost_job(statedb, job_id) + ghosts.append(statedb.get_job_info(job_id)) + return { + 'killed-ghost-jobs': ghosts, + } + + def is_ghost_job(self, statedb, job_id, ghost_timeout): + updated = statedb.get_job_updated(job_id) + return self.now(statedb) - updated >= ghost_timeout + + def now(self, statedb): + return statedb.get_current_time() + + def exorcise_ghost_job(self, statedb, job_id): + logging.info('Job %s is a ghost job', job_id) + self.mark_job_to_be_killed_in_case_minion_appears(statedb, job_id) + self.mark_job_as_terminated(statedb, job_id) + + def mark_job_to_be_killed_in_case_minion_appears(self, statedb, job_id): + statedb.set_kill_job(job_id, True) + + def mark_job_as_terminated(self, statedb, job_id): + statedb.append_to_job_output( + job_id, '\nTERMINATED DUE TO GHOST TIMEOUT\n') + statedb.set_job_exit(job_id, 127, self.now(statedb), -1) + + job_info = statedb.get_job_info(job_id) + statedb.set_running_job(job_info['path'], None) diff --git a/lorrycontroller/statedb.py b/lorrycontroller/statedb.py index 2d223e0..fd7857d 100644 --- a/lorrycontroller/statedb.py +++ b/lorrycontroller/statedb.py @@ -129,6 +129,7 @@ class StateDB(object): 'pid INT, ' 'started INT, ' 'ended INT, ' + 'updated INT, ' 'kill INT, ' 'path TEXT, ' 'exit TEXT, ' @@ -454,8 +455,8 @@ class StateDB(object): def get_job_info(self, job_id): c = self.get_cursor() c.execute( - 'SELECT job_id, host, pid, started, ended, kill, path, exit, ' - 'disk_usage, output FROM jobs WHERE job_id=?', + 'SELECT job_id, host, pid, started, ended, updated, kill, ' + 'path, exit, disk_usage, output FROM jobs WHERE job_id=?', (job_id,)) row = c.fetchone() return { @@ -464,11 +465,12 @@ class StateDB(object): 'pid': row[2], 'started': row[3], 'ended': row[4], - 'kill': row[5], - 'path': row[6], - 'exit': row[7], - 'disk_usage': row[8], - 'output': row[9], + 'updated': row[5], + 'kill': row[6], + 'path': row[7], + 'exit': row[8], + 'disk_usage': row[9], + 'output': row[10], } def add_new_job(self, job_id, host, pid, path, started): @@ -478,9 +480,10 @@ class StateDB(object): assert self.in_transaction c = self.get_cursor() c.execute( - 'INSERT INTO jobs (job_id, host, pid, path, started, kill) ' - 'VALUES (?, ?, ?, ?, ?, ?)', - (job_id, host, pid, path, started, 0)) + 'INSERT INTO jobs (job_id, host, pid, path, started, ' + 'updated, kill) ' + 'VALUES (?, ?, ?, ?, ?, ?, ?)', + (job_id, host, pid, path, started, started, 0)) def get_job_minion_host(self, job_id): c = self.get_cursor() @@ -514,6 +517,24 @@ class StateDB(object): row = c.fetchone() return row[0], row[1] + def get_job_updated(self, job_id): + c = self.get_cursor() + c.execute( + 'SELECT updated FROM jobs WHERE job_id IS ?', + (job_id,)) + row = c.fetchone() + return row[0] + + def set_job_updated(self, job_id, updated): + logging.debug( + 'StateDB.set_job_updated(%r, %r) called', + job_id, updated) + assert self.in_transaction + c = self.get_cursor() + c.execute( + 'UPDATE jobs SET updated=? WHERE job_id IS ?', + (updated, job_id)) + def get_job_exit(self, job_id): c = self.get_cursor() c.execute( diff --git a/units/lorry-controller-remove-ghost-jobs.service b/units/lorry-controller-remove-ghost-jobs.service new file mode 100644 index 0000000..e28a494 --- /dev/null +++ b/units/lorry-controller-remove-ghost-jobs.service @@ -0,0 +1,12 @@ +[Unit] +Description=Lorry Controller remove ghost jobs +After=lighttpd-lorry-controller-webapp.service + +[Install] +WantedBy=multi-user.target + +[Service] +ExecStart=/usr/bin/curl -o /dev/null -X POST -d '' http://localhost:12765/1.0/remove-ghost-jobs +Restart=no +User=lorry +Group=lorry diff --git a/units/lorry-controller-remove-ghost-jobs.timer b/units/lorry-controller-remove-ghost-jobs.timer new file mode 100644 index 0000000..61ebaba --- /dev/null +++ b/units/lorry-controller-remove-ghost-jobs.timer @@ -0,0 +1,9 @@ +[Unit] +Description=Lorry Controller remove ghost jobs +After=lighttpd-lorry-controller-webapp.service + +[Install] +WantedBy=multi-user.target + +[Timer] +OnUnitInactiveSec=60 diff --git a/yarns.webapp/040-running-jobs.yarn b/yarns.webapp/040-running-jobs.yarn index 879d9fa..571afd6 100644 --- a/yarns.webapp/040-running-jobs.yarn +++ b/yarns.webapp/040-running-jobs.yarn @@ -237,6 +237,85 @@ Cleanup. FINALLY WEBAPP terminates + +Forget jobs whose MINION is gone +-------------------------------- + +A job's status is updated when a MINION uses the `/1.0/job-update` +call, and when the MINION uses that to report that the job has +finished, the STATEDB is updated accordingly. However, sometimes the +MINION never tells WEBAPP that the job if finished. This can happen +for a variety of reasons, such as (not limited to these): + +* MINION crashes. +* WEBAPP is unavailable. +* The host reboots, killing MINION and WEBAPP both. + +If this happens, STATEDB still marks the job as running, and WEBAPP +won't start a new job for that lorry specification. + +To deal with these, we need to have a way to clean up "ghost jobs" +like these. We do this with the `/1.0/cleanup-ghost-jobs` API call, +which marks all jobs finished that haven't had a `job-update` called +on them for a long time. + + SCENARIO forget jobs without MINION updates in a long time + +Set up a WEBAPP that uses a CONFGIT with a Lorry file, so we can start +a job. + + GIVEN a new git repository in CONFGIT + AND an empty lorry-controller.conf in CONFGIT + AND lorry-controller.conf in CONFGIT adds lorries *.lorry using prefix upstream + AND Lorry file CONFGIT/foo.lorry with {"foo":{"type":"git","url":"git://foo"}} + AND WEBAPP uses CONFGIT as its configuration directory + AND a running WEBAPP + +Pretend it is a known time (specifically, the beginning of the epoch). +This is needed so we can trigger the ghost job timeout later. + + WHEN admin makes request POST /1.0/pretend-time with now=0 + +Tell WEBAPP to read the configuration. + + WHEN admin makes request POST /1.0/read-configuration + +Start a new job. + + WHEN admin makes request POST /1.0/give-me-job with host=testhost&pid=123 + THEN response has job_id set to 1 + +Verify that the job is in the list of running jobs. + + WHEN admin makes request GET /1.0/list-running-jobs + THEN response has running_jobs set to [1] + +Remove any ghosts. There aren't any yet, so nothing should be removed. + + WHEN admin makes request POST /1.0/remove-ghost-jobs + AND admin makes request GET /1.0/list-running-jobs + THEN response has running_jobs set to [1] + +Now, pretend a long time has passed, and clean up the ghost job. The +default value for the ghost timeout is reasonably short (less than a +day), so we pretend it is about 10 days later (one million seconds). + + WHEN admin makes request POST /1.0/pretend-time with now=1000000 + AND admin makes request POST /1.0/remove-ghost-jobs + AND admin makes request GET /1.0/list-running-jobs + THEN response has running_jobs set to [] + +Further, if we request for a new job now, we'll get one for the same +lorry specification. + + WHEN admin makes request POST /1.0/give-me-job with host=testhost&pid=123 + THEN response has job_id set to 2 + AND response has path set to "upstream/foo" + +Finally, clean up. + + FINALLY WEBAPP terminates + Remove a terminated job ----------------------- |