summaryrefslogtreecommitdiff
path: root/server-tools/instance-manager/guardian.cc
diff options
context:
space:
mode:
Diffstat (limited to 'server-tools/instance-manager/guardian.cc')
-rw-r--r--server-tools/instance-manager/guardian.cc544
1 files changed, 259 insertions, 285 deletions
diff --git a/server-tools/instance-manager/guardian.cc b/server-tools/instance-manager/guardian.cc
index 289e80d8b74..bad43b1f92e 100644
--- a/server-tools/instance-manager/guardian.cc
+++ b/server-tools/instance-manager/guardian.cc
@@ -27,102 +27,126 @@
#include "instance_map.h"
#include "log.h"
#include "mysql_manager_error.h"
+#include "options.h"
-const char *
-Guardian::get_instance_state_name(enum_instance_state state)
-{
- switch (state) {
- case NOT_STARTED:
- return "offline";
-
- case STARTING:
- return "starting";
-
- case STARTED:
- return "online";
- case JUST_CRASHED:
- return "failed";
+/*************************************************************************
+ {{{ Constructor & destructor.
+*************************************************************************/
- case CRASHED:
- return "crashed";
-
- case CRASHED_AND_ABANDONED:
- return "abandoned";
-
- case STOPPING:
- return "stopping";
- }
+/**
+ Guardian constructor.
- return NULL; /* just to ignore compiler warning. */
-}
+ SYNOPSIS
+ Guardian()
+ thread_registry_arg
+ instance_map_arg
-/* {{{ Constructor & destructor. */
+ DESCRIPTION
+ Nominal contructor intended for assigning references and initialize
+ trivial objects. Real initialization is made by init() method.
+*/
Guardian::Guardian(Thread_registry *thread_registry_arg,
- Instance_map *instance_map_arg,
- uint monitoring_interval_arg)
- :stopped(FALSE),
- monitoring_interval(monitoring_interval_arg),
+ Instance_map *instance_map_arg)
+ :shutdown_requested(FALSE),
+ stopped(FALSE),
thread_registry(thread_registry_arg),
- instance_map(instance_map_arg),
- guarded_instances(0),
- shutdown_requested(FALSE)
+ instance_map(instance_map_arg)
{
pthread_mutex_init(&LOCK_guardian, 0);
pthread_cond_init(&COND_guardian, 0);
- init_alloc_root(&alloc, MEM_ROOT_BLOCK_SIZE, 0);
}
Guardian::~Guardian()
{
- /* delay guardian destruction to the moment when no one needs it */
- pthread_mutex_lock(&LOCK_guardian);
- free_root(&alloc, MYF(0));
- pthread_mutex_unlock(&LOCK_guardian);
+ /*
+ NOTE: it's necessary to synchronize here, because Guiardian thread can be
+ still alive an hold the mutex (because it is detached and we have no
+ control over it).
+ */
+
+ lock();
+ unlock();
+
pthread_mutex_destroy(&LOCK_guardian);
pthread_cond_destroy(&COND_guardian);
}
-/* }}} */
+/*************************************************************************
+ }}}
+*************************************************************************/
+/**
+ Send request to stop Guardian.
+
+ SYNOPSIS
+ request_shutdown()
+*/
+
void Guardian::request_shutdown()
{
- pthread_mutex_lock(&LOCK_guardian);
- /* STOP Instances or just clean up Guardian repository */
stop_instances();
+
+ lock();
shutdown_requested= TRUE;
- pthread_mutex_unlock(&LOCK_guardian);
+ unlock();
+
+ ping();
}
-void Guardian::process_instance(Instance *instance,
- GUARD_NODE *current_node,
- LIST **guarded_instances,
- LIST *node)
+/**
+ Process an instance.
+
+ SYNOPSIS
+ process_instance()
+ instance a pointer to the instance for processing
+
+ MT-NOTE:
+ - the given instance must be locked before calling this operation;
+ - Guardian must be locked before calling this operation.
+*/
+
+void Guardian::process_instance(Instance *instance)
{
- uint waitchild= (uint) Instance::DEFAULT_SHUTDOWN_DELAY;
- /* The amount of times, Guardian attempts to restart an instance */
int restart_retry= 100;
time_t current_time= time(NULL);
- if (current_node->state == STOPPING)
+ if (instance->get_state() == Instance::STOPPING)
{
- waitchild= instance->options.get_shutdown_delay();
+ /* This brach is executed during shutdown. */
- /* this returns TRUE if and only if an instance was stopped for sure */
+ /* This returns TRUE if and only if an instance was stopped for sure. */
if (instance->is_crashed())
- *guarded_instances= list_delete(*guarded_instances, node);
- else if ( (uint) (current_time - current_node->last_checked) > waitchild)
{
+ log_info("Guardian: '%s' stopped.",
+ (const char *) instance->get_name()->str);
+
+ instance->set_state(Instance::STOPPED);
+ }
+ else if ((uint) (current_time - instance->last_checked) >=
+ instance->options.get_shutdown_delay())
+ {
+ log_info("Guardian: '%s' hasn't stopped within %d secs.",
+ (const char *) instance->get_name()->str,
+ (int) instance->options.get_shutdown_delay());
+
instance->kill_mysqld(SIGKILL);
- /*
- Later we do node= node->next. This is ok, as we are only removing
- the node from the list. The pointer to the next one is still valid.
- */
- *guarded_instances= list_delete(*guarded_instances, node);
+
+ log_info("Guardian: pretend that '%s' is killed.",
+ (const char *) instance->get_name()->str);
+
+ instance->set_state(Instance::STOPPED);
+ }
+ else
+ {
+ log_info("Guardian: waiting for '%s' to stop (%d secs left).",
+ (const char *) instance->get_name()->str,
+ (int) (instance->options.get_shutdown_delay() -
+ current_time + instance->last_checked));
}
return;
@@ -133,83 +157,90 @@ void Guardian::process_instance(Instance *instance,
/* The instance can be contacted on it's port */
/* If STARTING also check that pidfile has been created */
- if (current_node->state == STARTING &&
- current_node->instance->options.load_pid() == 0)
+ if (instance->get_state() == Instance::STARTING &&
+ instance->options.load_pid() == 0)
{
/* Pid file not created yet, don't go to STARTED state yet */
}
- else if (current_node->state != STARTED)
+ else if (instance->get_state() != Instance::STARTED)
{
/* clear status fields */
log_info("Guardian: '%s' is running, set state to STARTED.",
(const char *) instance->options.instance_name.str);
- current_node->restart_counter= 0;
- current_node->crash_moment= 0;
- current_node->state= STARTED;
+ instance->reset_stat();
+ instance->set_state(Instance::STARTED);
}
}
else
{
- switch (current_node->state) {
- case NOT_STARTED:
+ switch (instance->get_state()) {
+ case Instance::NOT_STARTED:
log_info("Guardian: starting '%s'...",
(const char *) instance->options.instance_name.str);
- /* NOTE, set state to STARTING _before_ start() is called */
- current_node->state= STARTING;
- instance->start();
- current_node->last_checked= current_time;
- break;
- case STARTED: /* fallthrough */
- case STARTING: /* let the instance start or crash */
- if (instance->is_crashed())
- {
- current_node->crash_moment= current_time;
- current_node->last_checked= current_time;
- current_node->state= JUST_CRASHED;
- /* fallthrough -- restart an instance immediately */
- }
- else
- break;
- case JUST_CRASHED:
- if (current_time - current_node->crash_moment <= 2)
+ /* NOTE: set state to STARTING _before_ start() is called. */
+ instance->set_state(Instance::STARTING);
+ instance->last_checked= current_time;
+
+ instance->start_mysqld();
+
+ return;
+
+ case Instance::STARTED: /* fallthrough */
+ case Instance::STARTING: /* let the instance start or crash */
+ if (!instance->is_crashed())
+ return;
+
+ instance->crash_moment= current_time;
+ instance->last_checked= current_time;
+ instance->set_state(Instance::JUST_CRASHED);
+ /* fallthrough -- restart an instance immediately */
+
+ case Instance::JUST_CRASHED:
+ if (current_time - instance->crash_moment <= 2)
{
if (instance->is_crashed())
{
- instance->start();
+ instance->start_mysqld();
log_info("Guardian: starting '%s'...",
(const char *) instance->options.instance_name.str);
}
}
else
- current_node->state= CRASHED;
- break;
- case CRASHED: /* just regular restarts */
- if (current_time - current_node->last_checked >
- monitoring_interval)
+ instance->set_state(Instance::CRASHED);
+
+ return;
+
+ case Instance::CRASHED: /* just regular restarts */
+ if (current_time - instance->last_checked <=
+ Options::Main::monitoring_interval)
+ return;
+
+ if (instance->restart_counter < restart_retry)
{
- if ((current_node->restart_counter < restart_retry))
- {
- if (instance->is_crashed())
- {
- instance->start();
- current_node->last_checked= current_time;
- current_node->restart_counter++;
- log_info("Guardian: restarting '%s'...",
- (const char *) instance->options.instance_name.str);
- }
- }
- else
+ if (instance->is_crashed())
{
- log_info("Guardian: can not start '%s'. "
- "Abandoning attempts to (re)start it",
+ instance->start_mysqld();
+ instance->last_checked= current_time;
+
+ log_info("Guardian: restarting '%s'...",
(const char *) instance->options.instance_name.str);
- current_node->state= CRASHED_AND_ABANDONED;
}
}
- break;
- case CRASHED_AND_ABANDONED:
- break; /* do nothing */
+ else
+ {
+ log_info("Guardian: can not start '%s'. "
+ "Abandoning attempts to (re)start it",
+ (const char *) instance->options.instance_name.str);
+
+ instance->set_state(Instance::CRASHED_AND_ABANDONED);
+ }
+
+ return;
+
+ case Instance::CRASHED_AND_ABANDONED:
+ return; /* do nothing */
+
default:
DBUG_ASSERT(0);
}
@@ -217,56 +248,78 @@ void Guardian::process_instance(Instance *instance,
}
-/*
+/**
Main function of Guardian thread.
SYNOPSIS
run()
DESCRIPTION
- Check for all guarded instances and restart them if needed. If everything
- is fine go and sleep for some time.
+ Check for all guarded instances and restart them if needed.
*/
void Guardian::run()
{
- Instance *instance;
- LIST *node;
struct timespec timeout;
log_info("Guardian: started.");
thread_registry->register_thread(&thread_info);
- pthread_mutex_lock(&LOCK_guardian);
+ /* Loop, until all instances were shut down at the end. */
- /* loop, until all instances were shut down at the end */
- while (!(shutdown_requested && (guarded_instances == NULL)))
+ while (true)
{
- node= guarded_instances;
+ Instance_map::Iterator instances_it(instance_map);
+ Instance *instance;
+ bool all_instances_stopped= TRUE;
+
+ instance_map->lock();
- while (node != NULL)
+ while ((instance= instances_it.next()))
{
- GUARD_NODE *current_node= (GUARD_NODE *) node->data;
- instance= ((GUARD_NODE *) node->data)->instance;
- process_instance(instance, current_node, &guarded_instances, node);
+ instance->lock();
- node= node->next;
+ if (!instance->is_guarded() ||
+ instance->get_state() == Instance::STOPPED)
+ {
+ instance->unlock();
+ continue;
+ }
+
+ process_instance(instance);
+
+ if (instance->get_state() != Instance::STOPPED)
+ all_instances_stopped= FALSE;
+
+ instance->unlock();
}
- timeout.tv_sec= time(NULL) + monitoring_interval;
+
+ instance_map->unlock();
+
+ lock();
+
+ if (shutdown_requested && all_instances_stopped)
+ {
+ log_info("Guardian: all guarded mysqlds stopped.");
+
+ stopped= TRUE;
+ unlock();
+ break;
+ }
+
+ timeout.tv_sec= time(NULL) + Options::Main::monitoring_interval;
timeout.tv_nsec= 0;
- /* check the loop predicate before sleeping */
- if (!(shutdown_requested && (!(guarded_instances))))
- thread_registry->cond_timedwait(&thread_info, &COND_guardian,
- &LOCK_guardian, &timeout);
+ thread_registry->cond_timedwait(&thread_info, &COND_guardian,
+ &LOCK_guardian, &timeout);
+ unlock();
}
log_info("Guardian: stopped.");
- stopped= TRUE;
- pthread_mutex_unlock(&LOCK_guardian);
- /* now, when the Guardian is stopped we can stop the IM */
+ /* Now, when the Guardian is stopped we can stop the IM. */
+
thread_registry->unregister_thread(&thread_info);
thread_registry->request_shutdown();
@@ -274,129 +327,65 @@ void Guardian::run()
}
-int Guardian::is_stopped()
+/**
+ Return the value of stopped flag.
+*/
+
+bool Guardian::is_stopped()
{
int var;
- pthread_mutex_lock(&LOCK_guardian);
+
+ lock();
var= stopped;
- pthread_mutex_unlock(&LOCK_guardian);
+ unlock();
+
return var;
}
-/*
- Initialize the list of guarded instances: loop through the Instance_map and
- add all of the instances, which don't have 'nonguarded' option specified.
-
- SYNOPSIS
- Guardian::init()
+/**
+ Wake up Guardian thread.
- NOTE: The operation should be invoked with the following locks acquired:
- - Guardian;
- - Instance_map;
-
- RETURN
- 0 - ok
- 1 - error occurred
+ MT-NOTE: though usually the mutex associated with condition variable should
+ be acquired before signalling the variable, here this is not needed.
+ Signalling under locked mutex is used to avoid lost signals. In the current
+ logic however locking mutex does not guarantee that the signal will not be
+ lost.
*/
-int Guardian::init()
+void Guardian::ping()
{
- Instance *instance;
- Instance_map::Iterator iterator(instance_map);
-
- /* clear the list of guarded instances */
- free_root(&alloc, MYF(0));
- init_alloc_root(&alloc, MEM_ROOT_BLOCK_SIZE, 0);
- guarded_instances= NULL;
-
- while ((instance= iterator.next()))
- {
- if (instance->options.nonguarded)
- continue;
-
- if (guard(instance, TRUE)) /* do not lock guardian */
- return 1;
- }
-
- return 0;
+ pthread_cond_signal(&COND_guardian);
}
-/*
- Add instance to the Guardian list
+/**
+ Prepare list of instances.
SYNOPSIS
- guard()
- instance the instance to be guarded
- nolock whether we prefer do not lock Guardian here,
- but use external locking instead
-
- DESCRIPTION
-
- The instance is added to the guarded instances list. Usually guard() is
- called after we start an instance.
+ init()
- RETURN
- 0 - ok
- 1 - error occurred
+ MT-NOTE: Instance Map must be locked before calling the operation.
*/
-int Guardian::guard(Instance *instance, bool nolock)
+void Guardian::init()
{
- LIST *node;
- GUARD_NODE *content;
-
- node= (LIST *) alloc_root(&alloc, sizeof(LIST));
- content= (GUARD_NODE *) alloc_root(&alloc, sizeof(GUARD_NODE));
-
- if ((!(node)) || (!(content)))
- return 1;
- /* we store the pointers to instances from the instance_map's MEM_ROOT */
- content->instance= instance;
- content->restart_counter= 0;
- content->crash_moment= 0;
- content->state= NOT_STARTED;
- node->data= (void*) content;
-
- if (nolock)
- guarded_instances= list_add(guarded_instances, node);
- else
- {
- pthread_mutex_lock(&LOCK_guardian);
- guarded_instances= list_add(guarded_instances, node);
- pthread_mutex_unlock(&LOCK_guardian);
- }
-
- return 0;
-}
-
-
-/*
- TODO: perhaps it would make sense to create a pool of the LIST nodeents
- and give them upon request. Now we are loosing a bit of memory when
- guarded instance was stopped and then restarted (since we cannot free just
- a piece of the MEM_ROOT).
-*/
-
-int Guardian::stop_guard(Instance *instance)
-{
- LIST *node;
-
- pthread_mutex_lock(&LOCK_guardian);
-
- node= find_instance_node(instance);
+ Instance *instance;
+ Instance_map::Iterator iterator(instance_map);
- if (node != NULL)
- guarded_instances= list_delete(guarded_instances, node);
+ while ((instance= iterator.next()))
+ {
+ instance->lock();
- pthread_mutex_unlock(&LOCK_guardian);
+ instance->reset_stat();
+ instance->set_state(Instance::NOT_STARTED);
- /* if there is nothing to delete it is also fine */
- return 0;
+ instance->unlock();
+ }
}
-/*
+
+/**
An internal method which is called at shutdown to unregister instances and
attempt to stop them if requested.
@@ -409,86 +398,71 @@ int Guardian::stop_guard(Instance *instance)
accordingly.
NOTE
- Guardian object should be locked by the calling function.
+ Guardian object should be locked by the caller.
- RETURN
- 0 - ok
- 1 - error occurred
*/
-int Guardian::stop_instances()
+void Guardian::stop_instances()
{
- LIST *node;
- node= guarded_instances;
- while (node != NULL)
+ Instance_map::Iterator instances_it(instance_map);
+ Instance *instance;
+
+ instance_map->lock();
+
+ while ((instance= instances_it.next()))
{
- GUARD_NODE *current_node= (GUARD_NODE *) node->data;
+ instance->lock();
+
+ if (!instance->is_guarded() ||
+ instance->get_state() == Instance::STOPPED)
+ {
+ instance->unlock();
+ continue;
+ }
+
/*
If instance is running or was running (and now probably hanging),
request stop.
*/
- if (current_node->instance->is_mysqld_running() ||
- (current_node->state == STARTED))
+
+ if (instance->is_mysqld_running() ||
+ instance->get_state() == Instance::STARTED)
{
- current_node->state= STOPPING;
- current_node->last_checked= time(NULL);
+ instance->set_state(Instance::STOPPING);
+ instance->last_checked= time(NULL);
}
else
- /* otherwise remove it from the list */
- guarded_instances= list_delete(guarded_instances, node);
- /* But try to kill it anyway. Just in case */
- current_node->instance->kill_mysqld(SIGTERM);
- node= node->next;
+ {
+ /* Otherwise mark it as STOPPED. */
+ instance->set_state(Instance::STOPPED);
+ }
+
+ /* Request mysqld to stop. */
+
+ instance->kill_mysqld(SIGTERM);
+
+ instance->unlock();
}
- return 0;
+
+ instance_map->unlock();
}
+/**
+ Lock Guardian.
+*/
+
void Guardian::lock()
{
pthread_mutex_lock(&LOCK_guardian);
}
+/**
+ Unlock Guardian.
+*/
+
void Guardian::unlock()
{
pthread_mutex_unlock(&LOCK_guardian);
}
-
-
-LIST *Guardian::find_instance_node(Instance *instance)
-{
- LIST *node= guarded_instances;
-
- while (node != NULL)
- {
- /*
- We compare only pointers, as we always use pointers from the
- instance_map's MEM_ROOT.
- */
- if (((GUARD_NODE *) node->data)->instance == instance)
- return node;
-
- node= node->next;
- }
-
- return NULL;
-}
-
-
-bool Guardian::is_active(Instance *instance)
-{
- bool guarded;
-
- lock();
-
- guarded= find_instance_node(instance) != NULL;
-
- /* is_running() can take a long time, so let's unlock mutex first. */
- unlock();
-
- if (guarded)
- return true;
-
- return instance->is_mysqld_running();
-}