diff options
Diffstat (limited to 'server-tools/instance-manager/guardian.cc')
-rw-r--r-- | server-tools/instance-manager/guardian.cc | 543 |
1 files changed, 259 insertions, 284 deletions
diff --git a/server-tools/instance-manager/guardian.cc b/server-tools/instance-manager/guardian.cc index e601ce0111c..1b451cd9933 100644 --- a/server-tools/instance-manager/guardian.cc +++ b/server-tools/instance-manager/guardian.cc @@ -28,101 +28,126 @@ #include "instance_map.h" #include "log.h" #include "mysql_manager_error.h" +#include "options.h" -const char * -Guardian::get_instance_state_name(enum_instance_state state) -{ - switch (state) { - case NOT_STARTED: - return "offline"; - - case STARTING: - return "starting"; - - case STARTED: - return "online"; - case JUST_CRASHED: - return "failed"; +/************************************************************************* + {{{ Constructor & destructor. +*************************************************************************/ - case CRASHED: - return "crashed"; - - case CRASHED_AND_ABANDONED: - return "abandoned"; - - case STOPPING: - return "stopping"; - } +/** + Guardian constructor. - return NULL; /* just to ignore compiler warning. */ -} + SYNOPSIS + Guardian() + thread_registry_arg + instance_map_arg -/* {{{ Constructor & destructor. */ + DESCRIPTION + Nominal contructor intended for assigning references and initialize + trivial objects. Real initialization is made by init() method. +*/ Guardian::Guardian(Thread_registry *thread_registry_arg, - Instance_map *instance_map_arg, - uint monitoring_interval_arg) - :stopped(FALSE), - monitoring_interval(monitoring_interval_arg), + Instance_map *instance_map_arg) + :shutdown_requested(FALSE), + stopped(FALSE), thread_registry(thread_registry_arg), - instance_map(instance_map_arg), - shutdown_requested(FALSE) + instance_map(instance_map_arg) { pthread_mutex_init(&LOCK_guardian, 0); pthread_cond_init(&COND_guardian, 0); - init_alloc_root(&alloc, MEM_ROOT_BLOCK_SIZE, 0); } Guardian::~Guardian() { - /* delay guardian destruction to the moment when no one needs it */ - pthread_mutex_lock(&LOCK_guardian); - free_root(&alloc, MYF(0)); - pthread_mutex_unlock(&LOCK_guardian); + /* + NOTE: it's necessary to synchronize here, because Guiardian thread can be + still alive an hold the mutex (because it is detached and we have no + control over it). + */ + + lock(); + unlock(); + pthread_mutex_destroy(&LOCK_guardian); pthread_cond_destroy(&COND_guardian); } -/* }}} */ +/************************************************************************* + }}} +*************************************************************************/ +/** + Send request to stop Guardian. + + SYNOPSIS + request_shutdown() +*/ + void Guardian::request_shutdown() { - pthread_mutex_lock(&LOCK_guardian); - /* STOP Instances or just clean up Guardian repository */ stop_instances(); + + lock(); shutdown_requested= TRUE; - pthread_mutex_unlock(&LOCK_guardian); + unlock(); + + ping(); } -void Guardian::process_instance(Instance *instance, - GUARD_NODE *current_node, - LIST **guarded_instances, - LIST *node) +/** + Process an instance. + + SYNOPSIS + process_instance() + instance a pointer to the instance for processing + + MT-NOTE: + - the given instance must be locked before calling this operation; + - Guardian must be locked before calling this operation. +*/ + +void Guardian::process_instance(Instance *instance) { - uint waitchild= (uint) Instance::DEFAULT_SHUTDOWN_DELAY; - /* The amount of times, Guardian attempts to restart an instance */ int restart_retry= 100; time_t current_time= time(NULL); - if (current_node->state == STOPPING) + if (instance->get_state() == Instance::STOPPING) { - waitchild= instance->options.get_shutdown_delay(); + /* This brach is executed during shutdown. */ - /* this returns TRUE if and only if an instance was stopped for sure */ + /* This returns TRUE if and only if an instance was stopped for sure. */ if (instance->is_crashed()) - *guarded_instances= list_delete(*guarded_instances, node); - else if ( (uint) (current_time - current_node->last_checked) > waitchild) { + log_info("Guardian: '%s' stopped.", + (const char *) instance->get_name()->str); + + instance->set_state(Instance::STOPPED); + } + else if ((uint) (current_time - instance->last_checked) >= + instance->options.get_shutdown_delay()) + { + log_info("Guardian: '%s' hasn't stopped within %d secs.", + (const char *) instance->get_name()->str, + (int) instance->options.get_shutdown_delay()); + instance->kill_mysqld(SIGKILL); - /* - Later we do node= node->next. This is ok, as we are only removing - the node from the list. The pointer to the next one is still valid. - */ - *guarded_instances= list_delete(*guarded_instances, node); + + log_info("Guardian: pretend that '%s' is killed.", + (const char *) instance->get_name()->str); + + instance->set_state(Instance::STOPPED); + } + else + { + log_info("Guardian: waiting for '%s' to stop (%d secs left).", + (const char *) instance->get_name()->str, + (int) (instance->options.get_shutdown_delay() - + current_time + instance->last_checked)); } return; @@ -133,83 +158,90 @@ void Guardian::process_instance(Instance *instance, /* The instance can be contacted on it's port */ /* If STARTING also check that pidfile has been created */ - if (current_node->state == STARTING && - current_node->instance->options.load_pid() == 0) + if (instance->get_state() == Instance::STARTING && + instance->options.load_pid() == 0) { /* Pid file not created yet, don't go to STARTED state yet */ } - else if (current_node->state != STARTED) + else if (instance->get_state() != Instance::STARTED) { /* clear status fields */ log_info("Guardian: '%s' is running, set state to STARTED.", (const char *) instance->options.instance_name.str); - current_node->restart_counter= 0; - current_node->crash_moment= 0; - current_node->state= STARTED; + instance->reset_stat(); + instance->set_state(Instance::STARTED); } } else { - switch (current_node->state) { - case NOT_STARTED: + switch (instance->get_state()) { + case Instance::NOT_STARTED: log_info("Guardian: starting '%s'...", (const char *) instance->options.instance_name.str); - /* NOTE, set state to STARTING _before_ start() is called */ - current_node->state= STARTING; - instance->start(); - current_node->last_checked= current_time; - break; - case STARTED: /* fallthrough */ - case STARTING: /* let the instance start or crash */ - if (instance->is_crashed()) - { - current_node->crash_moment= current_time; - current_node->last_checked= current_time; - current_node->state= JUST_CRASHED; - /* fallthrough -- restart an instance immediately */ - } - else - break; - case JUST_CRASHED: - if (current_time - current_node->crash_moment <= 2) + /* NOTE: set state to STARTING _before_ start() is called. */ + instance->set_state(Instance::STARTING); + instance->last_checked= current_time; + + instance->start_mysqld(); + + return; + + case Instance::STARTED: /* fallthrough */ + case Instance::STARTING: /* let the instance start or crash */ + if (!instance->is_crashed()) + return; + + instance->crash_moment= current_time; + instance->last_checked= current_time; + instance->set_state(Instance::JUST_CRASHED); + /* fallthrough -- restart an instance immediately */ + + case Instance::JUST_CRASHED: + if (current_time - instance->crash_moment <= 2) { if (instance->is_crashed()) { - instance->start(); + instance->start_mysqld(); log_info("Guardian: starting '%s'...", (const char *) instance->options.instance_name.str); } } else - current_node->state= CRASHED; - break; - case CRASHED: /* just regular restarts */ - if (current_time - current_node->last_checked > - monitoring_interval) + instance->set_state(Instance::CRASHED); + + return; + + case Instance::CRASHED: /* just regular restarts */ + if (current_time - instance->last_checked <= + Options::Main::monitoring_interval) + return; + + if (instance->restart_counter < restart_retry) { - if ((current_node->restart_counter < restart_retry)) - { - if (instance->is_crashed()) - { - instance->start(); - current_node->last_checked= current_time; - current_node->restart_counter++; - log_info("Guardian: restarting '%s'...", - (const char *) instance->options.instance_name.str); - } - } - else + if (instance->is_crashed()) { - log_info("Guardian: can not start '%s'. " - "Abandoning attempts to (re)start it", + instance->start_mysqld(); + instance->last_checked= current_time; + + log_info("Guardian: restarting '%s'...", (const char *) instance->options.instance_name.str); - current_node->state= CRASHED_AND_ABANDONED; } } - break; - case CRASHED_AND_ABANDONED: - break; /* do nothing */ + else + { + log_info("Guardian: can not start '%s'. " + "Abandoning attempts to (re)start it", + (const char *) instance->options.instance_name.str); + + instance->set_state(Instance::CRASHED_AND_ABANDONED); + } + + return; + + case Instance::CRASHED_AND_ABANDONED: + return; /* do nothing */ + default: DBUG_ASSERT(0); } @@ -217,56 +249,78 @@ void Guardian::process_instance(Instance *instance, } -/* +/** Main function of Guardian thread. SYNOPSIS run() DESCRIPTION - Check for all guarded instances and restart them if needed. If everything - is fine go and sleep for some time. + Check for all guarded instances and restart them if needed. */ void Guardian::run() { - Instance *instance; - LIST *node; struct timespec timeout; log_info("Guardian: started."); thread_registry->register_thread(&thread_info); - pthread_mutex_lock(&LOCK_guardian); + /* Loop, until all instances were shut down at the end. */ - /* loop, until all instances were shut down at the end */ - while (!(shutdown_requested && (guarded_instances == NULL))) + while (true) { - node= guarded_instances; + Instance_map::Iterator instances_it(instance_map); + Instance *instance; + bool all_instances_stopped= TRUE; + + instance_map->lock(); - while (node != NULL) + while ((instance= instances_it.next())) { - GUARD_NODE *current_node= (GUARD_NODE *) node->data; - instance= ((GUARD_NODE *) node->data)->instance; - process_instance(instance, current_node, &guarded_instances, node); + instance->lock(); - node= node->next; + if (!instance->is_guarded() || + instance->get_state() == Instance::STOPPED) + { + instance->unlock(); + continue; + } + + process_instance(instance); + + if (instance->get_state() != Instance::STOPPED) + all_instances_stopped= FALSE; + + instance->unlock(); } - timeout.tv_sec= time(NULL) + monitoring_interval; + + instance_map->unlock(); + + lock(); + + if (shutdown_requested && all_instances_stopped) + { + log_info("Guardian: all guarded mysqlds stopped."); + + stopped= TRUE; + unlock(); + break; + } + + timeout.tv_sec= time(NULL) + Options::Main::monitoring_interval; timeout.tv_nsec= 0; - /* check the loop predicate before sleeping */ - if (!(shutdown_requested && (!(guarded_instances)))) - thread_registry->cond_timedwait(&thread_info, &COND_guardian, - &LOCK_guardian, &timeout); + thread_registry->cond_timedwait(&thread_info, &COND_guardian, + &LOCK_guardian, &timeout); + unlock(); } log_info("Guardian: stopped."); - stopped= TRUE; - pthread_mutex_unlock(&LOCK_guardian); - /* now, when the Guardian is stopped we can stop the IM */ + /* Now, when the Guardian is stopped we can stop the IM. */ + thread_registry->unregister_thread(&thread_info); thread_registry->request_shutdown(); @@ -274,129 +328,65 @@ void Guardian::run() } -int Guardian::is_stopped() +/** + Return the value of stopped flag. +*/ + +bool Guardian::is_stopped() { int var; - pthread_mutex_lock(&LOCK_guardian); + + lock(); var= stopped; - pthread_mutex_unlock(&LOCK_guardian); + unlock(); + return var; } -/* - Initialize the list of guarded instances: loop through the Instance_map and - add all of the instances, which don't have 'nonguarded' option specified. - - SYNOPSIS - Guardian::init() +/** + Wake up Guardian thread. - NOTE: The operation should be invoked with the following locks acquired: - - Guardian; - - Instance_map; - - RETURN - 0 - ok - 1 - error occurred + MT-NOTE: though usually the mutex associated with condition variable should + be acquired before signalling the variable, here this is not needed. + Signalling under locked mutex is used to avoid lost signals. In the current + logic however locking mutex does not guarantee that the signal will not be + lost. */ -int Guardian::init() +void Guardian::ping() { - Instance *instance; - Instance_map::Iterator iterator(instance_map); - - /* clear the list of guarded instances */ - free_root(&alloc, MYF(0)); - init_alloc_root(&alloc, MEM_ROOT_BLOCK_SIZE, 0); - guarded_instances= NULL; - - while ((instance= iterator.next())) - { - if (instance->options.nonguarded) - continue; - - if (guard(instance, TRUE)) /* do not lock guardian */ - return 1; - } - - return 0; + pthread_cond_signal(&COND_guardian); } -/* - Add instance to the Guardian list +/** + Prepare list of instances. SYNOPSIS - guard() - instance the instance to be guarded - nolock whether we prefer do not lock Guardian here, - but use external locking instead - - DESCRIPTION - - The instance is added to the guarded instances list. Usually guard() is - called after we start an instance. + init() - RETURN - 0 - ok - 1 - error occurred + MT-NOTE: Instance Map must be locked before calling the operation. */ -int Guardian::guard(Instance *instance, bool nolock) +void Guardian::init() { - LIST *node; - GUARD_NODE *content; - - node= (LIST *) alloc_root(&alloc, sizeof(LIST)); - content= (GUARD_NODE *) alloc_root(&alloc, sizeof(GUARD_NODE)); - - if ((!(node)) || (!(content))) - return 1; - /* we store the pointers to instances from the instance_map's MEM_ROOT */ - content->instance= instance; - content->restart_counter= 0; - content->crash_moment= 0; - content->state= NOT_STARTED; - node->data= (void*) content; - - if (nolock) - guarded_instances= list_add(guarded_instances, node); - else - { - pthread_mutex_lock(&LOCK_guardian); - guarded_instances= list_add(guarded_instances, node); - pthread_mutex_unlock(&LOCK_guardian); - } - - return 0; -} - - -/* - TODO: perhaps it would make sense to create a pool of the LIST nodeents - and give them upon request. Now we are loosing a bit of memory when - guarded instance was stopped and then restarted (since we cannot free just - a piece of the MEM_ROOT). -*/ - -int Guardian::stop_guard(Instance *instance) -{ - LIST *node; - - pthread_mutex_lock(&LOCK_guardian); - - node= find_instance_node(instance); + Instance *instance; + Instance_map::Iterator iterator(instance_map); - if (node != NULL) - guarded_instances= list_delete(guarded_instances, node); + while ((instance= iterator.next())) + { + instance->lock(); - pthread_mutex_unlock(&LOCK_guardian); + instance->reset_stat(); + instance->set_state(Instance::NOT_STARTED); - /* if there is nothing to delete it is also fine */ - return 0; + instance->unlock(); + } } -/* + +/** An internal method which is called at shutdown to unregister instances and attempt to stop them if requested. @@ -409,86 +399,71 @@ int Guardian::stop_guard(Instance *instance) accordingly. NOTE - Guardian object should be locked by the calling function. + Guardian object should be locked by the caller. - RETURN - 0 - ok - 1 - error occurred */ -int Guardian::stop_instances() +void Guardian::stop_instances() { - LIST *node; - node= guarded_instances; - while (node != NULL) + Instance_map::Iterator instances_it(instance_map); + Instance *instance; + + instance_map->lock(); + + while ((instance= instances_it.next())) { - GUARD_NODE *current_node= (GUARD_NODE *) node->data; + instance->lock(); + + if (!instance->is_guarded() || + instance->get_state() == Instance::STOPPED) + { + instance->unlock(); + continue; + } + /* If instance is running or was running (and now probably hanging), request stop. */ - if (current_node->instance->is_mysqld_running() || - (current_node->state == STARTED)) + + if (instance->is_mysqld_running() || + instance->get_state() == Instance::STARTED) { - current_node->state= STOPPING; - current_node->last_checked= time(NULL); + instance->set_state(Instance::STOPPING); + instance->last_checked= time(NULL); } else - /* otherwise remove it from the list */ - guarded_instances= list_delete(guarded_instances, node); - /* But try to kill it anyway. Just in case */ - current_node->instance->kill_mysqld(SIGTERM); - node= node->next; + { + /* Otherwise mark it as STOPPED. */ + instance->set_state(Instance::STOPPED); + } + + /* Request mysqld to stop. */ + + instance->kill_mysqld(SIGTERM); + + instance->unlock(); } - return 0; + + instance_map->unlock(); } +/** + Lock Guardian. +*/ + void Guardian::lock() { pthread_mutex_lock(&LOCK_guardian); } +/** + Unlock Guardian. +*/ + void Guardian::unlock() { pthread_mutex_unlock(&LOCK_guardian); } - - -LIST *Guardian::find_instance_node(Instance *instance) -{ - LIST *node= guarded_instances; - - while (node != NULL) - { - /* - We compare only pointers, as we always use pointers from the - instance_map's MEM_ROOT. - */ - if (((GUARD_NODE *) node->data)->instance == instance) - return node; - - node= node->next; - } - - return NULL; -} - - -bool Guardian::is_active(Instance *instance) -{ - bool guarded; - - lock(); - - guarded= find_instance_node(instance) != NULL; - - /* is_running() can take a long time, so let's unlock mutex first. */ - unlock(); - - if (guarded) - return true; - - return instance->is_mysqld_running(); -} |