diff options
Diffstat (limited to 'server-tools/instance-manager/guardian.cc')
-rw-r--r-- | server-tools/instance-manager/guardian.cc | 317 |
1 files changed, 215 insertions, 102 deletions
diff --git a/server-tools/instance-manager/guardian.cc b/server-tools/instance-manager/guardian.cc index bc05fda1a8f..750c3495870 100644 --- a/server-tools/instance-manager/guardian.cc +++ b/server-tools/instance-manager/guardian.cc @@ -21,9 +21,32 @@ #include "guardian.h" #include "instance_map.h" +#include "instance.h" #include "mysql_manager_error.h" #include "log.h" #include <string.h> +#include <sys/types.h> +#include <signal.h> + + +/* + The Guardian list node structure. Guardian utilizes it to store + guarded instances plus some additional info. +*/ + +struct GUARD_NODE +{ + Instance *instance; + /* state of an instance (i.e. STARTED, CRASHED, etc.) */ + int state; + /* the amount of attemts to restart instance (cleaned up at success) */ + int restart_counter; + /* triggered at a crash */ + time_t crash_moment; + /* General time field. Used to provide timeouts (at shutdown and restart) */ + time_t last_checked; +}; + C_MODE_START @@ -42,15 +65,13 @@ Guardian_thread::Guardian_thread(Thread_registry &thread_registry_arg, uint monitoring_interval_arg) : Guardian_thread_args(thread_registry_arg, instance_map_arg, monitoring_interval_arg), - thread_info(pthread_self()) + thread_info(pthread_self()), guarded_instances(0) { pthread_mutex_init(&LOCK_guardian, 0); pthread_cond_init(&COND_guardian, 0); - shutdown_guardian= FALSE; - is_stopped= FALSE; + shutdown_requested= FALSE; + stopped= FALSE; init_alloc_root(&alloc, MEM_ROOT_BLOCK_SIZE, 0); - guarded_instances= NULL; - starting_instances= NULL; } @@ -65,19 +86,107 @@ Guardian_thread::~Guardian_thread() } -void Guardian_thread::shutdown() +void Guardian_thread::request_shutdown(bool stop_instances_arg) { pthread_mutex_lock(&LOCK_guardian); - shutdown_guardian= TRUE; + /* stop instances or just clean up Guardian repository */ + stop_instances(stop_instances_arg); + shutdown_requested= TRUE; pthread_mutex_unlock(&LOCK_guardian); } -void Guardian_thread::request_stop_instances() +void Guardian_thread::process_instance(Instance *instance, + GUARD_NODE *current_node, + LIST **guarded_instances, + LIST *elem) { - pthread_mutex_lock(&LOCK_guardian); - request_stop= TRUE; - pthread_mutex_unlock(&LOCK_guardian); + int waitchild= Instance::DEFAULT_SHUTDOWN_DELAY; + /* The amount of times, Guardian attempts to restart an instance */ + int restart_retry= 100; + time_t current_time= time(NULL); + + if (current_node->state == STOPPING) + { + /* this brach is executed during shutdown */ + if (instance->options.shutdown_delay != NULL) + waitchild= atoi(instance->options.shutdown_delay); + + /* this returns true if and only if an instance was stopped for shure */ + if (instance->is_crashed()) + *guarded_instances= list_delete(*guarded_instances, elem); + else if (current_time - current_node->last_checked > waitchild) + { + instance->kill_instance(SIGKILL); + /* + Later we do elem= elem->next. This is ok, as we are only removing + the node from the list. The pointer to the next one is still valid. + */ + *guarded_instances= list_delete(*guarded_instances, elem); + } + + return; + } + + if (instance->is_running()) + { + /* clear status fields */ + current_node->restart_counter= 0; + current_node->crash_moment= 0; + current_node->state= STARTED; + } + else + { + switch (current_node->state) + { + case NOT_STARTED: + instance->start(); + current_node->last_checked= current_time; + log_info("guardian: starting instance %s", + instance->options.instance_name); + current_node->state= STARTING; + break; + case STARTED: /* fallthrough */ + case STARTING: /* let the instance start or crash */ + if (instance->is_crashed()) + { + current_node->crash_moment= current_time; + current_node->last_checked= current_time; + current_node->state= JUST_CRASHED; + /* fallthrough -- restart an instance immediately */ + } + else + break; + case JUST_CRASHED: + if (current_time - current_node->crash_moment <= 2) + { + instance->start(); + log_info("guardian: starting instance %s", + instance->options.instance_name); + } + else current_node->state= CRASHED; + break; + case CRASHED: /* just regular restarts */ + if (current_time - current_node->last_checked > + monitoring_interval) + { + if ((current_node->restart_counter < restart_retry)) + { + instance->start(); + current_node->last_checked= current_time; + ((GUARD_NODE *) elem->data)->restart_counter++; + log_info("guardian: starting instance %s", + instance->options.instance_name); + } + else current_node->state= CRASHED_AND_ABANDONED; + } + break; + case CRASHED_AND_ABANDONED: + break; /* do nothing */ + default: + DBUG_ASSERT(0); + } + } } @@ -96,8 +205,7 @@ void Guardian_thread::request_stop_instances() void Guardian_thread::run() { Instance *instance; - int restart_retry= 100; - LIST *loop; + LIST *elem; struct timespec timeout; thread_registry.register_thread(&thread_info); @@ -105,68 +213,31 @@ void Guardian_thread::run() my_thread_init(); pthread_mutex_lock(&LOCK_guardian); - - while (!shutdown_guardian) + /* loop, until all instances were shut down at the end */ + while (!(shutdown_requested && (guarded_instances == NULL))) { - int status= 0; - loop= guarded_instances; + elem= guarded_instances; - while (loop != NULL) + while (elem != NULL) { - instance= ((GUARD_NODE *) loop->data)->instance; - if (!instance->is_running()) - { - int state= 0; /* state of guardian */ - - if ((((GUARD_NODE *) loop->data)->crash_moment == 0)) - state= 1; /* an instance just crashed */ - else - if (time(NULL) - ((GUARD_NODE *) loop->data)->crash_moment <= 2) - /* try to restart an instance immediately */ - state= 2; - else - state= 3; /* try to restart it */ + struct timespec timeout; - if (state == 1) - ((GUARD_NODE *) loop->data)->crash_moment= time(NULL); + GUARD_NODE *current_node= (GUARD_NODE *) elem->data; + instance= ((GUARD_NODE *) elem->data)->instance; + process_instance(instance, current_node, &guarded_instances, elem); - if ((state == 1) || (state == 2)) - { - instance->start(); - ((GUARD_NODE *) loop->data)->restart_counter++; - log_info("guardian: starting instance %s", - instance->options.instance_name); - } - else - { - if ((status == ETIMEDOUT) && - (((GUARD_NODE *) loop->data)->restart_counter < restart_retry)) - { - instance->start(); - ((GUARD_NODE *) loop->data)->restart_counter++; - log_info("guardian: starting instance %s", - instance->options.instance_name); - } - } - } - else /* clear status fields */ - { - ((GUARD_NODE *) loop->data)->restart_counter= 0; - ((GUARD_NODE *) loop->data)->crash_moment= 0; - } - loop= loop->next; + elem= elem->next; } - move_to_list(&starting_instances, &guarded_instances); timeout.tv_sec= time(NULL) + monitoring_interval; timeout.tv_nsec= 0; - status= pthread_cond_timedwait(&COND_guardian, &LOCK_guardian, &timeout); + /* check the loop predicate before sleeping */ + if (!(shutdown_requested && (guarded_instances == NULL))) + pthread_cond_timedwait(&COND_guardian, &LOCK_guardian, &timeout); } + stopped= TRUE; pthread_mutex_unlock(&LOCK_guardian); - if (request_stop) - stop_instances(); - is_stopped= TRUE; /* now, when the Guardian is stopped we can stop the IM */ thread_registry.unregister_thread(&thread_info); thread_registry.request_shutdown(); @@ -174,7 +245,29 @@ void Guardian_thread::run() } -int Guardian_thread::start() +int Guardian_thread::is_stopped() +{ + int var; + pthread_mutex_lock(&LOCK_guardian); + var= stopped; + pthread_mutex_unlock(&LOCK_guardian); + return var; +} + + +/* + Initialize the list of guarded instances: loop through the Instance_map and + add all of the instances, which don't have 'nonguarded' option specified. + + SYNOPSYS + Guardian_thread::init() + + RETURN + 0 - ok + 1 - error occured +*/ + +int Guardian_thread::init() { Instance *instance; Instance_map::Iterator iterator(instance_map); @@ -183,7 +276,7 @@ int Guardian_thread::start() while ((instance= iterator.next())) { if ((instance->options.nonguarded == NULL)) - if (add_instance_to_list(instance, &guarded_instances)) + if (guard(instance)) return 1; } instance_map->unlock(); @@ -193,7 +286,7 @@ int Guardian_thread::start() /* - Start instance guarding + Add instance to the Guardian list SYNOPSYS guard() @@ -201,37 +294,16 @@ int Guardian_thread::start() DESCRIPTION - The instance is added to the list of starting instances. Then after one guardian - loop it is moved to the guarded instances list. Usually guard() is called after we - start an instance, so we need to give some time to the instance to start. + The instance is added to the guarded instances list. Usually guard() is + called after we start an instance. RETURN 0 - ok 1 - error occured */ - int Guardian_thread::guard(Instance *instance) { - return add_instance_to_list(instance, &starting_instances); -} - - -void Guardian_thread::move_to_list(LIST **from, LIST **to) -{ - LIST *tmp; - - while (*from) - { - tmp= rest(*from); - *to= list_add(*to, *from); - *from= tmp; - } -} - - -int Guardian_thread::add_instance_to_list(Instance *instance, LIST **list) -{ LIST *node; GUARD_NODE *content; @@ -244,10 +316,11 @@ int Guardian_thread::add_instance_to_list(Instance *instance, LIST **list) content->instance= instance; content->restart_counter= 0; content->crash_moment= 0; + content->state= NOT_STARTED; node->data= (void *) content; pthread_mutex_lock(&LOCK_guardian); - *list= list_add(*list, node); + guarded_instances= list_add(guarded_instances, node); pthread_mutex_unlock(&LOCK_guardian); return 0; @@ -256,7 +329,7 @@ int Guardian_thread::add_instance_to_list(Instance *instance, LIST **list) /* TODO: perhaps it would make sense to create a pool of the LIST elements - elements and give them upon request. Now we are loosing a bit of memory when + and give them upon request. Now we are loosing a bit of memory when guarded instance was stopped and then restarted (since we cannot free just a piece of the MEM_ROOT). */ @@ -288,21 +361,61 @@ int Guardian_thread::stop_guard(Instance *instance) return 0; } -int Guardian_thread::stop_instances() -{ - Instance *instance; - Instance_map::Iterator iterator(instance_map); +/* + Start Guardian shutdown. Attempt to start instances if requested. - while ((instance= iterator.next())) + SYNOPSYS + stop_instances() + stop_instances_arg whether we should stop instances at shutdown + + DESCRIPTION + + Loops through the guarded_instances list and prepares them for shutdown. + If stop_instances was requested, we need to issue a stop command and change + the state accordingly. Otherwise we could simply delete an entry. + NOTE: Guardian should be locked by the calling function + + RETURN + 0 - ok + 1 - error occured +*/ + +int Guardian_thread::stop_instances(bool stop_instances_arg) +{ + LIST *node; + node= guarded_instances; + while (node != NULL) { - if ((instance->options.nonguarded == NULL)) + if (!stop_instances_arg) { - if (stop_guard(instance)) - return 1; - /* let us try to stop the server */ - instance->stop(); + /* just forget about an instance */ + guarded_instances= list_delete(guarded_instances, node); + /* + This should still work fine, as we have only removed the + node from the list. The pointer to the next one is still valid + */ + node= node->next; + } + else + { + GUARD_NODE *current_node= (GUARD_NODE *) node->data; + /* + If instance is running or was running (and now probably hanging), + request stop. + */ + if (current_node->instance->is_running() || + (current_node->state == STARTED)) + { + current_node->state= STOPPING; + current_node->last_checked= time(NULL); + } + else + /* otherwise remove it from the list */ + guarded_instances= list_delete(guarded_instances, node); + /* But try to kill it anyway. Just in case */ + current_node->instance->kill_instance(SIGTERM); + node= node->next; } } - return 0; } |