diff options
Diffstat (limited to 'server-tools/instance-manager/guardian.cc')
-rw-r--r-- | server-tools/instance-manager/guardian.cc | 307 |
1 files changed, 257 insertions, 50 deletions
diff --git a/server-tools/instance-manager/guardian.cc b/server-tools/instance-manager/guardian.cc index 7375453673b..750c3495870 100644 --- a/server-tools/instance-manager/guardian.cc +++ b/server-tools/instance-manager/guardian.cc @@ -21,9 +21,32 @@ #include "guardian.h" #include "instance_map.h" +#include "instance.h" #include "mysql_manager_error.h" #include "log.h" #include <string.h> +#include <sys/types.h> +#include <signal.h> + + +/* + The Guardian list node structure. Guardian utilizes it to store + guarded instances plus some additional info. +*/ + +struct GUARD_NODE +{ + Instance *instance; + /* state of an instance (i.e. STARTED, CRASHED, etc.) */ + int state; + /* the amount of attemts to restart instance (cleaned up at success) */ + int restart_counter; + /* triggered at a crash */ + time_t crash_moment; + /* General time field. Used to provide timeouts (at shutdown and restart) */ + time_t last_checked; +}; + C_MODE_START @@ -42,13 +65,13 @@ Guardian_thread::Guardian_thread(Thread_registry &thread_registry_arg, uint monitoring_interval_arg) : Guardian_thread_args(thread_registry_arg, instance_map_arg, monitoring_interval_arg), - thread_info(pthread_self()) + thread_info(pthread_self()), guarded_instances(0) { pthread_mutex_init(&LOCK_guardian, 0); - thread_registry.register_thread(&thread_info); + pthread_cond_init(&COND_guardian, 0); + shutdown_requested= FALSE; + stopped= FALSE; init_alloc_root(&alloc, MEM_ROOT_BLOCK_SIZE, 0); - guarded_instances= NULL; - starting_instances= NULL; } @@ -57,9 +80,113 @@ Guardian_thread::~Guardian_thread() /* delay guardian destruction to the moment when no one needs it */ pthread_mutex_lock(&LOCK_guardian); free_root(&alloc, MYF(0)); - thread_registry.unregister_thread(&thread_info); pthread_mutex_unlock(&LOCK_guardian); pthread_mutex_destroy(&LOCK_guardian); + pthread_cond_destroy(&COND_guardian); +} + + +void Guardian_thread::request_shutdown(bool stop_instances_arg) +{ + pthread_mutex_lock(&LOCK_guardian); + /* stop instances or just clean up Guardian repository */ + stop_instances(stop_instances_arg); + shutdown_requested= TRUE; + pthread_mutex_unlock(&LOCK_guardian); +} + + +void Guardian_thread::process_instance(Instance *instance, + GUARD_NODE *current_node, + LIST **guarded_instances, + LIST *elem) +{ + int waitchild= Instance::DEFAULT_SHUTDOWN_DELAY; + /* The amount of times, Guardian attempts to restart an instance */ + int restart_retry= 100; + time_t current_time= time(NULL); + + if (current_node->state == STOPPING) + { + /* this brach is executed during shutdown */ + if (instance->options.shutdown_delay != NULL) + waitchild= atoi(instance->options.shutdown_delay); + + /* this returns true if and only if an instance was stopped for shure */ + if (instance->is_crashed()) + *guarded_instances= list_delete(*guarded_instances, elem); + else if (current_time - current_node->last_checked > waitchild) + { + instance->kill_instance(SIGKILL); + /* + Later we do elem= elem->next. This is ok, as we are only removing + the node from the list. The pointer to the next one is still valid. + */ + *guarded_instances= list_delete(*guarded_instances, elem); + } + + return; + } + + if (instance->is_running()) + { + /* clear status fields */ + current_node->restart_counter= 0; + current_node->crash_moment= 0; + current_node->state= STARTED; + } + else + { + switch (current_node->state) + { + case NOT_STARTED: + instance->start(); + current_node->last_checked= current_time; + log_info("guardian: starting instance %s", + instance->options.instance_name); + current_node->state= STARTING; + break; + case STARTED: /* fallthrough */ + case STARTING: /* let the instance start or crash */ + if (instance->is_crashed()) + { + current_node->crash_moment= current_time; + current_node->last_checked= current_time; + current_node->state= JUST_CRASHED; + /* fallthrough -- restart an instance immediately */ + } + else + break; + case JUST_CRASHED: + if (current_time - current_node->crash_moment <= 2) + { + instance->start(); + log_info("guardian: starting instance %s", + instance->options.instance_name); + } + else current_node->state= CRASHED; + break; + case CRASHED: /* just regular restarts */ + if (current_time - current_node->last_checked > + monitoring_interval) + { + if ((current_node->restart_counter < restart_retry)) + { + instance->start(); + current_node->last_checked= current_time; + ((GUARD_NODE *) elem->data)->restart_counter++; + log_info("guardian: starting instance %s", + instance->options.instance_name); + } + else current_node->state= CRASHED_AND_ABANDONED; + } + break; + case CRASHED_AND_ABANDONED: + break; /* do nothing */ + default: + DBUG_ASSERT(0); + } + } } @@ -78,33 +205,69 @@ Guardian_thread::~Guardian_thread() void Guardian_thread::run() { Instance *instance; - LIST *loop; + LIST *elem; + struct timespec timeout; + + thread_registry.register_thread(&thread_info); my_thread_init(); + pthread_mutex_lock(&LOCK_guardian); - while (!thread_registry.is_shutdown()) + /* loop, until all instances were shut down at the end */ + while (!(shutdown_requested && (guarded_instances == NULL))) { - pthread_mutex_lock(&LOCK_guardian); - loop= guarded_instances; - while (loop != NULL) + elem= guarded_instances; + + while (elem != NULL) { - instance= (Instance *) loop->data; - /* instance-> start already checks whether instance is running */ - if (instance->start() != ER_INSTANCE_ALREADY_STARTED) - log_info("guardian attempted to restart instance %s", - instance->options.instance_name); - loop= loop->next; + struct timespec timeout; + + GUARD_NODE *current_node= (GUARD_NODE *) elem->data; + instance= ((GUARD_NODE *) elem->data)->instance; + process_instance(instance, current_node, &guarded_instances, elem); + + elem= elem->next; } - move_to_list(&starting_instances, &guarded_instances); - pthread_mutex_unlock(&LOCK_guardian); - sleep(monitoring_interval); + timeout.tv_sec= time(NULL) + monitoring_interval; + timeout.tv_nsec= 0; + + /* check the loop predicate before sleeping */ + if (!(shutdown_requested && (guarded_instances == NULL))) + pthread_cond_timedwait(&COND_guardian, &LOCK_guardian, &timeout); } + stopped= TRUE; + pthread_mutex_unlock(&LOCK_guardian); + /* now, when the Guardian is stopped we can stop the IM */ + thread_registry.unregister_thread(&thread_info); + thread_registry.request_shutdown(); my_thread_end(); } -int Guardian_thread::start() +int Guardian_thread::is_stopped() +{ + int var; + pthread_mutex_lock(&LOCK_guardian); + var= stopped; + pthread_mutex_unlock(&LOCK_guardian); + return var; +} + + +/* + Initialize the list of guarded instances: loop through the Instance_map and + add all of the instances, which don't have 'nonguarded' option specified. + + SYNOPSYS + Guardian_thread::init() + + RETURN + 0 - ok + 1 - error occured +*/ + +int Guardian_thread::init() { Instance *instance; Instance_map::Iterator iterator(instance_map); @@ -112,7 +275,7 @@ int Guardian_thread::start() instance_map->lock(); while ((instance= iterator.next())) { - if ((instance->options.is_guarded != NULL) && (instance->is_running())) + if ((instance->options.nonguarded == NULL)) if (guard(instance)) return 1; } @@ -123,7 +286,7 @@ int Guardian_thread::start() /* - Start instance guarding + Add instance to the Guardian list SYNOPSYS guard() @@ -131,47 +294,33 @@ int Guardian_thread::start() DESCRIPTION - The instance is added to the list of starting instances. Then after one guardian - loop it is moved to the guarded instances list. Usually guard() is called after we - start an instance, so we need to give some time to the instance to start. + The instance is added to the guarded instances list. Usually guard() is + called after we start an instance. RETURN 0 - ok 1 - error occured */ - int Guardian_thread::guard(Instance *instance) { - return add_instance_to_list(instance, &starting_instances); -} - - -void Guardian_thread::move_to_list(LIST **from, LIST **to) -{ - LIST *tmp; - - while (*from) - { - tmp= rest(*from); - *to= list_add(*to, *from); - *from= tmp; - } -} - - -int Guardian_thread::add_instance_to_list(Instance *instance, LIST **list) -{ LIST *node; + GUARD_NODE *content; node= (LIST *) alloc_root(&alloc, sizeof(LIST)); - if (node == NULL) + content= (GUARD_NODE *) alloc_root(&alloc, sizeof(GUARD_NODE)); + + if ((node == NULL) || (content == NULL)) return 1; /* we store the pointers to instances from the instance_map's MEM_ROOT */ - node->data= (void *) instance; + content->instance= instance; + content->restart_counter= 0; + content->crash_moment= 0; + content->state= NOT_STARTED; + node->data= (void *) content; pthread_mutex_lock(&LOCK_guardian); - *list= list_add(*list, node); + guarded_instances= list_add(guarded_instances, node); pthread_mutex_unlock(&LOCK_guardian); return 0; @@ -180,7 +329,7 @@ int Guardian_thread::add_instance_to_list(Instance *instance, LIST **list) /* TODO: perhaps it would make sense to create a pool of the LIST elements - elements and give them upon request. Now we are loosing a bit of memory when + and give them upon request. Now we are loosing a bit of memory when guarded instance was stopped and then restarted (since we cannot free just a piece of the MEM_ROOT). */ @@ -198,7 +347,7 @@ int Guardian_thread::stop_guard(Instance *instance) We compare only pointers, as we always use pointers from the instance_map's MEM_ROOT. */ - if ((Instance *) node->data == instance) + if (((GUARD_NODE *) node->data)->instance == instance) { guarded_instances= list_delete(guarded_instances, node); pthread_mutex_unlock(&LOCK_guardian); @@ -212,3 +361,61 @@ int Guardian_thread::stop_guard(Instance *instance) return 0; } +/* + Start Guardian shutdown. Attempt to start instances if requested. + + SYNOPSYS + stop_instances() + stop_instances_arg whether we should stop instances at shutdown + + DESCRIPTION + + Loops through the guarded_instances list and prepares them for shutdown. + If stop_instances was requested, we need to issue a stop command and change + the state accordingly. Otherwise we could simply delete an entry. + NOTE: Guardian should be locked by the calling function + + RETURN + 0 - ok + 1 - error occured +*/ + +int Guardian_thread::stop_instances(bool stop_instances_arg) +{ + LIST *node; + node= guarded_instances; + while (node != NULL) + { + if (!stop_instances_arg) + { + /* just forget about an instance */ + guarded_instances= list_delete(guarded_instances, node); + /* + This should still work fine, as we have only removed the + node from the list. The pointer to the next one is still valid + */ + node= node->next; + } + else + { + GUARD_NODE *current_node= (GUARD_NODE *) node->data; + /* + If instance is running or was running (and now probably hanging), + request stop. + */ + if (current_node->instance->is_running() || + (current_node->state == STARTED)) + { + current_node->state= STOPPING; + current_node->last_checked= time(NULL); + } + else + /* otherwise remove it from the list */ + guarded_instances= list_delete(guarded_instances, node); + /* But try to kill it anyway. Just in case */ + current_node->instance->kill_instance(SIGTERM); + node= node->next; + } + } + return 0; +} |