summaryrefslogtreecommitdiff
path: root/server-tools/instance-manager/guardian.cc
diff options
context:
space:
mode:
Diffstat (limited to 'server-tools/instance-manager/guardian.cc')
-rw-r--r--server-tools/instance-manager/guardian.cc307
1 files changed, 257 insertions, 50 deletions
diff --git a/server-tools/instance-manager/guardian.cc b/server-tools/instance-manager/guardian.cc
index 7375453673b..750c3495870 100644
--- a/server-tools/instance-manager/guardian.cc
+++ b/server-tools/instance-manager/guardian.cc
@@ -21,9 +21,32 @@
#include "guardian.h"
#include "instance_map.h"
+#include "instance.h"
#include "mysql_manager_error.h"
#include "log.h"
#include <string.h>
+#include <sys/types.h>
+#include <signal.h>
+
+
+/*
+ The Guardian list node structure. Guardian utilizes it to store
+ guarded instances plus some additional info.
+*/
+
+struct GUARD_NODE
+{
+ Instance *instance;
+ /* state of an instance (i.e. STARTED, CRASHED, etc.) */
+ int state;
+ /* the amount of attemts to restart instance (cleaned up at success) */
+ int restart_counter;
+ /* triggered at a crash */
+ time_t crash_moment;
+ /* General time field. Used to provide timeouts (at shutdown and restart) */
+ time_t last_checked;
+};
+
C_MODE_START
@@ -42,13 +65,13 @@ Guardian_thread::Guardian_thread(Thread_registry &thread_registry_arg,
uint monitoring_interval_arg) :
Guardian_thread_args(thread_registry_arg, instance_map_arg,
monitoring_interval_arg),
- thread_info(pthread_self())
+ thread_info(pthread_self()), guarded_instances(0)
{
pthread_mutex_init(&LOCK_guardian, 0);
- thread_registry.register_thread(&thread_info);
+ pthread_cond_init(&COND_guardian, 0);
+ shutdown_requested= FALSE;
+ stopped= FALSE;
init_alloc_root(&alloc, MEM_ROOT_BLOCK_SIZE, 0);
- guarded_instances= NULL;
- starting_instances= NULL;
}
@@ -57,9 +80,113 @@ Guardian_thread::~Guardian_thread()
/* delay guardian destruction to the moment when no one needs it */
pthread_mutex_lock(&LOCK_guardian);
free_root(&alloc, MYF(0));
- thread_registry.unregister_thread(&thread_info);
pthread_mutex_unlock(&LOCK_guardian);
pthread_mutex_destroy(&LOCK_guardian);
+ pthread_cond_destroy(&COND_guardian);
+}
+
+
+void Guardian_thread::request_shutdown(bool stop_instances_arg)
+{
+ pthread_mutex_lock(&LOCK_guardian);
+ /* stop instances or just clean up Guardian repository */
+ stop_instances(stop_instances_arg);
+ shutdown_requested= TRUE;
+ pthread_mutex_unlock(&LOCK_guardian);
+}
+
+
+void Guardian_thread::process_instance(Instance *instance,
+ GUARD_NODE *current_node,
+ LIST **guarded_instances,
+ LIST *elem)
+{
+ int waitchild= Instance::DEFAULT_SHUTDOWN_DELAY;
+ /* The amount of times, Guardian attempts to restart an instance */
+ int restart_retry= 100;
+ time_t current_time= time(NULL);
+
+ if (current_node->state == STOPPING)
+ {
+ /* this brach is executed during shutdown */
+ if (instance->options.shutdown_delay != NULL)
+ waitchild= atoi(instance->options.shutdown_delay);
+
+ /* this returns true if and only if an instance was stopped for shure */
+ if (instance->is_crashed())
+ *guarded_instances= list_delete(*guarded_instances, elem);
+ else if (current_time - current_node->last_checked > waitchild)
+ {
+ instance->kill_instance(SIGKILL);
+ /*
+ Later we do elem= elem->next. This is ok, as we are only removing
+ the node from the list. The pointer to the next one is still valid.
+ */
+ *guarded_instances= list_delete(*guarded_instances, elem);
+ }
+
+ return;
+ }
+
+ if (instance->is_running())
+ {
+ /* clear status fields */
+ current_node->restart_counter= 0;
+ current_node->crash_moment= 0;
+ current_node->state= STARTED;
+ }
+ else
+ {
+ switch (current_node->state)
+ {
+ case NOT_STARTED:
+ instance->start();
+ current_node->last_checked= current_time;
+ log_info("guardian: starting instance %s",
+ instance->options.instance_name);
+ current_node->state= STARTING;
+ break;
+ case STARTED: /* fallthrough */
+ case STARTING: /* let the instance start or crash */
+ if (instance->is_crashed())
+ {
+ current_node->crash_moment= current_time;
+ current_node->last_checked= current_time;
+ current_node->state= JUST_CRASHED;
+ /* fallthrough -- restart an instance immediately */
+ }
+ else
+ break;
+ case JUST_CRASHED:
+ if (current_time - current_node->crash_moment <= 2)
+ {
+ instance->start();
+ log_info("guardian: starting instance %s",
+ instance->options.instance_name);
+ }
+ else current_node->state= CRASHED;
+ break;
+ case CRASHED: /* just regular restarts */
+ if (current_time - current_node->last_checked >
+ monitoring_interval)
+ {
+ if ((current_node->restart_counter < restart_retry))
+ {
+ instance->start();
+ current_node->last_checked= current_time;
+ ((GUARD_NODE *) elem->data)->restart_counter++;
+ log_info("guardian: starting instance %s",
+ instance->options.instance_name);
+ }
+ else current_node->state= CRASHED_AND_ABANDONED;
+ }
+ break;
+ case CRASHED_AND_ABANDONED:
+ break; /* do nothing */
+ default:
+ DBUG_ASSERT(0);
+ }
+ }
}
@@ -78,33 +205,69 @@ Guardian_thread::~Guardian_thread()
void Guardian_thread::run()
{
Instance *instance;
- LIST *loop;
+ LIST *elem;
+ struct timespec timeout;
+
+ thread_registry.register_thread(&thread_info);
my_thread_init();
+ pthread_mutex_lock(&LOCK_guardian);
- while (!thread_registry.is_shutdown())
+ /* loop, until all instances were shut down at the end */
+ while (!(shutdown_requested && (guarded_instances == NULL)))
{
- pthread_mutex_lock(&LOCK_guardian);
- loop= guarded_instances;
- while (loop != NULL)
+ elem= guarded_instances;
+
+ while (elem != NULL)
{
- instance= (Instance *) loop->data;
- /* instance-> start already checks whether instance is running */
- if (instance->start() != ER_INSTANCE_ALREADY_STARTED)
- log_info("guardian attempted to restart instance %s",
- instance->options.instance_name);
- loop= loop->next;
+ struct timespec timeout;
+
+ GUARD_NODE *current_node= (GUARD_NODE *) elem->data;
+ instance= ((GUARD_NODE *) elem->data)->instance;
+ process_instance(instance, current_node, &guarded_instances, elem);
+
+ elem= elem->next;
}
- move_to_list(&starting_instances, &guarded_instances);
- pthread_mutex_unlock(&LOCK_guardian);
- sleep(monitoring_interval);
+ timeout.tv_sec= time(NULL) + monitoring_interval;
+ timeout.tv_nsec= 0;
+
+ /* check the loop predicate before sleeping */
+ if (!(shutdown_requested && (guarded_instances == NULL)))
+ pthread_cond_timedwait(&COND_guardian, &LOCK_guardian, &timeout);
}
+ stopped= TRUE;
+ pthread_mutex_unlock(&LOCK_guardian);
+ /* now, when the Guardian is stopped we can stop the IM */
+ thread_registry.unregister_thread(&thread_info);
+ thread_registry.request_shutdown();
my_thread_end();
}
-int Guardian_thread::start()
+int Guardian_thread::is_stopped()
+{
+ int var;
+ pthread_mutex_lock(&LOCK_guardian);
+ var= stopped;
+ pthread_mutex_unlock(&LOCK_guardian);
+ return var;
+}
+
+
+/*
+ Initialize the list of guarded instances: loop through the Instance_map and
+ add all of the instances, which don't have 'nonguarded' option specified.
+
+ SYNOPSYS
+ Guardian_thread::init()
+
+ RETURN
+ 0 - ok
+ 1 - error occured
+*/
+
+int Guardian_thread::init()
{
Instance *instance;
Instance_map::Iterator iterator(instance_map);
@@ -112,7 +275,7 @@ int Guardian_thread::start()
instance_map->lock();
while ((instance= iterator.next()))
{
- if ((instance->options.is_guarded != NULL) && (instance->is_running()))
+ if ((instance->options.nonguarded == NULL))
if (guard(instance))
return 1;
}
@@ -123,7 +286,7 @@ int Guardian_thread::start()
/*
- Start instance guarding
+ Add instance to the Guardian list
SYNOPSYS
guard()
@@ -131,47 +294,33 @@ int Guardian_thread::start()
DESCRIPTION
- The instance is added to the list of starting instances. Then after one guardian
- loop it is moved to the guarded instances list. Usually guard() is called after we
- start an instance, so we need to give some time to the instance to start.
+ The instance is added to the guarded instances list. Usually guard() is
+ called after we start an instance.
RETURN
0 - ok
1 - error occured
*/
-
int Guardian_thread::guard(Instance *instance)
{
- return add_instance_to_list(instance, &starting_instances);
-}
-
-
-void Guardian_thread::move_to_list(LIST **from, LIST **to)
-{
- LIST *tmp;
-
- while (*from)
- {
- tmp= rest(*from);
- *to= list_add(*to, *from);
- *from= tmp;
- }
-}
-
-
-int Guardian_thread::add_instance_to_list(Instance *instance, LIST **list)
-{
LIST *node;
+ GUARD_NODE *content;
node= (LIST *) alloc_root(&alloc, sizeof(LIST));
- if (node == NULL)
+ content= (GUARD_NODE *) alloc_root(&alloc, sizeof(GUARD_NODE));
+
+ if ((node == NULL) || (content == NULL))
return 1;
/* we store the pointers to instances from the instance_map's MEM_ROOT */
- node->data= (void *) instance;
+ content->instance= instance;
+ content->restart_counter= 0;
+ content->crash_moment= 0;
+ content->state= NOT_STARTED;
+ node->data= (void *) content;
pthread_mutex_lock(&LOCK_guardian);
- *list= list_add(*list, node);
+ guarded_instances= list_add(guarded_instances, node);
pthread_mutex_unlock(&LOCK_guardian);
return 0;
@@ -180,7 +329,7 @@ int Guardian_thread::add_instance_to_list(Instance *instance, LIST **list)
/*
TODO: perhaps it would make sense to create a pool of the LIST elements
- elements and give them upon request. Now we are loosing a bit of memory when
+ and give them upon request. Now we are loosing a bit of memory when
guarded instance was stopped and then restarted (since we cannot free just
a piece of the MEM_ROOT).
*/
@@ -198,7 +347,7 @@ int Guardian_thread::stop_guard(Instance *instance)
We compare only pointers, as we always use pointers from the
instance_map's MEM_ROOT.
*/
- if ((Instance *) node->data == instance)
+ if (((GUARD_NODE *) node->data)->instance == instance)
{
guarded_instances= list_delete(guarded_instances, node);
pthread_mutex_unlock(&LOCK_guardian);
@@ -212,3 +361,61 @@ int Guardian_thread::stop_guard(Instance *instance)
return 0;
}
+/*
+ Start Guardian shutdown. Attempt to start instances if requested.
+
+ SYNOPSYS
+ stop_instances()
+ stop_instances_arg whether we should stop instances at shutdown
+
+ DESCRIPTION
+
+ Loops through the guarded_instances list and prepares them for shutdown.
+ If stop_instances was requested, we need to issue a stop command and change
+ the state accordingly. Otherwise we could simply delete an entry.
+ NOTE: Guardian should be locked by the calling function
+
+ RETURN
+ 0 - ok
+ 1 - error occured
+*/
+
+int Guardian_thread::stop_instances(bool stop_instances_arg)
+{
+ LIST *node;
+ node= guarded_instances;
+ while (node != NULL)
+ {
+ if (!stop_instances_arg)
+ {
+ /* just forget about an instance */
+ guarded_instances= list_delete(guarded_instances, node);
+ /*
+ This should still work fine, as we have only removed the
+ node from the list. The pointer to the next one is still valid
+ */
+ node= node->next;
+ }
+ else
+ {
+ GUARD_NODE *current_node= (GUARD_NODE *) node->data;
+ /*
+ If instance is running or was running (and now probably hanging),
+ request stop.
+ */
+ if (current_node->instance->is_running() ||
+ (current_node->state == STARTED))
+ {
+ current_node->state= STOPPING;
+ current_node->last_checked= time(NULL);
+ }
+ else
+ /* otherwise remove it from the list */
+ guarded_instances= list_delete(guarded_instances, node);
+ /* But try to kill it anyway. Just in case */
+ current_node->instance->kill_instance(SIGTERM);
+ node= node->next;
+ }
+ }
+ return 0;
+}