summaryrefslogtreecommitdiff
path: root/server-tools/instance-manager/guardian.cc
diff options
context:
space:
mode:
authorunknown <petr@mysql.com>2005-01-31 23:54:08 +0300
committerunknown <petr@mysql.com>2005-01-31 23:54:08 +0300
commit92a52cccf6fe4f41f2b234b162daaae3d2e3ab26 (patch)
tree2f2dbe002f0f9d45952166fd4ac2722be38b39e7 /server-tools/instance-manager/guardian.cc
parent28b1aefa000d752d7275fb3cb5d01545a39911ea (diff)
downloadmariadb-git-92a52cccf6fe4f41f2b234b162daaae3d2e3ab26.tar.gz
more fixes for IM to substitude mysqld_safe in startup scripts
BitKeeper/deleted/.del-thread_repository.cc~bba09f64f8cb4037: Delete: server-tools/instance-manager/thread_repository.cc BitKeeper/deleted/.del-thread_repository.h~e6a3b9cab7a6612a: Delete: server-tools/instance-manager/thread_repository.h server-tools/instance-manager/commands.cc: All instances are guarded by default now, so we need to perform check on whether the instance is nonguarded, rather then guarded when adding it to the list of guarded instnces. server-tools/instance-manager/guardian.cc: Guardian rewritten to start instances by default, and shut them down, when exiting. Behaviour of the guardian in case of the instance crash has changed. Now it tries to restart an instance constantly in the first 2 seconds after the crash was noticed, and then it tries restart an instance once in the MONITORING_INTERVAL. If it failed to restart instance for "restart_retry" (compiled-in value) times, guardian stops trying to restart it. server-tools/instance-manager/guardian.h: Several new functions and variables declared. server-tools/instance-manager/instance.cc: now start doesn't call stop(), but rather tries to remove the pidfile itself server-tools/instance-manager/instance.h: cleanup server-tools/instance-manager/instance_map.cc: no more "admin" options server-tools/instance-manager/instance_map.h: User and password purged from instance_map options, as IM shouldn't know them server-tools/instance-manager/instance_options.cc: new option added -- shutdown_delay, guarded option now called nonguaded and has the opposite meaning server-tools/instance-manager/instance_options.h: appropriate changes, reflecting options addition/removal server-tools/instance-manager/manager.cc: shutdown process is complicated a bit -- at first signal thread should stop guardian, and only then the IM itself server-tools/instance-manager/messages.cc: update error message server-tools/instance-manager/options.cc: admin user/password purged from mysqlmanager options
Diffstat (limited to 'server-tools/instance-manager/guardian.cc')
-rw-r--r--server-tools/instance-manager/guardian.cc107
1 files changed, 96 insertions, 11 deletions
diff --git a/server-tools/instance-manager/guardian.cc b/server-tools/instance-manager/guardian.cc
index f68ef6575a0..e8f9068dbb9 100644
--- a/server-tools/instance-manager/guardian.cc
+++ b/server-tools/instance-manager/guardian.cc
@@ -46,6 +46,8 @@ Guardian_thread::Guardian_thread(Thread_registry &thread_registry_arg,
{
pthread_mutex_init(&LOCK_guardian, 0);
pthread_cond_init(&COND_guardian, 0);
+ shutdown_guardian= FALSE;
+ is_stopped= FALSE;
thread_registry.register_thread(&thread_info);
init_alloc_root(&alloc, MEM_ROOT_BLOCK_SIZE, 0);
guarded_instances= NULL;
@@ -65,6 +67,22 @@ Guardian_thread::~Guardian_thread()
}
+void Guardian_thread::shutdown()
+{
+ pthread_mutex_lock(&LOCK_guardian);
+ shutdown_guardian= TRUE;
+ pthread_mutex_unlock(&LOCK_guardian);
+}
+
+
+void Guardian_thread::request_stop_instances()
+{
+ pthread_mutex_lock(&LOCK_guardian);
+ request_stop= TRUE;
+ pthread_mutex_unlock(&LOCK_guardian);
+}
+
+
/*
Run guardian thread
@@ -80,6 +98,7 @@ Guardian_thread::~Guardian_thread()
void Guardian_thread::run()
{
Instance *instance;
+ int restart_retry= 100;
LIST *loop;
struct timespec timeout;
@@ -87,26 +106,68 @@ void Guardian_thread::run()
pthread_mutex_lock(&LOCK_guardian);
- while (!thread_registry.is_shutdown())
+ while (!shutdown_guardian)
{
+ int status= 0;
loop= guarded_instances;
while (loop != NULL)
{
- instance= (Instance *) loop->data;
- /* instance-> start already checks whether the instance is running */
- if (instance->start() != ER_INSTANCE_ALREADY_STARTED)
- log_info("guardian attempted to restart instance %s",
- instance->options.instance_name);
+ instance= ((GUARD_NODE *) loop->data)->instance;
+ if (!instance->is_running())
+ {
+ int state= 0; /* state of guardian */
+
+ if ((((GUARD_NODE *) loop->data)->crash_moment == 0))
+ state= 1; /* an instance just crashed */
+ else
+ if (time(NULL) - ((GUARD_NODE *) loop->data)->crash_moment <= 2)
+ /* try to restart an instance immediately */
+ state= 2;
+ else
+ state= 3; /* try to restart it */
+
+ if (state == 1)
+ ((GUARD_NODE *) loop->data)->crash_moment= time(NULL);
+
+ if ((state == 1) || (state == 2))
+ {
+ instance->start();
+ ((GUARD_NODE *) loop->data)->restart_counter++;
+ log_info("guardian: starting instance %s",
+ instance->options.instance_name);
+ }
+ else
+ {
+ if ((status == ETIMEDOUT) &&
+ (((GUARD_NODE *) loop->data)->restart_counter < restart_retry))
+ {
+ instance->start();
+ ((GUARD_NODE *) loop->data)->restart_counter++;
+ log_info("guardian: starting instance %s",
+ instance->options.instance_name);
+ }
+ }
+ }
+ else /* clear status fields */
+ {
+ ((GUARD_NODE *) loop->data)->restart_counter= 0;
+ ((GUARD_NODE *) loop->data)->crash_moment= 0;
+ }
loop= loop->next;
}
move_to_list(&starting_instances, &guarded_instances);
timeout.tv_sec= time(NULL) + monitoring_interval;
timeout.tv_nsec= 0;
- pthread_cond_timedwait(&COND_guardian, &LOCK_guardian, &timeout);
+ status= pthread_cond_timedwait(&COND_guardian, &LOCK_guardian, &timeout);
}
pthread_mutex_unlock(&LOCK_guardian);
+ if (request_stop)
+ stop_instances();
+ is_stopped= TRUE;
+ /* now, when the Guardian is stopped we can stop the IM */
+ thread_registry.request_shutdown();
my_thread_end();
}
@@ -119,7 +180,7 @@ int Guardian_thread::start()
instance_map->lock();
while ((instance= iterator.next()))
{
- if ((instance->options.is_guarded != NULL) && (instance->is_running()))
+ if ((instance->options.nonguarded == NULL))
if (guard(instance))
return 1;
}
@@ -170,12 +231,18 @@ void Guardian_thread::move_to_list(LIST **from, LIST **to)
int Guardian_thread::add_instance_to_list(Instance *instance, LIST **list)
{
LIST *node;
+ GUARD_NODE *content;
node= (LIST *) alloc_root(&alloc, sizeof(LIST));
- if (node == NULL)
+ content= (GUARD_NODE *) alloc_root(&alloc, sizeof(GUARD_NODE));
+
+ if ((node == NULL) || (content == NULL))
return 1;
/* we store the pointers to instances from the instance_map's MEM_ROOT */
- node->data= (void *) instance;
+ content->instance= instance;
+ content->restart_counter= 0;
+ content->crash_moment= 0;
+ node->data= (void *) content;
pthread_mutex_lock(&LOCK_guardian);
*list= list_add(*list, node);
@@ -205,7 +272,7 @@ int Guardian_thread::stop_guard(Instance *instance)
We compare only pointers, as we always use pointers from the
instance_map's MEM_ROOT.
*/
- if ((Instance *) node->data == instance)
+ if (((GUARD_NODE *) node->data)->instance == instance)
{
guarded_instances= list_delete(guarded_instances, node);
pthread_mutex_unlock(&LOCK_guardian);
@@ -219,3 +286,21 @@ int Guardian_thread::stop_guard(Instance *instance)
return 0;
}
+int Guardian_thread::stop_instances()
+{
+ Instance *instance;
+ Instance_map::Iterator iterator(instance_map);
+
+ while ((instance= iterator.next()))
+ {
+ if ((instance->options.nonguarded == NULL))
+ {
+ if (stop_guard(instance))
+ return 1;
+ /* let us try to stop the server */
+ instance->stop();
+ }
+ }
+
+ return 0;
+}