diff options
Diffstat (limited to 'daemons/lvmlockd/lvmlockd-core.c')
-rw-r--r-- | daemons/lvmlockd/lvmlockd-core.c | 5715 |
1 files changed, 5715 insertions, 0 deletions
diff --git a/daemons/lvmlockd/lvmlockd-core.c b/daemons/lvmlockd/lvmlockd-core.c new file mode 100644 index 000000000..d058ea1c3 --- /dev/null +++ b/daemons/lvmlockd/lvmlockd-core.c @@ -0,0 +1,5715 @@ +/* + * Copyright (C) 2014 Red Hat, Inc. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU Lesser General Public License v.2.1. + */ + +#define _XOPEN_SOURCE 500 /* pthread */ +#define _ISOC99_SOURCE +#define _GNU_SOURCE + +#include "configure.h" +#include "daemon-io.h" +#include "daemon-server.h" +#include "daemon-log.h" +#include "config-util.h" +#include "lvm-version.h" +#include "lvmetad-client.h" +#include "lvmlockd-client.h" + +#include <assert.h> +#include <pthread.h> +#include <stdint.h> +#include <stddef.h> +#include <stdlib.h> +#include <unistd.h> +#include <poll.h> +#include <errno.h> +#include <signal.h> +#include <getopt.h> +#include <syslog.h> +#include <dirent.h> +#include <time.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/utsname.h> +#include <sys/un.h> + +#define EXTERN +#include "lvmlockd-internal.h" + +/* + * Basic operation of lvmlockd + * + * lvmlockd main process runs main_loop() which uses poll(). + * poll listens for new connections from lvm commands and for + * messages from existing connected lvm commands. + * + * lvm command starts and connects to lvmlockd. + * + * lvmlockd receives a connection request from command and adds a + * 'struct client' to keep track of the connection to the command. + * The client's fd is added to the set of fd's in poll(). + * + * lvm command sends a lock request to lvmlockd. The lock request + * can be for the global lock, a vg lock, or an lv lock. + * + * lvmlockd main_loop/poll sees a message from an existing client. + * It sets client.recv = 1, then wakes up client_thread_main. + * + * client_thread_main iterates through client structs (cl), looking + * for any that need processing, finds the one with cl->recv set, + * and calls client_recv_action(cl). + * + * client_recv_action(cl) reads the message/request from the client, + * allocates a new 'struct action' (act) to represent the request, + * sets the act with what is found in the request, then looks at + * the specific operation in act->op (LD_OP_FOO) to decide what to + * do with the action: + * + * . If the action is to start a lockspace, create a new thread + * to manage that lockspace: add_lockspace(act). + * + * . If the action is a lock request, pass the act to the thread + * that is managing that lockspace: add_lock_action(act). + * + * . Other misc actions are are passed to the worker_thread: + * add_work_action(act). + * + * Onec the client_thread has passed the action off to another + * thread to process, it goes back to waiting for more client + * handling work to do. + * + * The thread that was given the action by the client_thread + * now processes that action according to the operation, act->op. + * This is either a lockspace_thread (for lock ops or ops that + * add/rem a lockspace), or the worker_thread. See below for + * how these ops are processed by these threads. When the + * given thread is done processing the action, the result is + * set in act->result, and the act struct for the completed action + * is passed back to the client_thread (client_results list). + * + * The client_thread takes completed actions (from client_results + * list), and sends the result back to the client that sent the + * request represented by the action. The act struct is then freed. + * + * This completes the cycle of work between lvm commands (clients) + * and lvmlockd. In summary: + * + * - main process polls for new client connections and new requests + * from lvm commands + * - client_thread reads requests from clients + * - client_thread creates an action struct for each request + * - client_thread passes the act to another thread for processing + * - other threads pass completed act structs back to client_thread + * - client_thread sends the act result back to the client and frees the act + * + * + * Lockspace threads: + * Each lockd VG has its own lockspace that contains locks for that VG. + * Each 'struct lockspace' is managed by a separate lockspace_thread. + * When the lockspace_thread is first created, the first thing it does + * is join the lockspace in the lock manager. This can take a long time. + * If the join fails, the thread exits. After the join, the thread + * enters a loop waiting for lock actions to perform in the lockspace. + * + * The request to remove/leave a lockspace causes a flag to be set in + * the lockspace struct. When the lockspace_thread sees this flag + * set, it leaves the lockspace, and exits. + * + * When the client_thread passes a new action to a lockspace_thread, + * i.e. a new lock request, the lockspace_thread identifies which resource + * is being locked (GL, VG, LV), and gets the 'struct resource' (r) for it. + * r->type will be LD_RT_GL, LD_RT_VG, or LD_RT_LV. r->name is the + * resource name, and is fixed for GL and VG resources, but is based on + * the LV name for LV resources. The act is added to the resource's + * list of actions: r->actions, i.e. outstanding lock requests on the + * resource. + * + * The lockspace thread then iterates through each resource in the + * lockspace, processing any outstanding actions on each: res_process(ls, r). + * + * res_process() compares the outstanding actions/requests in r->actions + * against any existing locks on the resource in r->locks. If the + * action is blocked by existing locks, it's left on r->actions. If not, + * the action/request is passed to the lock manager. If the result from + * the lock manager is success, a new 'struct lock' is created for the + * action and saved on r->locks. The result is set in act->result and + * the act is passed back to the client_thread to be returned to the client. + */ + +static const char *lvmlockd_protocol = "lvmlockd"; +static const int lvmlockd_protocol_version = 1; +static int daemon_quit; +static int adopt_opt; + +static daemon_handle lvmetad_handle; +static pthread_mutex_t lvmetad_mutex; +static int lvmetad_connected; + +/* + * We use a separate socket for dumping daemon info. + * This will not interfere with normal operations, and allows + * free-form debug data to be dumped instead of the libdaemon + * protocol that wants all data in the cft format. + * 1MB should fit all the info we need to dump. + */ +#define DUMP_SOCKET_NAME "lvmlockd-dump.sock" +#define DUMP_BUF_SIZE (1024 * 1024) +static char dump_buf[DUMP_BUF_SIZE]; +static struct sockaddr_un dump_addr; +static socklen_t dump_addrlen; + +/* + * Main program polls client connections, adds new clients, + * adds work for client thread. + * + * pollfd_mutex is used for adding vs removing entries, + * and for resume vs realloc. + */ +#define POLL_FD_UNUSED -1 /* slot if free */ +#define POLL_FD_IGNORE -2 /* slot is used but ignore in poll */ +#define ADD_POLL_SIZE 16 /* increment slots by this amount */ + +static pthread_mutex_t pollfd_mutex; +static struct pollfd *pollfd; +static int pollfd_size; +static int pollfd_maxi; +static int listen_pi; +static int listen_fd; +static int restart_pi; +static int restart_fds[2]; + +/* + * Each lockspace has its own thread to do locking. + * The lockspace thread makes synchronous lock requests to dlm/sanlock. + * Every vg with a lockd type, i.e. "dlm", "sanlock", should be on this list. + * + * lockspaces_inactive holds old ls structs for vgs that have been + * stopped, or for vgs that failed to start. The old ls structs + * are removed from the inactive list and freed when a new ls with + * the same name is started and added to the standard lockspaces list. + * Keeping this bit of "history" for the ls allows us to return a + * more informative error message if a vg lock request is made for + * an ls that has been stopped or failed to start. + */ +static pthread_mutex_t lockspaces_mutex; +static struct list_head lockspaces; +static struct list_head lockspaces_inactive; + +/* + * This flag is set to 1 if we see multiple vgs with the global + * lock enabled. While this is set, we return a special flag + * with the vg lock result indicating to the lvm command that + * there is a duplicate gl in the vg which should be resolved. + * While this is set, find_lockspace_name has the side job of + * counting the number of lockspaces with enabled gl's so that + * this can be set back to zero when the duplicates are disabled. + */ +static int sanlock_gl_dup; + +/* + * Client thread reads client requests and writes client results. + */ +static pthread_t client_thread; +static pthread_mutex_t client_mutex; +static pthread_cond_t client_cond; +static struct list_head client_list; /* connected clients */ +static struct list_head client_results; /* actions to send back to clients */ +static uint32_t client_ids; /* 0 and ADOPT_CLIENT_ID are skipped */ +static int client_stop; /* stop the thread */ +static int client_work; /* a client on client_list has work to do */ + +#define ADOPT_CLIENT_ID 0xFFFFFFFF /* special client_id for adopt actions */ +static struct list_head adopt_results; /* special start actions from adopt_locks() */ + +/* + * Worker thread performs misc non-locking actions, e.g. init/free. + */ +static pthread_t worker_thread; +static pthread_mutex_t worker_mutex; +static pthread_cond_t worker_cond; +static struct list_head worker_list; /* actions for worker_thread */ +static int worker_stop; /* stop the thread */ +static int worker_wake; /* wake the thread without adding work */ + +/* + * The content of every log_foo() statement is saved in the + * circular buffer, which can be dumped to a client and printed. + */ +#define LOG_LINE_SIZE 256 +#define LOG_DUMP_SIZE DUMP_BUF_SIZE +#define LOG_SYSLOG_PRIO LOG_WARNING +static char log_dump[LOG_DUMP_SIZE]; +static unsigned int log_point; +static unsigned int log_wrap; +static pthread_mutex_t log_mutex; +static int syslog_priority = LOG_SYSLOG_PRIO; + +/* + * Structure pools to avoid repeated malloc/free. + */ +#define MAX_UNUSED_ACTION 64 +#define MAX_UNUSED_CLIENT 64 +#define MAX_UNUSED_RESOURCE 64 +#define MAX_UNUSED_LOCK 64 +static pthread_mutex_t unused_struct_mutex; +static struct list_head unused_action; +static struct list_head unused_client; +static struct list_head unused_resource; +static struct list_head unused_lock; +static int unused_action_count; +static int unused_client_count; +static int unused_resource_count; +static int unused_lock_count; +static int resource_lm_data_size; /* max size of lm_data from sanlock|dlm */ +static int alloc_new_structs; /* used for initializing in setup_structs */ + +#define DO_STOP 1 +#define NO_STOP 0 +#define DO_FREE 1 +#define NO_FREE 0 +#define DO_FORCE 1 +#define NO_FORCE 0 + +static int add_lock_action(struct action *act); +static int str_to_lm(const char *str); +static int clear_lockspace_inactive(char *name); + +static int _syslog_name_to_num(const char *name) +{ + if (!strcmp(name, "emerg")) + return LOG_EMERG; + if (!strcmp(name, "alert")) + return LOG_ALERT; + if (!strcmp(name, "crit")) + return LOG_CRIT; + if (!strcmp(name, "err") || !strcmp(name, "error")) + return LOG_ERR; + if (!strcmp(name, "warning") || !strcmp(name, "warn")) + return LOG_WARNING; + if (!strcmp(name, "notice")) + return LOG_NOTICE; + if (!strcmp(name, "info")) + return LOG_INFO; + if (!strcmp(name, "debug")) + return LOG_DEBUG; + return LOG_WARNING; +} + +static const char *_syslog_num_to_name(int num) +{ + switch (num) { + case LOG_EMERG: + return "emerg"; + case LOG_ALERT: + return "alert"; + case LOG_CRIT: + return "crit"; + case LOG_ERR: + return "err"; + case LOG_WARNING: + return "warning"; + case LOG_NOTICE: + return "notice"; + case LOG_INFO: + return "info"; + case LOG_DEBUG: + return "debug"; + } + return "unknown"; +} + +static uint64_t monotime(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec; +} + +static void log_save_line(int len, char *line, + char *log_buf, unsigned int *point, unsigned int *wrap) +{ + unsigned int p = *point; + unsigned int w = *wrap; + int i; + + if (len < LOG_DUMP_SIZE - p) { + memcpy(log_buf + p, line, len); + p += len; + + if (p == LOG_DUMP_SIZE) { + p = 0; + w = 1; + } + goto out; + } + + for (i = 0; i < len; i++) { + log_buf[p++] = line[i]; + + if (p == LOG_DUMP_SIZE) { + p = 0; + w = 1; + } + } + out: + *point = p; + *wrap = w; +} + +void log_level(int level, const char *fmt, ...) +{ + char line[LOG_LINE_SIZE]; + va_list ap; + int len = LOG_LINE_SIZE - 1; + int ret, pos = 0; + + memset(line, 0, sizeof(line)); + + ret = snprintf(line, len, "%llu ", (unsigned long long)time(NULL)); + pos += ret; + + va_start(ap, fmt); + ret = vsnprintf(line + pos, len - pos, fmt, ap); + va_end(ap); + + if (ret >= len - pos) + pos = len - 1; + else + pos += ret; + + line[pos++] = '\n'; + line[pos++] = '\0'; + + pthread_mutex_lock(&log_mutex); + log_save_line(pos - 1, line, log_dump, &log_point, &log_wrap); + pthread_mutex_unlock(&log_mutex); + + if (level <= syslog_priority) + syslog(level, "%s", line); + + if (daemon_debug) + fprintf(stderr, "%s", line); +} + +static int dump_log(int *dump_len) +{ + int tail_len; + + pthread_mutex_lock(&log_mutex); + + if (!log_wrap && !log_point) { + *dump_len = 0; + } else if (log_wrap) { + tail_len = LOG_DUMP_SIZE - log_point; + memcpy(dump_buf, log_dump+log_point, tail_len); + if (log_point) + memcpy(dump_buf+tail_len, log_dump, log_point); + *dump_len = LOG_DUMP_SIZE; + } else { + memcpy(dump_buf, log_dump, log_point-1); + *dump_len = log_point-1; + } + pthread_mutex_unlock(&log_mutex); + + return 0; +} + +struct lockspace *alloc_lockspace(void) +{ + struct lockspace *ls; + + if (!(ls = malloc(sizeof(struct lockspace)))) { + log_error("out of memory for lockspace"); + return NULL; + } + + memset(ls, 0, sizeof(struct lockspace)); + INIT_LIST_HEAD(&ls->actions); + INIT_LIST_HEAD(&ls->resources); + pthread_mutex_init(&ls->mutex, NULL); + pthread_cond_init(&ls->cond, NULL); + return ls; +} + +static struct action *alloc_action(void) +{ + struct action *act; + + pthread_mutex_lock(&unused_struct_mutex); + if (!unused_action_count || alloc_new_structs) { + act = malloc(sizeof(struct action)); + } else { + act = list_first_entry(&unused_action, struct action, list); + list_del(&act->list); + unused_action_count--; + } + pthread_mutex_unlock(&unused_struct_mutex); + if (act) + memset(act, 0, sizeof(struct action)); + else + log_error("out of memory for action"); + return act; +} + +static struct client *alloc_client(void) +{ + struct client *cl; + + pthread_mutex_lock(&unused_struct_mutex); + if (!unused_client_count || alloc_new_structs) { + cl = malloc(sizeof(struct client)); + } else { + cl = list_first_entry(&unused_client, struct client, list); + list_del(&cl->list); + unused_client_count--; + } + pthread_mutex_unlock(&unused_struct_mutex); + if (cl) + memset(cl, 0, sizeof(struct client)); + else + log_error("out of memory for client"); + return cl; +} + +static struct resource *alloc_resource(void) +{ + struct resource *r; + + pthread_mutex_lock(&unused_struct_mutex); + if (!unused_resource_count || alloc_new_structs) { + r = malloc(sizeof(struct resource) + resource_lm_data_size); + } else { + r = list_first_entry(&unused_resource, struct resource, list); + list_del(&r->list); + unused_resource_count--; + } + pthread_mutex_unlock(&unused_struct_mutex); + if (r) { + memset(r, 0, sizeof(struct resource) + resource_lm_data_size); + INIT_LIST_HEAD(&r->locks); + INIT_LIST_HEAD(&r->actions); + } else { + log_error("out of memory for resource"); + } + return r; +} + +static struct lock *alloc_lock(void) +{ + struct lock *lk; + + pthread_mutex_lock(&unused_struct_mutex); + if (!unused_lock_count || alloc_new_structs) { + lk = malloc(sizeof(struct lock)); + } else { + lk = list_first_entry(&unused_lock, struct lock, list); + list_del(&lk->list); + unused_lock_count--; + } + pthread_mutex_unlock(&unused_struct_mutex); + if (lk) + memset(lk, 0, sizeof(struct lock)); + else + log_error("out of memory for lock"); + return lk; +} + +static void free_action(struct action *act) +{ + pthread_mutex_lock(&unused_struct_mutex); + if (unused_action_count >= MAX_UNUSED_ACTION) { + free(act); + } else { + list_add_tail(&act->list, &unused_action); + unused_action_count++; + } + pthread_mutex_unlock(&unused_struct_mutex); +} + +static void free_client(struct client *cl) +{ + pthread_mutex_lock(&unused_struct_mutex); + if (unused_client_count >= MAX_UNUSED_CLIENT) { + free(cl); + } else { + list_add_tail(&cl->list, &unused_client); + unused_client_count++; + } + pthread_mutex_unlock(&unused_struct_mutex); +} + +static void free_resource(struct resource *r) +{ + pthread_mutex_lock(&unused_struct_mutex); + if (unused_resource_count >= MAX_UNUSED_RESOURCE) { + free(r); + } else { + list_add_tail(&r->list, &unused_resource); + unused_resource_count++; + } + pthread_mutex_unlock(&unused_struct_mutex); +} + +static void free_lock(struct lock *lk) +{ + pthread_mutex_lock(&unused_struct_mutex); + if (unused_lock_count >= MAX_UNUSED_LOCK) { + free(lk); + } else { + list_add_tail(&lk->list, &unused_lock); + unused_lock_count++; + } + pthread_mutex_unlock(&unused_struct_mutex); +} + +static int setup_structs(void) +{ + struct action *act; + struct client *cl; + struct resource *r; + struct lock *lk; + int data_san = lm_data_size_sanlock(); + int data_dlm = lm_data_size_dlm(); + int i; + + resource_lm_data_size = data_san > data_dlm ? data_san : data_dlm; + + pthread_mutex_init(&unused_struct_mutex, NULL); + INIT_LIST_HEAD(&unused_action); + INIT_LIST_HEAD(&unused_client); + INIT_LIST_HEAD(&unused_resource); + INIT_LIST_HEAD(&unused_lock); + + /* + * For setup, force the alloc_ functions to alloc new structs instead + * of taking them unused. This allows alloc_struct/free_struct loop to + * populate the unused lists. + */ + alloc_new_structs = 1; + + for (i = 0; i < MAX_UNUSED_ACTION/2; i++) { + if (!(act = alloc_action())) + goto fail; + free_action(act); + } + + for (i = 0; i < MAX_UNUSED_CLIENT/2; i++) { + if (!(cl = alloc_client())) + goto fail; + free_client(cl); + } + + for (i = 0; i < MAX_UNUSED_RESOURCE/2; i++) { + if (!(r = alloc_resource())) + goto fail; + free_resource(r); + } + + for (i = 0; i < MAX_UNUSED_LOCK/2; i++) { + if (!(lk = alloc_lock())) + goto fail; + free_lock(lk); + } + + alloc_new_structs = 0; + return 0; +fail: + alloc_new_structs = 0; + return -ENOMEM; +} + +static int add_pollfd(int fd) +{ + int i, new_size; + + pthread_mutex_lock(&pollfd_mutex); + for (i = 0; i < pollfd_size; i++) { + if (pollfd[i].fd != POLL_FD_UNUSED) + continue; + + pollfd[i].fd = fd; + pollfd[i].events = POLLIN; + pollfd[i].revents = 0; + + if (i > pollfd_maxi) + pollfd_maxi = i; + + pthread_mutex_unlock(&pollfd_mutex); + return i; + } + + new_size = pollfd_size + ADD_POLL_SIZE; + + pollfd = realloc(pollfd, new_size * sizeof(struct pollfd)); + if (!pollfd) { + log_error("can't alloc new size %d for pollfd", new_size); + return -ENOMEM; + } + + for (i = pollfd_size; i < new_size; i++) { + pollfd[i].fd = POLL_FD_UNUSED; + pollfd[i].events = 0; + pollfd[i].revents = 0; + } + + i = pollfd_size; + pollfd[i].fd = fd; + pollfd[i].events = POLLIN; + pollfd[i].revents = 0; + pollfd_maxi = i; + + pollfd_size = new_size; + + pthread_mutex_unlock(&pollfd_mutex); + return i; +} + +static void rem_pollfd(int pi) +{ + if (pi < 0) { + log_error("rem_pollfd %d", pi); + return; + } + pthread_mutex_lock(&pollfd_mutex); + pollfd[pi].fd = POLL_FD_UNUSED; + pollfd[pi].events = 0; + pollfd[pi].revents = 0; + pthread_mutex_unlock(&pollfd_mutex); +} + +static const char *lm_str(int x) +{ + switch (x) { + case LD_LM_NONE: + return "none"; + case LD_LM_DLM: + return "dlm"; + case LD_LM_SANLOCK: + return "sanlock"; + default: + return "lm_unknown"; + } +} + +static const char *rt_str(int x) +{ + switch (x) { + case LD_RT_GL: + return "gl"; + case LD_RT_VG: + return "vg"; + case LD_RT_LV: + return "lv"; + default: + return "."; + }; +} + +static const char *op_str(int x) +{ + switch (x) { + case LD_OP_INIT: + return "init"; + case LD_OP_FREE: + return "free"; + case LD_OP_START: + return "start"; + case LD_OP_STOP: + return "stop"; + case LD_OP_LOCK: + return "lock"; + case LD_OP_UPDATE: + return "update"; + case LD_OP_CLOSE: + return "close"; + case LD_OP_ENABLE: + return "enable"; + case LD_OP_DISABLE: + return "disable"; + case LD_OP_START_WAIT: + return "start_wait"; + case LD_OP_STOP_ALL: + return "stop_all"; + case LD_OP_RENAME_BEFORE: + return "rename_before"; + case LD_OP_RENAME_FINAL: + return "rename_final"; + case LD_OP_RUNNING_LM: + return "running_lm"; + case LD_OP_FIND_FREE_LOCK: + return "find_free_lock"; + case LD_OP_FORGET_VG_NAME: + return "forget_vg_name"; + default: + return "op_unknown"; + }; +} + +static const char *mode_str(int x) +{ + switch (x) { + case LD_LK_IV: + return "iv"; + case LD_LK_UN: + return "un"; + case LD_LK_NL: + return "nl"; + case LD_LK_SH: + return "sh"; + case LD_LK_EX: + return "ex"; + default: + return "."; + }; +} + +int last_string_from_args(char *args_in, char *last) +{ + const char *args = args_in; + const char *colon, *str = NULL; + + while (1) { + if (!args || (*args == '\0')) + break; + colon = strstr(args, ":"); + if (!colon) + break; + str = colon; + args = colon + 1; + } + + if (str) { + snprintf(last, MAX_ARGS, "%s", str + 1); + return 0; + } + return -1; +} + +int version_from_args(char *args, unsigned int *major, unsigned int *minor, unsigned int *patch) +{ + char version[MAX_ARGS]; + char *major_str, *minor_str, *patch_str; + char *n, *d1, *d2; + + strncpy(version, args, MAX_ARGS); + + n = strstr(version, ":"); + if (n) + *n = '\0'; + + d1 = strstr(version, "."); + if (!d1) + return -1; + + d2 = strstr(d1 + 1, "."); + if (!d2) + return -1; + + major_str = version; + minor_str = d1 + 1; + patch_str = d2 + 1; + + *d1 = '\0'; + *d2 = '\0'; + + if (major) + *major = atoi(major_str); + if (minor) + *minor = atoi(minor_str); + if (patch) + *patch = atoi(patch_str); + + return 0; +} + +/* + * These are few enough that arrays of function pointers can + * be avoided. + */ + +static int lm_prepare_lockspace(struct lockspace *ls, struct action *act) +{ + int rv; + + if (ls->lm_type == LD_LM_DLM) + rv = lm_prepare_lockspace_dlm(ls); + else if (ls->lm_type == LD_LM_SANLOCK) + rv = lm_prepare_lockspace_sanlock(ls); + else + return -1; + + if (act) + act->lm_rv = rv; + return rv; +} + +static int lm_add_lockspace(struct lockspace *ls, struct action *act, int adopt) +{ + int rv; + + if (ls->lm_type == LD_LM_DLM) + rv = lm_add_lockspace_dlm(ls, adopt); + else if (ls->lm_type == LD_LM_SANLOCK) + rv = lm_add_lockspace_sanlock(ls, adopt); + else + return -1; + + if (act) + act->lm_rv = rv; + return rv; +} + +static int lm_rem_lockspace(struct lockspace *ls, struct action *act, int free_vg) +{ + int rv; + + if (ls->lm_type == LD_LM_DLM) + rv = lm_rem_lockspace_dlm(ls, free_vg); + else if (ls->lm_type == LD_LM_SANLOCK) + rv = lm_rem_lockspace_sanlock(ls, free_vg); + else + return -1; + + if (act) + act->lm_rv = rv; + return rv; +} + +static int lm_lock(struct lockspace *ls, struct resource *r, int mode, struct action *act, + uint32_t *r_version, int *retry, int adopt) +{ + int rv; + + if (ls->lm_type == LD_LM_DLM) + rv = lm_lock_dlm(ls, r, mode, r_version, adopt); + else if (ls->lm_type == LD_LM_SANLOCK) + rv = lm_lock_sanlock(ls, r, mode, r_version, retry, adopt); + else + return -1; + + if (act) + act->lm_rv = rv; + return rv; +} + +static int lm_convert(struct lockspace *ls, struct resource *r, + int mode, struct action *act, uint32_t r_version) +{ + int rv; + + if (ls->lm_type == LD_LM_DLM) + rv = lm_convert_dlm(ls, r, mode, r_version); + else if (ls->lm_type == LD_LM_SANLOCK) + rv = lm_convert_sanlock(ls, r, mode, r_version); + else + return -1; + + if (act) + act->lm_rv = rv; + return rv; +} + +static int lm_unlock(struct lockspace *ls, struct resource *r, struct action *act, + uint32_t r_version, uint32_t lmu_flags) +{ + int rv; + + if (ls->lm_type == LD_LM_DLM) + return lm_unlock_dlm(ls, r, r_version, lmu_flags); + else if (ls->lm_type == LD_LM_SANLOCK) + return lm_unlock_sanlock(ls, r, r_version, lmu_flags); + else + return -1; + + if (act) + act->lm_rv = rv; + return rv; +} + +static int lm_hosts(struct lockspace *ls, int notify) +{ + if (ls->lm_type == LD_LM_DLM) + return 0; + else if (ls->lm_type == LD_LM_SANLOCK) + return lm_hosts_sanlock(ls, notify); + return -1; +} + +static void lm_rem_resource(struct lockspace *ls, struct resource *r) +{ + if (ls->lm_type == LD_LM_DLM) + lm_rem_resource_dlm(ls, r); + else if (ls->lm_type == LD_LM_SANLOCK) + lm_rem_resource_sanlock(ls, r); +} + +static int lm_find_free_lock(struct lockspace *ls, uint64_t *free_offset) +{ + if (ls->lm_type == LD_LM_DLM) + return 0; + else if (ls->lm_type == LD_LM_SANLOCK) + return lm_find_free_lock_sanlock(ls, free_offset); + return -1; +} + +/* + * While adopting locks, actions originate from the adopt_locks() + * function, not from a client. So, these actions (flagged ADOPT), + * should be passed back to the adopt_locks() function through the + * adopt_results list, and not be sent back to a client via the + * client_list/client_thread. + */ + +static void add_client_result(struct action *act) +{ + pthread_mutex_lock(&client_mutex); + if (act->flags & LD_AF_ADOPT) + list_add_tail(&act->list, &adopt_results); + else + list_add_tail(&act->list, &client_results); + pthread_cond_signal(&client_cond); + pthread_mutex_unlock(&client_mutex); +} + +static struct lock *find_lock_client(struct resource *r, uint32_t client_id) +{ + struct lock *lk; + + list_for_each_entry(lk, &r->locks, list) { + if (lk->client_id == client_id) + return lk; + } + return NULL; +} + +static struct lock *find_lock_persistent(struct resource *r) +{ + struct lock *lk; + + list_for_each_entry(lk, &r->locks, list) { + if (lk->flags & LD_LF_PERSISTENT) + return lk; + } + return NULL; +} + +static struct action *find_action_client(struct resource *r, uint32_t client_id) +{ + struct action *act; + + list_for_each_entry(act, &r->actions, list) { + if (act->client_id != client_id) + continue; + return act; + } + return NULL; +} + +static void add_work_action(struct action *act) +{ + pthread_mutex_lock(&worker_mutex); + if (!worker_stop) { + list_add_tail(&act->list, &worker_list); + pthread_cond_signal(&worker_cond); + } + pthread_mutex_unlock(&worker_mutex); +} + +static int res_lock(struct lockspace *ls, struct resource *r, struct action *act, int *retry) +{ + struct lock *lk; + uint32_t r_version = 0; + int rv; + + log_debug("S %s R %s res_lock mode %s", ls->name, r->name, mode_str(act->mode)); + + if (r->mode == LD_LK_SH && act->mode == LD_LK_SH) + goto add_lk; + + if (r->type == LD_RT_LV && act->lv_args[0]) + memcpy(r->lv_args, act->lv_args, MAX_ARGS); + + rv = lm_lock(ls, r, act->mode, act, &r_version, retry, act->flags & LD_AF_ADOPT); + if (rv == -EAGAIN) + return rv; + if (rv < 0) { + log_error("S %s R %s res_lock lm error %d", ls->name, r->name, rv); + return rv; + } + + log_debug("S %s R %s res_lock lm done r_version %u", + ls->name, r->name, r_version); + + /* lm_lock() reads new r_version */ + + if ((r_version > r->version) || (!r->version && !r->version_zero_valid)) { + /* + * New r_version of the lock: means that another + * host has changed data protected by this lock + * since the last time we acquired it. We + * should invalidate any local cache of the data + * protected by this lock and reread it from disk. + */ + r->version = r_version; + + /* + * When a new global lock is enabled in a new vg, + * it will have version zero, and the first time + * we use it we need to validate the global cache + * since we don't have any version history to know + * the state of the cache. The version could remain + * zero for a long time if no global state is changed + * to cause the GL version to be incremented to 1. + */ + r->version_zero_valid = 1; + + /* + * r is vglk: tell lvmetad to set the vg invalid + * flag, and provide the new r_version. If lvmetad finds + * that its cached vg has seqno less than the value + * we send here, it will set the vg invalid flag. + * lvm commands that read the vg from lvmetad, will + * see the invalid flag returned, will reread the + * vg from disk, update the lvmetad copy, and go on. + * + * r is global: tell lvmetad to set the global invalid + * flag. When commands see this flag returned from lvmetad, + * they will reread metadata from disk, update the lvmetad + * caches, and tell lvmetad to set global invalid to 0. + */ + + if ((r->type == LD_RT_VG) && lvmetad_connected) { + daemon_reply reply; + char *uuid; + + log_debug("S %s R %s res_lock set lvmetad vg version %u", + ls->name, r->name, r_version); + + if (!ls->vg_uuid[0] || !strcmp(ls->vg_uuid, "none")) + uuid = ls->name; + else + uuid = ls->vg_uuid; + + pthread_mutex_lock(&lvmetad_mutex); + reply = daemon_send_simple(lvmetad_handle, "set_vg_info", + "token = %s", "skip", + "uuid = %s", uuid, + "version = %d", (int)r_version, + NULL); + pthread_mutex_unlock(&lvmetad_mutex); + + if (reply.error || strcmp(daemon_reply_str(reply, "response", ""), "OK")) + log_error("set_vg_info in lvmetad failed %d", reply.error); + daemon_reply_destroy(reply); + } + + if ((r->type == LD_RT_GL) && lvmetad_connected) { + daemon_reply reply; + + log_debug("S %s R %s res_lock set lvmetad global invalid", + ls->name, r->name); + + pthread_mutex_lock(&lvmetad_mutex); + reply = daemon_send_simple(lvmetad_handle, "set_global_info", + "token = %s", "skip", + "global_invalid = %d", 1, + NULL); + pthread_mutex_unlock(&lvmetad_mutex); + + if (reply.error || strcmp(daemon_reply_str(reply, "response", ""), "OK")) + log_error("set_global_info in lvmetad failed %d", reply.error); + daemon_reply_destroy(reply); + } + } + + r->mode = act->mode; + +add_lk: + if (r->mode == LD_LK_SH) + r->sh_count++; + + if (!(lk = alloc_lock())) + return -ENOMEM; + + lk->client_id = act->client_id; + lk->mode = act->mode; + + if (act->flags & LD_AF_PERSISTENT) { + lk->flags |= LD_LF_PERSISTENT; + lk->client_id = 0; + } + + list_add_tail(&lk->list, &r->locks); + + return 0; +} + +static int res_convert(struct lockspace *ls, struct resource *r, + struct lock *lk, struct action *act) +{ + uint32_t r_version; + int rv; + + log_debug("S %s R %s res_convert mode %d", ls->name, r->name, act->mode); + + if (act->mode == LD_LK_EX && lk->mode == LD_LK_SH && r->sh_count > 1) + return -EAGAIN; + + /* + * lm_convert() writes new version (from ex) + * Same as lm_unlock() + */ + + if ((r->type == LD_RT_GL) && (r->mode == LD_LK_EX)) { + r->version++; + lk->version = r->version; + r_version = r->version; + log_debug("S %s R %s res_convert r_version inc %u", + ls->name, r->name, r_version); + + } else if ((r->type == LD_RT_VG) && (r->mode == LD_LK_EX) && (lk->version > r->version)) { + r->version = lk->version; + r_version = r->version; + log_debug("S %s R %s res_convert r_version new %u", ls->name, r->name, r_version); + } else { + r_version = 0; + } + + rv = lm_convert(ls, r, act->mode, act, r_version); + if (rv < 0) { + log_error("S %s R %s res_convert lm error %d", ls->name, r->name, rv); + return rv; + } + + log_debug("S %s R %s res_convert lm done", ls->name, r->name); + + if (lk->mode == LD_LK_EX && act->mode == LD_LK_SH) { + r->sh_count = 1; + } else if (lk->mode == LD_LK_SH && act->mode == LD_LK_EX) { + r->sh_count = 0; + } else { + /* should not be possible */ + log_error("S %s R %s res_convert invalid modes %d %d", + ls->name, r->name, lk->mode, act->mode); + return -1; + } + + r->mode = act->mode; + lk->mode = act->mode; + + return 0; +} + +static int res_cancel(struct lockspace *ls, struct resource *r, + struct action *act) +{ + struct action *cact; + + /* + * a client can cancel its own non-persistent lock requests, + * when could this happen? + * + * a client can cancel other client's persistent lock requests, + * when could this happen? + */ + + if (act->flags & LD_AF_PERSISTENT) { + list_for_each_entry(cact, &r->actions, list) { + if (!(cact->flags & LD_AF_PERSISTENT)) + continue; + goto do_cancel; + } + } else { + cact = find_action_client(r, act->client_id); + if (cact) + goto do_cancel; + } + + return -ENOENT; + +do_cancel: + log_debug("S %s R %s res_cancel client %d", ls->name, r->name, cact->client_id); + cact->result = -ECANCELED; + list_del(&cact->list); + add_client_result(cact); + + return -ECANCELED; +} + +/* + * lm_unlock() writes new a r_version (from ex) + * + * The r_version of the vg resource is incremented if + * an "update" was received for the vg lock. The update + * contains the new vg seqno from the vg metadata which is + * used as the r_version. + * + * The r_version of the global resource is automatically + * incremented when it is unlocked from ex mode. + * + * r_version is incremented every time a command releases + * the global lock from ex. + */ + +/* + * persistent locks will not be unlocked for OP_CLOSE/act_close + * because act_close->flags does not have the PERSISTENT flag + * set, and a persistent lk->client_id is zero, which will not + * match the client in act_close->client_id. + */ + +static int res_unlock(struct lockspace *ls, struct resource *r, + struct action *act) +{ + struct lock *lk; + uint32_t r_version; + int rv; + + if (act->flags & LD_AF_PERSISTENT) { + lk = find_lock_persistent(r); + if (lk) + goto do_unlock; + } else { + lk = find_lock_client(r, act->client_id); + if (lk) + goto do_unlock; + } + + if (act->op != LD_OP_CLOSE) + log_error("S %s R %s res_unlock no locks", ls->name, r->name); + return -ENOENT; + +do_unlock: + log_debug("S %s R %s res_unlock %s", ls->name, r->name, + (act->op == LD_OP_CLOSE) ? "from close" : ""); + + /* send unlock to lm when last sh lock is unlocked */ + if (lk->mode == LD_LK_SH) { + r->sh_count--; + if (r->sh_count > 0) + goto rem_lk; + } + + if ((r->type == LD_RT_GL) && (r->mode == LD_LK_EX)) { + r->version++; + lk->version = r->version; + r_version = r->version; + + log_debug("S %s R %s res_unlock r_version inc %u", ls->name, r->name, r_version); + + } else if ((r->type == LD_RT_VG) && (r->mode == LD_LK_EX) && (lk->version > r->version)) { + r->version = lk->version; + r_version = r->version; + + log_debug("S %s R %s res_unlock r_version new %u", + ls->name, r->name, r_version); + } else { + r_version = 0; + } + + rv = lm_unlock(ls, r, act, r_version, 0); + if (rv < 0) { + /* should never happen, retry? */ + log_error("S %s R %s res_unlock lm error %d", ls->name, r->name, rv); + return rv; + } + + log_debug("S %s R %s res_unlock lm done", ls->name, r->name); + +rem_lk: + list_del(&lk->list); + free_lock(lk); + + if (list_empty(&r->locks)) + r->mode = LD_LK_UN; + + return 0; +} + +static int res_update(struct lockspace *ls, struct resource *r, + struct action *act) +{ + struct lock *lk; + + lk = find_lock_client(r, act->client_id); + if (!lk) { + log_error("S %s R %s res_update client %u lock not found", + ls->name, r->name, act->client_id); + return -ENOENT; + } + + if (r->mode != LD_LK_EX) { + log_error("S %s R %s res_update version on non-ex lock", + ls->name, r->name); + return -EINVAL; + } + + /* lk version will be written to lm by unlock */ + + if (act->flags & LD_AF_NEXT_VERSION) + lk->version = r->version + 1; + else + lk->version = act->version; + + log_debug("S %s R %s res_update lk version to %u", ls->name, r->name, lk->version); + + return 0; +} + +/* + * There is nothing to deallocate when freeing a dlm LV, the LV + * will simply be unlocked by rem_resource. + */ + +static int free_lv(struct lockspace *ls, struct resource *r) +{ + if (ls->lm_type == LD_LM_SANLOCK) + return lm_free_lv_sanlock(ls, r); + else if (ls->lm_type == LD_LM_DLM) + return 0; + else + return -EINVAL; +} + +/* + * NB. we can't do this if sanlock is holding any locks on + * the resource; we'd be rewriting the resource from under + * sanlock and would confuse or break it badly. We don't + * know what another host is doing, so these must be used + * very carefully. + */ + +static int res_able(struct lockspace *ls, struct resource *r, + struct action *act) +{ + int rv; + + if (ls->lm_type != LD_LM_SANLOCK) { + log_error("enable/disable only applies to sanlock"); + return -EINVAL; + } + + if (r->type != LD_RT_GL) { + log_error("enable/disable only applies to global lock"); + return -EINVAL; + } + + if (r->mode != LD_LK_UN) { + log_error("enable/disable only allowed on unlocked resource"); + return -EINVAL; + } + + if (act->op == LD_OP_ENABLE && gl_lsname_sanlock[0]) { + log_error("disable global lock in %s before enable in %s", + gl_lsname_sanlock, ls->name); + return -EINVAL; + } + + if ((act->op == LD_OP_DISABLE) && (act->flags & LD_AF_EX_DISABLE)) { + rv = lm_ex_disable_gl_sanlock(ls); + goto out; + } + + rv = lm_able_gl_sanlock(ls, act->op == LD_OP_ENABLE); +out: + return rv; +} + +/* + * Go through queued actions, and make lock/unlock calls on the resource + * based on the actions and the existing lock state. + * + * All lock operations sent to the lock manager are non-blocking. + * This is because sanlock does not support lock queueing. + * Eventually we could enhance this to take advantage of lock + * queueing when available (i.e. for the dlm). + * + * act_close_list: list of CLOSE actions, identifying clients that have + * closed/terminated their lvmlockd connection, and whose locks should + * be released. Do not remove these actions from act_close_list. + * + * retry_out: set to 1 if the lock manager said we should retry, + * meaning we should call res_process() again in a short while to retry. + */ + +static void res_process(struct lockspace *ls, struct resource *r, + struct list_head *act_close_list, int *retry_out) +{ + struct action *act, *safe, *act_close; + struct lock *lk; + int lm_retry; + int rv; + + /* + * handle version updates for ex locks + * (new version will be written by unlock) + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (act->op == LD_OP_UPDATE) { + rv = res_update(ls, r, act); + act->result = rv; + list_del(&act->list); + add_client_result(act); + } + } + + /* + * handle explicit unlock actions + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if ((act->op == LD_OP_LOCK) && + (act->mode == LD_LK_IV || act->mode == LD_LK_NL)) { + act->result = -EINVAL; + list_del(&act->list); + add_client_result(act); + } + + if (act->op == LD_OP_LOCK && act->mode == LD_LK_UN) { + rv = res_unlock(ls, r, act); + + if (rv == -ENOENT && (act->flags & LD_AF_UNLOCK_CANCEL)) + rv = res_cancel(ls, r, act); + + /* + * possible unlock results: + * 0: unlock succeeded + * -ECANCELED: cancel succeeded + * -ENOENT: nothing to unlock or cancel + */ + + act->result = rv; + list_del(&act->list); + add_client_result(act); + } + } + + /* + * handle implicit unlocks due to client exit, + * also clear any outstanding actions for the client + */ + + list_for_each_entry(act_close, act_close_list, list) { + res_unlock(ls, r, act_close); + res_cancel(ls, r, act_close); + } + + /* + * handle freeing a lock for an lv that has been removed + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (act->op == LD_OP_FREE && act->rt == LD_RT_LV) { + log_debug("S %s R %s free_lv", ls->name, r->name); + rv = free_lv(ls, r); + act->result = rv; + list_del(&act->list); + add_client_result(act); + goto r_free; + + } + } + + /* + * handle enable/disable + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (act->op == LD_OP_ENABLE || act->op == LD_OP_DISABLE) { + rv = res_able(ls, r, act); + act->result = rv; + list_del(&act->list); + add_client_result(act); + + if (!rv && act->op == LD_OP_DISABLE) { + log_debug("S %s R %s free disabled", ls->name, r->name); + goto r_free; + } + } + } + + /* + * transient requests on existing transient locks + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (act->flags & LD_AF_PERSISTENT) + continue; + + lk = find_lock_client(r, act->client_id); + if (!lk) + continue; + + if (lk->mode != act->mode) { + /* convert below */ + /* + act->result = -EEXIST; + list_del(&act->list); + add_client_result(act); + */ + continue; + } else { + /* success */ + act->result = -EALREADY; + list_del(&act->list); + add_client_result(act); + } + } + + /* + * persistent requests on existing persistent locks + * + * persistent locks are not owned by a client, so any + * existing with matching mode satisfies a request. + * only one persistent lock is kept on a resource. + * a single "unowned" persistent lock satisfies + * any/multiple client requests for a persistent lock. + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (!(act->flags & LD_AF_PERSISTENT)) + continue; + + lk = find_lock_persistent(r); + if (!lk) + continue; + + if (lk->mode != act->mode) { + /* convert below */ + /* + act->result = -EEXIST; + list_del(&act->list); + add_client_result(act); + */ + continue; + } else { + /* success */ + act->result = -EALREADY; + list_del(&act->list); + add_client_result(act); + } + } + + /* + * transient requests with existing persistent locks + * + * Just grant the transient request and do not + * keep a record of it. Assume that the persistent + * lock will not go away while the transient lock + * is needed. + * + * This would be used when an ex, persistent lv lock + * exists from activation, and then something like + * lvextend asks for a transient ex lock to change + * the lv. The lv could not be unlocked by deactivation + * while the lvextend was running. + * + * The logic here for mixing T/P locks is not general + * support; there are a number of cases where it will + * not work: updating version number (lv locks have + * none), ex locks from multiple clients will not + * conflict, explicit un of the transient lock will fail. + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (act->flags & LD_AF_PERSISTENT) + continue; + + lk = find_lock_persistent(r); + if (!lk) + continue; + + if ((lk->mode == LD_LK_EX) || + (lk->mode == LD_LK_SH && act->mode == LD_LK_SH)) { + act->result = 0; + list_del(&act->list); + add_client_result(act); + } else { + /* persistent lock is sh, transient request is ex */ + /* FIXME: can we remove this case? do a convert here? */ + log_debug("res_process %s existing persistent lock new transient", r->name); + act->result = -EEXIST; + list_del(&act->list); + add_client_result(act); + } + } + + /* + * persistent requests with existing transient locks + * + * If a client requests a P (persistent) lock for a T (transient) + * lock it already holds, we can just change T to P. Fail if the + * same happens for locks from different clients. Changing + * another client's lock from T to P may cause problems + * if that client tries to unlock or update version. + * + * I don't think this P/T combination will be used. + * It might be used if a command was able to take a P + * vg lock, in which case the T vg lock would already + * be held for reading. If the T lock was sh, it would + * be converted to P ex. If the T/P modes matched, the + * lock could just be changed from T to P. + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (!(act->flags & LD_AF_PERSISTENT)) + continue; + + lk = find_lock_client(r, act->client_id); + if (!lk) + continue; + + if (lk->mode != act->mode) { + /* FIXME: convert and change to persistent? */ + log_debug("res_process %s existing transient lock new persistent", r->name); + act->result = -EEXIST; + list_del(&act->list); + add_client_result(act); + } else { + lk->flags |= LD_LF_PERSISTENT; + lk->client_id = 0; + act->result = 0; + list_del(&act->list); + add_client_result(act); + } + } + + /* + * convert mode of existing locks + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (act->flags & LD_AF_PERSISTENT) + lk = find_lock_persistent(r); + else + lk = find_lock_client(r, act->client_id); + if (!lk) + continue; + + if (lk->mode == act->mode) { + /* should never happen, should be found above */ + log_error("convert same mode"); + continue; + } + + /* convert fails immediately, no EAGAIN retry */ + rv = res_convert(ls, r, lk, act); + act->result = rv; + list_del(&act->list); + add_client_result(act); + } + + /* + * Cases above are all requests addressed by existing locks. + * Below handles the rest. Transient and persistent are + * handled the same, except + * - if mode of existing lock is incompat with requested, + * leave the act on r->actions + * - if r mode is EX, any lock action is blocked, just quit + * + * Retry a lock request that fails due to a lock conflict (-EAGAIN): + * if we have not exceeded max retries and lm sets lm_retry (sanlock + * transient conflicts from shared lock implementation), or r type + * is gl or vg (transient real conflicts we want to hide from command). + * lv lock conflicts won't be transient so don't retry them. + */ + + + if (r->mode == LD_LK_EX) + return; + + /* + * r mode is SH or UN, pass lock-sh actions to lm + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + /* grant in order, so break here */ + if (act->op == LD_OP_LOCK && act->mode == LD_LK_EX) + break; + + if (act->op == LD_OP_LOCK && act->mode == LD_LK_SH) { + lm_retry = 0; + + rv = res_lock(ls, r, act, &lm_retry); + if ((rv == -EAGAIN) && + (act->retries <= act->max_retries) && + (lm_retry || (r->type != LD_RT_LV))) { + /* leave act on list */ + log_debug("S %s R %s res_lock EAGAIN retry", ls->name, r->name); + act->retries++; + *retry_out = 1; + } else { + act->result = rv; + list_del(&act->list); + add_client_result(act); + } + if (rv == -EUNATCH) + goto r_free; + } + } + + /* + * r mode is SH, any ex lock action is blocked, just quit + */ + + if (r->mode == LD_LK_SH) + return; + + /* + * r mode is UN, pass lock-ex action to lm + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (act->op == LD_OP_LOCK && act->mode == LD_LK_EX) { + lm_retry = 0; + + rv = res_lock(ls, r, act, &lm_retry); + if ((rv == -EAGAIN) && + (act->retries <= act->max_retries) && + (lm_retry || (r->type != LD_RT_LV))) { + /* leave act on list */ + log_debug("S %s R %s res_lock EAGAIN retry", ls->name, r->name); + act->retries++; + *retry_out = 1; + } else { + act->result = rv; + list_del(&act->list); + add_client_result(act); + } + if (rv == -EUNATCH) + goto r_free; + break; + } + } + + return; + +r_free: + /* For the EUNATCH case it may be possible there are queued actions? */ + list_for_each_entry_safe(act, safe, &r->actions, list) { + log_error("S %s R %s res_process r_free cancel %s client %d", + ls->name, r->name, op_str(act->op), act->client_id); + act->result = -ECANCELED; + list_del(&act->list); + add_client_result(act); + } + log_debug("S %s R %s res_process free", ls->name, r->name); + lm_rem_resource(ls, r); + list_del(&r->list); + free_resource(r); +} + +#define LOCKS_EXIST_ANY 1 +#define LOCKS_EXIST_GL 2 +#define LOCKS_EXIST_VG 3 +#define LOCKS_EXIST_LV 4 + +static int for_each_lock(struct lockspace *ls, int locks_do) +{ + struct resource *r; + struct lock *lk; + + list_for_each_entry(r, &ls->resources, list) { + list_for_each_entry(lk, &r->locks, list) { + if (locks_do == LOCKS_EXIST_ANY) + return 1; + + if (locks_do == LOCKS_EXIST_GL && r->type == LD_RT_GL) + return 1; + + if (locks_do == LOCKS_EXIST_VG && r->type == LD_RT_VG) + return 1; + + if (locks_do == LOCKS_EXIST_LV && r->type == LD_RT_LV) + return 1; + } + } + + return 0; +} + +static int clear_locks(struct lockspace *ls, int free_vg) +{ + struct resource *r, *r_safe; + struct lock *lk, *lk_safe; + struct action *act, *act_safe; + uint32_t lk_version; + uint32_t r_version; + int lk_count = 0; + int rv; + + list_for_each_entry_safe(r, r_safe, &ls->resources, list) { + lk_version = 0; + + list_for_each_entry_safe(lk, lk_safe, &r->locks, list) { + lk_count++; + + if (lk->flags & LD_LF_PERSISTENT) + log_error("S %s R %s clear lock persistent", ls->name, r->name); + else + log_error("S %s R %s clear lock client %d", ls->name, r->name, lk->client_id); + + if (lk->version > lk_version) + lk_version = lk->version; + + list_del(&lk->list); + free_lock(lk); + } + + if (r->mode == LD_LK_UN) + goto r_free; + + if ((r->type == LD_RT_GL) && (r->mode == LD_LK_EX)) { + r->version++; + r_version = r->version; + log_debug("S %s R %s clear_locks r_version inc %u", + ls->name, r->name, r_version); + + } else if ((r->type == LD_RT_VG) && (r->mode == LD_LK_EX) && (lk_version > r->version)) { + r->version = lk_version; + r_version = r->version; + log_debug("S %s R %s clear_locks r_version new %u", + ls->name, r->name, r_version); + + } else { + r_version = 0; + } + + rv = lm_unlock(ls, r, NULL, r_version, free_vg ? LMUF_FREE_VG : 0); + if (rv < 0) { + /* should never happen */ + log_error("S %s R %s clear_locks free %d lm unlock error %d", + ls->name, r->name, free_vg, rv); + } + + list_for_each_entry_safe(act, act_safe, &r->actions, list) { + log_error("S %s R %s clear_locks cancel %s client %d", + ls->name, r->name, op_str(act->op), act->client_id); + act->result = -ECANCELED; + list_del(&act->list); + add_client_result(act); + } + r_free: + log_debug("S %s R %s free", ls->name, r->name); + lm_rem_resource(ls, r); + list_del(&r->list); + free_resource(r); + } + + return lk_count; +} + +/* + * find and return the resource that is referenced by the action + * - there is a single gl resource per lockspace + * - there is a single vg resource per lockspace + * - there can be many lv resources per lockspace, compare names + */ + +static struct resource *find_resource_act(struct lockspace *ls, + struct action *act, + int nocreate) +{ + struct resource *r; + + list_for_each_entry(r, &ls->resources, list) { + if (r->type != act->rt) + continue; + + if (r->type == LD_RT_GL && act->rt == LD_RT_GL) + return r; + + if (r->type == LD_RT_VG && act->rt == LD_RT_VG) + return r; + + if (r->type == LD_RT_LV && act->rt == LD_RT_LV && + !strcmp(r->name, act->lv_uuid)) + return r; + } + + if (nocreate) + return NULL; + + if (!(r = alloc_resource())) + return NULL; + + r->type = act->rt; + + r->mode = LD_LK_UN; + + if (r->type == LD_RT_GL) + strncpy(r->name, R_NAME_GL, MAX_NAME); + else if (r->type == LD_RT_VG) + strncpy(r->name, R_NAME_VG, MAX_NAME); + else if (r->type == LD_RT_LV) + strncpy(r->name, act->lv_uuid, MAX_NAME); + + list_add_tail(&r->list, &ls->resources); + + return r; +} + +static void free_ls_resources(struct lockspace *ls) +{ + struct resource *r, *r_safe; + + list_for_each_entry_safe(r, r_safe, &ls->resources, list) { + lm_rem_resource(ls, r); + list_del(&r->list); + free_resource(r); + } +} + +/* + * Process actions queued for this lockspace by + * client_recv_action / add_lock_action. + * + * The lockspace_thread can touch its own ls struct without holding + * lockspaces_mutex until it sets ls->thread_done, after which it + * cannot touch ls without holding lockspaces_mutex. + */ + +#define LOCK_RETRY_MS 1000 /* milliseconds to delay between retry */ + +static void *lockspace_thread_main(void *arg_in) +{ + struct lockspace *ls = arg_in; + struct resource *r, *r2; + struct action *add_act, *act, *safe; + struct list_head tmp_act; + struct list_head act_close; + int free_vg = 0; + int error = 0; + int adopt_flag = 0; + int wait_flag = 0; + int retry; + int rv; + + INIT_LIST_HEAD(&act_close); + + /* first action may be client add */ + pthread_mutex_lock(&ls->mutex); + act = NULL; + add_act = NULL; + if (!list_empty(&ls->actions)) { + act = list_first_entry(&ls->actions, struct action, list); + if (act->op == LD_OP_START) { + add_act = act; + list_del(&add_act->list); + + if (add_act->flags & LD_AF_WAIT) + wait_flag = 1; + if (add_act->flags & LD_AF_ADOPT) + adopt_flag = 1; + } + } + pthread_mutex_unlock(&ls->mutex); + + log_debug("S %s lm_add_lockspace %s wait %d adopt %d", + ls->name, lm_str(ls->lm_type), wait_flag, adopt_flag); + + /* + * The prepare step does not wait for anything and is quick; + * it tells us if the parameters are valid and the lm is running. + */ + error = lm_prepare_lockspace(ls, add_act); + + if (add_act && (!wait_flag || error)) { + /* send initial join result back to client */ + add_act->result = error; + add_client_result(add_act); + add_act = NULL; + } + + /* + * The actual lockspace join can take a while. + */ + if (!error) { + error = lm_add_lockspace(ls, add_act, adopt_flag); + + log_debug("S %s lm_add_lockspace done %d", ls->name, error); + + if (ls->sanlock_gl_enabled && gl_lsname_sanlock[0] && + strcmp(ls->name, gl_lsname_sanlock)) + sanlock_gl_dup = 1; + + if (add_act) { + /* send final join result back to client */ + add_act->result = error; + add_client_result(add_act); + } + } + + pthread_mutex_lock(&ls->mutex); + if (error) { + ls->thread_stop = 1; + ls->create_fail = 1; + } else { + ls->create_done = 1; + } + pthread_mutex_unlock(&ls->mutex); + + if (error) + goto out_act; + + while (1) { + pthread_mutex_lock(&ls->mutex); + while (!ls->thread_work) { + if (ls->thread_stop) { + pthread_mutex_unlock(&ls->mutex); + goto out_rem; + } + pthread_cond_wait(&ls->cond, &ls->mutex); + } + + /* + * Process all the actions queued for this lockspace. + * The client thread queues actions on ls->actions. + * + * Here, take all the actions off of ls->actions, and: + * + * - For lock operations, move the act to r->actions. + * These lock actions/operations processed by res_process(). + * + * - For non-lock operations, e.g. related to managing + * the lockspace, process them in this loop. + */ + + while (1) { + if (list_empty(&ls->actions)) { + ls->thread_work = 0; + break; + } + + act = list_first_entry(&ls->actions, struct action, list); + + if (sanlock_gl_dup && ls->sanlock_gl_enabled) + act->flags |= LD_AF_DUP_GL_LS; + + if (act->op == LD_OP_STOP) { + ls->thread_work = 0; + break; + } + + if (act->op == LD_OP_FREE && act->rt == LD_RT_VG) { + /* vgremove */ + log_debug("S %s checking for lockspace hosts", ls->name); + rv = lm_hosts(ls, 1); + if (rv) { + /* + * Checking for hosts here in addition to after the + * main loop allows vgremove to fail and be rerun + * after the ls is stopped on other hosts. + */ + log_error("S %s lockspace hosts %d", ls->name, rv); + list_del(&act->list); + act->result = -EBUSY; + add_client_result(act); + continue; + } + ls->thread_work = 0; + ls->thread_stop = 1; + free_vg = 1; + break; + } + + if (act->op == LD_OP_RENAME_BEFORE && act->rt == LD_RT_VG) { + /* vgrename */ + log_debug("S %s checking for lockspace hosts", ls->name); + rv = lm_hosts(ls, 1); + if (rv) { + log_error("S %s lockspace hosts %d", ls->name, rv); + list_del(&act->list); + act->result = -EBUSY; + add_client_result(act); + continue; + } + ls->thread_work = 0; + ls->thread_stop = 1; + /* Do we want to check hosts again below like vgremove? */ + break; + } + + if (act->op == LD_OP_FIND_FREE_LOCK && act->rt == LD_RT_VG) { + uint64_t free_offset = 0; + log_debug("S %s find free lock", ls->name); + rv = lm_find_free_lock(ls, &free_offset); + log_debug("S %s find free lock %d offset %llu", + ls->name, rv, (unsigned long long)free_offset); + ls->free_lock_offset = free_offset; + list_del(&act->list); + act->result = rv; + add_client_result(act); + continue; + } + + list_del(&act->list); + + /* applies to all resources */ + if (act->op == LD_OP_CLOSE) { + list_add(&act->list, &act_close); + continue; + } + + /* + * All the other op's are for locking. + * Find the specific resource that the lock op is for, + * and add the act to the resource's list of lock ops. + * + * (This creates a new resource if the one named in + * the act is not found.) + */ + + r = find_resource_act(ls, act, (act->op == LD_OP_FREE) ? 1 : 0); + if (!r) { + act->result = (act->op == LD_OP_FREE) ? -ENOENT : -ENOMEM; + add_client_result(act); + continue; + } + + list_add_tail(&act->list, &r->actions); + + log_debug("S %s R %s action %s %s", ls->name, r->name, + op_str(act->op), mode_str(act->mode)); + } + pthread_mutex_unlock(&ls->mutex); + + /* + * Process the lock operations that have been queued for each + * resource. + */ + + retry = 0; + + list_for_each_entry_safe(r, r2, &ls->resources, list) + res_process(ls, r, &act_close, &retry); + + list_for_each_entry_safe(act, safe, &act_close, list) { + list_del(&act->list); + free_action(act); + } + + if (retry) { + ls->thread_work = 1; + usleep(LOCK_RETRY_MS * 1000); + } + } + +out_rem: + log_debug("S %s stopping", ls->name); + + /* + * For sanlock, we need to unlock any existing locks + * before removing the lockspace, otherwise the sanlock + * daemon will kill us when the lockspace goes away. + * For dlm, we leave with force, so all locks will + * automatically be dropped when we leave the lockspace, + * so unlocking all before leaving could be skipped. + * + * Blindly dropping all existing locks must only be + * allowed in emergency/force situations, otherwise it's + * obviously dangerous, since the lock holders are still + * operating under the assumption that they hold the lock. + * + * For vgremove of a sanlock vg, the vg lock will be held, + * and possibly the gl lock if this vg holds the gl. + * sanlock vgremove wants to unlock-rename these locks. + */ + + log_debug("S %s clearing locks", ls->name); + + rv = clear_locks(ls, free_vg); + + /* + * Tell any other hosts in the lockspace to leave it + * before we remove it (for vgremove). We do this + * before leaving the lockspace ourself because we + * need to be in the lockspace to see others. + */ + + if (free_vg) { + log_debug("S %s checking for lockspace hosts", ls->name); + rv = lm_hosts(ls, 1); + if (rv) + log_error("S %s other lockspace hosts %d", ls->name, rv); + } + + /* + * Leave the lockspace. + */ + + rv = lm_rem_lockspace(ls, NULL, free_vg); + + log_debug("S %s rem_lockspace done %d", ls->name, rv); + +out_act: + /* + * Move remaining actions to results; this will usually (always?) + * be only the stop action. + */ + INIT_LIST_HEAD(&tmp_act); + + pthread_mutex_lock(&ls->mutex); + list_for_each_entry_safe(act, safe, &ls->actions, list) { + if (act->op == LD_OP_FREE) + act->result = 0; + else if (act->op == LD_OP_STOP) + act->result = 0; + else if (act->op == LD_OP_RENAME_BEFORE) + act->result = 0; + else + act->result = -ENOLS; + list_del(&act->list); + list_add_tail(&act->list, &tmp_act); + } + pthread_mutex_unlock(&ls->mutex); + + pthread_mutex_lock(&client_mutex); + list_for_each_entry_safe(act, safe, &tmp_act, list) { + list_del(&act->list); + list_add_tail(&act->list, &client_results); + } + pthread_cond_signal(&client_cond); + pthread_mutex_unlock(&client_mutex); + + pthread_mutex_lock(&lockspaces_mutex); + ls->thread_done = 1; + pthread_mutex_unlock(&lockspaces_mutex); + + /* + * worker_thread will join this thread, and move the + * ls struct from lockspaces list to lockspaces_inactive. + */ + pthread_mutex_lock(&worker_mutex); + worker_wake = 1; + pthread_cond_signal(&worker_cond); + pthread_mutex_unlock(&worker_mutex); + + return NULL; +} + +int lockspaces_empty(void) +{ + int rv; + pthread_mutex_lock(&lockspaces_mutex); + rv = list_empty(&lockspaces); + pthread_mutex_unlock(&lockspaces_mutex); + return rv; +} + +/* + * lockspaces_mutex is locked + * + * When duplicate sanlock global locks have been seen, + * this function has a secondary job of counting the + * number of lockspaces that exist with the gl enabled, + * with the side effect of setting sanlock_gl_dup back to + * zero when the duplicates have been removed/disabled. + */ + +static struct lockspace *find_lockspace_name(char *ls_name) +{ + struct lockspace *ls_found = NULL; + struct lockspace *ls; + int gl_count = 0; + + list_for_each_entry(ls, &lockspaces, list) { + if (!strcmp(ls->name, ls_name)) + ls_found = ls; + + if (!sanlock_gl_dup && ls_found) + return ls_found; + + if (sanlock_gl_dup && ls->sanlock_gl_enabled) + gl_count++; + } + + /* this is the side effect we want from this function */ + if (sanlock_gl_dup && gl_count < 2) + sanlock_gl_dup = 0; + + return ls_found; +} + +/* + * If lvm_<vg_name> is longer than max lockspace name (64) we just ignore the + * extra characters. For sanlock vgs, the name is shortened further to 48 in + * the sanlock code. + */ + +static int vg_ls_name(const char *vg_name, char *ls_name) +{ + if (strlen(vg_name) + 4 > MAX_NAME) { + log_error("vg name too long %s", vg_name); + return -1; + } + + snprintf(ls_name, MAX_NAME, "%s%s", LVM_LS_PREFIX, vg_name); + return 0; +} + +/* FIXME: add mutex for gl_lsname_ ? */ + +static int gl_ls_name(char *ls_name) +{ + if (gl_use_dlm) + memcpy(ls_name, gl_lsname_dlm, MAX_NAME); + else if (gl_use_sanlock) + memcpy(ls_name, gl_lsname_sanlock, MAX_NAME); + else { + log_error("gl_ls_name: global lockspace type unknown"); + return -1; + } + return 0; +} + +/* + * When this function returns an error, the caller needs to deal + * with act (in the cases where act exists). + */ + +static int add_lockspace_thread(const char *ls_name, + const char *vg_name, + const char *vg_uuid, + int lm_type, const char *vg_args, + struct action *act) +{ + struct lockspace *ls, *ls2; + struct resource *r; + uint32_t version = 0; + int rv; + + if (act) + version = act->version; + + log_debug("add_lockspace_thread %s %s version %u", + lm_str(lm_type), ls_name, version); + + if (!(ls = alloc_lockspace())) + return -ENOMEM; + + strncpy(ls->name, ls_name, MAX_NAME); + ls->lm_type = lm_type; + + if (act) + ls->start_client_id = act->client_id; + + if (vg_uuid) + strncpy(ls->vg_uuid, vg_uuid, 64); + + if (vg_name) + strncpy(ls->vg_name, vg_name, MAX_NAME); + + if (vg_args) + strncpy(ls->vg_args, vg_args, MAX_ARGS); + + if (act) + ls->host_id = act->host_id; + + if (!(r = alloc_resource())) { + free(ls); + return -ENOMEM; + } + + r->type = LD_RT_VG; + r->mode = LD_LK_UN; + r->version = version; + strncpy(r->name, R_NAME_VG, MAX_NAME); + list_add_tail(&r->list, &ls->resources); + + pthread_mutex_lock(&lockspaces_mutex); + ls2 = find_lockspace_name(ls->name); + if (ls2) { + if (ls2->thread_stop) + rv = -EAGAIN; + else + rv = -EEXIST; + pthread_mutex_unlock(&lockspaces_mutex); + free_resource(r); + free(ls); + return rv; + } + + /* + * act will be null when this lockspace is added automatically/internally + * and not by an explicit client action that wants a result. + */ + if (act) + list_add(&act->list, &ls->actions); + + clear_lockspace_inactive(ls->name); + + list_add_tail(&ls->list, &lockspaces); + pthread_mutex_unlock(&lockspaces_mutex); + + rv = pthread_create(&ls->thread, NULL, lockspace_thread_main, ls); + if (rv < 0) { + pthread_mutex_lock(&lockspaces_mutex); + list_del(&ls->list); + pthread_mutex_unlock(&lockspaces_mutex); + free_resource(r); + free(ls); + return rv; + } + + return 0; +} + +/* + * There is no add_sanlock_global_lockspace or + * rem_sanlock_global_lockspace because with sanlock, + * the global lockspace is one of the vg lockspaces. + */ + +static int add_dlm_global_lockspace(struct action *act) +{ + int rv; + + if (gl_running_dlm) + return -EEXIST; + + gl_running_dlm = 1; + + /* Keep track of whether we automatically added + the global ls, so we know to automatically + remove it. */ + + if (act) + gl_auto_dlm = 0; + else + gl_auto_dlm = 1; + + /* + * There's a short period after which a previous gl lockspace thread + * has set gl_running_dlm = 0, but before its ls struct has been + * deleted, during which this add_lockspace_thread() can fail with + * -EAGAIN. + */ + + rv = add_lockspace_thread(gl_lsname_dlm, NULL, NULL, LD_LM_DLM, NULL, act); + + if (rv < 0) { + log_error("add_dlm_global_lockspace add_lockspace_thread %d", rv); + gl_running_dlm = 0; + gl_auto_dlm = 0; + } + + return rv; +} + +/* + * If dlm gl lockspace is the only one left, then stop it. + * This is not used for an explicit rem_lockspace action from + * the client, only for auto remove. + */ + +static int rem_dlm_global_lockspace(void) +{ + struct lockspace *ls, *ls_gl = NULL; + int others = 0; + int rv = 0; + + pthread_mutex_lock(&lockspaces_mutex); + list_for_each_entry(ls, &lockspaces, list) { + if (!strcmp(ls->name, gl_lsname_dlm)) { + ls_gl = ls; + continue; + } + if (ls->thread_stop) + continue; + others++; + break; + } + + if (others) { + rv = -EAGAIN; + goto out; + } + + if (!ls_gl) { + rv = -ENOENT; + goto out; + } + + ls = ls_gl; + pthread_mutex_lock(&ls->mutex); + ls->thread_stop = 1; + ls->thread_work = 1; + pthread_cond_signal(&ls->cond); + pthread_mutex_unlock(&ls->mutex); + rv = 0; +out: + pthread_mutex_unlock(&lockspaces_mutex); + return rv; +} + +/* + * When the first dlm lockspace is added for a vg, + * automatically add a separate dlm lockspace for the + * global lock if it hasn't been done explicitly. + * This is to make the dlm global lockspace work similarly to + * the sanlock global lockspace, which is "automatic" by + * nature of being one of the vg lockspaces. + * + * For sanlock, a separate lockspace is not used for + * the global lock, but the gl lock lives in a vg + * lockspace, (although it's recommended to create a + * special vg dedicated to holding the gl). + * + * N.B. for dlm, if this is an add+WAIT action for a vg + * lockspace, and this triggered the automatic addition + * of the global lockspace, then the action may complete + * for the vg ls add, while the gl ls add is still in + * progress. If the caller wants to ensure that the + * gl ls add is complete, they should explicitly add+WAIT + * the gl ls. + * + * If this function returns and error, the caller + * will queue the act with that error for the client. + */ + +static int add_lockspace(struct action *act) +{ + char ls_name[MAX_NAME+1]; + int rv; + + memset(ls_name, 0, sizeof(ls_name)); + + if (act->rt == LD_RT_GL) { + if (gl_use_dlm) { + rv = add_dlm_global_lockspace(act); + return rv; + } else { + return -EINVAL; + } + } + + if (act->rt == LD_RT_VG) { + if (gl_use_dlm) { + rv = add_dlm_global_lockspace(NULL); + if (rv < 0 && rv != -EEXIST) + return rv; + } + + vg_ls_name(act->vg_name, ls_name); + + rv = add_lockspace_thread(ls_name, act->vg_name, act->vg_uuid, + act->lm_type, act->vg_args, + act); + + if (rv) + log_error("add_lockspace %s add_lockspace_thread %d", ls_name, rv); + return rv; + } + + log_error("add_lockspace bad type %d", act->rt); + return -1; +} + +/* + * vgchange --lock-stop vgname will lock the vg ex, then send a stop, + * so we exect to find the ex vg lock held here, and will automatically + * unlock it when stopping. + * + * Should we attempt to stop the lockspace containing the gl last? + */ + +static int rem_lockspace(struct action *act) +{ + struct lockspace *ls; + char ls_name[MAX_NAME+1]; + int force = act->flags & LD_AF_FORCE; + int rt = act->rt; + + if (act->rt == LD_RT_GL && act->lm_type != LD_LM_DLM) + return -EINVAL; + + memset(ls_name, 0, sizeof(ls_name)); + + if (act->rt == LD_RT_GL) + gl_ls_name(ls_name); + else + vg_ls_name(act->vg_name, ls_name); + + pthread_mutex_lock(&lockspaces_mutex); + ls = find_lockspace_name(ls_name); + if (!ls) { + pthread_mutex_unlock(&lockspaces_mutex); + return -ENOLS; + } + + pthread_mutex_lock(&ls->mutex); + if (ls->thread_stop) { + pthread_mutex_unlock(&ls->mutex); + pthread_mutex_unlock(&lockspaces_mutex); + return -ESTALE; + } + + if (!force && for_each_lock(ls, LOCKS_EXIST_LV)) { + pthread_mutex_unlock(&ls->mutex); + pthread_mutex_unlock(&lockspaces_mutex); + return -EBUSY; + } + ls->thread_work = 1; + ls->thread_stop = 1; + if (act) + list_add_tail(&act->list, &ls->actions); + pthread_cond_signal(&ls->cond); + pthread_mutex_unlock(&ls->mutex); + pthread_mutex_unlock(&lockspaces_mutex); + + /* + * If the dlm global lockspace was automatically added when + * the first dlm vg lockspace was added, then reverse that + * by automatically removing the dlm global lockspace when + * the last dlm vg lockspace is removed. + */ + + if (rt == LD_RT_VG && gl_use_dlm && gl_auto_dlm) + rem_dlm_global_lockspace(); + + return 0; +} + +/* + * count how many lockspaces started by this client are still starting; + * the client will use this to wait for all its start operations to finish + * (START_WAIT). + */ + +static int count_lockspace_starting(uint32_t client_id) +{ + struct lockspace *ls; + int count = 0; + int done = 0; + int fail = 0; + + pthread_mutex_lock(&lockspaces_mutex); + list_for_each_entry(ls, &lockspaces, list) { + if (ls->start_client_id != client_id) + continue; + + if (!ls->create_done && !ls->create_fail) { + count++; + continue; + } + + if (ls->create_done) + done++; + if (ls->create_fail) + fail++; + } + pthread_mutex_unlock(&lockspaces_mutex); + + log_debug("count_lockspace_starting client %u count %d done %d fail %d", + client_id, count, done, fail); + + return count; +} + +/* lockspaces_mutex is held */ +static struct lockspace *find_lockspace_inactive(char *ls_name) +{ + struct lockspace *ls; + + list_for_each_entry(ls, &lockspaces_inactive, list) { + if (!strcmp(ls->name, ls_name)) + return ls; + } + + return NULL; +} + +/* lockspaces_mutex is held */ +static int clear_lockspace_inactive(char *ls_name) +{ + struct lockspace *ls; + + ls = find_lockspace_inactive(ls_name); + if (ls) { + list_del(&ls->list); + free(ls); + return 1; + } + + return 0; +} + +static int forget_lockspace_inactive(char *vg_name) +{ + char ls_name[MAX_NAME+1]; + int found; + + memset(ls_name, 0, sizeof(ls_name)); + vg_ls_name(vg_name, ls_name); + + log_debug("forget_lockspace_inactive %s", ls_name); + + pthread_mutex_lock(&lockspaces_mutex); + found = clear_lockspace_inactive(ls_name); + pthread_mutex_unlock(&lockspaces_mutex); + + if (found) + return 0; + return -ENOENT; +} + +static void free_lockspaces_inactive(void) +{ + struct lockspace *ls, *safe; + + pthread_mutex_lock(&lockspaces_mutex); + list_for_each_entry_safe(ls, safe, &lockspaces_inactive, list) { + list_del(&ls->list); + free(ls); + } + pthread_mutex_unlock(&lockspaces_mutex); +} + +/* + * Loop through all lockspaces, and: + * - if do_stop is set, stop any that are not stopped + * - if do_free is set, join any that are done stopping (and free ls) + * + * do_stop will not stop an ls with lv locks unless force is set. + * + * This function does not block or wait for anything. + * + * do_stop (no do_free): + * returns count of lockspaces that need stop (have locks and no force) + * + * do_free (no do_stop): + * returns count of lockspaces that are stopped and need freeing + * + * do_stop and do_free: + * returns sum of the previous two + */ + +static int for_each_lockspace(int do_stop, int do_free, int do_force) +{ + struct lockspace *ls, *safe; + int need_stop = 0; + int need_free = 0; + int stop_count = 0; + int free_count = 0; + int done; + int stop; + + pthread_mutex_lock(&lockspaces_mutex); + + if (do_stop) { + list_for_each_entry(ls, &lockspaces, list) { + + pthread_mutex_lock(&ls->mutex); + if (ls->thread_stop) { + pthread_mutex_unlock(&ls->mutex); + continue; + } + + if (!do_force && for_each_lock(ls, LOCKS_EXIST_ANY)) { + need_stop++; + } else { + ls->thread_work = 1; + ls->thread_stop = 1; + pthread_cond_signal(&ls->cond); + stop_count++; + } + pthread_mutex_unlock(&ls->mutex); + } + } + + if (do_free) { + list_for_each_entry_safe(ls, safe, &lockspaces, list) { + + pthread_mutex_lock(&ls->mutex); + done = ls->thread_done; + stop = ls->thread_stop; + pthread_mutex_unlock(&ls->mutex); + + /* This ls has locks and force is not set. */ + if (!stop) + continue; + + /* + * Once thread_done is set, we know that the lockspace_thread + * will not be using/touching the ls struct. Any other + * thread touches the ls struct under lockspaces_mutex. + */ + if (done) { + pthread_join(ls->thread, NULL); + list_del(&ls->list); + + /* In future we may need to free ls->actions here */ + free_ls_resources(ls); + list_add(&ls->list, &lockspaces_inactive); + free_count++; + } else { + need_free++; + } + } + } + + if (list_empty(&lockspaces)) { + if (!gl_type_static) { + gl_use_dlm = 0; + gl_use_sanlock = 0; + } + } + pthread_mutex_unlock(&lockspaces_mutex); + + if (stop_count || free_count || need_stop || need_free) { + log_debug("for_each_lockspace do_stop %d do_free %d " + "stop_count %d free_count %d need_stop %d need_free %d", + do_stop, do_free, stop_count, free_count, need_stop, need_free); + } + + return need_stop + need_free; +} + +/* + * This is only called when the daemon is exiting so the sleep/retry + * loop doesn't have any adverse impact. + */ + +static void for_each_lockspace_retry(int do_stop, int do_free, int do_force) +{ + int count; + + while (1) { + count = for_each_lockspace(do_stop, do_free, do_force); + if (!count) + break; + + log_debug("for_each_lockspace_retry remaining %d", count); + sleep(1); + } +} + +static int work_init_vg(struct action *act) +{ + struct lockspace *ls; + char ls_name[MAX_NAME+1]; + int rv = 0; + + memset(ls_name, 0, sizeof(ls_name)); + + vg_ls_name(act->vg_name, ls_name); + + /* + * The max dlm ls name is 64 and the max sanlock ls name is 48. So, + * after the "lvm_" prefix, only the first 60/44 characters of the VG + * name are used for the lockspace name. This will cause a collision + * in the lock manager if two different VG names have the first 60/44 + * chars in common. At the time of vgcreate (here), check if any other + * VG's are known that would collide. If the collision is not detected + * at vgcreate time, it will be detected at start time and add_lockspace + * will fail for the second of the two matching ls names. + */ + pthread_mutex_lock(&lockspaces_mutex); + list_for_each_entry(ls, &lockspaces, list) { + if ((ls->lm_type == LD_LM_SANLOCK) && !strncmp(ls->name, ls_name, 48)) { + rv = -EEXIST; + break; + } + if ((ls->lm_type == LD_LM_DLM) && !strcmp(ls->name, ls_name)) { + rv = -EEXIST; + break; + } + } + pthread_mutex_unlock(&lockspaces_mutex); + + if (rv == -EEXIST) { + log_error("Existing lockspace name %s matches new %s VG names %s %s", + ls->name, ls_name, ls->vg_name, act->vg_name); + return rv; + } + + if (act->lm_type == LD_LM_SANLOCK) + rv = lm_init_vg_sanlock(ls_name, act->vg_name, act->flags, act->vg_args); + else if (act->lm_type == LD_LM_DLM) + rv = lm_init_vg_dlm(ls_name, act->vg_name, act->flags, act->vg_args); + else + rv = -EINVAL; + + return rv; +} + +static int work_rename_vg(struct action *act) +{ + char ls_name[MAX_NAME+1]; + int rv = 0; + + memset(ls_name, 0, sizeof(ls_name)); + + vg_ls_name(act->vg_name, ls_name); + + if (act->lm_type == LD_LM_SANLOCK) + rv = lm_rename_vg_sanlock(ls_name, act->vg_name, act->flags, act->vg_args); + else if (act->lm_type == LD_LM_DLM) + return 0; + else + rv = -EINVAL; + + return rv; +} + +static void work_test_gl(void) +{ + struct lockspace *ls; + int is_enabled = 0; + + pthread_mutex_lock(&lockspaces_mutex); + list_for_each_entry(ls, &lockspaces, list) { + if (ls->lm_type != LD_LM_SANLOCK) + continue; + + pthread_mutex_lock(&ls->mutex); + if (ls->create_done && !ls->thread_stop) { + is_enabled = lm_gl_is_enabled(ls); + if (is_enabled) { + log_debug("S %s worker found gl_is_enabled", ls->name); + strncpy(gl_lsname_sanlock, ls->name, MAX_NAME); + } + } + pthread_mutex_unlock(&ls->mutex); + + if (is_enabled) + break; + } + + if (!is_enabled) + log_debug("worker found no gl_is_enabled"); + pthread_mutex_unlock(&lockspaces_mutex); +} + +static int work_init_lv(struct action *act) +{ + struct lockspace *ls; + char ls_name[MAX_NAME+1]; + char vg_args[MAX_ARGS]; + char lv_args[MAX_ARGS]; + uint64_t free_offset = 0; + int lm_type = 0; + int rv = 0; + + memset(ls_name, 0, sizeof(ls_name)); + memset(vg_args, 0, MAX_ARGS); + memset(lv_args, 0, MAX_ARGS); + + vg_ls_name(act->vg_name, ls_name); + + pthread_mutex_lock(&lockspaces_mutex); + ls = find_lockspace_name(ls_name); + if (ls) { + lm_type = ls->lm_type; + memcpy(vg_args, ls->vg_args, MAX_ARGS); + free_offset = ls->free_lock_offset; + ls->free_lock_offset = 0; + } + pthread_mutex_unlock(&lockspaces_mutex); + + if (!ls) { + lm_type = act->lm_type; + memcpy(vg_args, act->vg_args, MAX_ARGS); + } + + if (act->lm_type != lm_type) { + log_error("init_lv ls_name %s wrong lm_type %d %d", + ls_name, act->lm_type, lm_type); + return -EINVAL; + } + + if (lm_type == LD_LM_SANLOCK) { + rv = lm_init_lv_sanlock(ls_name, act->vg_name, act->lv_uuid, + vg_args, lv_args, free_offset); + + memcpy(act->lv_args, lv_args, MAX_ARGS); + return rv; + + } else if (act->lm_type == LD_LM_DLM) { + return 0; + } else { + log_error("init_lv ls_name %s bad lm_type %d", ls_name, act->lm_type); + return -EINVAL; + } +} + +/* + * When an action is queued for the worker_thread, it is processed right away. + * After processing, some actions need to be retried again in a short while. + * These actions are put on the delayed_list, and the worker_thread will + * process these delayed actions again in SHORT_DELAY_PERIOD. + */ + +#define SHORT_DELAY_PERIOD 2 +#define LONG_DELAY_PERIOD 60 + +static void *worker_thread_main(void *arg_in) +{ + struct list_head delayed_list; + struct timespec ts; + struct action *act, *safe; + uint64_t last_delayed_time = 0; + int delay_sec = LONG_DELAY_PERIOD; + int rv; + + INIT_LIST_HEAD(&delayed_list); + + while (1) { + pthread_mutex_lock(&worker_mutex); + clock_gettime(CLOCK_REALTIME, &ts); + ts.tv_sec += delay_sec; + rv = 0; + act = NULL; + + while (list_empty(&worker_list) && !worker_stop && !worker_wake && !rv) { + rv = pthread_cond_timedwait(&worker_cond, &worker_mutex, &ts); + } + worker_wake = 0; + + if (worker_stop) { + pthread_mutex_unlock(&worker_mutex); + goto out; + } + + if (!list_empty(&worker_list)) { + act = list_first_entry(&worker_list, struct action, list); + list_del(&act->list); + } + pthread_mutex_unlock(&worker_mutex); + + /* + * Do new work actions before processing delayed work actions. + */ + + if (!act) + goto delayed_work; + + if (act->op == LD_OP_RUNNING_LM) { + int run_sanlock = lm_is_running_sanlock(); + int run_dlm = lm_is_running_dlm(); + + if (run_sanlock && run_dlm) + act->result = -EXFULL; + else if (!run_sanlock && !run_dlm) + act->result = -ENOLCK; + else if (run_sanlock) + act->result = LD_LM_SANLOCK; + else if (run_dlm) + act->result = LD_LM_DLM; + add_client_result(act); + + } else if ((act->op == LD_OP_LOCK) && (act->flags & LD_AF_SEARCH_LS)) { + /* + * worker_thread used as a helper to search existing + * sanlock vgs for an enabled gl. + */ + log_debug("work search for gl"); + work_test_gl(); + + /* try again to find a gl lockspace for this act */ + rv = add_lock_action(act); + if (rv < 0) { + act->result = rv; + add_client_result(act); + } + + } else if ((act->op == LD_OP_INIT) && (act->rt == LD_RT_VG)) { + log_debug("work init_vg %s", act->vg_name); + act->result = work_init_vg(act); + add_client_result(act); + + } else if ((act->op == LD_OP_INIT) && (act->rt == LD_RT_LV)) { + log_debug("work init_lv %s/%s uuid %s", act->vg_name, act->lv_name, act->lv_uuid); + act->result = work_init_lv(act); + add_client_result(act); + + } else if ((act->op == LD_OP_RENAME_FINAL) && (act->rt == LD_RT_VG)) { + log_debug("work rename_vg %s", act->vg_name); + act->result = work_rename_vg(act); + add_client_result(act); + + } else if (act->op == LD_OP_START_WAIT) { + act->result = count_lockspace_starting(act->client_id); + if (!act->result) + add_client_result(act); + else + list_add(&act->list, &delayed_list); + + } else if (act->op == LD_OP_STOP_ALL) { + act->result = for_each_lockspace(DO_STOP, DO_FREE, (act->flags & LD_AF_FORCE) ? DO_FORCE : NO_FORCE); + if (!act->result || !(act->flags & LD_AF_WAIT)) + add_client_result(act); + else + list_add(&act->list, &delayed_list); + + } else { + log_error("work unknown op %d", act->op); + act->result = -EINVAL; + add_client_result(act); + } + + delayed_work: + /* + * We may want to track retry times per action so that + * we can delay different actions by different amounts. + */ + + if (monotime() - last_delayed_time < SHORT_DELAY_PERIOD) { + delay_sec = 1; + continue; + } + last_delayed_time = monotime(); + + list_for_each_entry_safe(act, safe, &delayed_list, list) { + if (act->op == LD_OP_START_WAIT) { + log_debug("work delayed start_wait for client %u", act->client_id); + act->result = count_lockspace_starting(act->client_id); + if (!act->result) { + list_del(&act->list); + add_client_result(act); + } + + } else if (act->op == LD_OP_STOP_ALL) { + log_debug("work delayed stop_all"); + act->result = for_each_lockspace(DO_STOP, DO_FREE, (act->flags & LD_AF_FORCE) ? DO_FORCE : NO_FORCE); + if (!act->result) { + list_del(&act->list); + act->result = 0; + add_client_result(act); + } + } + } + + /* + * This is not explicitly queued work, and not delayed work, + * but lockspace thread cleanup that's needed when a + * lockspace has been stopped/removed or failed to start. + */ + + for_each_lockspace(NO_STOP, DO_FREE, NO_FORCE); + + if (list_empty(&delayed_list)) + delay_sec = LONG_DELAY_PERIOD; + else + delay_sec = 1; + } +out: + list_for_each_entry_safe(act, safe, &delayed_list, list) { + list_del(&act->list); + free_action(act); + } + + pthread_mutex_lock(&worker_mutex); + list_for_each_entry_safe(act, safe, &worker_list, list) { + list_del(&act->list); + free_action(act); + } + pthread_mutex_unlock(&worker_mutex); + return NULL; +} + +static int setup_worker_thread(void) +{ + int rv; + + INIT_LIST_HEAD(&worker_list); + + pthread_mutex_init(&worker_mutex, NULL); + pthread_cond_init(&worker_cond, NULL); + + rv = pthread_create(&worker_thread, NULL, worker_thread_main, NULL); + if (rv) + return -1; + return 0; +} + +static void close_worker_thread(void) +{ + pthread_mutex_lock(&worker_mutex); + worker_stop = 1; + pthread_cond_signal(&worker_cond); + pthread_mutex_unlock(&worker_mutex); + pthread_join(worker_thread, NULL); +} + +/* client_mutex is locked */ +static struct client *find_client_work(void) +{ + struct client *cl; + + list_for_each_entry(cl, &client_list, list) { + if (cl->recv || cl->dead) + return cl; + } + return NULL; +} + +/* client_mutex is locked */ +static struct client *find_client_id(uint32_t id) +{ + struct client *cl; + + list_for_each_entry(cl, &client_list, list) { + if (cl->id == id) + return cl; + } + return NULL; +} + +/* client_mutex is locked */ +static struct client *find_client_pi(int pi) +{ + struct client *cl; + + list_for_each_entry(cl, &client_list, list) { + if (cl->pi == pi) + return cl; + } + return NULL; +} + +/* + * wake up poll() because we have added an fd + * back into pollfd and poll() needs to be restarted + * to recognize it. + */ +static void restart_poll(void) +{ + int rv; + rv = write(restart_fds[1], "w", 1); + if (!rv || rv < 0) + log_debug("restart_poll write %d", errno); +} + +/* poll will take requests from client again, cl->mutex must be held */ +static void client_resume(struct client *cl) +{ + if (cl->dead) + return; + + if (!cl->poll_ignore || cl->fd == -1 || cl->pi == -1) { + /* shouldn't happen */ + log_error("client_resume %d bad state ig %d fd %d pi %d", + cl->id, cl->poll_ignore, cl->fd, cl->pi); + return; + } + + pthread_mutex_lock(&pollfd_mutex); + if (pollfd[cl->pi].fd != POLL_FD_IGNORE) { + log_error("client_resume %d pi %d fd %d not IGNORE", + cl->id, cl->pi, cl->fd); + } + pollfd[cl->pi].fd = cl->fd; + pollfd[cl->pi].events = POLLIN; + pthread_mutex_unlock(&pollfd_mutex); + + restart_poll(); +} + +/* called from client_thread, cl->mutex is held */ +static void client_send_result(struct client *cl, struct action *act) +{ + response res; + char result_flags[128]; + + if (cl->dead) { + log_debug("client send %d skip dead", cl->id); + return; + } + + memset(result_flags, 0, sizeof(result_flags)); + + buffer_init(&res.buffer); + + /* + * EUNATCH is returned when the global lock existed, + * but had been disabled when we tried to lock it, + * so we removed it, and no longer have a gl to lock. + */ + + if (act->result == -EUNATCH) + act->result = -ENOLS; + + /* + * init_vg with dlm|sanlock returns vg_args + * init_lv with sanlock returns lv_args + */ + + if (act->result == -ENOLS) { + /* + * The lockspace could not be found, in which case + * the caller may want to know if any lockspaces exist + * or if lockspaces exist, but not one with the global lock. + * Given this detail, it may be able to procede without + * the lock. + * + * FIXME: it would also help the caller to know if there + * are other sanlock VGs that have not been started. + * If there are, then one of them might have a global + * lock enabled. In that case, vgcreate may not want + * to create a new sanlock vg with gl enabled. + */ + pthread_mutex_lock(&lockspaces_mutex); + if (list_empty(&lockspaces)) + strcat(result_flags, "NO_LOCKSPACES,"); + pthread_mutex_unlock(&lockspaces_mutex); + + if (gl_use_sanlock && !gl_lsname_sanlock[0]) + strcat(result_flags, "NO_GL_LS,"); + else if (gl_use_dlm && !gl_lsname_dlm[0]) + strcat(result_flags, "NO_GL_LS,"); + else + strcat(result_flags, "NO_GL_LS,"); + } + + if (act->flags & LD_AF_DUP_GL_LS) + strcat(result_flags, "DUP_GL_LS,"); + + if (act->flags & LD_AF_INACTIVE_LS) + strcat(result_flags, "INACTIVE_LS,"); + + if (act->flags & LD_AF_ADD_LS_ERROR) + strcat(result_flags, "ADD_LS_ERROR,"); + + if (act->op == LD_OP_INIT) { + /* + * init is a special case where lock args need + * to be passed back to the client. + */ + const char *vg_args = "none"; + const char *lv_args = "none"; + + if (act->vg_args[0]) + vg_args = act->vg_args; + + if (act->lv_args[0]) + lv_args = act->lv_args; + + log_debug("send %s[%d.%u] %s %s rv %d vg_args %s lv_args %s", + cl->name[0] ? cl->name : "client", cl->pid, cl->id, + op_str(act->op), rt_str(act->rt), + act->result, vg_args ? vg_args : "", lv_args ? lv_args : ""); + + res = daemon_reply_simple("OK", + "op = %d", act->op, + "op_result = %d", act->result, + "lm_result = %d", act->lm_rv, + "vg_lock_args = %s", vg_args, + "lv_lock_args = %s", lv_args, + "result_flags = %s", result_flags[0] ? result_flags : "none", + NULL); + } else { + /* + * A normal reply. + */ + + log_debug("send %s[%d.%u] %s %s rv %d %s %s", + cl->name[0] ? cl->name : "client", cl->pid, cl->id, + op_str(act->op), rt_str(act->rt), + act->result, (act->result == -ENOLS) ? "ENOLS" : "", result_flags); + + res = daemon_reply_simple("OK", + "op = %d", act->op, + "lock_type = %s", lm_str(act->lm_type), + "op_result = %d", act->result, + "lm_result = %d", act->lm_rv, + "result_flags = %s", result_flags[0] ? result_flags : "none", + NULL); + } + + buffer_write(cl->fd, &res.buffer); + buffer_destroy(&res.buffer); + + client_resume(cl); +} + +/* called from client_thread */ +static void client_purge(struct client *cl) +{ + struct lockspace *ls; + struct action *act; + + pthread_mutex_lock(&lockspaces_mutex); + list_for_each_entry(ls, &lockspaces, list) { + if (!(act = alloc_action())) + continue; + + act->op = LD_OP_CLOSE; + act->client_id = cl->id; + + pthread_mutex_lock(&ls->mutex); + if (!ls->thread_stop) { + list_add_tail(&act->list, &ls->actions); + ls->thread_work = 1; + pthread_cond_signal(&ls->cond); + } else { + free_action(act); + } + pthread_mutex_unlock(&ls->mutex); + } + pthread_mutex_unlock(&lockspaces_mutex); +} + +static int add_lock_action(struct action *act) +{ + struct lockspace *ls = NULL; + char ls_name[MAX_NAME+1]; + + memset(ls_name, 0, sizeof(ls_name)); + + /* Determine which lockspace this action is for, and set ls_name. */ + + if (act->rt == LD_RT_GL && gl_use_sanlock && + (act->op == LD_OP_ENABLE || act->op == LD_OP_DISABLE)) + vg_ls_name(act->vg_name, ls_name); + else if (act->rt == LD_RT_GL) + gl_ls_name(ls_name); + else + vg_ls_name(act->vg_name, ls_name); + + retry: + pthread_mutex_lock(&lockspaces_mutex); + if (ls_name[0]) + ls = find_lockspace_name(ls_name); + if (!ls) { + int ls_inactive = 0; + int ls_create_fail = 0; + + ls = find_lockspace_inactive(ls_name); + if (ls) { + ls_inactive = 1; + ls_create_fail = ls->create_fail; + ls = NULL; + } + pthread_mutex_unlock(&lockspaces_mutex); + + if (act->op == LD_OP_UPDATE && act->rt == LD_RT_VG) { + log_debug("lockspace not found ignored for vg update"); + return -ENOLS; + + } else if (act->flags & LD_AF_SEARCH_LS) { + /* fail if we've already tried searching for the ls */ + log_error("lockspace search repeated %s", ls_name); + return -ENOLS; + + } else if (act->op == LD_OP_LOCK && act->rt == LD_RT_GL && gl_use_sanlock) { + /* gl may have been enabled in an existing vg */ + log_debug("gl lockspace not found check sanlock vgs"); + act->flags |= LD_AF_SEARCH_LS; + add_work_action(act); + return 0; + + } else if (act->op == LD_OP_LOCK && act->rt == LD_RT_GL && gl_use_dlm) { + log_debug("gl lockspace not found add dlm global"); + act->flags |= LD_AF_SEARCH_LS; + act->flags |= LD_AF_WAIT_STARTING; + add_dlm_global_lockspace(NULL); + gl_ls_name(ls_name); + goto retry; + + } else if (act->op == LD_OP_LOCK && act->mode == LD_LK_UN) { + log_debug("lockspace not found ignored for unlock"); + return -ENOLS; + + } else if (act->op == LD_OP_LOCK && act->rt == LD_RT_VG && ls_inactive) { + /* ls has been stopped or previously failed to start */ + log_debug("lockspace inactive create_fail %d %s", + ls_create_fail, ls_name); + act->flags |= LD_AF_INACTIVE_LS; + if (ls_create_fail) + act->flags |= LD_AF_ADD_LS_ERROR; + return -ENOLS; + + } else { + log_error("lockspace not found %s", ls_name); + return -ENOLS; + } + } + + if (act->lm_type == LD_LM_NONE) { + /* return to the command the type we are using */ + act->lm_type = ls->lm_type; + } else if (act->lm_type != ls->lm_type) { + /* should not happen */ + log_error("S %s add_lock_action bad lm_type %d ls %d", + ls_name, act->lm_type, ls->lm_type); + return -EINVAL; + } + + pthread_mutex_lock(&ls->mutex); + if (ls->thread_stop && ls->thread_done) { + log_debug("lockspace is done finish cleanup %s", ls_name); + pthread_join(ls->thread, NULL); + list_del(&ls->list); + pthread_mutex_unlock(&ls->mutex); + free_ls_resources(ls); + free(ls); + pthread_mutex_unlock(&lockspaces_mutex); + goto retry; + } + + if (ls->thread_stop) { + pthread_mutex_unlock(&ls->mutex); + pthread_mutex_unlock(&lockspaces_mutex); + log_error("lockspace is stopping %s", ls_name); + return -ESTALE; + } + + if (!ls->create_fail && !ls->create_done && !(act->flags & LD_AF_WAIT_STARTING)) { + pthread_mutex_unlock(&ls->mutex); + pthread_mutex_unlock(&lockspaces_mutex); + log_debug("lockspace is starting %s", ls_name); + return -ESTARTING; + } + + list_add_tail(&act->list, &ls->actions); + ls->thread_work = 1; + pthread_cond_signal(&ls->cond); + pthread_mutex_unlock(&ls->mutex); + pthread_mutex_unlock(&lockspaces_mutex); + + /* lockspace_thread_main / res_process take it from here */ + + return 0; +} + +static int str_to_op_rt(const char *req_name, int *op, int *rt) +{ + if (!req_name) + goto out; + + if (!strcmp(req_name, "hello")) { + *op = LD_OP_HELLO; + *rt = 0; + return 0; + } + if (!strcmp(req_name, "quit")) { + *op = LD_OP_QUIT; + *rt = 0; + return 0; + } + if (!strcmp(req_name, "info")) { + *op = LD_OP_DUMP_INFO; + *rt = 0; + return 0; + } + if (!strcmp(req_name, "dump")) { + *op = LD_OP_DUMP_LOG; + *rt = 0; + return 0; + } + if (!strcmp(req_name, "init_vg")) { + *op = LD_OP_INIT; + *rt = LD_RT_VG; + return 0; + } + if (!strcmp(req_name, "init_lv")) { + *op = LD_OP_INIT; + *rt = LD_RT_LV; + return 0; + } + if (!strcmp(req_name, "free_vg")) { + *op = LD_OP_FREE; + *rt = LD_RT_VG; + return 0; + } + if (!strcmp(req_name, "free_lv")) { + *op = LD_OP_FREE; + *rt = LD_RT_LV; + return 0; + } + if (!strcmp(req_name, "start_vg")) { + *op = LD_OP_START; + *rt = LD_RT_VG; + return 0; + } + if (!strcmp(req_name, "stop_vg")) { + *op = LD_OP_STOP; + *rt = LD_RT_VG; + return 0; + } + if (!strcmp(req_name, "start_wait")) { + *op = LD_OP_START_WAIT; + *rt = 0; + return 0; + } + if (!strcmp(req_name, "stop_all")) { + *op = LD_OP_STOP_ALL; + *rt = 0; + return 0; + } + if (!strcmp(req_name, "lock_gl")) { + *op = LD_OP_LOCK; + *rt = LD_RT_GL; + return 0; + } + if (!strcmp(req_name, "lock_vg")) { + *op = LD_OP_LOCK; + *rt = LD_RT_VG; + return 0; + } + if (!strcmp(req_name, "lock_lv")) { + *op = LD_OP_LOCK; + *rt = LD_RT_LV; + return 0; + } + if (!strcmp(req_name, "vg_update")) { + *op = LD_OP_UPDATE; + *rt = LD_RT_VG; + return 0; + } + if (!strcmp(req_name, "enable_gl")) { + *op = LD_OP_ENABLE; + *rt = LD_RT_GL; + return 0; + } + if (!strcmp(req_name, "disable_gl")) { + *op = LD_OP_DISABLE; + *rt = LD_RT_GL; + return 0; + } + if (!strcmp(req_name, "rename_vg_before")) { + *op = LD_OP_RENAME_BEFORE; + *rt = LD_RT_VG; + return 0; + } + if (!strcmp(req_name, "rename_vg_final")) { + *op = LD_OP_RENAME_FINAL; + *rt = LD_RT_VG; + return 0; + } + if (!strcmp(req_name, "running_lm")) { + *op = LD_OP_RUNNING_LM; + *rt = 0; + return 0; + } + if (!strcmp(req_name, "find_free_lock")) { + *op = LD_OP_FIND_FREE_LOCK; + *rt = LD_RT_VG; + return 0; + } + if (!strcmp(req_name, "forget_vg_name")) { + *op = LD_OP_FORGET_VG_NAME; + *rt = LD_RT_VG; + return 0; + } +out: + return -1; +} + +static int str_to_mode(const char *str) +{ + if (!str) + goto out; + if (!strcmp(str, "un")) + return LD_LK_UN; + if (!strcmp(str, "nl")) + return LD_LK_NL; + if (!strcmp(str, "sh")) + return LD_LK_SH; + if (!strcmp(str, "ex")) + return LD_LK_EX; +out: + return LD_LK_IV; +} + +static int str_to_lm(const char *str) +{ + if (!str || !strcmp(str, "none")) + return LD_LM_NONE; + if (!strcmp(str, "sanlock")) + return LD_LM_SANLOCK; + if (!strcmp(str, "dlm")) + return LD_LM_DLM; + return -2; +} + +static uint32_t str_to_opts(const char *str) +{ + uint32_t flags = 0; + + if (!str) + goto out; + if (strstr(str, "persistent")) + flags |= LD_AF_PERSISTENT; + if (strstr(str, "unlock_cancel")) + flags |= LD_AF_UNLOCK_CANCEL; + if (strstr(str, "next_version")) + flags |= LD_AF_NEXT_VERSION; + if (strstr(str, "wait")) + flags |= LD_AF_WAIT; + if (strstr(str, "force")) + flags |= LD_AF_FORCE; + if (strstr(str, "ex_disable")) + flags |= LD_AF_EX_DISABLE; + if (strstr(str, "enable")) + flags |= LD_AF_ENABLE; + if (strstr(str, "disable")) + flags |= LD_AF_DISABLE; +out: + return flags; +} + +/* + * dump info + * client_list: each client struct + * lockspaces: each lockspace struct + * lockspace actions: each action struct + * lockspace resources: each resource struct + * lockspace resource actions: each action struct + * lockspace resource locks: each lock struct + */ + +static int setup_dump_socket(void) +{ + int s; + + s = socket(AF_LOCAL, SOCK_DGRAM, 0); + if (s < 0) + return s; + + memset(&dump_addr, 0, sizeof(dump_addr)); + dump_addr.sun_family = AF_LOCAL; + strcpy(&dump_addr.sun_path[1], DUMP_SOCKET_NAME); + dump_addrlen = sizeof(sa_family_t) + strlen(dump_addr.sun_path+1) + 1; + + return s; +} + +static int send_dump_buf(int fd, int dump_len) +{ + int pos = 0; + int ret; + +retry: + ret = sendto(fd, dump_buf + pos, dump_len - pos, MSG_DONTWAIT | MSG_NOSIGNAL, + (struct sockaddr *)&dump_addr, dump_addrlen); + if (ret <= 0) + return ret; + + pos += ret; + + if (pos < dump_len) + goto retry; + + return 0; +} + +static int print_structs(const char *prefix, int pos, int len) +{ + return snprintf(dump_buf + pos, len - pos, + "info=%s " + "unused_action_count=%d " + "unused_client_count=%d " + "unused_resource_count=%d " + "unused_lock_count=%d\n", + prefix, + unused_action_count, + unused_client_count, + unused_resource_count, + unused_lock_count); +} + +static int print_client(struct client *cl, const char *prefix, int pos, int len) +{ + return snprintf(dump_buf + pos, len - pos, + "info=%s " + "pid=%d " + "fd=%d " + "pi=%d " + "id=%u " + "name=%s\n", + prefix, + cl->pid, + cl->fd, + cl->pi, + cl->id, + cl->name[0] ? cl->name : "."); +} + +static int print_lockspace(struct lockspace *ls, const char *prefix, int pos, int len) +{ + return snprintf(dump_buf + pos, len - pos, + "info=%s " + "ls_name=%s " + "vg_name=%s " + "vg_uuid=%s " + "vg_sysid=%s " + "vg_args=%s " + "lm_type=%s " + "host_id=%llu " + "create_fail=%d " + "create_done=%d " + "thread_work=%d " + "thread_stop=%d " + "thread_done=%d " + "sanlock_gl_enabled=%d " + "sanlock_gl_dup=%d\n", + prefix, + ls->name, + ls->vg_name, + ls->vg_uuid, + ls->vg_sysid[0] ? ls->vg_sysid : ".", + ls->vg_args, + lm_str(ls->lm_type), + (unsigned long long)ls->host_id, + ls->create_fail ? 1 : 0, + ls->create_done ? 1 : 0, + ls->thread_work ? 1 : 0, + ls->thread_stop ? 1 : 0, + ls->thread_done ? 1 : 0, + ls->sanlock_gl_enabled ? 1 : 0, + ls->sanlock_gl_dup ? 1 : 0); +} + +static int print_action(struct action *act, const char *prefix, int pos, int len) +{ + return snprintf(dump_buf + pos, len - pos, + "info=%s " + "client_id=%u " + "flags=0x%x " + "version=%u " + "op=%s " + "rt=%s " + "mode=%s " + "lm_type=%s " + "result=%d " + "lm_rv=%d\n", + prefix, + act->client_id, + act->flags, + act->version, + op_str(act->op), + rt_str(act->rt), + mode_str(act->mode), + lm_str(act->lm_type), + act->result, + act->lm_rv); +} + +static int print_resource(struct resource *r, const char *prefix, int pos, int len) +{ + return snprintf(dump_buf + pos, len - pos, + "info=%s " + "name=%s " + "type=%s " + "mode=%s " + "sh_count=%d " + "version=%u\n", + prefix, + r->name, + rt_str(r->type), + mode_str(r->mode), + r->sh_count, + r->version); +} + +static int print_lock(struct lock *lk, const char *prefix, int pos, int len) +{ + return snprintf(dump_buf + pos, len - pos, + "info=%s " + "mode=%s " + "version=%u " + "flags=0x%x " + "client_id=%u\n", + prefix, + mode_str(lk->mode), + lk->version, + lk->flags, + lk->client_id); +} + +static int dump_info(int *dump_len) +{ + struct client *cl; + struct lockspace *ls; + struct resource *r; + struct lock *lk; + struct action *act; + int len, pos, ret; + int rv = 0; + + memset(dump_buf, 0, sizeof(dump_buf)); + len = sizeof(dump_buf); + pos = 0; + + /* + * memory + */ + + pthread_mutex_lock(&unused_struct_mutex); + ret = print_structs("structs", pos, len); + if (ret >= len - pos) { + pthread_mutex_unlock(&unused_struct_mutex); + return -ENOSPC; + } + pos += ret; + pthread_mutex_unlock(&unused_struct_mutex); + + /* + * clients + */ + + pthread_mutex_lock(&client_mutex); + list_for_each_entry(cl, &client_list, list) { + ret = print_client(cl, "client", pos, len); + if (ret >= len - pos) { + rv = -ENOSPC; + break; + } + pos += ret; + } + pthread_mutex_unlock(&client_mutex); + + if (rv < 0) + return rv; + + /* + * lockspaces with their action/resource/lock info + */ + + pthread_mutex_lock(&lockspaces_mutex); + list_for_each_entry(ls, &lockspaces, list) { + + ret = print_lockspace(ls, "ls", pos, len); + if (ret >= len - pos) { + rv = -ENOSPC; + goto out; + } + pos += ret; + + list_for_each_entry(act, &ls->actions, list) { + ret = print_action(act, "ls_action", pos, len); + if (ret >= len - pos) { + rv = -ENOSPC; + goto out; + } + pos += ret; + } + + list_for_each_entry(r, &ls->resources, list) { + ret = print_resource(r, "r", pos, len); + if (ret >= len - pos) { + rv = -ENOSPC; + goto out; + } + pos += ret; + + list_for_each_entry(lk, &r->locks, list) { + ret = print_lock(lk, "lk", pos, len); + if (ret >= len - pos) { + rv = -ENOSPC; + goto out; + } + pos += ret; + } + + list_for_each_entry(act, &r->actions, list) { + ret = print_action(act, "r_action", pos, len); + if (ret >= len - pos) { + rv = -ENOSPC; + goto out; + } + pos += ret; + } + } + } +out: + pthread_mutex_unlock(&lockspaces_mutex); + + *dump_len = pos; + + return rv; +} + +/* called from client_thread, cl->mutex is held */ +static void client_recv_action(struct client *cl) +{ + request req; + response res; + struct action *act; + const char *cl_name; + const char *vg_name; + const char *vg_uuid; + const char *vg_sysid; + const char *str; + int64_t val; + uint32_t opts = 0; + int result = 0; + int cl_pid; + int op, rt, lm, mode; + int rv; + + buffer_init(&req.buffer); + + rv = buffer_read(cl->fd, &req.buffer); + if (!rv) { + if (errno == ECONNRESET) { + log_debug("client recv %d ECONNRESET", cl->id); + cl->dead = 1; + } else { + log_error("client recv %d buffer_read error %d", cl->id, errno); + } + buffer_destroy(&req.buffer); + client_resume(cl); + return; + } + + req.cft = dm_config_from_string(req.buffer.mem); + if (!req.cft) { + log_error("client recv %d config_from_string error", cl->id); + buffer_destroy(&req.buffer); + client_resume(cl); + return; + } + + str = daemon_request_str(req, "request", NULL); + rv = str_to_op_rt(str, &op, &rt); + if (rv < 0) { + log_error("client recv %d bad request name \"%s\"", cl->id, str ? str : ""); + dm_config_destroy(req.cft); + buffer_destroy(&req.buffer); + client_resume(cl); + return; + } + + if (op == LD_OP_HELLO || op == LD_OP_QUIT || + op == LD_OP_DUMP_INFO || op == LD_OP_DUMP_LOG) { + + /* + * FIXME: add the client command name to the hello messages + * so it can be saved in cl->name here. + */ + + result = 0; + + if (op == LD_OP_QUIT) { + log_debug("op quit"); + pthread_mutex_lock(&lockspaces_mutex); + if (list_empty(&lockspaces)) { + daemon_quit = 1; + } else { + result = -EBUSY; + } + pthread_mutex_unlock(&lockspaces_mutex); + } + + buffer_init(&res.buffer); + + if (op == LD_OP_DUMP_INFO || op == LD_OP_DUMP_LOG) { + int dump_len = 0; + int fd; + + fd = setup_dump_socket(); + if (fd < 0) + result = fd; + else if (op == LD_OP_DUMP_INFO) + result = dump_info(&dump_len); + else if (op == LD_OP_DUMP_LOG) + result = dump_log(&dump_len); + else + result = -EINVAL; + + res = daemon_reply_simple("OK", + "result = %d", result, + "dump_len = %d", dump_len, + NULL); + if (fd >= 0) { + send_dump_buf(fd, dump_len); + close(fd); + } + + } else { + res = daemon_reply_simple("OK", + "result = %d", result, + "protocol = %s", lvmlockd_protocol, + "version = %d", lvmlockd_protocol_version, + NULL); + } + + buffer_write(cl->fd, &res.buffer); + buffer_destroy(&res.buffer); + dm_config_destroy(req.cft); + buffer_destroy(&req.buffer); + client_resume(cl); + return; + } + + cl_name = daemon_request_str(req, "cmd", NULL); + cl_pid = daemon_request_int(req, "pid", 0); + vg_name = daemon_request_str(req, "vg_name", NULL); + vg_uuid = daemon_request_str(req, "vg_uuid", NULL); + vg_sysid = daemon_request_str(req, "vg_sysid", NULL); + str = daemon_request_str(req, "mode", NULL); + mode = str_to_mode(str); + str = daemon_request_str(req, "opts", NULL); + opts = str_to_opts(str); + str = daemon_request_str(req, "vg_lock_type", NULL); + lm = str_to_lm(str); + + if (cl_pid && cl_pid != cl->pid) + log_error("client recv bad message pid %d client %d", cl_pid, cl->pid); + + /* FIXME: do this in hello message instead */ + if (!cl->name[0] && cl_name) + strncpy(cl->name, cl_name, MAX_NAME); + + if (!gl_use_dlm && !gl_use_sanlock && (lm > 0)) { + if (lm == LD_LM_DLM) + gl_use_dlm = 1; + else if (lm == LD_LM_SANLOCK) + gl_use_sanlock = 1; + + log_debug("set gl_use_%s", lm_str(lm)); + } + + if (!(act = alloc_action())) { + log_error("No memory for action"); + dm_config_destroy(req.cft); + buffer_destroy(&req.buffer); + client_resume(cl); + return; + } + + act->client_id = cl->id; + act->op = op; + act->rt = rt; + act->mode = mode; + act->flags = opts; + act->lm_type = lm; + + if (vg_name && strcmp(vg_name, "none")) + strncpy(act->vg_name, vg_name, MAX_NAME); + + if (vg_uuid && strcmp(vg_uuid, "none")) + strncpy(act->vg_uuid, vg_uuid, 64); + + if (vg_sysid && strcmp(vg_sysid, "none")) + strncpy(act->vg_sysid, vg_sysid, MAX_NAME); + + str = daemon_request_str(req, "lv_name", NULL); + if (str && strcmp(str, "none")) + strncpy(act->lv_name, str, MAX_NAME); + + str = daemon_request_str(req, "lv_uuid", NULL); + if (str && strcmp(str, "none")) + strncpy(act->lv_uuid, str, MAX_NAME); + + val = daemon_request_int(req, "version", 0); + if (val) + act->version = (uint32_t)val; + + str = daemon_request_str(req, "vg_lock_args", NULL); + if (str && strcmp(str, "none")) + strncpy(act->vg_args, str, MAX_ARGS); + + str = daemon_request_str(req, "lv_lock_args", NULL); + if (str && strcmp(str, "none")) + strncpy(act->lv_args, str, MAX_ARGS); + + /* start_vg will include lvmlocal.conf local/host_id here */ + val = daemon_request_int(req, "host_id", 0); + if (val) + act->host_id = val; + + act->max_retries = daemon_request_int(req, "max_retries", DEFAULT_MAX_RETRIES); + + dm_config_destroy(req.cft); + buffer_destroy(&req.buffer); + + log_debug("recv %s[%d.%u] %s %s \"%s\" mode %s flags %x", + cl->name[0] ? cl->name : "client", cl->pid, cl->id, + op_str(act->op), rt_str(act->rt), act->vg_name, mode_str(act->mode), opts); + + switch (act->op) { + case LD_OP_START: + rv = add_lockspace(act); + break; + case LD_OP_STOP: + rv = rem_lockspace(act); + break; + case LD_OP_INIT: + case LD_OP_START_WAIT: + case LD_OP_STOP_ALL: + case LD_OP_RENAME_FINAL: + case LD_OP_RUNNING_LM: + add_work_action(act); + rv = 0; + break; + case LD_OP_LOCK: + case LD_OP_UPDATE: + case LD_OP_ENABLE: + case LD_OP_DISABLE: + case LD_OP_FREE: + case LD_OP_RENAME_BEFORE: + case LD_OP_FIND_FREE_LOCK: + rv = add_lock_action(act); + break; + case LD_OP_FORGET_VG_NAME: + act->result = forget_lockspace_inactive(act->vg_name); + add_client_result(act); + break; + default: + rv = -EINVAL; + }; + + if (rv < 0) { + act->result = rv; + add_client_result(act); + } +} + +static void *client_thread_main(void *arg_in) +{ + struct client *cl; + struct action *act; + + while (1) { + pthread_mutex_lock(&client_mutex); + while (!client_work && list_empty(&client_results)) { + if (client_stop) { + pthread_mutex_unlock(&client_mutex); + goto out; + } + pthread_cond_wait(&client_cond, &client_mutex); + } + + /* + * Send outgoing results back to clients + */ + + if (!list_empty(&client_results)) { + act = list_first_entry(&client_results, struct action, list); + list_del(&act->list); + cl = find_client_id(act->client_id); + pthread_mutex_unlock(&client_mutex); + + if (cl) { + pthread_mutex_lock(&cl->mutex); + client_send_result(cl, act); + pthread_mutex_unlock(&cl->mutex); + } else { + log_debug("no client for result"); + } + free_action(act); + continue; + } + + /* + * Queue incoming actions for lockspace threads + */ + + if (client_work) { + cl = find_client_work(); + if (!cl) + client_work = 0; + pthread_mutex_unlock(&client_mutex); + + if (!cl) + continue; + + pthread_mutex_lock(&cl->mutex); + + if (cl->recv) { + cl->recv = 0; + client_recv_action(cl); + } + + if (cl->dead) { + /* + log_debug("client rem %d pi %d fd %d ig %d", + cl->id, cl->pi, cl->fd, cl->poll_ignore); + */ + /* + * If cl->dead was set in main_loop, then the + * fd has already been closed and the pollfd + * entry is already unused. + * main_loop set dead=1, ignore=0, pi=-1, fd=-1 + * + * if cl->dead was not set in main_loop, but + * set in client_recv_action, then the main_loop + * should be ignoring this client fd. + * main_loop set ignore=1 + */ + + if (cl->poll_ignore) { + log_debug("client close %d pi %d fd %d", + cl->id, cl->pi, cl->fd); + /* assert cl->pi != -1 */ + /* assert pollfd[pi].fd == FD_IGNORE */ + close(cl->fd); + rem_pollfd(cl->pi); + cl->pi = -1; + cl->fd = -1; + cl->poll_ignore = 0; + } else { + /* main thread should have closed */ + if (cl->pi != -1 || cl->fd != -1) { + log_error("client %d bad state pi %d fd %d", + cl->id, cl->pi, cl->fd); + } + } + pthread_mutex_unlock(&cl->mutex); + + pthread_mutex_lock(&client_mutex); + list_del(&cl->list); + pthread_mutex_unlock(&client_mutex); + + client_purge(cl); + + free_client(cl); + } else { + pthread_mutex_unlock(&cl->mutex); + } + } + pthread_mutex_unlock(&client_mutex); + } +out: + return NULL; +} + +static int setup_client_thread(void) +{ + int rv; + + INIT_LIST_HEAD(&client_list); + INIT_LIST_HEAD(&client_results); + + pthread_mutex_init(&client_mutex, NULL); + pthread_cond_init(&client_cond, NULL); + + rv = pthread_create(&client_thread, NULL, client_thread_main, NULL); + if (rv) + return -1; + return 0; +} + +static void close_client_thread(void) +{ + pthread_mutex_lock(&client_mutex); + client_stop = 1; + pthread_cond_signal(&client_cond); + pthread_mutex_unlock(&client_mutex); + pthread_join(client_thread, NULL); +} + +/* + * Get a list of all VGs with a lockd type (sanlock|dlm) from lvmetad. + * We'll match this list against a list of existing lockspaces that are + * found in the lock manager. + * + * For each of these VGs, also create a struct resource on ls->resources to + * represent each LV in the VG that uses a lock. For each of these LVs + * that are active, we'll attempt to adopt a lock. + */ + +static int get_lockd_vgs(struct list_head *vg_lockd) +{ + struct list_head update_vgs; + daemon_reply reply; + struct dm_config_node *cn; + struct dm_config_node *metadata; + struct dm_config_node *md_cn; + struct dm_config_node *lv_cn; + struct lockspace *ls, *safe; + struct resource *r; + const char *vg_name; + const char *vg_uuid; + const char *lv_uuid; + const char *lock_type; + const char *lock_args; + char find_str_path[PATH_MAX]; + int mutex_unlocked = 0; + int rv = 0; + + INIT_LIST_HEAD(&update_vgs); + + pthread_mutex_lock(&lvmetad_mutex); + reply = daemon_send_simple(lvmetad_handle, "vg_list", + "token = %s", "skip", + NULL); + + if (reply.error || strcmp(daemon_reply_str(reply, "response", ""), "OK")) { + log_error("vg_list from lvmetad failed %d", reply.error); + rv = -EINVAL; + goto destroy; + } + + if (!(cn = dm_config_find_node(reply.cft->root, "volume_groups"))) { + log_error("get_lockd_vgs no vgs"); + rv = -EINVAL; + goto destroy; + } + + /* create an update_vgs list of all vg uuids */ + + for (cn = cn->child; cn; cn = cn->sib) { + vg_uuid = cn->key; + + if (!(ls = alloc_lockspace())) { + rv = -ENOMEM; + break; + } + + strncpy(ls->vg_uuid, vg_uuid, 64); + list_add_tail(&ls->list, &update_vgs); + log_debug("get_lockd_vgs %s", vg_uuid); + } + destroy: + daemon_reply_destroy(reply); + + if (rv < 0) + goto out; + + /* get vg_name and lock_type for each vg uuid entry in update_vgs */ + + list_for_each_entry(ls, &update_vgs, list) { + reply = daemon_send_simple(lvmetad_handle, "vg_lookup", + "token = %s", "skip", + "uuid = %s", ls->vg_uuid, + NULL); + + if (reply.error || strcmp(daemon_reply_str(reply, "response", ""), "OK")) { + log_error("vg_lookup from lvmetad failed %d", reply.error); + rv = -EINVAL; + goto next; + } + + vg_name = daemon_reply_str(reply, "name", NULL); + if (!vg_name) { + log_error("get_lockd_vgs %s no name", ls->vg_uuid); + rv = -EINVAL; + goto next; + } + + strncpy(ls->vg_name, vg_name, MAX_NAME); + + metadata = dm_config_find_node(reply.cft->root, "metadata"); + if (!metadata) { + log_error("get_lockd_vgs %s name %s no metadata", + ls->vg_uuid, ls->vg_name); + rv = -EINVAL; + goto next; + } + + lock_type = dm_config_find_str(metadata, "metadata/lock_type", NULL); + ls->lm_type = str_to_lm(lock_type); + + if ((ls->lm_type != LD_LM_SANLOCK) && (ls->lm_type != LD_LM_DLM)) { + log_debug("get_lockd_vgs %s not lockd type", ls->vg_name); + continue; + } + + lock_args = dm_config_find_str(metadata, "metadata/lock_args", NULL); + if (lock_args) + strncpy(ls->vg_args, lock_args, MAX_ARGS); + + log_debug("get_lockd_vgs %s lock_type %s lock_args %s", + ls->vg_name, lock_type, lock_args ?: "none"); + + /* + * Make a record (struct resource) of each lv that uses a lock. + * For any lv that uses a lock, we'll check if the lv is active + * and if so try to adopt a lock for it. + */ + + for (md_cn = metadata->child; md_cn; md_cn = md_cn->sib) { + if (strcmp(md_cn->key, "logical_volumes")) + continue; + + for (lv_cn = md_cn->child; lv_cn; lv_cn = lv_cn->sib) { + snprintf(find_str_path, PATH_MAX, "%s/lock_type", lv_cn->key); + lock_type = dm_config_find_str(lv_cn, find_str_path, NULL); + + if (!lock_type) + continue; + + snprintf(find_str_path, PATH_MAX, "%s/lock_args", lv_cn->key); + lock_args = dm_config_find_str(lv_cn, find_str_path, NULL); + + snprintf(find_str_path, PATH_MAX, "%s/id", lv_cn->key); + lv_uuid = dm_config_find_str(lv_cn, find_str_path, NULL); + + if (!lv_uuid) { + log_error("get_lock_vgs no lv id for name %s", lv_cn->key); + continue; + } + + if (!(r = alloc_resource())) { + rv = -ENOMEM; + goto next; + } + + r->type = LD_RT_LV; + strncpy(r->name, lv_uuid, MAX_NAME); + if (lock_args) + strncpy(r->lv_args, lock_args, MAX_ARGS); + list_add_tail(&r->list, &ls->resources); + log_debug("get_lockd_vgs %s lv %s %s (name %s)", + ls->vg_name, r->name, lock_args ? lock_args : "", lv_cn->key); + } + } + next: + daemon_reply_destroy(reply); + + if (rv < 0) + break; + } + pthread_mutex_unlock(&lvmetad_mutex); + mutex_unlocked = 1; +out: + /* Return lockd VG's on the vg_lockd list. */ + + list_for_each_entry_safe(ls, safe, &update_vgs, list) { + list_del(&ls->list); + + if ((ls->lm_type == LD_LM_SANLOCK) || (ls->lm_type == LD_LM_DLM)) + list_add_tail(&ls->list, vg_lockd); + else + free(ls); + } + + if (!mutex_unlocked) + pthread_mutex_unlock(&lvmetad_mutex); + + return rv; +} + +static char _dm_uuid[64]; + +static char *get_dm_uuid(char *dm_name) +{ + struct dm_info info; + struct dm_task *dmt; + const char *uuid; + + if (!(dmt = dm_task_create(DM_DEVICE_INFO))) + goto fail_out; + + if (!dm_task_set_name(dmt, dm_name)) + goto fail; + + if (!dm_task_run(dmt)) + goto fail; + + if (!dm_task_get_info(dmt, &info)) + goto fail; + + if (!info.exists) + goto fail; + + uuid = dm_task_get_uuid(dmt); + if (!uuid) { + log_error("Failed to get uuid for device %s", dm_name); + goto fail; + } + + if (strncmp(uuid, "LVM", 3)) { + log_debug("dm device %s is not from LVM", dm_name); + goto fail; + } + + memset(_dm_uuid, 0, sizeof(_dm_uuid)); + strcpy(_dm_uuid, uuid); + dm_task_destroy(dmt); + return _dm_uuid; + +fail: + dm_task_destroy(dmt); +fail_out: + return NULL; +} + +/* + * dm reports the LV uuid as: + * LVM-ydpRIdDWBDX25upmj2k0D4deat6oxH8er03T0f4xM8rPIV8XqIhwv3h8Y7xRWjMr + * + * the lock name for the LV is: + * r03T0f-4xM8-rPIV-8XqI-hwv3-h8Y7-xRWjMr + * + * This function formats both as: + * r03T0f4xM8rPIV8XqIhwv3h8Y7xRWjMr + * + * and returns 1 if they match. + */ + +static int match_dm_uuid(char *dm_uuid, char *lv_lock_uuid) +{ + char buf1[64]; + char buf2[64]; + int i, j; + + memset(buf1, 0, sizeof(buf1)); + memset(buf2, 0, sizeof(buf2)); + + for (i = 0, j = 0; i < strlen(lv_lock_uuid); i++) { + if (lv_lock_uuid[i] == '-') + continue; + buf1[j] = lv_lock_uuid[i]; + j++; + } + + for (i = 36, j = 0; i < 69; i++) { + buf2[j] = dm_uuid[i]; + j++; + } + + if (!strcmp(buf1, buf2)) + return 1; + return 0; +} + +/* + * All LVs with a lock_type are on ls->resources. + * Remove any that are not active. The remaining + * will have locks adopted. + */ + +static int remove_inactive_lvs(struct list_head *vg_lockd) +{ + struct lockspace *ls; + struct resource *r, *rsafe; + struct dm_names *names; + struct dm_task *dmt; + char *dm_uuid; + char *vgname, *lvname, *layer; + char namebuf[MAX_NAME+1]; + unsigned next = 0; + int rv = 0; + + if (!(dmt = dm_task_create(DM_DEVICE_LIST))) + return -1; + + if (!dm_task_run(dmt)) { + log_error("Failed to get dm devices"); + rv = -1; + goto ret; + } + + if (!(names = dm_task_get_names(dmt))) { + log_error("Failed to get dm names"); + rv = -1; + goto ret; + } + + if (!names->dev) { + log_debug("dm names none found"); + goto out; + } + + /* + * For each dm name, compare it to each lv in each lockd vg. + */ + + do { + names = (struct dm_names *)((char *) names + next); + + dm_uuid = get_dm_uuid(names->name); + if (!dm_uuid) + goto next_dmname; + + vgname = NULL; + lvname = NULL; + layer = NULL; + + memset(namebuf, 0, sizeof(namebuf)); + strncpy(namebuf, names->name, MAX_NAME); + vgname = namebuf; + + dm_split_lvm_name(NULL, NULL, &vgname, &lvname, &layer); + + log_debug("adopt remove_inactive dm name %s dm uuid %s vgname %s lvname %s", + names->name, dm_uuid, vgname, lvname); + + if (!vgname || !lvname) { + log_debug("dm name %s invalid split vg %s lv %s layer %s", + names->name, vgname ? vgname : "", lvname ? lvname : "", layer ? layer : ""); + goto next_dmname; + } + + list_for_each_entry(ls, vg_lockd, list) { + if (strcmp(vgname, ls->vg_name)) + continue; + + if (!strcmp(lvname, "lvmlock")) + continue; + + list_for_each_entry(r, &ls->resources, list) { + if (!match_dm_uuid(dm_uuid, r->name)) + continue; + + /* Found an active LV in a lockd VG. */ + log_debug("dm device %s adopt in vg %s lv %s", + names->name, ls->vg_name, r->name); + r->adopt = 1; + goto next_dmname; + } + } +next_dmname: + next = names->next; + } while (next); + +out: + /* Remove any struct resources that do not need locks adopted. */ + list_for_each_entry(ls, vg_lockd, list) { + list_for_each_entry_safe(r, rsafe, &ls->resources, list) { + if (r->adopt) { + r->adopt = 0; + } else { + log_debug("lockd vg %s remove inactive lv %s", ls->vg_name, r->name); + list_del(&r->list); + free_resource(r); + } + } + } +ret: + dm_task_destroy(dmt); + return rv; +} + +static void adopt_locks(void) +{ + struct list_head ls_found; + struct list_head vg_lockd; + struct list_head to_unlock; + struct lockspace *ls, *lsafe; + struct lockspace *ls1, *l1safe; + struct lockspace *ls2, *l2safe; + struct resource *r, *rsafe; + struct action *act, *asafe; + int count_start = 0, count_start_done = 0, count_start_fail = 0; + int count_adopt = 0, count_adopt_done = 0, count_adopt_fail = 0; + int found, rv; + + INIT_LIST_HEAD(&adopt_results); + + INIT_LIST_HEAD(&ls_found); + INIT_LIST_HEAD(&vg_lockd); + INIT_LIST_HEAD(&to_unlock); + + /* + * Get list of lockspaces from lock managers. + * Get list of VGs from lvmetad with a lockd type. + * Get list of active lockd type LVs from /dev. + * + * ECONNREFUSED means the lock manager is not running. + * This is expected for at least one of them. + */ + + rv = lm_get_lockspaces_dlm(&ls_found); + if ((rv < 0) && (rv != -ECONNREFUSED)) + goto fail; + + rv = lm_get_lockspaces_sanlock(&ls_found); + if ((rv < 0) && (rv != -ECONNREFUSED)) + goto fail; + + if (list_empty(&ls_found)) { + log_debug("No lockspaces found to adopt"); + return; + } + + /* + * Adds a struct lockspace to vg_lockd for each lockd VG. + * Adds a struct resource to ls->resources for each LV. + */ + rv = get_lockd_vgs(&vg_lockd); + if (rv < 0) { + log_error("adopt_locks get_lockd_vgs failed"); + goto fail; + } + + /* + * For each resource on each lockspace, check if the + * corresponding LV is active. If so, leave the + * resource struct, if not free the resource struct. + * The remain entries need to have locks adopted. + */ + rv = remove_inactive_lvs(&vg_lockd); + if (rv < 0) { + log_error("adopt_locks remove_inactive_lvs failed"); + goto fail; + } + + list_for_each_entry(ls, &ls_found, list) { + if (ls->lm_type == LD_LM_DLM) + gl_use_dlm = 1; + + log_debug("adopt %s lockspace %s vg %s", + lm_str(ls->lm_type), ls->name, ls->vg_name); + } + + if (!gl_use_dlm) + gl_use_sanlock = 1; + + list_for_each_entry(ls, &vg_lockd, list) { + log_debug("adopt lvmetad vg %s lock_type %s lock_args %s", + ls->vg_name, lm_str(ls->lm_type), ls->vg_args); + + list_for_each_entry(r, &ls->resources, list) + log_debug("adopt lv %s %s", ls->vg_name, r->name); + } + + /* + * Compare and merge the list of lockspaces in ls_found + * and the list of lockd VGs in vg_lockd. + * + * An ls from ls_found may not have had any active lvs when + * previous lvmlockd died, but the ls should still be joined, + * and checked for GL/VG locks. + * + * An ls from vg_lockd with active lvs should be in ls_found. + * If it's not then we might want to join the ls and acquire locks + * for the active lvs (as opposed to adopting orphans for them.) + * The orphan lock in the ls should have prevented the ls in + * the lock manager from going away. + * + * If an ls in vg_lockd has no active lvs and does not have + * a matching entry in ls_found, then skip it. + * + * An ls in ls_found should always have a matching ls in + * vg_lockd. If it doesn't, then maybe the vg has been + * removed even though the lockspace for the vg is still + * in the lock manager. Just leave the ls in the lm + * alone, and skip the ls_found entry. + */ + + list_for_each_entry_safe(ls1, l1safe, &ls_found, list) { + + /* The dlm global lockspace is special and doesn't match a VG. */ + if (!strcmp(ls1->name, gl_lsname_dlm)) { + list_del(&ls1->list); + free(ls1); + continue; + } + + found = 0; + + list_for_each_entry_safe(ls2, l2safe, &vg_lockd, list) { + if (strcmp(ls1->vg_name, ls2->vg_name)) + continue; + + /* + * LS in both ls_found and vg_lockd. + */ + log_debug("ls %s matches vg %s", ls1->name, ls2->vg_name); + memcpy(ls1->vg_uuid, ls2->vg_uuid, 64); + memcpy(ls1->vg_args, ls2->vg_args, MAX_ARGS); + list_for_each_entry_safe(r, rsafe, &ls2->resources, list) { + list_del(&r->list); + list_add(&r->list, &ls1->resources); + } + list_del(&ls2->list); + free(ls2); + found = 1; + break; + } + + /* + * LS in ls_found, not in vg_lockd. + * An lvm lockspace found in the lock manager has no + * corresponding VG in lvmetad. This shouldn't usually + * happen, but it's possible the VG could have been removed + * while the orphaned lockspace from it was still around. + * Report an error and leave the ls in the lm alone. + */ + if (!found) { + log_error("No VG %s found for lockspace %s %s", + ls1->vg_name, ls1->name, lm_str(ls1->lm_type)); + list_del(&ls1->list); + free(ls1); + } + } + + /* + * LS in vg_lockd, not in ls_found. + * lockd vgs from lvmetad that do not have an existing lockspace. + * This wouldn't be unusual; we just skip the vg. + * But, if the vg has active lvs, then it should have had locks + * and a lockspace. Should we attempt to join the lockspace and + * acquire (not adopt) locks for these LVs? + */ + + list_for_each_entry_safe(ls, lsafe, &vg_lockd, list) { + if (!list_empty(&ls->resources)) { + /* We should have found a lockspace. */ + /* add this ls and acquire locks for ls->resources? */ + log_error("No lockspace %s %s found for VG %s with active LVs", + ls->name, lm_str(ls->lm_type), ls->vg_name); + } else { + /* The VG wasn't started in the previous lvmlockd. */ + log_debug("No ls found for vg %s", ls->vg_name); + } + + list_del(&ls->list); + free(ls); + } + + /* + * Create and queue start actions to add lockspaces. + */ + + if (gl_use_dlm) { + if (!(act = alloc_action())) + goto fail; + log_debug("adopt add dlm global lockspace"); + act->op = LD_OP_START; + act->flags = (LD_AF_ADOPT | LD_AF_WAIT); + act->rt = LD_RT_GL; + act->lm_type = LD_LM_DLM; + act->client_id = ADOPT_CLIENT_ID; + add_dlm_global_lockspace(act); + count_start++; + } + + list_for_each_entry_safe(ls, lsafe, &ls_found, list) { + if (!(act = alloc_action())) + goto fail; + act->op = LD_OP_START; + act->flags = (LD_AF_ADOPT | LD_AF_WAIT); + act->rt = LD_RT_VG; + act->lm_type = ls->lm_type; + act->client_id = ADOPT_CLIENT_ID; + strncpy(act->vg_name, ls->vg_name, MAX_NAME); + memcpy(act->vg_uuid, ls->vg_uuid, 64); + memcpy(act->vg_args, ls->vg_args, MAX_ARGS); + act->host_id = ls->host_id; + + /* set act->version from lvmetad data? */ + + log_debug("adopt add %s vg lockspace %s", lm_str(act->lm_type), act->vg_name); + + rv = add_lockspace_thread(ls->name, act->vg_name, act->vg_uuid, + act->lm_type, act->vg_args, act); + if (rv < 0) { + log_error("Failed to create lockspace thread for VG %s", ls->vg_name); + list_del(&ls->list); + free(ls); + free_action(act); + count_start_fail++; + continue; + } + + /* + * When the lockspace_thread is done with the start act, + * it will see the act ADOPT flag and move the act onto + * the adopt_results list for us to collect below. + */ + count_start++; + } + + log_debug("adopt starting %d lockspaces", count_start); + + /* + * Wait for all start/rejoin actions to complete. Each start action + * queued above will appear on the adopt_results list when finished. + */ + + while (count_start_done < count_start) { + sleep(1); + act = NULL; + + pthread_mutex_lock(&client_mutex); + if (!list_empty(&adopt_results)) { + act = list_first_entry(&adopt_results, struct action, list); + list_del(&act->list); + } + pthread_mutex_unlock(&client_mutex); + + if (!act) + continue; + + if (act->result < 0) { + log_error("adopt add lockspace failed vg %s %d", act->vg_name, act->result); + count_start_fail++; + } + + free_action(act); + count_start_done++; + } + + log_debug("adopt started %d lockspaces done %d fail %d", + count_start, count_start_done, count_start_fail); + + /* + * Create lock-adopt actions for active LVs (ls->resources), + * and GL/VG locks (we don't know if these locks were held + * and orphaned by the last lvmlockd, so try to adopt them + * to see.) + * + * A proper struct lockspace now exists on the lockspaces list + * for each ls in ls_found. Lock ops for one of those + * lockspaces can be done as OP_LOCK actions queued using + * add_lock_action(); + * + * Start by attempting to adopt the lock in the most likely + * mode it was left in (ex for lvs, sh for vg/gl). If + * the mode is wrong, the lm will return an error and we + * try again with the other mode. + */ + + list_for_each_entry(ls, &ls_found, list) { + + /* + * Adopt orphan LV locks. + */ + + list_for_each_entry(r, &ls->resources, list) { + if (!(act = alloc_action())) + goto fail; + act->op = LD_OP_LOCK; + act->rt = LD_RT_LV; + act->mode = LD_LK_EX; + act->flags = (LD_AF_ADOPT | LD_AF_PERSISTENT); + act->client_id = ADOPT_CLIENT_ID; + act->lm_type = ls->lm_type; + strncpy(act->vg_name, ls->vg_name, MAX_NAME); + strncpy(act->lv_uuid, r->name, MAX_NAME); + strncpy(act->lv_args, r->lv_args, MAX_ARGS); + + log_debug("adopt lock for lv %s %s", act->vg_name, act->lv_uuid); + + rv = add_lock_action(act); + if (rv < 0) { + log_error("adopt add_lock_action lv %s %s error %d", act->vg_name, act->lv_uuid, rv); + count_adopt_fail++; + free_action(act); + } else { + count_adopt++; + } + } + + /* + * Adopt orphan VG lock. + */ + + if (!(act = alloc_action())) + goto fail; + act->op = LD_OP_LOCK; + act->rt = LD_RT_VG; + act->mode = LD_LK_SH; + act->flags = LD_AF_ADOPT; + act->client_id = ADOPT_CLIENT_ID; + act->lm_type = ls->lm_type; + strncpy(act->vg_name, ls->vg_name, MAX_NAME); + + log_debug("adopt lock for vg %s", act->vg_name); + + rv = add_lock_action(act); + if (rv < 0) { + log_error("adopt add_lock_action vg %s error %d", act->vg_name, rv); + count_adopt_fail++; + free_action(act); + } else { + count_adopt++; + } + } + + /* + * Adopt orphan GL lock. + */ + + if (!(act = alloc_action())) + goto fail; + act->op = LD_OP_LOCK; + act->rt = LD_RT_GL; + act->mode = LD_LK_SH; + act->flags = LD_AF_ADOPT; + act->client_id = ADOPT_CLIENT_ID; + act->lm_type = (gl_use_sanlock ? LD_LM_SANLOCK : LD_LM_DLM); + + log_debug("adopt lock for gl"); + + rv = add_lock_action(act); + if (rv < 0) { + log_error("adopt add_lock_action gl %s error %d", act->vg_name, rv); + count_adopt_fail++; + free_action(act); + } else { + count_adopt++; + } + + /* + * Wait for lock-adopt actions to complete. The completed + * actions are passed back here via the adopt_results list. + */ + + while (count_adopt_done < count_adopt) { + sleep(1); + act = NULL; + + pthread_mutex_lock(&client_mutex); + if (!list_empty(&adopt_results)) { + act = list_first_entry(&adopt_results, struct action, list); + list_del(&act->list); + } + pthread_mutex_unlock(&client_mutex); + + if (!act) + continue; + + /* + * lock adopt results + */ + + if (act->result == -EUCLEAN) { + /* + * Adopt failed because the orphan has a different mode + * than initially requested. Repeat the lock-adopt operation + * with the other mode. N.B. this logic depends on first + * trying sh then ex for GL/VG locks, and ex then sh for + * LV locks. + */ + + if ((act->rt != LD_RT_LV) && (act->mode == LD_LK_SH)) { + /* GL/VG locks: attempt to adopt ex after sh failed. */ + act->mode = LD_LK_EX; + rv = add_lock_action(act); + + } else if ((act->rt == LD_RT_LV) && (act->mode == LD_LK_EX)) { + /* LV locks: attempt to adopt sh after ex failed. */ + act->mode = LD_LK_SH; + rv = add_lock_action(act); + + } else { + log_error("Failed to adopt %s lock in vg %s error %d", + rt_str(act->rt), act->vg_name, act->result); + count_adopt_fail++; + count_adopt_done++; + free_action(act); + rv = 0; + } + + if (rv < 0) { + log_error("adopt add_lock_action again %s", act->vg_name); + count_adopt_fail++; + count_adopt_done++; + free_action(act); + } + + } else if (act->result == -ENOENT) { + /* + * No orphan lock exists. This is common for GL/VG locks + * because they may not have been held when lvmlockd exited. + * It's also expected for LV types that do not use a lock. + */ + + if (act->rt == LD_RT_LV) { + /* Unexpected, we should have found an orphan. */ + log_error("Failed to adopt LV lock for %s %s error %d", + act->vg_name, act->lv_uuid, act->result); + count_adopt_fail++; + } else { + /* Normal, no GL/VG lock was orphaned. */ + log_debug("Did not adopt %s lock in vg %s error %d", + rt_str(act->rt), act->vg_name, act->result); + } + + count_adopt_done++; + free_action(act); + + } else if (act->result < 0) { + /* + * Some unexpected error. + */ + + log_error("adopt lock rt %s vg %s lv %s error %d", + rt_str(act->rt), act->vg_name, act->lv_uuid, act->result); + count_adopt_fail++; + count_adopt_done++; + free_action(act); + + } else { + /* + * Adopt success. + */ + + if (act->rt == LD_RT_LV) { + log_debug("adopt success lv %s %s %s", act->vg_name, act->lv_uuid, mode_str(act->mode)); + free_action(act); + } else if (act->rt == LD_RT_VG) { + log_debug("adopt success vg %s %s", act->vg_name, mode_str(act->mode)); + list_add_tail(&act->list, &to_unlock); + } else if (act->rt == LD_RT_GL) { + log_debug("adopt success gl %s %s", act->vg_name, mode_str(act->mode)); + list_add_tail(&act->list, &to_unlock); + } + count_adopt_done++; + } + } + + /* + * Release adopted GL/VG locks. + * The to_unlock actions were the ones used to lock-adopt the GL/VG locks; + * now use them to do the unlocks. These actions will again be placed + * on adopt_results for us to collect because they have the ADOPT flag set. + */ + + count_adopt = 0; + count_adopt_done = 0; + + list_for_each_entry_safe(act, asafe, &to_unlock, list) { + list_del(&act->list); + + if (act->mode == LD_LK_EX) { + /* + * FIXME: we probably want to check somehow that + * there's no lvm command still running that's + * using this ex lock and changing things. + */ + log_warn("adopt releasing ex %s lock %s", + rt_str(act->rt), act->vg_name); + } + + act->mode = LD_LK_UN; + + log_debug("adopt unlock for %s %s", rt_str(act->rt), act->vg_name); + + rv = add_lock_action(act); + if (rv < 0) { + log_error("adopt unlock add_lock_action error %d", rv); + free_action(act); + } else { + count_adopt++; + } + } + + /* Wait for the unlocks to complete. */ + + while (count_adopt_done < count_adopt) { + sleep(1); + act = NULL; + + pthread_mutex_lock(&client_mutex); + if (!list_empty(&adopt_results)) { + act = list_first_entry(&adopt_results, struct action, list); + list_del(&act->list); + } + pthread_mutex_unlock(&client_mutex); + + if (!act) + continue; + + if (act->result < 0) + log_error("adopt unlock error %d", act->result); + + count_adopt_done++; + free_action(act); + } + + + /* FIXME: purge any remaining orphan locks in each rejoined ls? */ + + if (count_start_fail || count_adopt_fail) + goto fail; + + log_debug("adopt_locks done"); + return; + +fail: + log_error("adopt_locks failed, reset host"); +} + +static int get_peer_pid(int fd) +{ + struct ucred cred; + unsigned int len = sizeof(cred); + + if (getsockopt(fd, SOL_SOCKET, SO_PEERCRED, &cred, &len) != 0) + return -1; + + return cred.pid; +} + +static void process_listener(int poll_fd) +{ + struct client *cl; + int fd, pi; + + /* assert poll_fd == listen_fd */ + + fd = accept(listen_fd, NULL, NULL); + if (fd < 0) + return; + + if (!(cl = alloc_client())) + return; + + pi = add_pollfd(fd); + if (pi < 0) { + log_error("process_listener add_pollfd error %d", pi); + free_client(cl); + return; + } + + cl->pi = pi; + cl->fd = fd; + cl->pid = get_peer_pid(fd); + + pthread_mutex_init(&cl->mutex, NULL); + + pthread_mutex_lock(&client_mutex); + client_ids++; + + if (client_ids == ADOPT_CLIENT_ID) + client_ids++; + if (!client_ids) + client_ids++; + + cl->id = client_ids; + list_add_tail(&cl->list, &client_list); + pthread_mutex_unlock(&client_mutex); + + log_debug("client add id %d pi %d fd %d", cl->id, cl->pi, cl->fd); +} + +/* + * main loop polls on pipe[0] so that a thread can + * restart the poll by writing to pipe[1]. + */ +static int setup_restart(void) +{ + if (pipe(restart_fds)) { + log_error("setup_restart pipe error %d", errno); + return -1; + } + + restart_pi = add_pollfd(restart_fds[0]); + if (restart_pi < 0) + return restart_pi; + + return 0; +} + +/* + * thread wrote 'w' to restart_fds[1] to restart poll() + * after adding an fd back into pollfd. + */ +static void process_restart(int fd) +{ + char wake[1]; + int rv; + + /* assert fd == restart_fds[0] */ + + rv = read(restart_fds[0], wake, 1); + if (!rv || rv < 0) + log_debug("process_restart error %d", errno); +} + +static void sigterm_handler(int sig __attribute__((unused))) +{ + daemon_quit = 1; +} + +static int main_loop(daemon_state *ds_arg) +{ + struct client *cl; + int i, rv, is_recv, is_dead; + + signal(SIGTERM, &sigterm_handler); + + rv = setup_structs(); + if (rv < 0) { + log_error("Can't allocate memory"); + return rv; + } + + strcpy(gl_lsname_dlm, S_NAME_GL_DLM); + + INIT_LIST_HEAD(&lockspaces); + INIT_LIST_HEAD(&lockspaces_inactive); + pthread_mutex_init(&lockspaces_mutex, NULL); + pthread_mutex_init(&pollfd_mutex, NULL); + pthread_mutex_init(&log_mutex, NULL); + + openlog("lvmlockd", LOG_CONS | LOG_PID, LOG_DAEMON); + log_warn("lvmlockd started"); + + listen_fd = ds_arg->socket_fd; + listen_pi = add_pollfd(listen_fd); + + setup_client_thread(); + setup_worker_thread(); + setup_restart(); + + pthread_mutex_init(&lvmetad_mutex, NULL); + lvmetad_handle = lvmetad_open(NULL); + if (lvmetad_handle.error || lvmetad_handle.socket_fd < 0) + log_error("lvmetad_open error %d", lvmetad_handle.error); + else + lvmetad_connected = 1; + + /* + * Attempt to rejoin lockspaces and adopt locks from a previous + * instance of lvmlockd that left behind lockspaces/locks. + */ + if (adopt_opt) + adopt_locks(); + + while (1) { + rv = poll(pollfd, pollfd_maxi + 1, -1); + if (rv == -1 && errno == EINTR) { + if (daemon_quit) { + int count; + /* first sigterm would trigger stops, and + second sigterm may finish the joins. */ + count = for_each_lockspace(DO_STOP, DO_FREE, NO_FORCE); + if (!count) + break; + log_debug("ignore shutdown for %d lockspaces", count); + daemon_quit = 0; + } + continue; + } + if (rv < 0) { + log_error("poll errno %d", errno); + break; + } + + for (i = 0; i <= pollfd_maxi; i++) { + if (pollfd[i].fd < 0) + continue; + + is_recv = 0; + is_dead = 0; + + if (pollfd[i].revents & POLLIN) + is_recv = 1; + if (pollfd[i].revents & (POLLERR | POLLHUP | POLLNVAL)) + is_dead = 1; + + if (!is_recv && !is_dead) + continue; + + if (i == listen_pi) { + process_listener(pollfd[i].fd); + continue; + } + + if (i == restart_pi) { + process_restart(pollfd[i].fd); + continue; + } + + /* + log_debug("poll pi %d fd %d revents %x", + i, pollfd[i].fd, pollfd[i].revents); + */ + + pthread_mutex_lock(&client_mutex); + cl = find_client_pi(i); + if (cl) { + pthread_mutex_lock(&cl->mutex); + + if (cl->recv) { + /* should not happen */ + log_error("main client %d already recv", cl->id); + + } else if (cl->dead) { + /* should not happen */ + log_error("main client %d already dead", cl->id); + + } else if (is_dead) { + log_debug("close %s[%d.%u] fd %d", + cl->name[0] ? cl->name : "client", + cl->pid, cl->id, cl->fd); + cl->dead = 1; + cl->pi = -1; + cl->fd = -1; + cl->poll_ignore = 0; + close(pollfd[i].fd); + pollfd[i].fd = POLL_FD_UNUSED; + pollfd[i].events = 0; + pollfd[i].revents = 0; + + } else if (is_recv) { + cl->recv = 1; + cl->poll_ignore = 1; + pollfd[i].fd = POLL_FD_IGNORE; + pollfd[i].events = 0; + pollfd[i].revents = 0; + } + + pthread_mutex_unlock(&cl->mutex); + + client_work = 1; + pthread_cond_signal(&client_cond); + + /* client_thread will pick up and work on any + client with cl->recv or cl->dead set */ + + } else { + /* don't think this can happen */ + log_error("no client for index %d fd %d", + i, pollfd[i].fd); + close(pollfd[i].fd); + pollfd[i].fd = POLL_FD_UNUSED; + pollfd[i].events = 0; + pollfd[i].revents = 0; + } + pthread_mutex_unlock(&client_mutex); + + /* After set_dead, should we scan pollfd for + last unused slot and reduce pollfd_maxi? */ + } + } + + for_each_lockspace_retry(DO_STOP, DO_FREE, DO_FORCE); + free_lockspaces_inactive(); + close_worker_thread(); + close_client_thread(); + closelog(); + daemon_close(lvmetad_handle); + return 0; +} + +static void usage(char *prog, FILE *file) +{ + fprintf(file, "Usage:\n"); + fprintf(file, "%s [options]\n\n", prog); + fprintf(file, " --help | -h\n"); + fprintf(file, " Show this help information.\n"); + fprintf(file, " --version | -V\n"); + fprintf(file, " Show version of lvmlockd.\n"); + fprintf(file, " --test | -T\n"); + fprintf(file, " Test mode, do not call lock manager.\n"); + fprintf(file, " --foreground | -f\n"); + fprintf(file, " Don't fork.\n"); + fprintf(file, " --daemon-debug | -D\n"); + fprintf(file, " Don't fork and print debugging to stdout.\n"); + fprintf(file, " --pid-file | -p <path>\n"); + fprintf(file, " Set path to the pid file. [%s]\n", LVMLOCKD_PIDFILE); + fprintf(file, " --socket-path | -s <path>\n"); + fprintf(file, " Set path to the socket to listen on. [%s]\n", LVMLOCKD_SOCKET); + fprintf(file, " --syslog-priority | -S err|warning|debug\n"); + fprintf(file, " Write log messages from this level up to syslog. [%s]\n", _syslog_num_to_name(LOG_SYSLOG_PRIO)); + fprintf(file, " --gl-type | -g <str>\n"); + fprintf(file, " Set global lock type to be dlm|sanlock.\n"); + fprintf(file, " --host-id | -i <num>\n"); + fprintf(file, " Set the local sanlock host id.\n"); + fprintf(file, " --host-id-file | -F <path>\n"); + fprintf(file, " A file containing the local sanlock host_id.\n"); + fprintf(file, " --sanlock-timeout | -o <seconds>\n"); + fprintf(file, " Set the sanlock lockspace I/O timeout.\n"); + fprintf(file, " --adopt | -A 0|1\n"); + fprintf(file, " Adopt locks from a previous instance of lvmlockd.\n"); +} + +int main(int argc, char *argv[]) +{ + daemon_state ds; + + ds.daemon_main = main_loop; + ds.daemon_init = NULL; + ds.daemon_fini = NULL; + ds.pidfile = getenv("LVM_LVMLOCKD_PIDFILE"); + ds.socket_path = getenv("LVM_LVMLOCKD_SOCKET"); + ds.protocol = lvmlockd_protocol; + ds.protocol_version = lvmlockd_protocol_version; + ds.name = "lvmlockd"; + + static struct option long_options[] = { + {"help", no_argument, 0, 'h' }, + {"version", no_argument, 0, 'V' }, + {"test", no_argument, 0, 'T' }, + {"foreground", no_argument, 0, 'f' }, + {"daemon-debug", no_argument, 0, 'D' }, + {"pid-file", required_argument, 0, 'p' }, + {"socket-path", required_argument, 0, 's' }, + {"gl-type", required_argument, 0, 'g' }, + {"host-id", required_argument, 0, 'i' }, + {"host-id-file", required_argument, 0, 'F' }, + {"adopt", required_argument, 0, 'A' }, + {"syslog-priority", required_argument, 0, 'S' }, + {"sanlock-timeout", required_argument, 0, 'o' }, + {0, 0, 0, 0 } + }; + + while (1) { + int c; + int lm; + int option_index = 0; + + c = getopt_long(argc, argv, "hVTfDp:s:l:g:S:I:A:o:", + long_options, &option_index); + if (c == -1) + break; + + switch (c) { + case '0': + break; + case 'h': + usage(argv[0], stdout); + exit(EXIT_SUCCESS); + case 'V': + printf("lvmlockd version: " LVM_VERSION "\n"); + exit(EXIT_SUCCESS); + case 'T': + daemon_test = 1; + break; + case 'f': + ds.foreground = 1; + break; + case 'D': + ds.foreground = 1; + daemon_debug = 1; + break; + case 'p': + ds.pidfile = strdup(optarg); + break; + case 's': + ds.socket_path = strdup(optarg); + break; + case 'g': + lm = str_to_lm(optarg); + if (lm == LD_LM_DLM) + gl_use_dlm = 1; + else if (lm == LD_LM_SANLOCK) + gl_use_sanlock = 1; + else { + fprintf(stderr, "invalid gl-type option"); + exit(EXIT_FAILURE); + } + break; + case 'i': + daemon_host_id = atoi(optarg); + break; + case 'F': + daemon_host_id_file = strdup(optarg); + break; + case 'o': + sanlock_io_timeout = atoi(optarg); + break; + case 'A': + adopt_opt = atoi(optarg); + break; + case 'S': + syslog_priority = _syslog_name_to_num(optarg); + break; + case '?': + default: + usage(argv[0], stdout); + exit(EXIT_FAILURE); + } + } + + if (!ds.pidfile) + ds.pidfile = LVMLOCKD_PIDFILE; + + if (!ds.socket_path) + ds.socket_path = LVMLOCKD_SOCKET; + + /* runs daemon_main/main_loop */ + daemon_start(ds); + + return 0; +} |