diff options
author | David Teigland <teigland@redhat.com> | 2014-12-02 14:05:49 -0600 |
---|---|---|
committer | David Teigland <teigland@redhat.com> | 2015-03-11 13:31:15 -0500 |
commit | 9e2ace1b541c05d82f1a99201991bfebdbbea82b (patch) | |
tree | f97ab71896115d4bbec1f7c0529b5a48795fbbbe | |
parent | 619ca175ba4b89de909e5db30f33ca3ae4fa2e22 (diff) | |
download | lvm2-9e2ace1b541c05d82f1a99201991bfebdbbea82b.tar.gz |
lvmlockd: add daemon
-rw-r--r-- | configure.in | 1 | ||||
-rw-r--r-- | daemons/Makefile.in | 8 | ||||
-rw-r--r-- | daemons/lvmlockd/Makefile.in | 52 | ||||
-rw-r--r-- | daemons/lvmlockd/lvmlock.c | 742 | ||||
-rw-r--r-- | daemons/lvmlockd/lvmlockd-client.h | 4 | ||||
-rw-r--r-- | daemons/lvmlockd/lvmlockd-core.c | 6184 | ||||
-rw-r--r-- | daemons/lvmlockd/lvmlockd-dlm.c | 641 | ||||
-rw-r--r-- | daemons/lvmlockd/lvmlockd-internal.h | 387 | ||||
-rw-r--r-- | daemons/lvmlockd/lvmlockd-sanlock.c | 1475 | ||||
-rw-r--r-- | man/lvmlockd.8.in | 801 |
10 files changed, 10292 insertions, 3 deletions
diff --git a/configure.in b/configure.in index ea0b1a735..a6da24652 100644 --- a/configure.in +++ b/configure.in @@ -1848,6 +1848,7 @@ daemons/dmeventd/plugins/mirror/Makefile daemons/dmeventd/plugins/snapshot/Makefile daemons/dmeventd/plugins/thin/Makefile daemons/lvmetad/Makefile +daemons/lvmlockd/Makefile conf/Makefile conf/example.conf conf/lvmlocal.conf diff --git a/daemons/Makefile.in b/daemons/Makefile.in index 9a7351681..ba9c489a2 100644 --- a/daemons/Makefile.in +++ b/daemons/Makefile.in @@ -15,7 +15,7 @@ srcdir = @srcdir@ top_srcdir = @top_srcdir@ top_builddir = @top_builddir@ -.PHONY: dmeventd clvmd cmirrord lvmetad +.PHONY: dmeventd clvmd cmirrord lvmetad lvmlockd ifneq ("@CLVMD@", "none") SUBDIRS += clvmd @@ -36,8 +36,12 @@ ifeq ("@BUILD_LVMETAD@", "yes") SUBDIRS += lvmetad endif +ifeq ("@BUILD_LVMLOCKD@", "yes") + SUBDIRS += lvmlockd +endif + ifeq ($(MAKECMDGOALS),distclean) - SUBDIRS = clvmd cmirrord dmeventd lvmetad + SUBDIRS = clvmd cmirrord dmeventd lvmetad lvmlockd endif include $(top_builddir)/make.tmpl diff --git a/daemons/lvmlockd/Makefile.in b/daemons/lvmlockd/Makefile.in new file mode 100644 index 000000000..12a85db3a --- /dev/null +++ b/daemons/lvmlockd/Makefile.in @@ -0,0 +1,52 @@ +# +# Copyright (C) 2011-2012 Red Hat, Inc. +# +# This file is part of LVM2. +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions +# of the GNU Lesser General Public License v.2.1. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +top_builddir = @top_builddir@ + +SOURCES = \ + lvmlockd-core.c \ + lvmlockd-sanlock.c \ + lvmlockd-dlm.c + +TARGETS = lvmlockd lvmlock + +.PHONY: install_lvmlockd + +include $(top_builddir)/make.tmpl + +INCLUDES += -I$(top_srcdir)/libdaemon/server +LVMLIBS = -ldaemonserver $(LVMINTERNAL_LIBS) -ldevmapper + +LIBS += $(PTHREAD_LIBS) -ldlm_lt -lsanlock + +LDFLAGS += -L$(top_builddir)/libdaemon/server +CLDFLAGS += -L$(top_builddir)/libdaemon/server -D_GNU_SOURCE + +lvmlockd: $(OBJECTS) $(top_builddir)/libdaemon/client/libdaemonclient.a \ + $(top_builddir)/libdaemon/server/libdaemonserver.a + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(OBJECTS) \ + $(DL_LIBS) $(LVMLIBS) $(LIBS) -rdynamic + +lvmlock: lvmlock.o $(top_builddir)/libdaemon/client/libdaemonclient.a \ + $(top_builddir)/libdaemon/server/libdaemonserver.a + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ lvmlock.o \ + $(DL_LIBS) $(LVMLIBS) -rdynamic + +install_lvmlockd: lvmlockd + $(INSTALL_PROGRAM) -D $< $(sbindir)/$(<F) + +install_lvm2: install_lvmlockd + +install: install_lvm2 diff --git a/daemons/lvmlockd/lvmlock.c b/daemons/lvmlockd/lvmlock.c new file mode 100644 index 000000000..d9472b685 --- /dev/null +++ b/daemons/lvmlockd/lvmlock.c @@ -0,0 +1,742 @@ +#define _GNU_SOURCE +#include "configure.h" +#include "lvmlockd-client.h" + +#include <stdio.h> +#include <stdint.h> +#include <stddef.h> +#include <stdlib.h> +#include <unistd.h> +#include <getopt.h> +#include <string.h> +#include <signal.h> +#include <errno.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/socket.h> +#include <sys/un.h> + +static int quit; +static int info; +static int dump; +static int wait_opt; +static int force_opt; +static int gl_enable; +static int gl_disable; +static int stop_lockspaces; +static char *able_vg_name; + +static int cmd_pipe[2]; +static int cmd_argc; +static char *cmd_name; +static char **cmd_argv; + +#define DUMP_SOCKET_NAME "lvmlockd-dump.sock" +#define DUMP_BUF_SIZE (1024 * 1024) +static char dump_buf[DUMP_BUF_SIZE]; +static int dump_len; +static struct sockaddr_un dump_addr; +static socklen_t dump_addrlen; + +daemon_handle _lvmlockd; + +#define log_debug(fmt, args...) \ +do { \ + printf(fmt "\n", ##args); \ +} while (0) + +#define log_error(fmt, args...) \ +do { \ + printf(fmt "\n", ##args); \ +} while (0) + +#define MAX_LINE 512 + +/* copied from lvmlockd-internal.h */ +#define MAX_NAME 64 +#define MAX_ARGS 64 + +/* + * lvmlockd dumps the client info before the lockspaces, + * so we can look up client info when printing lockspace info. + */ + +#define MAX_CLIENTS 100 + +struct client_info { + uint32_t client_id; + int pid; + char name[MAX_NAME+1]; +}; + +static struct client_info clients[MAX_CLIENTS]; +static int num_clients; + +static void save_client_info(char *line) +{ + uint32_t pid = 0; + int fd = 0; + int pi = 0; + uint32_t client_id = 0; + char name[MAX_NAME+1] = { 0 }; + + sscanf(line, "info=client pid=%u fd=%d pi=%d id=%u name=%s", + &pid, &fd, &pi, &client_id, name); + + clients[num_clients].client_id = client_id; + clients[num_clients].pid = pid; + strcpy(clients[num_clients].name, name); + num_clients++; +} + +static void find_client_info(uint32_t client_id, uint32_t *pid, char *cl_name) +{ + int i; + + for (i = 0; i < num_clients; i++) { + if (clients[i].client_id == client_id) { + *pid = clients[i].pid; + strcpy(cl_name, clients[i].name); + return; + } + } +} + +static void format_info_local_vg(char *line) +{ + char vg_name[MAX_NAME+1] = { 0 }; + char vg_uuid[MAX_NAME+1] = { 0 }; + char vg_sysid[MAX_NAME+1] = { 0 }; + + sscanf(line, "info=local_vg vg_name=%s vg_uuid=%s vg_sysid=%s", + vg_name, vg_uuid, vg_sysid); + + if (strlen(vg_sysid) == 1 && vg_sysid[0] == '.') + strcpy(vg_sysid, "none"); + + printf("VG %s system_id=%s %s\n", vg_name, vg_sysid, vg_uuid); +} + +static void format_info_ls(char *line) +{ + char ls_name[MAX_NAME+1] = { 0 }; + char vg_name[MAX_NAME+1] = { 0 }; + char vg_uuid[MAX_NAME+1] = { 0 }; + char vg_sysid[MAX_NAME+1] = { 0 }; + char lock_args[MAX_ARGS+1] = { 0 }; + char lock_type[MAX_NAME+1] = { 0 }; + + sscanf(line, "info=ls ls_name=%s vg_name=%s vg_uuid=%s vg_sysid=%s vg_args=%s lm_type=%s", + ls_name, vg_name, vg_uuid, vg_sysid, lock_args, lock_type); + + printf("\n"); + + printf("VG %s lock_type=%s %s\n", vg_name, lock_type, vg_uuid); + + printf("LS %s %s\n", lock_type, ls_name); +} + +static void format_info_ls_action(char *line) +{ + uint32_t client_id = 0; + char flags[MAX_NAME+1] = { 0 }; + char version[MAX_NAME+1] = { 0 }; + char op[MAX_NAME+1] = { 0 }; + uint32_t pid = 0; + char cl_name[MAX_NAME+1] = { 0 }; + + sscanf(line, "info=ls_action client_id=%u %s %s op=%s", + &client_id, flags, version, op); + + find_client_info(client_id, &pid, cl_name); + + printf("OP %s pid %u (%s)", op, pid, cl_name); +} + +static void format_info_r(char *line, char *r_name_out, char *r_type_out) +{ + char r_name[MAX_NAME+1] = { 0 }; + char r_type[4] = { 0 }; + char mode[4] = { 0 }; + char sh_count[MAX_NAME+1] = { 0 }; + uint32_t ver = 0; + + sscanf(line, "info=r name=%s type=%s mode=%s %s version=%u", + r_name, r_type, mode, sh_count, &ver); + + /* when mode is not un, wait and print each lk line */ + + if (strcmp(mode, "un")) { + strcpy(r_name_out, r_name); + strcpy(r_type_out, r_type); + return; + } + + /* when mode is un, there will be no lk lines, so print now */ + + if (!strcmp(r_type, "gl")) { + printf("LK GL un ver %4u\n", ver); + + } else if (!strcmp(r_type, "vg")) { + printf("LK VG un ver %4u\n", ver); + + } else if (!strcmp(r_type, "lv")) { + printf("LK LV un %s\n", r_name); + } +} + +static void format_info_lk(char *line, char *r_name, char *r_type) +{ + char mode[4] = { 0 }; + uint32_t ver = 0; + char flags[MAX_NAME+1] = { 0 }; + uint32_t client_id = 0; + uint32_t pid = 0; + char cl_name[MAX_NAME+1] = { 0 }; + + if (!r_name[0] || !r_type[0]) { + printf("format_info_lk error r_name %s r_type %s\n", r_name, r_type); + printf("%s\n", line); + return; + } + + sscanf(line, "info=lk mode=%s version=%u %s client_id=%u", + mode, &ver, flags, &client_id); + + find_client_info(client_id, &pid, cl_name); + + if (!strcmp(r_type, "gl")) { + printf("LK GL %s ver %4u pid %u (%s)\n", mode, ver, pid, cl_name); + + } else if (!strcmp(r_type, "vg")) { + printf("LK VG %s ver %4u pid %u (%s)\n", mode, ver, pid, cl_name); + + } else if (!strcmp(r_type, "lv")) { + printf("LK LV %s %s\n", mode, r_name); + } +} + +static void format_info_r_action(char *line, char *r_name, char *r_type) +{ + uint32_t client_id = 0; + char flags[MAX_NAME+1] = { 0 }; + char version[MAX_NAME+1] = { 0 }; + char op[MAX_NAME+1] = { 0 }; + char rt[4] = { 0 }; + char mode[4] = { 0 }; + char lm[MAX_NAME+1] = { 0 }; + char result[MAX_NAME+1] = { 0 }; + char lm_rv[MAX_NAME+1] = { 0 }; + uint32_t pid = 0; + char cl_name[MAX_NAME+1] = { 0 }; + + if (!r_name[0] || !r_type[0]) { + printf("format_info_r_action error r_name %s r_type %s\n", r_name, r_type); + printf("%s\n", line); + return; + } + + sscanf(line, "info=r_action client_id=%u %s %s op=%s rt=%s mode=%s %s %s %s", + &client_id, flags, version, op, rt, mode, lm, result, lm_rv); + + find_client_info(client_id, &pid, cl_name); + + if (strcmp(op, "lock")) { + printf("OP %s pid %u (%s)", op, pid, cl_name); + return; + } + + if (!strcmp(r_type, "gl")) { + printf("LW GL %s ver %4u pid %u (%s)\n", mode, 0, pid, cl_name); + + } else if (!strcmp(r_type, "vg")) { + printf("LW VG %s ver %4u pid %u (%s)\n", mode, 0, pid, cl_name); + + } else if (!strcmp(r_type, "lv")) { + printf("LW LV %s %s\n", mode, r_name); + } +} + +static void format_info_line(char *line) +{ + char r_name[MAX_NAME+1]; + char r_type[MAX_NAME+1]; + + if (!strncmp(line, "info=client ", strlen("info=client "))) { + save_client_info(line); + + } else if (!strncmp(line, "info=local_vg ", strlen("info=local_vg "))) { + format_info_local_vg(line); + + } else if (!strncmp(line, "info=ls ", strlen("info=ls "))) { + format_info_ls(line); + + } else if (!strncmp(line, "info=ls_action ", strlen("info=ls_action "))) { + format_info_ls_action(line); + + } else if (!strncmp(line, "info=r ", strlen("info=r "))) { + memset(r_name, 0, sizeof(r_name)); + memset(r_type, 0, sizeof(r_type)); + format_info_r(line, r_name, r_type); + + } else if (!strncmp(line, "info=lk ", strlen("info=lk "))) { + /* will use info from previous r */ + format_info_lk(line, r_name, r_type); + + } else if (!strncmp(line, "info=r_action ", strlen("info=r_action "))) { + /* will use info from previous r */ + format_info_r_action(line, r_name, r_type); + } else { + printf("UN %s\n", line); + } +} + +static void format_info(void) +{ + char line[MAX_LINE]; + int i, j; + + j = 0; + memset(line, 0, sizeof(line)); + + for (i = 0; i < dump_len; i++) { + line[j++] = dump_buf[i]; + + if ((line[j-1] == '\n') || (line[j-1] == '\0')) { + format_info_line(line); + j = 0; + memset(line, 0, sizeof(line)); + } + } +} + + +static daemon_reply _lvmlockd_send(const char *req_name, ...) +{ + va_list ap; + daemon_reply repl; + daemon_request req; + + req = daemon_request_make(req_name); + + va_start(ap, req_name); + daemon_request_extend_v(req, ap); + va_end(ap); + + repl = daemon_send(_lvmlockd, req); + + daemon_request_destroy(req); + + return repl; +} + +static int _lvmlockd_result(daemon_reply reply, int *result) +{ + int reply_result; + const char *reply_flags; + const char *lock_type; + + if (reply.error) { + log_error("lvmlockd_result reply error %d", reply.error); + return 0; + } + + if (strcmp(daemon_reply_str(reply, "response", ""), "OK")) { + log_error("lvmlockd_result bad response"); + return 0; + } + + /* FIXME: using -1000 is dumb */ + + reply_result = daemon_reply_int(reply, "op_result", -1000); + if (reply_result == -1000) { + log_error("lvmlockd_result no op_result"); + return 0; + } + + /* The lock_type that lvmlockd used for locking. */ + lock_type = daemon_reply_str(reply, "lock_type", "none"); + + *result = reply_result; + + reply_flags = daemon_reply_str(reply, "result_flags", NULL); + + log_debug("lvmlockd_result %d %s lm %s", reply_result, reply_flags, lock_type); + return 1; +} + +static int do_quit(void) +{ + daemon_reply reply; + int rv = 0; + + reply = daemon_send_simple(_lvmlockd, "quit", NULL); + + if (reply.error) { + log_error("reply error %d", reply.error); + rv = reply.error; + } + + daemon_reply_destroy(reply); + return rv; +} + +static int setup_dump_socket(void) +{ + int s, rv; + + s = socket(AF_LOCAL, SOCK_DGRAM, 0); + if (s < 0) + return s; + + memset(&dump_addr, 0, sizeof(dump_addr)); + dump_addr.sun_family = AF_LOCAL; + strcpy(&dump_addr.sun_path[1], DUMP_SOCKET_NAME); + dump_addrlen = sizeof(sa_family_t) + strlen(dump_addr.sun_path+1) + 1; + + rv = bind(s, (struct sockaddr *) &dump_addr, dump_addrlen); + if (rv < 0) + return rv; + + return s; +} + +static int do_dump(const char *req_name) +{ + daemon_reply reply; + int result; + int fd, rv; + + fd = setup_dump_socket(); + if (fd < 0) { + log_error("socket error %d", fd); + return fd; + } + + reply = daemon_send_simple(_lvmlockd, req_name, NULL); + + if (reply.error) { + log_error("reply error %d", reply.error); + rv = reply.error; + goto out; + } + + result = daemon_reply_int(reply, "result", 0); + dump_len = daemon_reply_int(reply, "dump_len", 0); + + daemon_reply_destroy(reply); + + if (result < 0) + log_error("result %d", result); + + if (!dump_len) + goto out; + + memset(dump_buf, 0, sizeof(dump_buf)); + + rv = recvfrom(fd, dump_buf, dump_len, MSG_WAITALL, + (struct sockaddr *)&dump_addr, &dump_addrlen); + if (rv < 0) { + log_error("recvfrom error %d %d", rv, errno); + rv = -errno; + goto out; + } + + rv = 0; + if ((info && dump) || !strcmp(req_name, "dump")) + printf("%s\n", dump_buf); + else + format_info(); +out: + close(fd); + return rv; +} + +static int do_able(const char *req_name) +{ + daemon_reply reply; + int result; + int rv; + + reply = _lvmlockd_send(req_name, + "cmd = %s", "lvmlock", + "pid = %d", getpid(), + "vg_name = %s", able_vg_name, + NULL); + + if (!_lvmlockd_result(reply, &result)) { + log_error("lvmlockd result %d", result); + rv = result; + } else { + rv = 0; + } + + daemon_reply_destroy(reply); + return rv; +} + +static int do_stop_lockspaces(void) +{ + daemon_reply reply; + char opts[32]; + int result; + int rv; + + memset(opts, 0, sizeof(opts)); + + if (wait_opt) + strcat(opts, "wait "); + if (force_opt) + strcat(opts, "force "); + + reply = _lvmlockd_send("stop_all", + "cmd = %s", "lvmlock", + "pid = %d", getpid(), + "opts = %s", opts[0] ? opts : "none", + NULL); + + if (!_lvmlockd_result(reply, &result)) { + log_error("lvmlockd result %d", result); + rv = result; + } else { + rv = 0; + } + + daemon_reply_destroy(reply); + return rv; +} + +static void print_usage(void) +{ + printf("lvmlock options\n"); + printf("Options:\n"); + printf("--help | -h\n"); + printf(" Show this help information.\n"); + printf("--quit | -q\n"); + printf(" Tell lvmlockd to quit.\n"); + printf("--info | -i\n"); + printf(" Print lock state information from lvmlockd.\n"); + printf("--dump | -d\n"); + printf(" Print log buffer from lvmlockd.\n"); + printf("--wait | -w 0|1\n"); + printf(" Wait option for other commands.\n"); + printf("--force | -f 0|1>\n"); + printf(" Force option for other commands.\n"); + printf("--stop-lockspaces | -S\n"); + printf(" Stop all lockspaces.\n"); + printf("--gl-enable <vg_name>\n"); + printf(" Tell lvmlockd to enable the global lock in a sanlock vg.\n"); + printf("--gl-disable <vg_name>\n"); + printf(" Tell lvmlockd to disable the global lock in a sanlock vg.\n"); +} + +static int read_options(int argc, char *argv[]) +{ + int option_index = 0; + int i, j, c, len; + + static struct option long_options[] = { + {"help", no_argument, 0, 'h' }, + {"quit", no_argument, 0, 'q' }, + {"info", no_argument, 0, 'i' }, + {"dump", no_argument, 0, 'd' }, + {"wait", required_argument, 0, 'w' }, + {"force", required_argument, 0, 'f' }, + {"gl-enable", required_argument, 0, 'E' }, + {"gl-disable", required_argument, 0, 'D' }, + {"stop-lockspaces", no_argument, 0, 'S' }, + {"sleep", required_argument, 0, 's' }, + {"command", required_argument, 0, 'c' }, + {0, 0, 0, 0 } + }; + + /* + if (argc == 1) { + print_usage(); + exit(0); + } + */ + + while (1) { + c = getopt_long(argc, argv, "hqidE:D:s:c:w:f:S", long_options, &option_index); + if (c == -1) + break; + + switch (c) { + case 'h': + /* --help */ + print_usage(); + exit(0); + case 'q': + /* --quit */ + quit = 1; + break; + case 'i': + /* --info */ + info = 1; + break; + case 'd': + /* --dump */ + dump = 1; + break; + case 'w': + wait_opt = atoi(optarg); + break; + case 'E': + gl_enable = 1; + able_vg_name = strdup(optarg); + break; + case 'D': + gl_disable = 1; + able_vg_name = strdup(optarg); + break; + case 'S': + stop_lockspaces = 1; + break; + case 'c': + /* --command path args */ + cmd_name = strdup(optarg); + break; + default: + print_usage(); + exit(1); + } + + if (cmd_name) + break; + } + + if (cmd_name) { + /* + * optind is the index in argv of the first argv element that + * is not an option. + */ + + cmd_argc = argc - optind + 1; /* +1 for cmd_name */ + + len = (cmd_argc + 1) * sizeof(char *); /* +1 for final NULL */ + cmd_argv = malloc(len); + if (!cmd_argv) + return -ENOMEM; + memset(cmd_argv, 0, len); + + j = 0; + cmd_argv[j++] = cmd_name; + + for (i = optind; i < argc; i++) { + cmd_argv[j++] = strdup(argv[i]); + if (!cmd_argv[j-1]) + return -ENOMEM; + } + } + + return 0; +} + +static void run_command(void) +{ + char go[1]; + int rv; + + while (1) { + /* wait for parent to tell us to go */ + rv = read(cmd_pipe[0], go, 1); + if (rv == -1 && errno == EINTR) + continue; + if (rv == 1 && go[0] == 'g') + break; + else + exit(-1); + } + + execv(cmd_name, cmd_argv); + log_error("execv failed"); +} + +int main(int argc, char **argv) +{ + int status; + int pid = 0; + int rv = 0; + + rv = read_options(argc, argv); + if (rv < 0) + return rv; + + /* + * fork child for command before acquiring locks, + * exec command in child after acquiring locks, + * release locks after child exits. + */ + + if (cmd_name) { + if (pipe(cmd_pipe)) { + log_error("pipe error"); + return -1; + } + pid = fork(); + if (pid < 0) { + log_error("cannot fork"); + return -1; + } + if (!pid) { + run_command(); + return -1; + } + } + + _lvmlockd = lvmlockd_open(NULL); + + if (_lvmlockd.socket_fd < 0 || _lvmlockd.error) { + log_error("lvmlockd open error %d", _lvmlockd.error); + goto out_pid; + } + + if (quit) { + rv = do_quit(); + goto out; + } + + if (info) { + rv = do_dump("info"); + goto out; + } + + if (dump) { + rv = do_dump("dump"); + goto out; + } + + if (gl_enable) { + rv = do_able("enable_gl"); + goto out; + } + + if (gl_disable) { + rv = do_able("disable_gl"); + goto out; + } + + if (stop_lockspaces) { + rv = do_stop_lockspaces(); + goto out; + } + + if (pid) { + /* tell child to exec */ + write(cmd_pipe[1], "g", 1); + waitpid(pid, &status, 0); + pid = 0; + } +out: + lvmlockd_close(_lvmlockd); +out_pid: + if (pid) { + kill(pid, SIGKILL); + waitpid(pid, &status, 0); + } + + return rv; +} + diff --git a/daemons/lvmlockd/lvmlockd-client.h b/daemons/lvmlockd/lvmlockd-client.h index dcae38468..7010078a2 100644 --- a/daemons/lvmlockd/lvmlockd-client.h +++ b/daemons/lvmlockd/lvmlockd-client.h @@ -13,13 +13,15 @@ #include "daemon-client.h" +#define LVMLOCKD_SOCKET DEFAULT_RUN_DIR "/lvmlockd.socket" + /* Wrappers to open/close connection */ static inline daemon_handle lvmlockd_open(const char *socket) { daemon_info lvmlockd_info = { .path = "lvmlockd", - .socket = socket ?: DEFAULT_RUN_DIR "/lvmlockd.socket", + .socket = socket ?: LVMLOCKD_SOCKET, .protocol = "lvmlockd", .protocol_version = 1, .autostart = 0 diff --git a/daemons/lvmlockd/lvmlockd-core.c b/daemons/lvmlockd/lvmlockd-core.c new file mode 100644 index 000000000..79d81b347 --- /dev/null +++ b/daemons/lvmlockd/lvmlockd-core.c @@ -0,0 +1,6184 @@ +/* + * Copyright (C) 2014 Red Hat, Inc. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU Lesser General Public License v.2.1. + */ + +#define _XOPEN_SOURCE 500 /* pthread */ +#define _ISOC99_SOURCE +#define _GNU_SOURCE + +#include "configure.h" +#include "daemon-io.h" +#include "daemon-server.h" +#include "daemon-log.h" +#include "config-util.h" +#include "lvm-version.h" +#include "lvmetad-client.h" +#include "lvmlockd-client.h" + +#include <assert.h> +#include <pthread.h> +#include <stdint.h> +#include <stddef.h> +#include <stdlib.h> +#include <unistd.h> +#include <poll.h> +#include <errno.h> +#include <signal.h> +#include <getopt.h> +#include <syslog.h> +#include <dirent.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/utsname.h> +#include <sys/un.h> + +#define EXTERN +#include "lvmlockd-internal.h" + +/* + * Basic operation of lvmlockd + * + * lvmlockd main process runs main_loop() which uses poll(). + * poll listens for new connections from lvm commands and for + * messages from existing connected lvm commands. + * + * lvm command starts and connects to lvmlockd. + * + * lvmlockd receives a connection request from command and adds a + * 'struct client' to keep track of the connection to the command. + * The client's fd is added to the set of fd's in poll(). + * + * lvm command sends a lock request to lvmlockd. The lock request + * can be for the global lock, a vg lock, or an lv lock. + * + * lvmlockd main_loop/poll sees a message from an existing client. + * It sets client.recv = 1, then wakes up client_thread_main. + * + * client_thread_main iterates through client structs (cl), looking + * for any that need processing, finds the one with cl->recv set, + * and calls client_recv_action(cl). + * + * client_recv_action(cl) reads the message/request from the client, + * allocates a new 'struct action' (act) to represent the request, + * sets the act with what is found in the request, then looks at + * the specific operation in act->op (LD_OP_FOO) to decide what to + * do with the action: + * + * . If the action is to start a lockspace, create a new thread + * to manage that lockspace: add_lockspace(act). + * + * . If the action is a lock request, pass the act to the thread + * that is managing that lockspace: add_lock_action(act). + * + * . If the action is to add/remove a local VG to the list of + * local VGs, do that directly: add_local_vg()/rem_local_vg(). + * + * . Other misc actions are are passed to the worker_thread: + * add_work_action(act). + * + * Onec the client_thread has passed the action off to another + * thread to process, it goes back to waiting for more client + * handling work to do. + * + * The thread that was given the action by the client_thread + * now processes that action according to the operation, act->op. + * This is either a lockspace_thread (for lock ops or ops that + * add/rem a lockspace), or the worker_thread. See below for + * how these ops are processed by these threads. When the + * given thread is done processing the action, the result is + * set in act->result, and the act struct for the completed action + * is passed back to the client_thread (client_results list). + * + * The client_thread takes completed actions (from client_results + * list), and sends the result back to the client that sent the + * request represented by the action. The act struct is then freed. + * + * This completes the cycle of work between lvm commands (clients) + * and lvmlockd. In summary: + * + * - main process polls for new client connections and new requests + * from lvm commands + * - client_thread reads requests from clients + * - client_thread creates an action struct for each request + * - client_thread passes the act to another thread for processing + * - other threads pass completed act structs back to client_thread + * - client_thread sends the act result back to the client and frees the act + * + * + * Lockspace threads: + * Each lockd VG has its own lockspace that contains locks for that VG. + * Each 'struct lockspace' is managed by a separate lockspace_thread. + * When the lockspace_thread is first created, the first thing it does + * is join the lockspace in the lock manager. This can take a long time. + * If the join fails, the thread exits. After the join, the thread + * enters a loop waiting for lock actions to perform in the lockspace. + * + * The request to remove/leave a lockspace causes a flag to be set in + * the lockspace struct. When the lockspace_thread sees this flag + * set, it leaves the lockspace, and exits. + * + * When the client_thread passes a new action to a lockspace_thread, + * i.e. a new lock request, the lockspace_thread identifies which resource + * is being locked (GL, VG, LV), and gets the 'struct resource' (r) for it. + * r->type will be LD_RT_GL, LD_RT_VG, or LD_RT_LV. r->name is the + * resource name, and is fixed for GL and VG resources, but is based on + * the LV name for LV resources. The act is added to the resource's + * list of actions: r->actions, i.e. outstanding lock requests on the + * resource. + * + * The lockspace thread then iterates through each resource in the + * lockspace, processing any outstanding actions on each: res_process(ls, r). + * + * res_process() compares the outstanding actions/requests in r->actions + * against any existing locks on the resource in r->locks. If the + * action is blocked by existing locks, it's left on r->actions. If not, + * the action/request is passed to the lock manager. If the result from + * the lock manager is success, a new 'struct lock' is created for the + * action and saved on r->locks. The result is set in act->result and + * the act is passed back to the client_thread to be returned to the client. + */ + +static const char *lvmlockd_protocol = "lvmlockd"; +static const int lvmlockd_protocol_version = 1; +static int daemon_quit; +static char *our_system_id; +static int adopt_opt; + +static daemon_handle lvmetad_handle; +static pthread_mutex_t lvmetad_mutex; +static int lvmetad_connected; + +/* + * We use a separate socket for dumping daemon info. + * This will not interfere with normal operations, and allows + * free-form debug data to be dumped instead of the libdaemon + * protocol that wants all data in the cft format. + * 1MB should fit all the info we need to dump. + */ +#define DUMP_SOCKET_NAME "lvmlockd-dump.sock" +#define DUMP_BUF_SIZE (1024 * 1024) +static char dump_buf[DUMP_BUF_SIZE]; +static struct sockaddr_un dump_addr; +static socklen_t dump_addrlen; + +/* + * Main program polls client connections, adds new clients, + * adds work for client thread. + * + * pollfd_mutex is used for adding vs removing entries, + * and for resume vs realloc. + */ +#define POLL_FD_UNUSED -1 /* slot if free */ +#define POLL_FD_IGNORE -2 /* slot is used but ignore in poll */ +#define ADD_POLL_SIZE 16 /* increment slots by this amount */ + +static pthread_mutex_t pollfd_mutex; +static struct pollfd *pollfd; +static int pollfd_size; +static int pollfd_maxi; +static int listen_pi; +static int listen_fd; +static int restart_pi; +static int restart_fds[2]; + +/* + * Each lockspace has its own thread to do locking. + * The lockspace thread makes synchronous lock requests to dlm/sanlock. + * Every vg with a lockd type, i.e. "dlm", "sanlock", should be on this list. + * + * lockspaces_inactive holds old ls structs for vgs that have been + * stopped, or for vgs that failed to start. The old ls structs + * are removed from the inactive list and freed when a new ls with + * the same name is started and added to the standard lockspaces list. + * Keeping this bit of "history" for the ls allows us to return a + * more informative error message if a vg lock request is made for + * an ls that has been stopped or failed to start. + */ +static pthread_mutex_t lockspaces_mutex; +static struct list_head lockspaces; +static struct list_head lockspaces_inactive; + +/* + * This flag is set to 1 if we see multiple vgs with the global + * lock enabled. While this is set, we return a special flag + * with the vg lock result indicating to the lvm command that + * there is a duplicate gl in the vg which should be resolved. + * While this is set, find_lockspace_name has the side job of + * counting the number of lockspaces with enabled gl's so that + * this can be set back to zero when the duplicates are disabled. + */ +static int sanlock_gl_dup; + +/* + * VG's that do not have a lockd type are on the local_vgs list. + * Every vg on the system should be in either the lockspaces + * list or the local_vgs list. + * + * lvm commands send lock requests to lvmlockd for local vgs + * because at the point locks are acquired in the command, + * the vg has not been read, so the command does not know if + * the vg's lock_type is local and the locks can be skipped. + * So lvmlockd keeps track of which vg's are local so it can + * quickly check if a vg lock request can be skipped. (Rather + * than having to look up the lock_type in lvmetad for every + * operation on a local vg.) + * + * When local_thread_also is set, lvmlockd's local_thread is + * used to manage locks for local pids on vgs from local_vgs. + * (In addition to standard locking for lockd type vgs.) + * + * When local_thread_only is set, lvmlockd is only used to + * manage locks for local pids on vgs from local_vgs, and + * not to manage lockd type vgs. + * + * local locking: + * + * lock_gl: only do local_thread locking for gl when local_thread_only + * is set. local_thread_only means that no standard lockd lockspaces + * are being used, and lvmlockd is used only for inter-pid locking. + * When local_thread_only is not set (meaning both local and shared vgs + * are expected), then the standard gl lockspace works for both local + * (between local pids) and remote (between pids on different nodes). + * + * lock_vg: only do local_thread locking for local, non-lockd, vgs in + * the local_vgs list. When the vg is a lockd-type, then the standard + * lockspace thread works for locking between pids also. + * + * local_thread_only=1 local_thread_also=1 + * Use lvmlockd for locking only between local pids, both gl and vg locks. + * No shared disks or lockd type vgs should exist. + * + * local_thread_only=0 local_thread_also=1 + * Use lvmlockd for locking between local pids for local vgs, + * and use lvmlockd for distributed locking for lockd-type vgs. + * Use global lock from a lockd-type vgs. A local-only gl does + * not make sense here. + * + * local_thread_only=0 local_thread_also=0 + * Do not use lvmlockd for locking between local pids. + * No shared disks or lockd type vgs should exist. + * (lvmlockd should probably not be run at all in this case.) + * + * local_thread_only=1 local_thread_also=0 + * Not allowed. + */ +static pthread_t local_thread; +static pthread_mutex_t local_thread_mutex; +static pthread_cond_t local_thread_cond; +static struct list_head local_thread_actions; +static struct list_head local_vgs; +static struct lockspace *local_thread_gls; +static int local_thread_also; +static int local_thread_only; +static int local_thread_stop; +static int local_thread_work; + +/* + * Client thread reads client requests and writes client results. + */ +static pthread_t client_thread; +static pthread_mutex_t client_mutex; +static pthread_cond_t client_cond; +static struct list_head client_list; /* connected clients */ +static struct list_head client_results; /* actions to send back to clients */ +static uint32_t client_ids; /* 0 and ADOPT_CLIENT_ID are skipped */ +static int client_stop; /* stop the thread */ +static int client_work; /* a client on client_list has work to do */ + +#define ADOPT_CLIENT_ID 0xFFFFFFFF /* special client_id for adopt actions */ +static struct list_head adopt_results; /* special start actions from adopt_locks() */ + +/* + * Worker thread performs misc non-locking actions, e.g. init/free. + */ +static pthread_t worker_thread; +static pthread_mutex_t worker_mutex; +static pthread_cond_t worker_cond; +static struct list_head worker_list; /* actions for worker_thread */ +static int worker_stop; /* stop the thread */ +static int worker_wake; /* wake the thread without adding work */ + +/* + * The content of every log_foo() statement is saved in the + * circular buffer, which can be dumped to a client and printed. + */ +#define LOG_LINE_SIZE 256 +#define LOG_DUMP_SIZE DUMP_BUF_SIZE +static char log_dump[LOG_DUMP_SIZE]; +static unsigned int log_point; +static unsigned int log_wrap; +static pthread_mutex_t log_mutex; +static int syslog_priority = LOG_WARNING; + +/* + * Structure pools to avoid repeated malloc/free. + */ +#define MAX_UNUSED_ACTION 64 +#define MAX_UNUSED_CLIENT 64 +#define MAX_UNUSED_RESOURCE 64 +#define MAX_UNUSED_LOCK 64 +static pthread_mutex_t unused_struct_mutex; +static struct list_head unused_action; +static struct list_head unused_client; +static struct list_head unused_resource; +static struct list_head unused_lock; +static int unused_action_count; +static int unused_client_count; +static int unused_resource_count; +static int unused_lock_count; +static int resource_lm_data_size; /* max size of lm_data from sanlock|dlm */ + +#define DO_STOP 1 +#define NO_STOP 0 +#define DO_FREE 1 +#define NO_FREE 0 +#define DO_FORCE 1 +#define NO_FORCE 0 + +static int add_lock_action(struct action *act); +static int str_to_lm(const char *str); +static void clear_lockspace_inactive(char *name); + +static uint64_t monotime(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec; +} + +static void log_save_line(int len, char *line, + char *log_buf, unsigned int *point, unsigned int *wrap) +{ + unsigned int p = *point; + unsigned int w = *wrap; + int i; + + if (len < LOG_DUMP_SIZE - p) { + memcpy(log_buf + p, line, len); + p += len; + + if (p == LOG_DUMP_SIZE) { + p = 0; + w = 1; + } + goto out; + } + + for (i = 0; i < len; i++) { + log_buf[p++] = line[i]; + + if (p == LOG_DUMP_SIZE) { + p = 0; + w = 1; + } + } + out: + *point = p; + *wrap = w; +} + +void log_level(int level, const char *fmt, ...) +{ + char line[LOG_LINE_SIZE]; + va_list ap; + int len = LOG_LINE_SIZE - 1; + int ret, pos = 0; + + memset(line, 0, sizeof(line)); + + ret = snprintf(line, len, "%llu ", (unsigned long long)time(NULL)); + pos += ret; + + va_start(ap, fmt); + ret = vsnprintf(line + pos, len - pos, fmt, ap); + va_end(ap); + + if (ret >= len - pos) + pos = len - 1; + else + pos += ret; + + line[pos++] = '\n'; + line[pos++] = '\0'; + + pthread_mutex_lock(&log_mutex); + log_save_line(pos - 1, line, log_dump, &log_point, &log_wrap); + pthread_mutex_unlock(&log_mutex); + + if (level <= syslog_priority) + syslog(level, "%s", line); + + if (daemon_debug) + fprintf(stderr, "%s", line); +} + +static int dump_log(int *dump_len) +{ + int tail_len; + + pthread_mutex_lock(&log_mutex); + + if (!log_wrap && !log_point) { + *dump_len = 0; + } else if (log_wrap) { + tail_len = LOG_DUMP_SIZE - log_point; + memcpy(dump_buf, log_dump+log_point, tail_len); + if (log_point) + memcpy(dump_buf+tail_len, log_dump, log_point); + *dump_len = LOG_DUMP_SIZE; + } else { + memcpy(dump_buf, log_dump, log_point-1); + *dump_len = log_point-1; + } + pthread_mutex_unlock(&log_mutex); + + return 0; +} + +struct lockspace *alloc_lockspace(void) +{ + struct lockspace *ls; + + if (!(ls = malloc(sizeof(struct lockspace)))) { + log_error("out of memory for lockspace"); + return NULL; + } + + memset(ls, 0, sizeof(struct lockspace)); + INIT_LIST_HEAD(&ls->actions); + INIT_LIST_HEAD(&ls->resources); + pthread_mutex_init(&ls->mutex, NULL); + pthread_cond_init(&ls->cond, NULL); + return ls; +} + +static struct action *alloc_action(void) +{ + struct action *act; + + pthread_mutex_lock(&unused_struct_mutex); + if (!unused_action_count) { + act = malloc(sizeof(struct action)); + } else { + act = list_first_entry(&unused_action, struct action, list); + list_del(&act->list); + unused_action_count--; + } + pthread_mutex_unlock(&unused_struct_mutex); + if (act) + memset(act, 0, sizeof(struct action)); + else + log_error("out of memory for action"); + return act; +} + +static struct client *alloc_client(void) +{ + struct client *cl; + + pthread_mutex_lock(&unused_struct_mutex); + if (!unused_client_count) { + cl = malloc(sizeof(struct client)); + } else { + cl = list_first_entry(&unused_client, struct client, list); + list_del(&cl->list); + unused_client_count--; + } + pthread_mutex_unlock(&unused_struct_mutex); + if (cl) + memset(cl, 0, sizeof(struct client)); + else + log_error("out of memory for client"); + return cl; +} + +static struct resource *alloc_resource(void) +{ + struct resource *r; + + pthread_mutex_lock(&unused_struct_mutex); + if (!unused_resource_count) { + r = malloc(sizeof(struct resource) + resource_lm_data_size); + } else { + r = list_first_entry(&unused_resource, struct resource, list); + list_del(&r->list); + unused_resource_count--; + } + pthread_mutex_unlock(&unused_struct_mutex); + if (r) { + memset(r, 0, sizeof(struct resource) + resource_lm_data_size); + INIT_LIST_HEAD(&r->locks); + INIT_LIST_HEAD(&r->actions); + } else { + log_error("out of memory for resource"); + } + return r; +} + +static struct lock *alloc_lock(void) +{ + struct lock *lk; + + pthread_mutex_lock(&unused_struct_mutex); + if (!unused_lock_count) { + lk = malloc(sizeof(struct lock)); + } else { + lk = list_first_entry(&unused_lock, struct lock, list); + list_del(&lk->list); + unused_lock_count--; + } + pthread_mutex_unlock(&unused_struct_mutex); + if (lk) + memset(lk, 0, sizeof(struct lock)); + else + log_error("out of memory for lock"); + return lk; +} + +static void free_action(struct action *act) +{ + pthread_mutex_lock(&unused_struct_mutex); + if (unused_action_count >= MAX_UNUSED_ACTION) { + free(act); + } else { + list_add_tail(&act->list, &unused_action); + unused_action_count++; + } + pthread_mutex_unlock(&unused_struct_mutex); +} + +static void free_client(struct client *cl) +{ + pthread_mutex_lock(&unused_struct_mutex); + if (unused_client_count >= MAX_UNUSED_CLIENT) { + free(cl); + } else { + list_add_tail(&cl->list, &unused_client); + unused_client_count++; + } + pthread_mutex_unlock(&unused_struct_mutex); +} + +static void free_resource(struct resource *r) +{ + pthread_mutex_lock(&unused_struct_mutex); + if (unused_resource_count >= MAX_UNUSED_RESOURCE) { + free(r); + } else { + list_add_tail(&r->list, &unused_resource); + unused_resource_count++; + } + pthread_mutex_unlock(&unused_struct_mutex); +} + +static void free_lock(struct lock *lk) +{ + pthread_mutex_lock(&unused_struct_mutex); + if (unused_lock_count >= MAX_UNUSED_LOCK) { + free(lk); + } else { + list_add_tail(&lk->list, &unused_lock); + unused_lock_count++; + } + pthread_mutex_unlock(&unused_struct_mutex); +} + +static int setup_structs(void) +{ + struct action *act; + struct client *cl; + struct resource *r; + struct lock *lk; + int data_san = lm_data_size_sanlock(); + int data_dlm = lm_data_size_dlm(); + int i; + + resource_lm_data_size = data_san > data_dlm ? data_san : data_dlm; + + pthread_mutex_init(&unused_struct_mutex, NULL); + INIT_LIST_HEAD(&unused_action); + INIT_LIST_HEAD(&unused_client); + INIT_LIST_HEAD(&unused_resource); + INIT_LIST_HEAD(&unused_lock); + + for (i = 0; i < MAX_UNUSED_ACTION/2; i++) { + if (!(act = alloc_action())) + goto fail; + free_action(act); + } + + for (i = 0; i < MAX_UNUSED_CLIENT/2; i++) { + if (!(cl = alloc_client())) + goto fail; + free_client(cl); + } + + for (i = 0; i < MAX_UNUSED_RESOURCE/2; i++) { + if (!(r = alloc_resource())) + goto fail; + free_resource(r); + } + + for (i = 0; i < MAX_UNUSED_LOCK/2; i++) { + if (!(lk = alloc_lock())) + goto fail; + free_lock(lk); + } + return 0; +fail: + return -ENOMEM; +} + +static int add_pollfd(int fd) +{ + int i, new_size; + + pthread_mutex_lock(&pollfd_mutex); + for (i = 0; i < pollfd_size; i++) { + if (pollfd[i].fd != POLL_FD_UNUSED) + continue; + + pollfd[i].fd = fd; + pollfd[i].events = POLLIN; + pollfd[i].revents = 0; + + if (i > pollfd_maxi) + pollfd_maxi = i; + + pthread_mutex_unlock(&pollfd_mutex); + return i; + } + + new_size = pollfd_size + ADD_POLL_SIZE; + + pollfd = realloc(pollfd, new_size * sizeof(struct pollfd)); + if (!pollfd) { + log_error("can't alloc new size %d for pollfd", new_size); + return -ENOMEM; + } + + for (i = pollfd_size; i < new_size; i++) { + pollfd[i].fd = POLL_FD_UNUSED; + pollfd[i].events = 0; + pollfd[i].revents = 0; + } + + i = pollfd_size; + pollfd[i].fd = fd; + pollfd[i].events = POLLIN; + pollfd[i].revents = 0; + pollfd_maxi = i; + + pollfd_size = new_size; + + pthread_mutex_unlock(&pollfd_mutex); + return i; +} + +static void rem_pollfd(int pi) +{ + if (pi < 0) { + log_error("rem_pollfd %d", pi); + return; + } + pthread_mutex_lock(&pollfd_mutex); + pollfd[pi].fd = POLL_FD_UNUSED; + pollfd[pi].events = 0; + pollfd[pi].revents = 0; + pthread_mutex_unlock(&pollfd_mutex); +} + +static const char *lm_str(int x) +{ + switch (x) { + case LD_LM_NONE: + return "none"; + case LD_LM_DLM: + return "dlm"; + case LD_LM_SANLOCK: + return "sanlock"; + default: + return "lm_unknown"; + } +} + +static const char *rt_str(int x) +{ + switch (x) { + case LD_RT_GL: + return "gl"; + case LD_RT_VG: + return "vg"; + case LD_RT_LV: + return "lv"; + default: + return "."; + }; +} + +static const char *op_str(int x) +{ + switch (x) { + case LD_OP_INIT: + return "init"; + case LD_OP_FREE: + return "free"; + case LD_OP_START: + return "start"; + case LD_OP_STOP: + return "stop"; + case LD_OP_LOCK: + return "lock"; + case LD_OP_UPDATE: + return "update"; + case LD_OP_CLOSE: + return "close"; + case LD_OP_ENABLE: + return "enable"; + case LD_OP_DISABLE: + return "disable"; + case LD_OP_ADD_LOCAL: + return "add_local"; + case LD_OP_REM_LOCAL: + return "rem_local"; + case LD_OP_UPDATE_LOCAL: + return "update_local"; + case LD_OP_START_WAIT: + return "start_wait"; + case LD_OP_STOP_ALL: + return "stop_all"; + default: + return "op_unknown"; + }; +} + +static const char *mode_str(int x) +{ + switch (x) { + case LD_LK_IV: + return "iv"; + case LD_LK_UN: + return "un"; + case LD_LK_NL: + return "nl"; + case LD_LK_SH: + return "sh"; + case LD_LK_EX: + return "ex"; + default: + return "."; + }; +} + +int last_string_from_args(char *args_in, char *last) +{ + const char *args = args_in; + const char *colon, *str = NULL; + + while (1) { + if (!args || (*args == '\0')) + break; + colon = strstr(args, ":"); + if (!colon) + break; + str = colon; + args = colon + 1; + } + + if (str) { + snprintf(last, MAX_ARGS, "%s", str + 1); + return 0; + } + return -1; +} + +int version_from_args(char *args, unsigned int *major, unsigned int *minor, unsigned int *patch) +{ + char version[MAX_ARGS]; + char *major_str, *minor_str, *patch_str; + char *n, *d1, *d2; + + strncpy(version, args, MAX_ARGS); + + n = strstr(version, ":"); + if (n) + *n = '\0'; + + d1 = strstr(version, "."); + if (!d1) + return -1; + + d2 = strstr(d1 + 1, "."); + if (!d2) + return -1; + + major_str = version; + minor_str = d1 + 1; + patch_str = d2 + 1; + + *d1 = '\0'; + *d2 = '\0'; + + if (major) + *major = atoi(major_str); + if (minor) + *minor = atoi(minor_str); + if (patch) + *patch = atoi(patch_str); + + return 0; +} + +/* + * These are few enough that arrays of function pointers can + * be avoided. + */ + +static int lm_add_lockspace(struct lockspace *ls, struct action *act, int adopt) +{ + int rv; + + if (ls->lm_type == LD_LM_DLM) + rv = lm_add_lockspace_dlm(ls, adopt); + else if (ls->lm_type == LD_LM_SANLOCK) + rv = lm_add_lockspace_sanlock(ls, adopt); + else + return -1; + + if (act) + act->lm_rv = rv; + return rv; +} + +static int lm_rem_lockspace(struct lockspace *ls, struct action *act, int free_vg) +{ + int rv; + + if (ls->lm_type == LD_LM_DLM) + rv = lm_rem_lockspace_dlm(ls, free_vg); + else if (ls->lm_type == LD_LM_SANLOCK) + rv = lm_rem_lockspace_sanlock(ls, free_vg); + else + return -1; + + if (act) + act->lm_rv = rv; + return rv; +} + +static int lm_lock(struct lockspace *ls, struct resource *r, int mode, struct action *act, + uint32_t *r_version, uint32_t *n_version, int *retry, int adopt) +{ + int rv; + + if (ls->lm_type == LD_LM_DLM) + rv = lm_lock_dlm(ls, r, mode, r_version, n_version, adopt); + else if (ls->lm_type == LD_LM_SANLOCK) + rv = lm_lock_sanlock(ls, r, mode, r_version, n_version, retry, adopt); + else + return -1; + + if (act) + act->lm_rv = rv; + return rv; +} + +static int lm_convert(struct lockspace *ls, struct resource *r, + int mode, struct action *act, uint32_t r_version) +{ + int rv; + + if (ls->lm_type == LD_LM_DLM) + rv = lm_convert_dlm(ls, r, mode, r_version); + else if (ls->lm_type == LD_LM_SANLOCK) + rv = lm_convert_sanlock(ls, r, mode, r_version); + else + return -1; + + if (act) + act->lm_rv = rv; + return rv; +} + +static int lm_unlock(struct lockspace *ls, struct resource *r, struct action *act, + uint32_t r_version, uint32_t n_version, uint32_t lmu_flags) +{ + int rv; + + if (ls->lm_type == LD_LM_DLM) + return lm_unlock_dlm(ls, r, r_version, n_version, lmu_flags); + else if (ls->lm_type == LD_LM_SANLOCK) + return lm_unlock_sanlock(ls, r, r_version, n_version, lmu_flags); + else + return -1; + + if (act) + act->lm_rv = rv; + return rv; +} + +static int lm_hosts(struct lockspace *ls, int notify) +{ + if (ls->lm_type == LD_LM_DLM) + return 0; + else if (ls->lm_type == LD_LM_SANLOCK) + return lm_hosts_sanlock(ls, notify); + return -1; +} + +static void lm_rem_resource(struct lockspace *ls, struct resource *r) +{ + if (ls->lm_type == LD_LM_DLM) + lm_rem_resource_dlm(ls, r); + else if (ls->lm_type == LD_LM_SANLOCK) + lm_rem_resource_sanlock(ls, r); +} + +/* + * While adopting locks, actions originate from the adopt_locks() + * function, not from a client. So, these actions (flagged ADOPT), + * should be passed back to the adopt_locks() function through the + * adopt_results list, and not be sent back to a client via the + * client_list/client_thread. + */ + +static void add_client_result(struct action *act) +{ + pthread_mutex_lock(&client_mutex); + if (act->flags & LD_AF_ADOPT) + list_add_tail(&act->list, &adopt_results); + else + list_add_tail(&act->list, &client_results); + pthread_cond_signal(&client_cond); + pthread_mutex_unlock(&client_mutex); +} + +static struct lock *find_lock_client(struct resource *r, uint32_t client_id) +{ + struct lock *lk; + + list_for_each_entry(lk, &r->locks, list) { + if (lk->client_id == client_id) + return lk; + } + return NULL; +} + +static struct lock *find_lock_persistent(struct resource *r) +{ + struct lock *lk; + + list_for_each_entry(lk, &r->locks, list) { + if (lk->flags & LD_LF_PERSISTENT) + return lk; + } + return NULL; +} + +static struct action *find_action_client(struct resource *r, uint32_t client_id) +{ + struct action *act; + + list_for_each_entry(act, &r->actions, list) { + if (act->client_id != client_id) + continue; + return act; + } + return NULL; +} + +static void add_work_action(struct action *act) +{ + pthread_mutex_lock(&worker_mutex); + if (!worker_stop) { + list_add_tail(&act->list, &worker_list); + pthread_cond_signal(&worker_cond); + } + pthread_mutex_unlock(&worker_mutex); +} + +static void create_work_action(int op) +{ + struct action *act; + + if (!(act = alloc_action())) + return; + act->op = op; + add_work_action(act); +} + +static int res_lock(struct lockspace *ls, struct resource *r, struct action *act, int *retry) +{ + struct lock *lk; + uint32_t r_version = 0; + uint32_t n_version = 0; + int rv; + + log_debug("S %s R %s res_lock mode %s", ls->name, r->name, mode_str(act->mode)); + + if (r->mode == LD_LK_SH && act->mode == LD_LK_SH) + goto add_lk; + + if (r->type == LD_RT_LV && act->lv_args[0]) + memcpy(r->lv_args, act->lv_args, MAX_ARGS); + + rv = lm_lock(ls, r, act->mode, act, &r_version, &n_version, retry, act->flags & LD_AF_ADOPT); + if (rv == -EAGAIN) + return rv; + if (rv < 0) { + log_error("S %s R %s res_lock lm error %d", ls->name, r->name, rv); + return rv; + } + + log_debug("S %s R %s res_lock lm done r_version %u n_version %u", + ls->name, r->name, r_version, n_version); + + /* lm_lock() reads new r_version and n_version */ + + if (r_version > r->version) { + /* + * New r_version of the lock: means that another + * host has changed data protected by this lock + * since the last time we acquired it. We + * should invalidate any local cache of the data + * protected by this lock and reread it from disk. + */ + r->version = r_version; + + /* + * r is vglk: tell lvmetad to set the vg invalid + * flag, and provide the new r_version. If lvmetad finds + * that its cached vg has seqno less than the value + * we send here, it will set the vg invalid flag. + * lvm commands that read the vg from lvmetad, will + * see the invalid flag returned, will reread the + * vg from disk, update the lvmetad copy, and go on. + * + * r is global: tell lvmetad to set the global invalid + * flag. When commands see this flag returned from lvmetad, + * they will reread metadata from disk, update the lvmetad + * caches, and tell lvmetad to set global invalid to 0. + */ + + if ((r->type == LD_RT_VG) && lvmetad_connected) { + daemon_reply reply; + char *uuid; + + log_debug("S %s R %s res_lock set lvmetad vg version %u", + ls->name, r->name, r_version); + + if (!ls->vg_uuid[0] || !strcmp(ls->vg_uuid, "none")) + uuid = ls->name; + else + uuid = ls->vg_uuid; + + pthread_mutex_lock(&lvmetad_mutex); + reply = daemon_send_simple(lvmetad_handle, "set_vg_info", + "token = %s", "skip", + "uuid = %s", uuid, + "version = %d", (int)r_version, + NULL); + pthread_mutex_unlock(&lvmetad_mutex); + + if (reply.error || strcmp(daemon_reply_str(reply, "response", ""), "OK")) + log_error("set_vg_info in lvmetad failed %d", reply.error); + daemon_reply_destroy(reply); + } + + if ((r->type == LD_RT_GL) && lvmetad_connected) { + daemon_reply reply; + + log_debug("S %s R %s res_lock set lvmetad global invalid", + ls->name, r->name); + + pthread_mutex_lock(&lvmetad_mutex); + reply = daemon_send_simple(lvmetad_handle, "set_global_info", + "token = %s", "skip", + "global_invalid = %d", 1, + NULL); + pthread_mutex_unlock(&lvmetad_mutex); + + if (reply.error || strcmp(daemon_reply_str(reply, "response", ""), "OK")) + log_error("set_global_info in lvmetad failed %d", reply.error); + daemon_reply_destroy(reply); + } + } + + if ((r->type == LD_RT_GL) && (n_version > ls->names_version)) { + /* + * Set a flag that will cause update_local_vgs to be run + * when the gl is unlocked (by queueing an UPDATE_LOCK action). + * It needs to happen on unlock because lvmetad needs to be updated + * by the command before there is an updated vg list to be read. + */ + log_debug("S %s gl res_lock set update_local_vgs", ls->name); + ls->update_local_vgs = 1; + ls->names_version = n_version; + } + + if ((r->type == LD_RT_GL) && (act->flags & LD_AF_UPDATE_NAMES_VERSION)) { + /* + * Set a flag that will cause the ls->names_version to be + * incremented and written to the gl lvb n_version when + * the gl is unlocked. + * Other hosts will eventually take the gl lock, see the new + * n_version and run update_local_vgs. + */ + log_debug("S %s gl res_lock set update_names_version", ls->name); + ls->update_names_version = 1; + } + + r->mode = act->mode; + +add_lk: + if (r->mode == LD_LK_SH) + r->sh_count++; + + if (!(lk = alloc_lock())) + return -ENOMEM; + + lk->client_id = act->client_id; + lk->mode = act->mode; + + if (act->flags & LD_AF_PERSISTENT) { + lk->flags |= LD_LF_PERSISTENT; + lk->client_id = 0; + } + + list_add_tail(&lk->list, &r->locks); + + return 0; +} + +static int res_convert(struct lockspace *ls, struct resource *r, + struct lock *lk, struct action *act) +{ + uint32_t r_version; + int rv; + + log_debug("S %s R %s res_convert mode %d", ls->name, r->name, act->mode); + + if (act->mode == LD_LK_EX && lk->mode == LD_LK_SH && r->sh_count > 1) + return -EAGAIN; + + /* + * lm_convert() writes new version (from ex) + * Same as lm_unlock() + */ + + if ((r->type == LD_RT_GL) && (r->mode == LD_LK_EX)) { + r->version++; + lk->version = r->version; + r_version = r->version; + log_debug("S %s R %s res_convert r_version inc %u", + ls->name, r->name, r_version); + + } else if ((r->type == LD_RT_VG) && (r->mode == LD_LK_EX) && (lk->version > r->version)) { + r->version = lk->version; + r_version = r->version; + log_debug("S %s R %s res_convert r_version new %u", ls->name, r->name, r_version); + } else { + r_version = 0; + } + + rv = lm_convert(ls, r, act->mode, act, r_version); + if (rv < 0) { + log_error("S %s R %s res_convert lm error %d", ls->name, r->name, rv); + return rv; + } + + log_debug("S %s R %s res_convert lm done", ls->name, r->name); + + if (lk->mode == LD_LK_EX && act->mode == LD_LK_SH) { + r->sh_count = 1; + } else if (lk->mode == LD_LK_SH && act->mode == LD_LK_EX) { + r->sh_count = 0; + } else { + /* should not be possible */ + log_error("S %s R %s res_convert invalid modes %d %d", + ls->name, r->name, lk->mode, act->mode); + return -1; + } + + r->mode = act->mode; + lk->mode = act->mode; + + return 0; +} + +static int res_cancel(struct lockspace *ls, struct resource *r, + struct action *act) +{ + struct action *cact; + + /* + * a client can cancel its own non-persistent lock requests, + * when could this happen? + * + * a client can cancel other client's persistent lock requests, + * when could this happen? + */ + + if (act->flags & LD_AF_PERSISTENT) { + list_for_each_entry(cact, &r->actions, list) { + if (!(cact->flags & LD_AF_PERSISTENT)) + continue; + goto do_cancel; + } + } else { + cact = find_action_client(r, act->client_id); + if (cact) + goto do_cancel; + } + + return -ENOENT; + +do_cancel: + log_debug("S %s R %s res_cancel client %d", ls->name, r->name, cact->client_id); + cact->result = -ECANCELED; + list_del(&cact->list); + add_client_result(cact); + + return -ECANCELED; +} + +/* + * lm_unlock() writes new a r_version (from ex) + * + * The r_version of the vg resource is incremented if + * an "update" was received for the vg lock. The update + * contains the new vg seqno from the vg metadata which is + * used as the r_version. + * + * The r_version of the global resource is automatically + * incremented when it is unlocked from ex mode. + * + * For the global resource, n_version is used in addition + * to r_version: + * + * r_version is incremented every time a command releases + * the global lock from ex. + * + * n_version is incremented every time a command that + * changes the list of vg names releases the global lock from ex. + * + * Changes to n_version are used by hosts to detect that other + * hosts have added/removed/renamed local (non-lockd) vgs which + * can be seen by multiple hosts, so the local_vgs list probably + * needs to be updated. lvmlockd knows about changes to lockd-type + * vgs through their locks, but local vgs do not have locks, + * so the n_version change is the only way to know that the + * local_vgs list should be updated. + */ + +/* + * persistent locks will not be unlocked for OP_CLOSE/act_close + * because act_close->flags does not have the PERSISTENT flag + * set, and a persistent lk->client_id is zero, which will not + * match the client in act_close->client_id. + */ + +static int res_unlock(struct lockspace *ls, struct resource *r, + struct action *act) +{ + struct lock *lk; + uint32_t r_version; + uint32_t n_version = 0; + int rv; + + if (act->flags & LD_AF_PERSISTENT) { + lk = find_lock_persistent(r); + if (lk) + goto do_unlock; + } else { + lk = find_lock_client(r, act->client_id); + if (lk) + goto do_unlock; + } + + if (act->op != LD_OP_CLOSE) + log_error("S %s R %s res_unlock no locks", ls->name, r->name); + return -ENOENT; + +do_unlock: + log_debug("S %s R %s res_unlock %s", ls->name, r->name, + (act->op == LD_OP_CLOSE) ? "from close" : ""); + + /* send unlock to lm when last sh lock is unlocked */ + if (lk->mode == LD_LK_SH) { + r->sh_count--; + if (r->sh_count > 0) + goto rem_lk; + } + + if ((r->type == LD_RT_GL) && (r->mode == LD_LK_EX)) { + r->version++; + lk->version = r->version; + r_version = r->version; + + log_debug("S %s R %s res_unlock r_version inc %u", ls->name, r->name, r_version); + + if (ls->update_names_version) { + ls->names_version++; + n_version = ls->names_version; + log_debug("S %s gl res_unlock got update_names_version %u", + ls->name, n_version); + } + + } else if ((r->type == LD_RT_VG) && (r->mode == LD_LK_EX) && (lk->version > r->version)) { + r->version = lk->version; + r_version = r->version; + + log_debug("S %s R %s res_unlock r_version new %u", + ls->name, r->name, r_version); + } else { + r_version = 0; + } + + rv = lm_unlock(ls, r, act, r_version, n_version, 0); + if (rv < 0) { + /* should never happen, retry? */ + log_error("S %s R %s res_unlock lm error %d", ls->name, r->name, rv); + return rv; + } + + log_debug("S %s R %s res_unlock lm done", ls->name, r->name); + + if ((r->type == LD_RT_GL) && (ls->update_local_vgs || ls->update_names_version)) { + log_debug("S %s gl res_unlock got update_local_vgs %d update_names_version %d", + ls->name, ls->update_local_vgs, ls->update_names_version); + ls->update_local_vgs = 0; + ls->update_names_version = 0; + create_work_action(LD_OP_UPDATE_LOCAL); + } + +rem_lk: + list_del(&lk->list); + free_lock(lk); + + if (list_empty(&r->locks)) + r->mode = LD_LK_UN; + + return 0; +} + +static int res_update(struct lockspace *ls, struct resource *r, + struct action *act) +{ + struct lock *lk; + + lk = find_lock_client(r, act->client_id); + if (!lk) { + log_error("S %s R %s res_update client %u lock not found", + ls->name, r->name, act->client_id); + return -ENOENT; + } + + if (r->mode != LD_LK_EX) { + log_error("S %s R %s res_update version on non-ex lock", + ls->name, r->name); + return -EINVAL; + } + + /* lk version will be written to lm by unlock */ + + if (act->flags & LD_AF_NEXT_VERSION) + lk->version = r->version + 1; + else + lk->version = act->version; + + log_debug("S %s R %s res_update lk version to %u", ls->name, r->name, lk->version); + + return 0; +} + +static int free_lv(struct lockspace *ls, struct resource *r) +{ + if (ls->lm_type == LD_LM_SANLOCK) + return lm_free_lv_sanlock(ls, r); + else if (ls->lm_type == LD_LM_DLM) + return 0; + else + return -EINVAL; +} + +/* + * NB. we can't do this if sanlock is holding any locks on + * the resource; we'd be rewriting the resource from under + * sanlock and would confuse or break it badly. We don't + * know what another host is doing, so these must be used + * very carefully. + */ + +static int res_able(struct lockspace *ls, struct resource *r, + struct action *act) +{ + int rv; + + if (ls->lm_type != LD_LM_SANLOCK) { + log_error("enable/disable only applies to sanlock"); + return -EINVAL; + } + + if (r->type != LD_RT_GL) { + log_error("enable/disable only applies to global lock"); + return -EINVAL; + } + + if (r->mode != LD_LK_UN) { + log_error("enable/disable only allowed on unlocked resource"); + return -EINVAL; + } + + if (act->op == LD_OP_ENABLE && gl_lsname_sanlock[0]) { + log_error("disable global lock in %s before enable in %s", + gl_lsname_sanlock, ls->name); + return -EINVAL; + } + + if ((act->op == LD_OP_DISABLE) && (act->flags & LD_AF_EX_DISABLE)) { + rv = lm_ex_disable_gl_sanlock(ls); + goto out; + } + + rv = lm_able_gl_sanlock(ls, act->op == LD_OP_ENABLE); +out: + return rv; +} + +/* + * Go through queued actions, and make lock/unlock calls on the resource + * based on the actions and the existing lock state. + * + * All lock operations sent to the lock manager are non-blocking. + * This is because sanlock does not support lock queueing. + * Eventually we could enhance this to take advantage of lock + * queueing when available (i.e. for the dlm). + * + * act_close_list: list of CLOSE actions, identifying clients that have + * closed/terminated their lvmlockd connection, and whose locks should + * be released. Do not remove these actions from act_close_list. + * + * retry_out: set to 1 if the lock manager said we should retry, + * meaning we should call res_process() again in a short while to retry. + */ + +static void res_process(struct lockspace *ls, struct resource *r, + struct list_head *act_close_list, int *retry_out) +{ + struct action *act, *safe, *act_close; + struct lock *lk; + int lm_retry; + int rv; + + /* + * handle version updates for ex locks + * (new version will be written by unlock) + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (act->op == LD_OP_UPDATE) { + rv = res_update(ls, r, act); + act->result = rv; + list_del(&act->list); + add_client_result(act); + } + } + + /* + * handle explicit unlock actions + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if ((act->op == LD_OP_LOCK) && + (act->mode == LD_LK_IV || act->mode == LD_LK_NL)) { + act->result = -EINVAL; + list_del(&act->list); + add_client_result(act); + } + + if (act->op == LD_OP_LOCK && act->mode == LD_LK_UN) { + rv = res_unlock(ls, r, act); + + if (rv == -ENOENT && (act->flags & LD_AF_UNLOCK_CANCEL)) + rv = res_cancel(ls, r, act); + + /* + * possible unlock results: + * 0: unlock succeeded + * -ECANCELED: cancel succeeded + * -ENOENT: nothing to unlock or cancel + */ + + act->result = rv; + list_del(&act->list); + add_client_result(act); + } + } + + /* + * handle implicit unlocks due to client exit, + * also clear any outstanding actions for the client + */ + + list_for_each_entry(act_close, act_close_list, list) { + res_unlock(ls, r, act_close); + res_cancel(ls, r, act_close); + } + + /* + * handle freeing a lock for an lv that has been removed + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (act->op == LD_OP_FREE && act->rt == LD_RT_LV) { + log_debug("S %s R %s free_lv", ls->name, r->name); + rv = free_lv(ls, r); + act->result = rv; + list_del(&act->list); + add_client_result(act); + goto r_free; + + } + } + + /* + * handle enable/disable + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (act->op == LD_OP_ENABLE || act->op == LD_OP_DISABLE) { + rv = res_able(ls, r, act); + act->result = rv; + list_del(&act->list); + add_client_result(act); + } + + if (!rv && act->op == LD_OP_DISABLE) { + log_debug("S %s R %s free disabled", ls->name, r->name); + goto r_free; + } + } + + /* + * transient requests on existing transient locks + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (act->flags & LD_AF_PERSISTENT) + continue; + + lk = find_lock_client(r, act->client_id); + if (!lk) + continue; + + if (lk->mode != act->mode) { + /* convert below */ + /* + act->result = -EEXIST; + list_del(&act->list); + add_client_result(act); + */ + continue; + } else { + /* success */ + act->result = -EALREADY; + list_del(&act->list); + add_client_result(act); + } + } + + /* + * persistent requests on existing persistent locks + * + * persistent locks are not owned by a client, so any + * existing with matching mode satisfies a request. + * only one persistent lock is kept on a resource. + * a single "unowned" persistent lock satisfies + * any/multiple client requests for a persistent lock. + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (!(act->flags & LD_AF_PERSISTENT)) + continue; + + lk = find_lock_persistent(r); + if (!lk) + continue; + + if (lk->mode != act->mode) { + /* convert below */ + /* + act->result = -EEXIST; + list_del(&act->list); + add_client_result(act); + */ + continue; + } else { + /* success */ + act->result = -EALREADY; + list_del(&act->list); + add_client_result(act); + } + } + + /* + * transient requests with existing persistent locks + * + * Just grant the transient request and do not + * keep a record of it. Assume that the persistent + * lock will not go away while the transient lock + * is needed. + * + * This would be used when an ex, persistent lv lock + * exists from activation, and then something like + * lvextend asks for a transient ex lock to change + * the lv. The lv could not be unlocked by deactivation + * while the lvextend was running. + * + * The logic here for mixing T/P locks is not general + * support; there are a number of cases where it will + * not work: updating version number (lv locks have + * none), ex locks from multiple clients will not + * conflict, explicit un of the transient lock will fail. + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (act->flags & LD_AF_PERSISTENT) + continue; + + lk = find_lock_persistent(r); + if (!lk) + continue; + + if ((lk->mode == LD_LK_EX) || + (lk->mode == LD_LK_SH && act->mode == LD_LK_SH)) { + act->result = 0; + list_del(&act->list); + add_client_result(act); + } else { + /* persistent lock is sh, transient request is ex */ + /* TODO: can we remove this case? do a convert here? */ + log_debug("res_process %s existing persistent lock new transient", r->name); + act->result = -EEXIST; + list_del(&act->list); + add_client_result(act); + } + } + + /* + * persistent requests with existing transient locks + * + * If a client requests a P (persistent) lock for a T (transient) + * lock it already holds, we can just change T to P. Fail if the + * same happens for locks from different clients. Changing + * another client's lock from T to P may cause problems + * if that client tries to unlock or update version. + * + * I don't think this P/T combination will be used. + * It might be used if a command was able to take a P + * vg lock, in which case the T vg lock would already + * be held for reading. If the T lock was sh, it would + * be converted to P ex. If the T/P modes matched, the + * lock could just be changed from T to P. + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (!(act->flags & LD_AF_PERSISTENT)) + continue; + + lk = find_lock_client(r, act->client_id); + if (!lk) + continue; + + if (lk->mode != act->mode) { + /* TODO: convert and change to persistent? */ + log_debug("res_process %s existing transient lock new persistent", r->name); + act->result = -EEXIST; + list_del(&act->list); + add_client_result(act); + } else { + lk->flags |= LD_LF_PERSISTENT; + lk->client_id = 0; + act->result = 0; + list_del(&act->list); + add_client_result(act); + } + } + + /* + * convert mode of existing locks + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (act->flags & LD_AF_PERSISTENT) + lk = find_lock_persistent(r); + else + lk = find_lock_client(r, act->client_id); + if (!lk) + continue; + + if (lk->mode == act->mode) { + /* should never happen, should be found above */ + log_error("convert same mode"); + continue; + } + + /* convert fails immediately, no EAGAIN retry */ + rv = res_convert(ls, r, lk, act); + act->result = rv; + list_del(&act->list); + add_client_result(act); + } + + /* + * Cases above are all requests addressed by existing locks. + * Below handles the rest. Transient and persistent are + * handled the same, except + * - if mode of existing lock is incompat with requested, + * leave the act on r->actions + * - if r mode is EX, any lock action is blocked, just quit + * + * Retry a lock request that fails due to a lock conflict (-EAGAIN): + * if we have not exceeded max retries and lm sets lm_retry (sanlock + * transient conflicts from shared lock implementation), or r type + * is gl or vg (transient real conflicts we want to hide from command). + * lv lock conflicts won't be transient so don't retry them. + */ + + + if (r->mode == LD_LK_EX) + return; + + /* + * r mode is SH or UN, pass lock-sh actions to lm + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + /* grant in order, so break here */ + if (act->op == LD_OP_LOCK && act->mode == LD_LK_EX) + break; + + if (act->op == LD_OP_LOCK && act->mode == LD_LK_SH) { + lm_retry = 0; + + rv = res_lock(ls, r, act, &lm_retry); + if ((rv == -EAGAIN) && + (act->retries <= act->max_retries) && + (lm_retry || (r->type != LD_RT_LV))) { + /* leave act on list */ + log_debug("S %s R %s res_lock EAGAIN retry", ls->name, r->name); + act->retries++; + *retry_out = 1; + } else { + act->result = rv; + list_del(&act->list); + add_client_result(act); + } + if (rv == -EUNATCH) + goto r_free; + } + } + + /* + * r mode is SH, any ex lock action is blocked, just quit + */ + + if (r->mode == LD_LK_SH) + return; + + /* + * r mode is UN, pass lock-ex action to lm + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (act->op == LD_OP_LOCK && act->mode == LD_LK_EX) { + lm_retry = 0; + + rv = res_lock(ls, r, act, &lm_retry); + if ((rv == -EAGAIN) && + (act->retries <= act->max_retries) && + (lm_retry || (r->type != LD_RT_LV))) { + /* leave act on list */ + log_debug("S %s R %s res_lock EAGAIN retry", ls->name, r->name); + act->retries++; + *retry_out = 1; + } else { + act->result = rv; + list_del(&act->list); + add_client_result(act); + } + if (rv == -EUNATCH) + goto r_free; + break; + } + } + + return; + +r_free: + /* For the EUNATCH case it may be possible there are queued actions? */ + list_for_each_entry_safe(act, safe, &r->actions, list) { + log_error("S %s R %s res_process r_free cancel %s client %d", + ls->name, r->name, op_str(act->op), act->client_id); + act->result = -ECANCELED; + list_del(&act->list); + add_client_result(act); + } + log_debug("S %s R %s res_process free", ls->name, r->name); + lm_rem_resource(ls, r); + list_del(&r->list); + free_resource(r); +} + +#define LOCKS_EXIST_ANY 1 +#define LOCKS_EXIST_GL 2 +#define LOCKS_EXIST_VG 3 +#define LOCKS_EXIST_LV 4 + +static int for_each_lock(struct lockspace *ls, int locks_do) +{ + struct resource *r; + struct lock *lk; + + list_for_each_entry(r, &ls->resources, list) { + list_for_each_entry(lk, &r->locks, list) { + if (locks_do == LOCKS_EXIST_ANY) + return 1; + + if (locks_do == LOCKS_EXIST_GL && r->type == LD_RT_GL) + return 1; + + if (locks_do == LOCKS_EXIST_VG && r->type == LD_RT_VG) + return 1; + + if (locks_do == LOCKS_EXIST_LV && r->type == LD_RT_LV) + return 1; + } + } + + return 0; +} + +static int clear_locks(struct lockspace *ls, int free_vg) +{ + struct resource *r, *r_safe; + struct lock *lk, *lk_safe; + struct action *act, *act_safe; + uint32_t lk_version; + uint32_t r_version; + int lk_count = 0; + int rv; + + list_for_each_entry_safe(r, r_safe, &ls->resources, list) { + lk_version = 0; + + list_for_each_entry_safe(lk, lk_safe, &r->locks, list) { + lk_count++; + + if (lk->flags & LD_LF_PERSISTENT) + log_error("S %s R %s clear lock persistent", ls->name, r->name); + else + log_error("S %s R %s clear lock client %d", ls->name, r->name, lk->client_id); + + if (lk->version > lk_version) + lk_version = lk->version; + + list_del(&lk->list); + free_lock(lk); + } + + if (r->mode == LD_LK_UN) + goto r_free; + + if ((r->type == LD_RT_GL) && (r->mode == LD_LK_EX)) { + r->version++; + r_version = r->version; + log_debug("S %s R %s clear_locks r_version inc %u", + ls->name, r->name, r_version); + + } else if ((r->type == LD_RT_VG) && (r->mode == LD_LK_EX) && (lk_version > r->version)) { + r->version = lk_version; + r_version = r->version; + log_debug("S %s R %s clear_locks r_version new %u", + ls->name, r->name, r_version); + + } else { + r_version = 0; + } + + rv = lm_unlock(ls, r, NULL, r_version, 0, free_vg ? LMUF_FREE_VG : 0); + if (rv < 0) { + /* should never happen */ + log_error("S %s R %s clear_locks free %d lm unlock error %d", + ls->name, r->name, free_vg, rv); + } + + list_for_each_entry_safe(act, act_safe, &r->actions, list) { + log_error("S %s R %s clear_locks cancel %s client %d", + ls->name, r->name, op_str(act->op), act->client_id); + act->result = -ECANCELED; + list_del(&act->list); + add_client_result(act); + } + r_free: + log_debug("S %s R %s free", ls->name, r->name); + lm_rem_resource(ls, r); + list_del(&r->list); + free_resource(r); + } + + return lk_count; +} + +/* + * find and return the resource that is referenced by the action + * - there is a single gl resource per lockspace + * - there is a single vg resource per lockspace + * - there can be many lv resources per lockspace, compare names + */ + +static struct resource *find_resource_act(struct lockspace *ls, + struct action *act, + int nocreate) +{ + struct resource *r; + + list_for_each_entry(r, &ls->resources, list) { + if (r->type != act->rt) + continue; + + if (r->type == LD_RT_GL && act->rt == LD_RT_GL) + return r; + + if (r->type == LD_RT_VG && act->rt == LD_RT_VG) + return r; + + if (r->type == LD_RT_LV && act->rt == LD_RT_LV && + !strcmp(r->name, act->lv_name)) + return r; + } + + if (nocreate) + return NULL; + + if (!(r = alloc_resource())) + return NULL; + + r->type = act->rt; + + r->mode = LD_LK_UN; + + if (r->type == LD_RT_GL) + strncpy(r->name, R_NAME_GL, MAX_NAME); + else if (r->type == LD_RT_VG) + strncpy(r->name, R_NAME_VG, MAX_NAME); + else if (r->type == LD_RT_LV) + strncpy(r->name, act->lv_name, MAX_NAME); + + list_add_tail(&r->list, &ls->resources); + + return r; +} + +static void free_ls_resources(struct lockspace *ls) +{ + struct resource *r, *r_safe; + + list_for_each_entry_safe(r, r_safe, &ls->resources, list) { + lm_rem_resource(ls, r); + list_del(&r->list); + free_resource(r); + } +} + +/* + * Process actions queued for this lockspace by + * client_recv_action / add_lock_action. + * + * The lockspace_thread can touch its own ls struct without holding + * lockspaces_mutex until it sets ls->thread_done, after which it + * cannot touch ls without holding lockspaces_mutex. + */ + +#define LOCK_RETRY_MS 1000 /* milliseconds to delay between retry */ + +static void *lockspace_thread_main(void *arg_in) +{ + struct lockspace *ls = arg_in; + struct resource *r, *r2; + struct action *add_act, *act, *safe; + struct list_head tmp_act; + struct list_head act_close; + int free_vg = 0; + int error = 0; + int adopt_flag = 0; + int wait_flag = 0; + int retry; + int rv; + + INIT_LIST_HEAD(&act_close); + + /* first action may be client add */ + pthread_mutex_lock(&ls->mutex); + act = NULL; + add_act = NULL; + if (!list_empty(&ls->actions)) { + act = list_first_entry(&ls->actions, struct action, list); + if (act->op == LD_OP_START) { + add_act = act; + list_del(&add_act->list); + + if (add_act->flags & LD_AF_WAIT) + wait_flag = 1; + if (add_act->flags & LD_AF_ADOPT) + adopt_flag = 1; + } + } + pthread_mutex_unlock(&ls->mutex); + + log_debug("S %s lm_add_lockspace %s wait %d adopt %d", + ls->name, lm_str(ls->lm_type), wait_flag, adopt_flag); + + if (add_act && !wait_flag) { + /* send partial join result back to client */ + add_act->result = 0; + add_client_result(add_act); + add_act = NULL; + } + + /* the lm join can take a while */ + + error = lm_add_lockspace(ls, add_act, adopt_flag); + + log_debug("S %s lm_add_lockspace done %d", ls->name, error); + + if (ls->sanlock_gl_enabled && gl_lsname_sanlock[0] && + strcmp(ls->name, gl_lsname_sanlock)) + sanlock_gl_dup = 1; + + if (add_act) { + /* send final join result back to client */ + add_act->result = error; + add_client_result(add_act); + } + + pthread_mutex_lock(&ls->mutex); + if (error) { + ls->thread_stop = 1; + ls->create_fail = 1; + } else { + ls->create_done = 1; + } + pthread_mutex_unlock(&ls->mutex); + + if (error) + goto out_act; + + while (1) { + pthread_mutex_lock(&ls->mutex); + while (!ls->thread_work) { + if (ls->thread_stop) { + pthread_mutex_unlock(&ls->mutex); + goto out_rem; + } + pthread_cond_wait(&ls->cond, &ls->mutex); + } + + /* client thread queues actions on ls->actions, we move + ls->actions to r->actions, then process the resources */ + + while (1) { + if (list_empty(&ls->actions)) { + ls->thread_work = 0; + break; + } + + act = list_first_entry(&ls->actions, struct action, list); + + if (sanlock_gl_dup && ls->sanlock_gl_enabled) + act->flags |= LD_AF_DUP_GL_LS; + + if (act->op == LD_OP_STOP) { + ls->thread_work = 0; + break; + } + + if (act->op == LD_OP_FREE && act->rt == LD_RT_VG) { + /* vgremove */ + log_debug("S %s checking for lockspace hosts", ls->name); + rv = lm_hosts(ls, 1); + if (rv) { + /* + * Checking for hosts here in addition to after the + * main loop allows vgremove to fail and be rerun + * after the ls is stopped on other hosts. + */ + log_error("S %s lockspace hosts %d", ls->name, rv); + list_del(&act->list); + act->result = -EBUSY; + add_client_result(act); + continue; + } + ls->thread_work = 0; + ls->thread_stop = 1; + free_vg = 1; + break; + } + + list_del(&act->list); + + /* applies to all resources */ + if (act->op == LD_OP_CLOSE) { + list_add(&act->list, &act_close); + continue; + } + + /* + * Find the specific resource this action refers to; + * creates resource if not found. + */ + + r = find_resource_act(ls, act, (act->op == LD_OP_FREE) ? 1 : 0); + if (!r) { + act->result = (act->op == LD_OP_FREE) ? -ENOENT : -ENOMEM; + add_client_result(act); + continue; + } + + list_add_tail(&act->list, &r->actions); + + log_debug("S %s R %s action %s %s", ls->name, r->name, + op_str(act->op), mode_str(act->mode)); + } + pthread_mutex_unlock(&ls->mutex); + + retry = 0; + + list_for_each_entry_safe(r, r2, &ls->resources, list) + res_process(ls, r, &act_close, &retry); + + list_for_each_entry_safe(act, safe, &act_close, list) { + list_del(&act->list); + free_action(act); + } + + if (retry) { + ls->thread_work = 1; + usleep(LOCK_RETRY_MS * 1000); + } + } + +out_rem: + log_debug("S %s stopping", ls->name); + + /* + * For sanlock, we need to unlock any existing locks + * before removing the lockspace, otherwise the sanlock + * daemon will kill us when the lockspace goes away. + * For dlm, we leave with force, so all locks will + * automatically be dropped when we leave the lockspace, + * so unlocking all before leaving could be skipped. + * + * Blindly dropping all existing locks must only be + * allowed in emergency/force situations, otherwise it's + * obviously dangerous, since the lock holders are still + * operating under the assumption that they hold the lock. + * + * For vgremove of a sanlock vg, the vg lock will be held, + * and possibly the gl lock if this vg holds the gl. + * sanlock vgremove wants to unlock-rename these locks. + */ + + log_debug("S %s clearing locks", ls->name); + + rv = clear_locks(ls, free_vg); + + /* + * Tell any other hosts in the lockspace to leave it + * before we remove it (for vgremove). We do this + * before leaving the lockspace ourself because we + * need to be in the lockspace to see others. + */ + + if (free_vg) { + log_debug("S %s checking for lockspace hosts", ls->name); + rv = lm_hosts(ls, 1); + if (rv) + log_error("S %s other lockspace hosts %d", ls->name, rv); + } + + /* + * Leave the lockspace. + */ + + rv = lm_rem_lockspace(ls, NULL, free_vg); + + log_debug("S %s rem_lockspace done %d", ls->name, rv); + +out_act: + /* + * Move remaining actions to results; this will usually (always?) + * be only the stop action. + */ + INIT_LIST_HEAD(&tmp_act); + + pthread_mutex_lock(&ls->mutex); + list_for_each_entry_safe(act, safe, &ls->actions, list) { + if (act->op == LD_OP_FREE) + act->result = 0; + else if (act->op == LD_OP_STOP) + act->result = 0; + else + act->result = -ENOLS; + list_del(&act->list); + list_add_tail(&act->list, &tmp_act); + } + pthread_mutex_unlock(&ls->mutex); + + pthread_mutex_lock(&client_mutex); + list_for_each_entry_safe(act, safe, &tmp_act, list) { + list_del(&act->list); + list_add_tail(&act->list, &client_results); + } + pthread_cond_signal(&client_cond); + pthread_mutex_unlock(&client_mutex); + + pthread_mutex_lock(&lockspaces_mutex); + ls->thread_done = 1; + pthread_mutex_unlock(&lockspaces_mutex); + + /* + * worker_thread will join this thread, and move the + * ls struct from lockspaces list to lockspaces_inactive. + */ + pthread_mutex_lock(&worker_mutex); + worker_wake = 1; + pthread_cond_signal(&worker_cond); + pthread_mutex_unlock(&worker_mutex); + + return NULL; +} + +static void process_local_ls(struct lockspace *ls) +{ + struct resource *r = list_first_entry(&ls->resources, struct resource, list); + struct action *act, *act_safe; + struct lock *lk; + int prev_mode; + int result; + + list_for_each_entry_safe(act, act_safe, &ls->actions, list) { + if (act->op != LD_OP_LOCK) + continue; + if (act->mode != LD_LK_UN) + continue; + + result = -ENOENT; + + list_for_each_entry(lk, &r->locks, list) { + if (lk->client_id != act->client_id) + continue; + list_del(&lk->list); + free_lock(lk); + result = 0; + break; + } + + act->result = result; + list_del(&act->list); + add_client_result(act); + } + + prev_mode = LD_LK_UN; + + if (!list_empty(&r->locks)) { + lk = list_first_entry(&r->locks, struct lock, list); + if (lk->mode == LD_LK_EX) + return; + + /* sanity check */ + if (lk->mode != LD_LK_SH) { + log_error("process_local_ls bad lk mode %d", lk->mode); + return; + } + + prev_mode = LD_LK_SH; + } + + /* grant lock requests until we reach one that's one not compat with prev_mode */ + + list_for_each_entry_safe(act, act_safe, &ls->actions, list) { + + if (act->mode == LD_LK_EX && prev_mode == LD_LK_UN) { + /* grant it and return because no more can be granted */ + + if (!(lk = alloc_lock())) + return; + + lk->client_id = act->client_id; + lk->mode = LD_LK_EX; + list_add(&lk->list, &r->locks); + + act->result = 0; + list_del(&act->list); + add_client_result(act); + return; + + } else if (act->mode == LD_LK_EX && prev_mode == LD_LK_SH) { + + /* we'll process this act and try to grant it the + next we come through here. */ + + return; + + } else if (act->mode == LD_LK_SH) { + prev_mode = LD_LK_SH; + + /* grant it and continue */ + + if (!(lk = alloc_lock())) + return; + + lk->client_id = act->client_id; + lk->mode = LD_LK_SH; + list_add_tail(&lk->list, &r->locks); + + act->result = 0; + list_del(&act->list); + add_client_result(act); + } + } +} + +static void purge_local_client(uint32_t client_id) +{ + struct lockspace *ls; + struct resource *r; + struct lock *lk, *lk_safe; + struct action *act, *act_safe; + + list_for_each_entry(ls, &local_vgs, list) { + r = list_first_entry(&ls->resources, struct resource, list); + + list_for_each_entry_safe(lk, lk_safe, &r->locks, list) { + if (lk->client_id != client_id) + continue; + list_del(&lk->list); + free_lock(lk); + } + + list_for_each_entry_safe(act, act_safe, &ls->actions, list) { + if (act->client_id != client_id) + continue; + list_del(&act->list); + free_action(act); + } + } +} + +static void *local_thread_main(void *arg_in) +{ + struct lockspace *ls; + struct action *act, *act_safe; + + while (1) { + pthread_mutex_lock(&local_thread_mutex); + while (!local_thread_work) { + if (local_thread_stop) { + pthread_mutex_unlock(&local_thread_mutex); + goto out; + } + pthread_cond_wait(&local_thread_cond, &local_thread_mutex); + } + + /* close actions: clear all locks and actions in all lockspaces for client */ + list_for_each_entry_safe(act, act_safe, &local_thread_actions, list) { + if (act->op != LD_OP_CLOSE) + continue; + purge_local_client(act->client_id); + list_del(&act->list); + free_action(act); + } + + list_for_each_entry(ls, &local_vgs, list) { + if (list_empty(&ls->actions)) + continue; + process_local_ls(ls); + } + + local_thread_work = 0; + pthread_mutex_unlock(&local_thread_mutex); + } +out: + return NULL; +} + +int lockspaces_empty(void) +{ + int rv; + pthread_mutex_lock(&lockspaces_mutex); + rv = list_empty(&lockspaces); + pthread_mutex_unlock(&lockspaces_mutex); + return rv; +} + +/* + * lockspaces_mutex is locked + * + * When duplicate sanlock global locks have been seen, + * this function has a secondary job of counting the + * number of lockspaces that exist with the gl enabled, + * with the side effect of setting sanlock_gl_dup back to + * zero when the duplicates have been removed/disabled. + */ + +static struct lockspace *find_lockspace_name(char *ls_name) +{ + struct lockspace *ls_found = NULL; + struct lockspace *ls; + int gl_count = 0; + + list_for_each_entry(ls, &lockspaces, list) { + if (!strcmp(ls->name, ls_name)) + ls_found = ls; + + if (!sanlock_gl_dup && ls_found) + return ls_found; + + if (sanlock_gl_dup && ls->sanlock_gl_enabled) + gl_count++; + } + + /* this is the side effect we want from this function */ + if (sanlock_gl_dup && gl_count < 2) + sanlock_gl_dup = 0; + + return ls_found; +} + +/* local_thread_mutex is locked */ +static struct lockspace *find_local_vg(const char *name, const char *uuid) +{ + struct lockspace *ls; + + list_for_each_entry(ls, &local_vgs, list) { + if (name && name[0] && !strcmp(ls->vg_name, name)) + return ls; + if (uuid && uuid[0] && !strcmp(ls->vg_uuid, uuid)) + return ls; + } + return NULL; +} + +/* + * vgcreate/vgremove of local vgs do add_local/rem_local which + * updates local_vgs on the local host. Other hosts' local_vgs + * are updated with these changes asynchronously when they see + * the n_version change in the global lock lvb, and do + * update_local_vgs. + * + * So, the global lock n_version and update_local_vgs is about + * asyncronous propagation of add_local/rem_local to other hosts. + * Because these are local vgs, they are not used concurrently + * by multiple hosts, but will be used only by the host in the + * vg's system_id, which is doing the add_local/rem_local. + * + * A local vg created on host1 does not need to be immediately + * usable on host2, and is not locked between hosts anyway. + * So, returning a not found error on host2 for a while will + * be ok. Once node2 asynchronously updates local_vgs, it + * would know about a new local vg created on host1. Then + * lockd_vg on this vg would change from "not found" ENOLS + * (as above) to -EOTHERVG (or ELOCALVG if no sysid is set, + * but hosts shouldn't be actively sharing a vg with no + * lock_type, so an async delay in this case is not a problem.) + */ + +/* local_thread_mutex is locked */ +static void add_local_vg(const char *vg_name, const char *vg_uuid, const char *vg_sysid) +{ + struct lockspace *ls; + struct resource *r; + + /* not really a lockspace, we're just reusing the struct */ + + if (!vg_name || !vg_uuid || !vg_name[0] || !vg_uuid[0]) { + log_error("add_local_vg incomplete %s %s", + vg_name ? vg_name : "no-name", + vg_uuid ? vg_uuid : "no-uuid"); + + return; + } + + if ((ls = find_local_vg(vg_name, vg_uuid))) { + if (vg_sysid && ls->vg_sysid[0] && !strcmp(vg_sysid, "none")) { + log_debug("add_local_vg %s %s clear sysid", vg_name, vg_uuid); + memset(&ls->vg_sysid, 0, MAX_NAME); + } else if (vg_sysid && strcmp(ls->vg_sysid, vg_sysid)) { + log_debug("add_local_vg %s %s update %s", vg_name, vg_uuid, vg_sysid); + strncpy(ls->vg_sysid, vg_sysid, MAX_NAME); + } + return; + } + + if (!(ls = alloc_lockspace())) + return; + + if (!(r = alloc_resource())) { + free(ls); + return; + } + + strncpy(ls->vg_name, vg_name, MAX_NAME); + strncpy(ls->vg_uuid, vg_uuid, 64); + strncpy(ls->vg_sysid, vg_sysid, MAX_NAME); + + r->type = LD_RT_VG; + r->mode = LD_LK_UN; + strncpy(r->name, R_NAME_VG, MAX_NAME); + list_add_tail(&r->list, &ls->resources); + + list_add(&ls->list, &local_vgs); + + log_debug("add_local_vg %s %s %s", vg_name, vg_uuid, vg_sysid ?: ""); +} + +/* local_thread_mutex is locked */ +static void rem_local_vg(const char *vg_name, const char *vg_uuid) +{ + struct lockspace *ls; + struct resource *r; + struct lock *lk, *lk_safe; + struct action *act, *act_safe; + + log_debug("rem_local_vg %s %s", vg_name, vg_uuid); + + if (!(ls = find_local_vg(vg_name, vg_uuid))) + return; + + r = list_first_entry(&ls->resources, struct resource, list); + + list_for_each_entry_safe(lk, lk_safe, &r->locks, list) { + list_del(&lk->list); + free_lock(lk); + } + + list_del(&r->list); + free_resource(r); + + list_for_each_entry_safe(act, act_safe, &ls->actions, list) { + list_del(&act->list); + free_action(act); + } + + list_del(&ls->list); + free(ls); +} + +static struct lockspace *find_update_vg(struct list_head *head, const char *name, const char *uuid) +{ + struct lockspace *ls; + + list_for_each_entry(ls, head, list) { + if (!strcmp(ls->vg_name, name) && !strcmp(ls->vg_uuid, uuid)) + return ls; + } + return NULL; +} + +/* + * called by worker_thread. the work action is queued when we see that another + * host has changed the global lock n_version, which means they have changed the + * global vg name list, so our local_vgs list may need updating. + * + * Handle the issue where a lot of devices all appear together, + * pvscan is run for each of them to populate lvmetad, each pvscan + * triggers an update_local, and we end up calling this function many + * times in a row. We only really need/want one update_local when all + * the pvscans are done, and this is a rough approximation of that. + * If we're asked to do update_local within one second of the previous run, + * then push it off to the delayed work list, so it will be called in a + * couple seconds. Ignore more update_local actions while a delayed + * update_local action exists. IOW, if we see two quick back to back + * update_local actions, delay the second one for a couple seconds in + * an attempt to buffer more of them which can be eliminated. + */ + +static uint64_t last_update_local; + +static int work_update_local_vgs(void) +{ + struct list_head update_vgs; + daemon_reply reply; + struct dm_config_node *cn; + struct dm_config_node *metadata; + struct lockspace *lls, *uls, *safe; + const char *vg_name; + const char *vg_uuid; + const char *lock_type; + const char *system_id; + int mutex_unlocked = 0; + int rv; + + INIT_LIST_HEAD(&update_vgs); + + if (monotime() - last_update_local <= 1) + return -EAGAIN; + + last_update_local = monotime(); + + /* get a list of all vg uuids from lvmetad */ + + pthread_mutex_lock(&lvmetad_mutex); + reply = daemon_send_simple(lvmetad_handle, "vg_list", + "token = %s", "skip", + NULL); + if (reply.error) { + log_error("vg_list from lvmetad error %d", reply.error); + rv = -EINVAL; + goto destroy; + } + + if (!(cn = dm_config_find_node(reply.cft->root, "volume_groups"))) { + log_error("work_update_local no vgs"); + rv = -EINVAL; + goto destroy; + } + + /* create an update_vgs list of all vg uuids */ + + for (cn = cn->child; cn; cn = cn->sib) { + vg_uuid = cn->key; + + if (!(uls = alloc_lockspace())) { + rv = -ENOMEM; + goto destroy; + } + + strncpy(uls->vg_uuid, vg_uuid, 64); + list_add_tail(&uls->list, &update_vgs); + log_debug("work_update_local %s", vg_uuid); + } + destroy: + daemon_reply_destroy(reply); + + if (rv < 0) + goto out; + + /* get vg_name and system_id for each vg uuid entry in update_vgs */ + + list_for_each_entry(uls, &update_vgs, list) { + reply = daemon_send_simple(lvmetad_handle, "vg_lookup", + "token = %s", "skip", + "uuid = %s", uls->vg_uuid, + NULL); + if (reply.error) { + log_error("vg_lookup from lvmetad error %d", reply.error); + rv = -EINVAL; + goto next; + } + + vg_name = daemon_reply_str(reply, "name", NULL); + if (!vg_name) { + log_error("work_update_local %s no name", uls->vg_uuid); + rv = -EINVAL; + goto next; + } + + strncpy(uls->vg_name, vg_name, MAX_NAME); + + metadata = dm_config_find_node(reply.cft->root, "metadata"); + if (!metadata) { + log_error("work_update_local %s name %s no metadata", + uls->vg_uuid, uls->vg_name); + rv = -EINVAL; + goto next; + } + + lock_type = dm_config_find_str(metadata, "metadata/lock_type", NULL); + uls->lm_type = str_to_lm(lock_type); + + system_id = dm_config_find_str(metadata, "metadata/system_id", NULL); + if (system_id) + strncpy(uls->vg_sysid, system_id, MAX_NAME); + next: + daemon_reply_destroy(reply); + + log_debug("work_update_local %s lock_type %s %d sysid %s %s", + uls->vg_name, lock_type ?: "NULL", uls->lm_type, uls->vg_sysid, uls->vg_uuid); + + if (rv < 0 || !vg_name || !metadata) + goto out; + } + pthread_mutex_unlock(&lvmetad_mutex); + mutex_unlocked = 1; + + /* remove local_vgs entries that no longer exist in update_vgs */ + + pthread_mutex_lock(&local_thread_mutex); + + list_for_each_entry_safe(lls, safe, &local_vgs, list) { + uls = find_update_vg(&update_vgs, lls->vg_name, lls->vg_uuid); + if (!uls) { + log_debug("work_update_local remove local_vg %s %s", + lls->vg_name, lls->vg_uuid); + list_del(&lls->list); + free(lls); + + } else if (uls->lm_type != LD_LM_NONE) { + log_debug("work_update_local remove local_vg %s %s new lm_type %d", + lls->vg_name, lls->vg_uuid, uls->lm_type); + list_del(&lls->list); + free(lls); + } + } + + /* add local_vgs entries for any new non-lockd entries in update_vgs */ + + list_for_each_entry_safe(uls, safe, &update_vgs, list) { + if (uls->lm_type != LD_LM_NONE) + continue; + /* add_local_vg doesn't add any that already exist, it may update sysid */ + add_local_vg(uls->vg_name, uls->vg_uuid, uls->vg_sysid); + } + pthread_mutex_unlock(&local_thread_mutex); +out: + list_for_each_entry_safe(uls, safe, &update_vgs, list) { + list_del(&uls->list); + free(uls); + } + + if (!mutex_unlocked) + pthread_mutex_unlock(&lvmetad_mutex); + + return 0; +} + +/* + * We don't use the reply here, so it would be more + * efficient to send without waiting for a reply. + */ + +static void invalidate_lvmetad_vg(struct lockspace *ls) +{ + daemon_reply reply; + + pthread_mutex_lock(&lvmetad_mutex); + reply = daemon_send_simple(lvmetad_handle, "set_vg_info", + "token = %s", "skip", + "uuid = %s", ls->vg_uuid, + "version = %d", 0, + NULL); + pthread_mutex_unlock(&lvmetad_mutex); + daemon_reply_destroy(reply); +} + +/* + * If lvm_<vg_name> is longer than max lockspace name (64) we just ignore the + * extra characters. For sanlock vgs, the name is shortened further to 48 in + * the sanlock code. + */ + +static int vg_ls_name(const char *vg_name, char *ls_name) +{ + if (strlen(vg_name) + 4 > MAX_NAME) { + log_error("vg name too long %s", vg_name); + return -1; + } + + snprintf(ls_name, MAX_NAME, "%s%s", LVM_LS_PREFIX, vg_name); + return 0; +} + +/* TODO: add mutex for gl_lsname_ ? */ + +static int gl_ls_name(char *ls_name) +{ + if (gl_use_dlm) + memcpy(ls_name, gl_lsname_dlm, MAX_NAME); + else if (gl_use_sanlock) + memcpy(ls_name, gl_lsname_sanlock, MAX_NAME); + else { + log_error("gl_ls_name: global lockspace type unknown"); + return -1; + } + return 0; +} + +/* + * When this function returns an error, the caller needs to deal + * with act (in the cases where act exists). + */ + +static int add_lockspace_thread(const char *ls_name, + const char *vg_name, + const char *vg_uuid, + int lm_type, const char *vg_args, + struct action *act) +{ + struct lockspace *ls, *ls2; + struct resource *r; + uint32_t version = 0; + int rv; + + if (act) + version = act->version; + + log_debug("add_lockspace_thread %s %s version %u", + lm_str(lm_type), ls_name, version); + + if (!(ls = alloc_lockspace())) + return -ENOMEM; + + strncpy(ls->name, ls_name, MAX_NAME); + ls->lm_type = lm_type; + + if (act) + ls->start_client_id = act->client_id; + + if (vg_uuid) + strncpy(ls->vg_uuid, vg_uuid, 64); + + if (vg_name) + strncpy(ls->vg_name, vg_name, MAX_NAME); + + if (vg_args) + strncpy(ls->vg_args, vg_args, MAX_ARGS); + + if (act) + ls->host_id = act->host_id; + + if (!(r = alloc_resource())) { + free(ls); + return -ENOMEM; + } + + r->type = LD_RT_VG; + r->mode = LD_LK_UN; + r->version = version; + strncpy(r->name, R_NAME_VG, MAX_NAME); + list_add_tail(&r->list, &ls->resources); + + pthread_mutex_lock(&lockspaces_mutex); + ls2 = find_lockspace_name(ls->name); + if (ls2) { + if (ls2->thread_stop) + rv = -EAGAIN; + else + rv = -EEXIST; + pthread_mutex_unlock(&lockspaces_mutex); + free_resource(r); + free(ls); + return rv; + } + + /* + * act will be null when this lockspace is added automatically/internally + * and not by an explicit client action that wants a result. + */ + if (act) + list_add(&act->list, &ls->actions); + + clear_lockspace_inactive(ls->name); + + list_add_tail(&ls->list, &lockspaces); + pthread_mutex_unlock(&lockspaces_mutex); + + rv = pthread_create(&ls->thread, NULL, lockspace_thread_main, ls); + if (rv < 0) { + pthread_mutex_lock(&lockspaces_mutex); + list_del(&ls->list); + pthread_mutex_unlock(&lockspaces_mutex); + free_resource(r); + free(ls); + return rv; + } + + return 0; +} + +/* + * There is no add_sanlock_global_lockspace or + * rem_sanlock_global_lockspace because with sanlock, + * the global lockspace is one of the vg lockspaces. + */ + +static int add_dlm_global_lockspace(struct action *act) +{ + int rv; + + if (gl_running_dlm) + return -EEXIST; + + gl_running_dlm = 1; + + /* Keep track of whether we automatically added + the global ls, so we know to automatically + remove it. */ + + if (act) + gl_auto_dlm = 0; + else + gl_auto_dlm = 1; + + /* + * There's a short period after which a previous gl lockspace thread + * has set gl_running_dlm = 0, but before its ls struct has been + * deleted, during which this add_lockspace_thread() can fail with + * -EAGAIN. + */ + + rv = add_lockspace_thread(gl_lsname_dlm, NULL, NULL, LD_LM_DLM, NULL, act); + + if (rv < 0) { + log_error("add_dlm_global_lockspace add_lockspace_thread %d", rv); + gl_running_dlm = 0; + gl_auto_dlm = 0; + } + + return rv; +} + +/* + * If dlm gl lockspace is the only one left, then stop it. + * This is not used for an explicit rem_lockspace action from + * the client, only for auto remove. + */ + +static int rem_dlm_global_lockspace(void) +{ + struct lockspace *ls, *ls_gl = NULL; + int others = 0; + int rv = 0; + + pthread_mutex_lock(&lockspaces_mutex); + list_for_each_entry(ls, &lockspaces, list) { + if (!strcmp(ls->name, gl_lsname_dlm)) { + ls_gl = ls; + continue; + } + if (ls->thread_stop) + continue; + others++; + break; + } + + if (others) { + rv = -EAGAIN; + goto out; + } + + if (!ls_gl) { + rv = -ENOENT; + goto out; + } + + ls = ls_gl; + pthread_mutex_lock(&ls->mutex); + ls->thread_stop = 1; + ls->thread_work = 1; + pthread_cond_signal(&ls->cond); + pthread_mutex_unlock(&ls->mutex); + rv = 0; +out: + pthread_mutex_unlock(&lockspaces_mutex); + return rv; +} + +/* + * When the first dlm lockspace is added for a vg, + * automatically add a separate dlm lockspace for the + * global lock if it hasn't been done explicitly. + * This is to make the dlm global lockspace work similarly to + * the sanlock global lockspace, which is "automatic" by + * nature of being one of the vg lockspaces. + * + * For sanlock, a separate lockspace is not used for + * the global lock, but the gl lock lives in a vg + * lockspace, (although it's recommended to create a + * special vg dedicated to holding the gl). + * + * N.B. for dlm, if this is an add+WAIT action for a vg + * lockspace, and this triggered the automatic addition + * of the global lockspace, then the action may complete + * for the vg ls add, while the gl ls add is still in + * progress. If the caller wants to ensure that the + * gl ls add is complete, they should explicitly add+WAIT + * the gl ls. + * + * If this function returns and error, the caller + * will queue the act with that error for the client. + */ + +static int add_lockspace(struct action *act) +{ + struct lockspace *ls; + char ls_name[MAX_NAME+1]; + int rv; + + if (local_thread_only) { + log_error("add_lockspace not allowed local_thread_only"); + return -EINVAL; + } + + /* + * This should not generally happen, but does happen when a vg + * lock_type is changed from none to sanlock. + */ + pthread_mutex_lock(&local_thread_mutex); + ls = find_local_vg(act->vg_name, NULL); + if (ls) { + log_error("add_lockspace vg %s remove matching local_vg", act->vg_name); + list_del(&ls->list); + free_ls_resources(ls); + free(ls); + } + pthread_mutex_unlock(&local_thread_mutex); + + memset(ls_name, 0, sizeof(ls_name)); + + if (act->rt == LD_RT_GL) { + if (gl_use_dlm) { + rv = add_dlm_global_lockspace(act); + return rv; + } else { + return -EINVAL; + } + } + + if (act->rt == LD_RT_VG) { + if (gl_use_dlm) { + rv = add_dlm_global_lockspace(NULL); + if (rv < 0 && rv != -EEXIST) + return rv; + } + + vg_ls_name(act->vg_name, ls_name); + + rv = add_lockspace_thread(ls_name, act->vg_name, act->vg_uuid, + act->lm_type, act->vg_args, + act); + + if (rv) + log_error("add_lockspace %s add_lockspace_thread %d", ls_name, rv); + return rv; + } + + log_error("add_lockspace bad type %d", act->rt); + return -1; +} + +/* + * vgchange --lock-stop vgname will lock the vg ex, then send a stop, + * so we exect to find the ex vg lock held here, and will automatically + * unlock it when stopping. + * + * Should we attempt to stop the lockspace containing the gl last? + */ + +static int rem_lockspace(struct action *act) +{ + struct lockspace *ls; + char ls_name[MAX_NAME+1]; + int force = act->flags & LD_AF_FORCE; + int rt = act->rt; + + if (act->rt == LD_RT_GL && act->lm_type != LD_LM_DLM) + return -EINVAL; + + memset(ls_name, 0, sizeof(ls_name)); + + if (act->rt == LD_RT_GL) + gl_ls_name(ls_name); + else + vg_ls_name(act->vg_name, ls_name); + + pthread_mutex_lock(&lockspaces_mutex); + ls = find_lockspace_name(ls_name); + if (!ls) { + pthread_mutex_unlock(&lockspaces_mutex); + return -ENOLS; + } + + pthread_mutex_lock(&ls->mutex); + if (ls->thread_stop) { + pthread_mutex_unlock(&ls->mutex); + pthread_mutex_unlock(&lockspaces_mutex); + return -ESTALE; + } + + if (!force && for_each_lock(ls, LOCKS_EXIST_LV)) { + pthread_mutex_unlock(&ls->mutex); + pthread_mutex_unlock(&lockspaces_mutex); + return -EBUSY; + } + ls->thread_work = 1; + ls->thread_stop = 1; + if (act) + list_add_tail(&act->list, &ls->actions); + pthread_cond_signal(&ls->cond); + pthread_mutex_unlock(&ls->mutex); + pthread_mutex_unlock(&lockspaces_mutex); + + /* + * If the dlm global lockspace was automatically added when + * the first dlm vg lockspace was added, then reverse that + * by automatically removing the dlm global lockspace when + * the last dlm vg lockspace is removed. + */ + + if (rt == LD_RT_VG && gl_use_dlm && gl_auto_dlm) + rem_dlm_global_lockspace(); + + return 0; +} + +/* + * count how many lockspaces started by this client are still starting; + * the client will use this to wait for all its start operations to finish + * (START_WAIT). + */ + +static int count_lockspace_starting(uint32_t client_id) +{ + struct lockspace *ls; + int count = 0; + int done = 0; + int fail = 0; + + pthread_mutex_lock(&lockspaces_mutex); + list_for_each_entry(ls, &lockspaces, list) { + if (ls->start_client_id != client_id) + continue; + + if (!ls->create_done && !ls->create_fail) { + count++; + continue; + } + + if (ls->create_done) + done++; + if (ls->create_fail) + fail++; + } + pthread_mutex_unlock(&lockspaces_mutex); + + log_debug("count_lockspace_starting client %u count %d done %d fail %d", + client_id, count, done, fail); + + return count; +} + +/* lockspaces_mutex is held */ +static struct lockspace *find_lockspace_inactive(char *ls_name) +{ + struct lockspace *ls; + + list_for_each_entry(ls, &lockspaces_inactive, list) { + if (!strcmp(ls->name, ls_name)) + return ls; + } + + return NULL; +} + +/* lockspaces_mutex is held */ +static void clear_lockspace_inactive(char *ls_name) +{ + struct lockspace *ls; + + ls = find_lockspace_inactive(ls_name); + if (ls) { + list_del(&ls->list); + free(ls); + } +} + +static void free_lockspaces_inactive(void) +{ + struct lockspace *ls, *safe; + + pthread_mutex_lock(&lockspaces_mutex); + list_for_each_entry_safe(ls, safe, &lockspaces_inactive, list) { + list_del(&ls->list); + free(ls); + } + pthread_mutex_unlock(&lockspaces_mutex); +} + +/* + * Loop through all lockspaces, and: + * - if do_stop is set, stop any that are not stopped + * - if do_free is set, join any that are done stopping (and free ls) + * + * do_stop will not stop an ls with lv locks unless force is set. + * + * This function does not block or wait for anything. + * + * do_stop (no do_free): + * returns count of lockspaces that need stop (have locks and no force) + * + * do_free (no do_stop): + * returns count of lockspaces that are stopped and need freeing + * + * do_stop and do_free: + * returns sum of the previous two + */ + +static int for_each_lockspace(int do_stop, int do_free, int do_force) +{ + struct lockspace *ls, *safe; + int need_stop = 0; + int need_free = 0; + int stop_count = 0; + int free_count = 0; + int done; + int stop; + + pthread_mutex_lock(&lockspaces_mutex); + + if (do_stop) { + list_for_each_entry(ls, &lockspaces, list) { + + pthread_mutex_lock(&ls->mutex); + if (ls->thread_stop) { + pthread_mutex_unlock(&ls->mutex); + continue; + } + + if (!do_force && for_each_lock(ls, LOCKS_EXIST_ANY)) { + need_stop++; + } else { + ls->thread_work = 1; + ls->thread_stop = 1; + pthread_cond_signal(&ls->cond); + stop_count++; + } + pthread_mutex_unlock(&ls->mutex); + } + } + + if (do_free) { + list_for_each_entry_safe(ls, safe, &lockspaces, list) { + + pthread_mutex_lock(&ls->mutex); + done = ls->thread_done; + stop = ls->thread_stop; + pthread_mutex_unlock(&ls->mutex); + + /* This ls has locks and force is not set. */ + if (!stop) + continue; + + /* + * Once thread_done is set, we know that the lockspace_thread + * will not be using/touching the ls struct. Any other + * thread touches the ls struct under lockspaces_mutex. + */ + if (done) { + pthread_join(ls->thread, NULL); + list_del(&ls->list); + + /* TODO: remove this if unneeded */ + if (!list_empty(&ls->actions)) + log_error("TODO: free ls actions"); + + free_ls_resources(ls); + list_add(&ls->list, &lockspaces_inactive); + free_count++; + } else { + need_free++; + } + } + } + + if (list_empty(&lockspaces)) { + if (!gl_type_static) { + gl_use_dlm = 0; + gl_use_sanlock = 0; + } + } + pthread_mutex_unlock(&lockspaces_mutex); + + if (stop_count || free_count || need_stop || need_free) { + log_debug("for_each_lockspace do_stop %d do_free %d " + "stop_count %d free_count %d need_stop %d need_free %d", + do_stop, do_free, stop_count, free_count, need_stop, need_free); + } + + return need_stop + need_free; +} + +/* + * This is only called when the daemon is exiting so the sleep/retry + * loop doesn't have any adverse impact. + */ + +static void for_each_lockspace_retry(int do_stop, int do_free, int do_force) +{ + int count; + + while (1) { + count = for_each_lockspace(do_stop, do_free, do_force); + if (!count) + break; + + log_debug("for_each_lockspace_retry remaining %d", count); + sleep(1); + } +} + +static int work_init_vg(struct action *act) +{ + struct lockspace *ls; + char ls_name[MAX_NAME+1]; + int rv = 0; + + memset(ls_name, 0, sizeof(ls_name)); + + vg_ls_name(act->vg_name, ls_name); + + /* + * The max dlm ls name is 64 and the max sanlock ls name is 48. So, + * after the "lvm_" prefix, only the first 60/44 characters of the VG + * name are used for the lockspace name. This will cause a collision + * in the lock manager if two different VG names have the first 60/44 + * chars in common. At the time of vgcreate (here), check if any other + * VG's are known that would collide. If the collision is not detected + * at vgcreate time, it will be detected at start time and add_lockspace + * will fail for the second of the two matching ls names. + */ + pthread_mutex_lock(&lockspaces_mutex); + list_for_each_entry(ls, &lockspaces, list) { + if ((ls->lm_type == LD_LM_SANLOCK) && !strncmp(ls->name, ls_name, 48)) { + rv = -EEXIST; + break; + } + if ((ls->lm_type == LD_LM_DLM) && !strcmp(ls->name, ls_name)) { + rv = -EEXIST; + break; + } + } + pthread_mutex_unlock(&lockspaces_mutex); + + if (rv == -EEXIST) { + log_error("Existing lockspace name %s matches new %s VG names %s %s", + ls->name, ls_name, ls->vg_name, act->vg_name); + return rv; + } + + if (act->lm_type == LD_LM_SANLOCK) + rv = lm_init_vg_sanlock(ls_name, act->vg_name, act->flags, act->vg_args); + else if (act->lm_type == LD_LM_DLM) + rv = lm_init_vg_dlm(ls_name, act->vg_name, act->flags, act->vg_args); + else + rv = -EINVAL; + + return rv; +} + +static void work_test_gl(void) +{ + struct lockspace *ls; + int is_enabled = 0; + + pthread_mutex_lock(&lockspaces_mutex); + list_for_each_entry(ls, &lockspaces, list) { + if (ls->lm_type != LD_LM_SANLOCK) + continue; + + pthread_mutex_lock(&ls->mutex); + if (ls->create_done && !ls->thread_stop) { + is_enabled = lm_gl_is_enabled(ls); + if (is_enabled) { + log_debug("S %s worker found gl_is_enabled", ls->name); + strncpy(gl_lsname_sanlock, ls->name, MAX_NAME); + } + } + pthread_mutex_unlock(&ls->mutex); + + if (is_enabled) + break; + } + + if (!is_enabled) + log_debug("worker found no gl_is_enabled"); + pthread_mutex_unlock(&lockspaces_mutex); +} + +static int work_init_lv(struct action *act) +{ + struct lockspace *ls; + char ls_name[MAX_NAME+1]; + char vg_args[MAX_ARGS]; + char lv_args[MAX_ARGS]; + int lm_type = 0; + int rv = 0; + + memset(ls_name, 0, sizeof(ls_name)); + memset(vg_args, 0, MAX_ARGS); + memset(lv_args, 0, MAX_ARGS); + + vg_ls_name(act->vg_name, ls_name); + + pthread_mutex_lock(&lockspaces_mutex); + ls = find_lockspace_name(ls_name); + if (ls) { + lm_type = ls->lm_type; + memcpy(vg_args, ls->vg_args, MAX_ARGS); + } + pthread_mutex_unlock(&lockspaces_mutex); + + if (!ls) { + lm_type = act->lm_type; + memcpy(vg_args, act->vg_args, MAX_ARGS); + } + + if (act->lm_type != lm_type) { + log_error("init_lv ls_name %s wrong lm_type %d %d", + ls_name, act->lm_type, lm_type); + return -EINVAL; + } + + if (lm_type == LD_LM_SANLOCK) { + rv = lm_init_lv_sanlock(ls_name, act->vg_name, act->lv_name, + vg_args, lv_args); + + memcpy(act->lv_args, lv_args, MAX_ARGS); + return rv; + + } else if (act->lm_type == LD_LM_DLM) { + return 0; + } else { + log_error("init_lv ls_name %s bad lm_type %d", ls_name, act->lm_type); + return -EINVAL; + } +} + +/* + * When an action is queued for the worker_thread, it is processed right away. + * After processing, some actions need to be retried again in a short while. + * These actions are put on the delayed_list, and the worker_thread will + * process these delayed actions again in SHORT_DELAY_PERIOD. + */ + +#define SHORT_DELAY_PERIOD 2 +#define LONG_DELAY_PERIOD 60 + +static void *worker_thread_main(void *arg_in) +{ + struct list_head delayed_list; + struct timespec ts; + struct action *act, *safe; + uint64_t last_delayed_time = 0; + int delayed_update_local = 0; + int delay_sec = LONG_DELAY_PERIOD; + int rv; + + INIT_LIST_HEAD(&delayed_list); + + while (1) { + pthread_mutex_lock(&worker_mutex); + clock_gettime(CLOCK_REALTIME, &ts); + ts.tv_sec += delay_sec; + rv = 0; + act = NULL; + + while (list_empty(&worker_list) && !worker_stop && !worker_wake && !rv) { + rv = pthread_cond_timedwait(&worker_cond, &worker_mutex, &ts); + } + worker_wake = 0; + + if (worker_stop) { + pthread_mutex_unlock(&worker_mutex); + goto out; + } + + if (!list_empty(&worker_list)) { + act = list_first_entry(&worker_list, struct action, list); + list_del(&act->list); + } + pthread_mutex_unlock(&worker_mutex); + + /* + * Do new work actions before processing delayed work actions. + */ + + if (!act) + goto delayed_work; + + if ((act->op == LD_OP_LOCK) && (act->flags & LD_AF_SEARCH_LS)) { + /* + * worker_thread used as a helper to search existing + * sanlock vgs for an enabled gl. + */ + log_debug("work search for gl"); + work_test_gl(); + + /* try again to find a gl lockspace for this act */ + rv = add_lock_action(act); + if (rv < 0) { + act->result = rv; + add_client_result(act); + } + + } else if ((act->op == LD_OP_INIT) && (act->rt == LD_RT_VG)) { + log_debug("work init_vg %s", act->vg_name); + act->result = work_init_vg(act); + add_client_result(act); + + } else if ((act->op == LD_OP_INIT) && (act->rt == LD_RT_LV)) { + log_debug("work init_lv %s/%s", act->vg_name, act->lv_name); + act->result = work_init_lv(act); + add_client_result(act); + + } else if (act->op == LD_OP_UPDATE_LOCAL) { + if (delayed_update_local) { + log_debug("work update_local ignore repeat"); + act->result = 0; + add_client_result(act); + } else { + log_debug("work update_local"); + rv = work_update_local_vgs(); + if (rv == -EAGAIN) { + delayed_update_local = 1; + list_add(&act->list, &delayed_list); + } else { + act->result = 0; + add_client_result(act); + } + } + + } else if (act->op == LD_OP_START_WAIT) { + act->result = count_lockspace_starting(act->client_id); + if (!act->result) + add_client_result(act); + else + list_add(&act->list, &delayed_list); + + } else if (act->op == LD_OP_STOP_ALL) { + act->result = for_each_lockspace(DO_STOP, DO_FREE, (act->flags & LD_AF_FORCE) ? DO_FORCE : NO_FORCE); + if (!act->result || !(act->flags & LD_AF_WAIT)) + add_client_result(act); + else + list_add(&act->list, &delayed_list); + + } else { + log_error("work unknown op %d", act->op); + act->result = -EINVAL; + add_client_result(act); + } + + delayed_work: + /* + * We may want to track retry times per action so that + * we can delay different actions by different amounts. + */ + + if (monotime() - last_delayed_time < SHORT_DELAY_PERIOD) { + delay_sec = 1; + continue; + } + last_delayed_time = monotime(); + + list_for_each_entry_safe(act, safe, &delayed_list, list) { + if (act->op == LD_OP_START_WAIT) { + log_debug("work delayed start_wait for client %u", act->client_id); + act->result = count_lockspace_starting(act->client_id); + if (!act->result) { + list_del(&act->list); + add_client_result(act); + } + + } else if (act->op == LD_OP_UPDATE_LOCAL) { + log_debug("work delayed update_local"); + rv = work_update_local_vgs(); + if (rv == -EAGAIN) + continue; + act->result = 0; + list_del(&act->list); + add_client_result(act); + delayed_update_local = 0; + + } else if (act->op == LD_OP_STOP_ALL) { + log_debug("work delayed stop_all"); + act->result = for_each_lockspace(DO_STOP, DO_FREE, (act->flags & LD_AF_FORCE) ? DO_FORCE : NO_FORCE); + if (!act->result) { + list_del(&act->list); + act->result = 0; + add_client_result(act); + } + } + } + + /* + * This is not explicitly queued work, and not delayed work, + * but lockspace thread cleanup that's needed when a + * lockspace has been stopped/removed or failed to start. + */ + + for_each_lockspace(NO_STOP, DO_FREE, NO_FORCE); + + if (list_empty(&delayed_list)) + delay_sec = LONG_DELAY_PERIOD; + else + delay_sec = 1; + } +out: + list_for_each_entry_safe(act, safe, &delayed_list, list) { + list_del(&act->list); + free_action(act); + } + + pthread_mutex_lock(&worker_mutex); + list_for_each_entry_safe(act, safe, &worker_list, list) { + list_del(&act->list); + free_action(act); + } + pthread_mutex_unlock(&worker_mutex); + return NULL; +} + +static int setup_worker_thread(void) +{ + int rv; + + INIT_LIST_HEAD(&worker_list); + + pthread_mutex_init(&worker_mutex, NULL); + pthread_cond_init(&worker_cond, NULL); + + rv = pthread_create(&worker_thread, NULL, worker_thread_main, NULL); + if (rv) + return -1; + return 0; +} + +static void close_worker_thread(void) +{ + pthread_mutex_lock(&worker_mutex); + worker_stop = 1; + pthread_cond_signal(&worker_cond); + pthread_mutex_unlock(&worker_mutex); + pthread_join(worker_thread, NULL); +} + +/* client_mutex is locked */ +static struct client *find_client_work(void) +{ + struct client *cl; + + list_for_each_entry(cl, &client_list, list) { + if (cl->recv || cl->dead) + return cl; + } + return NULL; +} + +/* client_mutex is locked */ +static struct client *find_client_id(uint32_t id) +{ + struct client *cl; + + list_for_each_entry(cl, &client_list, list) { + if (cl->id == id) + return cl; + } + return NULL; +} + +/* client_mutex is locked */ +static struct client *find_client_pi(int pi) +{ + struct client *cl; + + list_for_each_entry(cl, &client_list, list) { + if (cl->pi == pi) + return cl; + } + return NULL; +} + +/* + * wake up poll() because we have added an fd + * back into pollfd and poll() needs to be restarted + * to recognize it. + */ +static void restart_poll(void) +{ + write(restart_fds[1], "w", 1); +} + +/* poll will take requests from client again, cl->mutex must be held */ +static void client_resume(struct client *cl) +{ + if (cl->dead) + return; + + if (!cl->poll_ignore || cl->fd == -1 || cl->pi == -1) { + /* shouldn't happen */ + log_error("client_resume %d bad state ig %d fd %d pi %d", + cl->id, cl->poll_ignore, cl->fd, cl->pi); + return; + } + + pthread_mutex_lock(&pollfd_mutex); + if (pollfd[cl->pi].fd != POLL_FD_IGNORE) { + log_error("client_resume %d pi %d fd %d not IGNORE", + cl->id, cl->pi, cl->fd); + } + pollfd[cl->pi].fd = cl->fd; + pollfd[cl->pi].events = POLLIN; + pthread_mutex_unlock(&pollfd_mutex); + + restart_poll(); +} + +/* called from client_thread, cl->mutex is held */ +static void client_send_result(struct client *cl, struct action *act) +{ + response res; + char result_flags[128]; + + if (cl->dead) { + log_debug("client send %d skip dead", cl->id); + return; + } + + memset(result_flags, 0, sizeof(result_flags)); + + buffer_init(&res.buffer); + + /* + * EUNATCH is returned when the global lock existed, + * but had been disabled when we tried to lock it, + * so we removed it, and no longer have a gl to lock. + */ + + if (act->result == -EUNATCH) + act->result = -ENOLS; + + /* + * init_vg with dlm|sanlock returns vg_args + * init_lv with sanlock returns lv_args + */ + + if (act->result == -ENOLS) { + /* + * The lockspace could not be found, in which case + * the caller may want to know if any lockspaces exist + * or if lockspaces exist, but not one with the global lock. + * Given this detail, it may be able to procede without + * the lock. + */ + pthread_mutex_lock(&lockspaces_mutex); + if (list_empty(&lockspaces)) + strcat(result_flags, "NO_LOCKSPACES,"); + pthread_mutex_unlock(&lockspaces_mutex); + + if (gl_use_sanlock && !gl_lsname_sanlock[0]) + strcat(result_flags, "NO_GL_LS,"); + else if (gl_use_dlm && !gl_lsname_dlm[0]) + strcat(result_flags, "NO_GL_LS,"); + else + strcat(result_flags, "NO_GL_LS,"); + } + + if (act->flags & LD_AF_LOCAL_LS) + strcat(result_flags, "LOCAL_LS,"); + + if (act->flags & LD_AF_DUP_GL_LS) + strcat(result_flags, "DUP_GL_LS,"); + + if (act->flags & LD_AF_INACTIVE_LS) + strcat(result_flags, "INACTIVE_LS,"); + + if (act->flags & LD_AF_ADD_LS_ERROR) + strcat(result_flags, "ADD_LS_ERROR,"); + + if (act->op == LD_OP_INIT) { + /* + * init is a special case where lock args need + * to be passed back to the client. + */ + const char *vg_args = "none"; + const char *lv_args = "none"; + + if (act->vg_args[0]) + vg_args = act->vg_args; + + if (act->lv_args[0]) + lv_args = act->lv_args; + + log_debug("send %s[%d.%u] %s %s rv %d vg_args %s lv_args %s", + cl->name[0] ? cl->name : "client", cl->pid, cl->id, + op_str(act->op), rt_str(act->rt), + act->result, vg_args ? vg_args : "", lv_args ? lv_args : ""); + + res = daemon_reply_simple("OK", + "op = %d", act->op, + "op_result = %d", act->result, + "lm_result = %d", act->lm_rv, + "vg_lock_args = %s", vg_args, + "lv_lock_args = %s", lv_args, + "result_flags = %s", result_flags[0] ? result_flags : "none", + NULL); + } else { + /* + * A normal reply. + */ + + log_debug("send %s[%d.%u] %s %s rv %d %s %s", + cl->name[0] ? cl->name : "client", cl->pid, cl->id, + op_str(act->op), rt_str(act->rt), + act->result, (act->result == -ENOLS) ? "ENOLS" : "", result_flags); + + res = daemon_reply_simple("OK", + "op = %d", act->op, + "lock_type = %s", lm_str(act->lm_type), + "op_result = %d", act->result, + "lm_result = %d", act->lm_rv, + "result_flags = %s", result_flags[0] ? result_flags : "none", + NULL); + } + + buffer_write(cl->fd, &res.buffer); + buffer_destroy(&res.buffer); + + client_resume(cl); +} + +/* called from client_thread */ +static void client_purge(struct client *cl) +{ + struct lockspace *ls; + struct action *act; + + pthread_mutex_lock(&lockspaces_mutex); + list_for_each_entry(ls, &lockspaces, list) { + if (!(act = alloc_action())) + continue; + + act->op = LD_OP_CLOSE; + act->client_id = cl->id; + + pthread_mutex_lock(&ls->mutex); + if (!ls->thread_stop) { + list_add_tail(&act->list, &ls->actions); + ls->thread_work = 1; + pthread_cond_signal(&ls->cond); + } else { + free_action(act); + } + pthread_mutex_unlock(&ls->mutex); + } + pthread_mutex_unlock(&lockspaces_mutex); + + if (local_thread_also) { + if (!(act = alloc_action())) + return; + + act->op = LD_OP_CLOSE; + act->client_id = cl->id; + + pthread_mutex_lock(&local_thread_mutex); + list_add_tail(&act->list, &local_thread_actions); + local_thread_work = 1; + pthread_cond_signal(&local_thread_cond); + pthread_mutex_unlock(&local_thread_mutex); + } +} + +static int add_lock_action(struct action *act) +{ + struct lockspace *ls = NULL; + char ls_name[MAX_NAME+1]; + + memset(ls_name, 0, sizeof(ls_name)); + + /* Determine which lockspace this action is for, and set ls_name. */ + + if (act->rt == LD_RT_GL && gl_use_sanlock && + (act->op == LD_OP_ENABLE || act->op == LD_OP_DISABLE)) + vg_ls_name(act->vg_name, ls_name); + else if (act->rt == LD_RT_GL) + gl_ls_name(ls_name); + else + vg_ls_name(act->vg_name, ls_name); + + retry: + pthread_mutex_lock(&lockspaces_mutex); + if (ls_name[0]) + ls = find_lockspace_name(ls_name); + if (!ls) { + int ls_inactive = 0; + int ls_create_fail = 0; + + ls = find_lockspace_inactive(ls_name); + if (ls) { + ls_inactive = 1; + ls_create_fail = ls->create_fail; + ls = NULL; + } + pthread_mutex_unlock(&lockspaces_mutex); + + if (act->op == LD_OP_UPDATE && act->rt == LD_RT_VG) { + log_debug("lockspace not found ignored for vg update"); + return -ENOLS; + + } else if (act->flags & LD_AF_SEARCH_LS) { + /* fail if we've already tried searching for the ls */ + log_error("lockspace search repeated %s", ls_name); + return -ENOLS; + + } else if (act->op == LD_OP_LOCK && act->rt == LD_RT_GL && gl_use_sanlock) { + /* gl may have been enabled in an existing vg */ + log_debug("gl lockspace not found check sanlock vgs"); + act->flags |= LD_AF_SEARCH_LS; + add_work_action(act); + return 0; + + } else if (act->op == LD_OP_LOCK && act->rt == LD_RT_GL && gl_use_dlm) { + log_debug("gl lockspace not found add dlm global"); + act->flags |= LD_AF_SEARCH_LS; + act->flags |= LD_AF_WAIT_STARTING; + add_dlm_global_lockspace(NULL); + gl_ls_name(ls_name); + goto retry; + + } else if (act->op == LD_OP_LOCK && act->mode == LD_LK_UN) { + log_debug("lockspace not found ignored for unlock"); + return -ENOLS; + + } else if (act->op == LD_OP_LOCK && act->rt == LD_RT_VG && ls_inactive) { + /* ls has been stopped or previously failed to start */ + log_debug("lockspace inactive create_fail %d %s", + ls_create_fail, ls_name); + act->flags |= LD_AF_INACTIVE_LS; + if (ls_create_fail) + act->flags |= LD_AF_ADD_LS_ERROR; + return -ENOLS; + + } else { + log_error("lockspace not found %s", ls_name); + return -ENOLS; + } + } + + if (act->lm_type == LD_LM_NONE) { + /* return to the command the type we are using */ + act->lm_type = ls->lm_type; + } else if (act->lm_type != ls->lm_type) { + /* should not happen */ + log_error("S %s add_lock_action bad lm_type %d ls %d", + ls_name, act->lm_type, ls->lm_type); + return -EINVAL; + } + + pthread_mutex_lock(&ls->mutex); + if (ls->thread_stop) { + pthread_mutex_unlock(&ls->mutex); + pthread_mutex_unlock(&lockspaces_mutex); + log_error("lockspace is stopping %s", ls_name); + return -ESTALE; + } + + if (!ls->create_fail && !ls->create_done && !(act->flags & LD_AF_WAIT_STARTING)) { + pthread_mutex_unlock(&ls->mutex); + pthread_mutex_unlock(&lockspaces_mutex); + log_debug("lockspace is starting %s", ls_name); + return -ESTARTING; + } + + list_add_tail(&act->list, &ls->actions); + ls->thread_work = 1; + pthread_cond_signal(&ls->cond); + pthread_mutex_unlock(&ls->mutex); + pthread_mutex_unlock(&lockspaces_mutex); + + /* lockspace_thread_main / res_process take it from here */ + + return 0; +} + +static int add_local_lock_action(struct lockspace *ls, struct action *act) +{ + act->flags |= LD_AF_LOCAL_LS; + pthread_mutex_lock(&local_thread_mutex); + if (!ls && local_thread_only) + list_add_tail(&act->list, &local_thread_gls->actions); + else if (ls) + list_add_tail(&act->list, &ls->actions); + local_thread_work = 1; + pthread_cond_signal(&local_thread_cond); + pthread_mutex_unlock(&local_thread_mutex); + return 0; +} + +static int str_to_op_rt(const char *req_name, int *op, int *rt) +{ + if (!req_name) + goto out; + + if (!strcmp(req_name, "hello")) { + *op = LD_OP_HELLO; + *rt = 0; + return 0; + } + if (!strcmp(req_name, "quit")) { + *op = LD_OP_QUIT; + *rt = 0; + return 0; + } + if (!strcmp(req_name, "info")) { + *op = LD_OP_DUMP_INFO; + *rt = 0; + return 0; + } + if (!strcmp(req_name, "dump")) { + *op = LD_OP_DUMP_LOG; + *rt = 0; + return 0; + } + if (!strcmp(req_name, "init_vg")) { + *op = LD_OP_INIT; + *rt = LD_RT_VG; + return 0; + } + if (!strcmp(req_name, "init_lv")) { + *op = LD_OP_INIT; + *rt = LD_RT_LV; + return 0; + } + if (!strcmp(req_name, "free_vg")) { + *op = LD_OP_FREE; + *rt = LD_RT_VG; + return 0; + } + if (!strcmp(req_name, "free_lv")) { + *op = LD_OP_FREE; + *rt = LD_RT_LV; + return 0; + } + if (!strcmp(req_name, "start_vg")) { + *op = LD_OP_START; + *rt = LD_RT_VG; + return 0; + } + if (!strcmp(req_name, "stop_vg")) { + *op = LD_OP_STOP; + *rt = LD_RT_VG; + return 0; + } + if (!strcmp(req_name, "start_wait")) { + *op = LD_OP_START_WAIT; + *rt = 0; + return 0; + } + if (!strcmp(req_name, "stop_all")) { + *op = LD_OP_STOP_ALL; + *rt = 0; + return 0; + } + if (!strcmp(req_name, "lock_gl")) { + *op = LD_OP_LOCK; + *rt = LD_RT_GL; + return 0; + } + if (!strcmp(req_name, "lock_vg")) { + *op = LD_OP_LOCK; + *rt = LD_RT_VG; + return 0; + } + if (!strcmp(req_name, "lock_lv")) { + *op = LD_OP_LOCK; + *rt = LD_RT_LV; + return 0; + } + if (!strcmp(req_name, "vg_update")) { + *op = LD_OP_UPDATE; + *rt = LD_RT_VG; + return 0; + } + if (!strcmp(req_name, "enable_gl")) { + *op = LD_OP_ENABLE; + *rt = LD_RT_GL; + return 0; + } + if (!strcmp(req_name, "disable_gl")) { + *op = LD_OP_DISABLE; + *rt = LD_RT_GL; + return 0; + } + if (!strcmp(req_name, "add_local")) { + *op = LD_OP_ADD_LOCAL; + return 0; + } + if (!strcmp(req_name, "rem_local")) { + *op = LD_OP_REM_LOCAL; + return 0; + } + if (!strcmp(req_name, "update_local")) { + *op = LD_OP_UPDATE_LOCAL; + return 0; + } +out: + return -1; +} + +static int str_to_mode(const char *str) +{ + if (!str) + goto out; + if (!strcmp(str, "un")) + return LD_LK_UN; + if (!strcmp(str, "nl")) + return LD_LK_NL; + if (!strcmp(str, "sh")) + return LD_LK_SH; + if (!strcmp(str, "ex")) + return LD_LK_EX; +out: + return LD_LK_IV; +} + +static int str_to_lm(const char *str) +{ + if (!str || !strcmp(str, "none")) + return LD_LM_NONE; + if (!strcmp(str, "sanlock")) + return LD_LM_SANLOCK; + if (!strcmp(str, "dlm")) + return LD_LM_DLM; + return -2; +} + +static uint32_t str_to_opts(const char *str) +{ + uint32_t flags = 0; + + if (!str) + goto out; + if (strstr(str, "persistent")) + flags |= LD_AF_PERSISTENT; + if (strstr(str, "unlock_cancel")) + flags |= LD_AF_UNLOCK_CANCEL; + if (strstr(str, "next_version")) + flags |= LD_AF_NEXT_VERSION; + if (strstr(str, "wait")) + flags |= LD_AF_WAIT; + if (strstr(str, "force")) + flags |= LD_AF_FORCE; + if (strstr(str, "ex_disable")) + flags |= LD_AF_EX_DISABLE; + if (strstr(str, "enable")) + flags |= LD_AF_ENABLE; + if (strstr(str, "disable")) + flags |= LD_AF_DISABLE; + if (strstr(str, "update_names")) + flags |= LD_AF_UPDATE_NAMES_VERSION; +out: + return flags; +} + +static int is_other_sysid(struct lockspace *lls) +{ + if (!our_system_id || !lls->vg_sysid[0]) + return 0; + if (!strcmp(lls->vg_sysid, our_system_id)) + return 0; + return 1; +} + + +/* + * dump info + * client_list: each client struct + * local_vgs: each lockspace struct (representing a local vg) + * lockspaces: each lockspace struct + * lockspace actions: each action struct + * lockspace resources: each resource struct + * lockspace resource actions: each action struct + * lockspace resource locks: each lock struct + */ + +static int setup_dump_socket(void) +{ + int s; + + s = socket(AF_LOCAL, SOCK_DGRAM, 0); + if (s < 0) + return s; + + memset(&dump_addr, 0, sizeof(dump_addr)); + dump_addr.sun_family = AF_LOCAL; + strcpy(&dump_addr.sun_path[1], DUMP_SOCKET_NAME); + dump_addrlen = sizeof(sa_family_t) + strlen(dump_addr.sun_path+1) + 1; + + return s; +} + +static int send_dump_buf(int fd, int dump_len) +{ + int pos = 0; + int ret; + +retry: + ret = sendto(fd, dump_buf + pos, dump_len - pos, MSG_DONTWAIT | MSG_NOSIGNAL, + (struct sockaddr *)&dump_addr, dump_addrlen); + if (ret <= 0) + return ret; + + pos += ret; + + if (pos < dump_len) + goto retry; + + return 0; +} + +static int print_client(struct client *cl, const char *prefix, int pos, int len) +{ + return snprintf(dump_buf + pos, len - pos, + "info=%s " + "pid=%d " + "fd=%d " + "pi=%d " + "id=%u " + "name=%s\n", + prefix, + cl->pid, + cl->fd, + cl->pi, + cl->id, + cl->name[0] ? cl->name : "."); +} + +static int print_local_vg(struct lockspace *ls, const char *prefix, int pos, int len) +{ + return snprintf(dump_buf + pos, len - pos, + "info=%s " + "vg_name=%s " + "vg_uuid=%s " + "vg_sysid=%s\n", + prefix, + ls->vg_name, + ls->vg_uuid, + ls->vg_sysid[0] ? ls->vg_sysid : "."); +} + +static int print_lockspace(struct lockspace *ls, const char *prefix, int pos, int len) +{ + return snprintf(dump_buf + pos, len - pos, + "info=%s " + "ls_name=%s " + "vg_name=%s " + "vg_uuid=%s " + "vg_sysid=%s " + "vg_args=%s " + "lm_type=%s " + "host_id=%llu " + "names_version=%u " + "create_fail=%d " + "create_done=%d " + "thread_work=%d " + "thread_stop=%d " + "thread_done=%d " + "update_local_vgs=%d " + "update_names_version=%d " + "sanlock_gl_enabled=%d " + "sanlock_gl_dup=%d\n", + prefix, + ls->name, + ls->vg_name, + ls->vg_uuid, + ls->vg_sysid[0] ? ls->vg_sysid : ".", + ls->vg_args, + lm_str(ls->lm_type), + (unsigned long long)ls->host_id, + ls->names_version, + ls->create_fail ? 1 : 0, + ls->create_done ? 1 : 0, + ls->thread_work ? 1 : 0, + ls->thread_stop ? 1 : 0, + ls->thread_done ? 1 : 0, + ls->update_local_vgs ? 1 : 0, + ls->update_names_version ? 1 : 0, + ls->sanlock_gl_enabled ? 1 : 0, + ls->sanlock_gl_dup ? 1 : 0); +} + +static int print_action(struct action *act, const char *prefix, int pos, int len) +{ + return snprintf(dump_buf + pos, len - pos, + "info=%s " + "client_id=%u " + "flags=0x%x " + "version=%u " + "op=%s " + "rt=%s " + "mode=%s " + "lm_type=%s " + "result=%d " + "lm_rv=%d\n", + prefix, + act->client_id, + act->flags, + act->version, + op_str(act->op), + rt_str(act->rt), + mode_str(act->mode), + lm_str(act->lm_type), + act->result, + act->lm_rv); +} + +static int print_resource(struct resource *r, const char *prefix, int pos, int len) +{ + return snprintf(dump_buf + pos, len - pos, + "info=%s " + "name=%s " + "type=%s " + "mode=%s " + "sh_count=%d " + "version=%u\n", + prefix, + r->name, + rt_str(r->type), + mode_str(r->mode), + r->sh_count, + r->version); +} + +static int print_lock(struct lock *lk, const char *prefix, int pos, int len) +{ + return snprintf(dump_buf + pos, len - pos, + "info=%s " + "mode=%s " + "version=%u " + "flags=0x%x " + "client_id=%u\n", + prefix, + mode_str(lk->mode), + lk->version, + lk->flags, + lk->client_id); +} + +static int dump_info(int *dump_len) +{ + struct client *cl; + struct lockspace *ls; + struct resource *r; + struct lock *lk; + struct action *act; + int len, pos, ret; + int rv = 0; + + memset(dump_buf, 0, sizeof(dump_buf)); + len = sizeof(dump_buf); + pos = 0; + + /* + * clients + */ + + pthread_mutex_lock(&client_mutex); + list_for_each_entry(cl, &client_list, list) { + ret = print_client(cl, "client", pos, len); + if (ret >= len - pos) { + rv = -ENOSPC; + break; + } + pos += ret; + } + pthread_mutex_unlock(&client_mutex); + + if (rv < 0) + return rv; + + /* + * local vgs + */ + + pthread_mutex_lock(&lockspaces_mutex); + list_for_each_entry(ls, &local_vgs, list) { + ret = print_local_vg(ls, "local_vg", pos, len); + if (ret >= len - pos) { + rv = -ENOSPC; + break; + } + pos += ret; + } + pthread_mutex_unlock(&lockspaces_mutex); + + if (rv < 0) + return rv; + + /* + * lockspaces with their action/resource/lock info + */ + + pthread_mutex_lock(&lockspaces_mutex); + list_for_each_entry(ls, &lockspaces, list) { + + ret = print_lockspace(ls, "ls", pos, len); + if (ret >= len - pos) { + rv = -ENOSPC; + goto out; + } + pos += ret; + + list_for_each_entry(act, &ls->actions, list) { + ret = print_action(act, "ls_action", pos, len); + if (ret >= len - pos) { + rv = -ENOSPC; + goto out; + } + pos += ret; + } + + list_for_each_entry(r, &ls->resources, list) { + ret = print_resource(r, "r", pos, len); + if (ret >= len - pos) { + rv = -ENOSPC; + goto out; + } + pos += ret; + + list_for_each_entry(lk, &r->locks, list) { + ret = print_lock(lk, "lk", pos, len); + if (ret >= len - pos) { + rv = -ENOSPC; + goto out; + } + pos += ret; + } + + list_for_each_entry(act, &r->actions, list) { + ret = print_action(act, "r_action", pos, len); + if (ret >= len - pos) { + rv = -ENOSPC; + goto out; + } + pos += ret; + } + } + } +out: + pthread_mutex_unlock(&lockspaces_mutex); + + *dump_len = pos; + + return rv; +} + +/* called from client_thread, cl->mutex is held */ +static void client_recv_action(struct client *cl) +{ + request req; + response res; + struct lockspace *lls = NULL; + struct action *act; + const char *cl_name; + const char *vg_name; + const char *vg_uuid; + const char *vg_sysid; + const char *str; + int64_t val; + uint32_t opts = 0; + int result = 0; + int cl_pid; + int op, rt, lm, mode; + int rv; + + buffer_init(&req.buffer); + + rv = buffer_read(cl->fd, &req.buffer); + if (!rv) { + if (errno == ECONNRESET) { + log_debug("client recv %d ECONNRESET", cl->id); + cl->dead = 1; + } else { + log_error("client recv %d buffer_read error %d", cl->id, errno); + } + buffer_destroy(&req.buffer); + client_resume(cl); + return; + } + + req.cft = dm_config_from_string(req.buffer.mem); + if (!req.cft) { + log_error("client recv %d config_from_string error", cl->id); + buffer_destroy(&req.buffer); + client_resume(cl); + return; + } + + str = daemon_request_str(req, "request", NULL); + rv = str_to_op_rt(str, &op, &rt); + if (rv < 0) { + log_error("client recv %d bad request name \"%s\"", cl->id, str ? str : ""); + dm_config_destroy(req.cft); + buffer_destroy(&req.buffer); + client_resume(cl); + return; + } + + if (op == LD_OP_HELLO || op == LD_OP_QUIT || + op == LD_OP_DUMP_INFO || op == LD_OP_DUMP_LOG) { + + /* + * TODO: add the client command name to the hello messages + * so it can be saved in cl->name here. + */ + + result = 0; + + if (op == LD_OP_QUIT) { + log_debug("op quit"); + pthread_mutex_lock(&lockspaces_mutex); + if (list_empty(&lockspaces)) { + daemon_quit = 1; + } else { + result = -EBUSY; + } + pthread_mutex_unlock(&lockspaces_mutex); + } + + buffer_init(&res.buffer); + + if (op == LD_OP_DUMP_INFO || op == LD_OP_DUMP_LOG) { + int dump_len = 0; + int fd; + + fd = setup_dump_socket(); + if (fd < 0) + result = fd; + else if (op == LD_OP_DUMP_INFO) + result = dump_info(&dump_len); + else if (op == LD_OP_DUMP_LOG) + result = dump_log(&dump_len); + else + result = -EINVAL; + + res = daemon_reply_simple("OK", + "result = %d", result, + "dump_len = %d", dump_len, + NULL); + if (fd >= 0) { + send_dump_buf(fd, dump_len); + close(fd); + } + + } else { + res = daemon_reply_simple("OK", + "result = %d", result, + "protocol = %s", lvmlockd_protocol, + "version = %d", lvmlockd_protocol_version, + NULL); + } + + buffer_write(cl->fd, &res.buffer); + buffer_destroy(&res.buffer); + dm_config_destroy(req.cft); + buffer_destroy(&req.buffer); + client_resume(cl); + return; + } + + cl_name = daemon_request_str(req, "cmd", NULL); + cl_pid = daemon_request_int(req, "pid", 0); + vg_name = daemon_request_str(req, "vg_name", NULL); + vg_uuid = daemon_request_str(req, "vg_uuid", NULL); + vg_sysid = daemon_request_str(req, "vg_sysid", NULL); + str = daemon_request_str(req, "mode", NULL); + mode = str_to_mode(str); + str = daemon_request_str(req, "opts", NULL); + opts = str_to_opts(str); + str = daemon_request_str(req, "vg_lock_type", NULL); + lm = str_to_lm(str); + + if (cl_pid && cl_pid != cl->pid) + log_error("client recv bad message pid %d client %d", cl_pid, cl->pid); + + /* TODO: do this in hello message instead */ + if (!cl->name[0] && cl_name) + strncpy(cl->name, cl_name, MAX_NAME-1); + + if (!our_system_id) { + str = daemon_request_str(req, "our_system_id", NULL); + if (str && strcmp(str, "none")) + our_system_id = strdup(str); + } + + /* + * Detect the common case of a lock op on a local vg and queue + * a reply immediately without going through a thread. + */ + + if (rt == LD_RT_VG && op == LD_OP_LOCK) { + pthread_mutex_lock(&local_thread_mutex); + lls = find_local_vg(vg_name, vg_uuid); + pthread_mutex_unlock(&local_thread_mutex); + if (lls) + result = is_other_sysid(lls) ? -EOTHERVG : -ELOCALVG; + } + + /* + * A local vg with no sysid, accessible from multiple hosts, can be + * modified without coordination if a user is not careful. The best we + * can do is disable the lvmetad cache for these vgs so any problems are + * detected earlier, and not masked by lvmetad caching. + */ + + if (lls && (result == -ELOCALVG) && !lls->vg_sysid[0]) + invalidate_lvmetad_vg(lls); + + if ((result == -EOTHERVG) || (result == -ELOCALVG && !local_thread_also)) { + const char *sysid = lls->vg_sysid[0] ? lls->vg_sysid : "none"; + + log_debug("local vg %s result %d %s sysid %s", vg_name, result, + (result == -EOTHERVG) ? "other" : "local", sysid); + + buffer_init(&res.buffer); + res = daemon_reply_simple("OK", + "op_result = %d", result, + "vg_sysid = %s", sysid, + "lock_type = %s", "none", + "result_flags = %s", "LOCAL_LS", + NULL); + buffer_write(cl->fd, &res.buffer); + buffer_destroy(&res.buffer); + dm_config_destroy(req.cft); + buffer_destroy(&req.buffer); + client_resume(cl); + return; + } + + if (!gl_use_dlm && !gl_use_sanlock && (lm > 0)) { + if (lm == LD_LM_DLM) + gl_use_dlm = 1; + else if (lm == LD_LM_SANLOCK) + gl_use_sanlock = 1; + + log_debug("set gl_use_%s", lm_str(lm)); + } + + if (!(act = alloc_action())) { + log_error("No memory for action"); + dm_config_destroy(req.cft); + buffer_destroy(&req.buffer); + client_resume(cl); + return; + } + + act->client_id = cl->id; + act->op = op; + act->rt = rt; + act->mode = mode; + act->flags = opts; + act->lm_type = lm; + + if (vg_name && strcmp(vg_name, "none")) + strncpy(act->vg_name, vg_name, MAX_NAME); + + if (vg_uuid && strcmp(vg_uuid, "none")) + strncpy(act->vg_uuid, vg_uuid, 64); + + if (vg_sysid && strcmp(vg_sysid, "none")) + strncpy(act->vg_sysid, vg_sysid, MAX_NAME); + + str = daemon_request_str(req, "lv_name", NULL); + if (str && strcmp(str, "none")) + strncpy(act->lv_name, str, MAX_NAME); + + val = daemon_request_int(req, "version", 0); + if (val) + act->version = (uint32_t)val; + + str = daemon_request_str(req, "vg_lock_args", NULL); + if (str && strcmp(str, "none")) + strncpy(act->vg_args, str, MAX_ARGS); + + str = daemon_request_str(req, "lv_lock_args", NULL); + if (str && strcmp(str, "none")) + strncpy(act->lv_args, str, MAX_ARGS); + + /* start_vg will include lvmlocal.conf local/host_id here */ + val = daemon_request_int(req, "host_id", 0); + if (val) + act->host_id = val; + + act->max_retries = daemon_request_int(req, "max_retries", DEFAULT_MAX_RETRIES); + + dm_config_destroy(req.cft); + buffer_destroy(&req.buffer); + + log_debug("recv %s[%d.%u] %s %s \"%s\" mode %s flags %x", + cl->name[0] ? cl->name : "client", cl->pid, cl->id, + op_str(act->op), rt_str(act->rt), act->vg_name, mode_str(act->mode), opts); + + /* + * local lock on local vg (lls) is done when local locking is enabled. + * local lock on gl is done when local locking is enabled and lockd is not. + */ + if ((local_thread_also && lls) || + (local_thread_only && rt == LD_RT_GL && op == LD_OP_LOCK)) { + add_local_lock_action(lls, act); + return; + } + + switch (act->op) { + case LD_OP_START: + rv = add_lockspace(act); + break; + case LD_OP_STOP: + rv = rem_lockspace(act); + break; + case LD_OP_INIT: + case LD_OP_UPDATE_LOCAL: + case LD_OP_START_WAIT: + case LD_OP_STOP_ALL: + add_work_action(act); + rv = 0; + break; + case LD_OP_LOCK: + case LD_OP_UPDATE: + case LD_OP_ENABLE: + case LD_OP_DISABLE: + case LD_OP_FREE: + rv = add_lock_action(act); + break; + case LD_OP_ADD_LOCAL: + pthread_mutex_lock(&local_thread_mutex); + add_local_vg(act->vg_name, act->vg_uuid, act->vg_sysid); + pthread_mutex_unlock(&local_thread_mutex); + act->result = 0; + add_client_result(act); + rv = 0; + break; + case LD_OP_REM_LOCAL: + pthread_mutex_lock(&local_thread_mutex); + rem_local_vg(act->vg_name, act->vg_uuid); + pthread_mutex_unlock(&local_thread_mutex); + act->result = 0; + add_client_result(act); + rv = 0; + break; + default: + rv = -EINVAL; + }; + + if (rv < 0) { + act->result = rv; + add_client_result(act); + } +} + +static void *client_thread_main(void *arg_in) +{ + struct client *cl; + struct action *act; + + while (1) { + pthread_mutex_lock(&client_mutex); + while (!client_work && list_empty(&client_results)) { + if (client_stop) { + pthread_mutex_unlock(&client_mutex); + goto out; + } + pthread_cond_wait(&client_cond, &client_mutex); + } + + /* + * Send outgoing results back to clients + */ + + if (!list_empty(&client_results)) { + act = list_first_entry(&client_results, struct action, list); + list_del(&act->list); + cl = find_client_id(act->client_id); + pthread_mutex_unlock(&client_mutex); + + if (cl) { + pthread_mutex_lock(&cl->mutex); + client_send_result(cl, act); + pthread_mutex_unlock(&cl->mutex); + } else { + log_debug("no client for result"); + } + free_action(act); + continue; + } + + /* + * Queue incoming actions for lockspace threads + */ + + if (client_work) { + cl = find_client_work(); + if (!cl) + client_work = 0; + pthread_mutex_unlock(&client_mutex); + + if (!cl) + continue; + + pthread_mutex_lock(&cl->mutex); + + if (cl->recv) { + cl->recv = 0; + client_recv_action(cl); + } + + if (cl->dead) { + /* + log_debug("client rem %d pi %d fd %d ig %d", + cl->id, cl->pi, cl->fd, cl->poll_ignore); + */ + /* + * If cl->dead was set in main_loop, then the + * fd has already been closed and the pollfd + * entry is already unused. + * main_loop set dead=1, ignore=0, pi=-1, fd=-1 + * + * if cl->dead was not set in main_loop, but + * set in client_recv_action, then the main_loop + * should be ignoring this client fd. + * main_loop set ignore=1 + */ + + if (cl->poll_ignore) { + log_debug("client close %d pi %d fd %d", + cl->id, cl->pi, cl->fd); + /* assert cl->pi != -1 */ + /* assert pollfd[pi].fd == FD_IGNORE */ + close(cl->fd); + rem_pollfd(cl->pi); + cl->pi = -1; + cl->fd = -1; + cl->poll_ignore = 0; + } else { + /* main thread should have closed */ + if (cl->pi != -1 || cl->fd != -1) { + log_error("client %d bad state pi %d fd %d", + cl->id, cl->pi, cl->fd); + } + } + pthread_mutex_unlock(&cl->mutex); + + pthread_mutex_lock(&client_mutex); + list_del(&cl->list); + pthread_mutex_unlock(&client_mutex); + + client_purge(cl); + + free_client(cl); + } else { + pthread_mutex_unlock(&cl->mutex); + } + } + pthread_mutex_unlock(&client_mutex); + } +out: + return NULL; +} + +static int setup_client_thread(void) +{ + int rv; + + INIT_LIST_HEAD(&client_list); + INIT_LIST_HEAD(&client_results); + + pthread_mutex_init(&client_mutex, NULL); + pthread_cond_init(&client_cond, NULL); + + rv = pthread_create(&client_thread, NULL, client_thread_main, NULL); + if (rv) + return -1; + return 0; +} + +static void close_client_thread(void) +{ + pthread_mutex_lock(&client_mutex); + client_stop = 1; + pthread_cond_signal(&client_cond); + pthread_mutex_unlock(&client_mutex); + pthread_join(client_thread, NULL); +} + +static int setup_local_thread(void) +{ + struct lockspace *ls; + struct resource *r; + int rv; + + if (!local_thread_also) + return 0; + + if (local_thread_only) { + if (!(ls = alloc_lockspace())) + return -ENOMEM; + + if (!(r = alloc_resource())) { + free(ls); + return -ENOMEM; + } + + strcpy(ls->name, "local_thread_gls"); + + r->type = LD_RT_GL; + r->mode = LD_LK_UN; + strncpy(r->name, R_NAME_GL, MAX_NAME); + list_add_tail(&r->list, &ls->resources); + + list_add(&ls->list, &local_vgs); + local_thread_gls = ls; + } + + rv = pthread_create(&local_thread, NULL, local_thread_main, NULL); + if (rv) + return -1; + + return 0; +} + +static void close_local_thread(void) +{ + if (!local_thread_also) + return; + + pthread_mutex_lock(&local_thread_mutex); + local_thread_stop = 1; + pthread_cond_signal(&local_thread_cond); + pthread_mutex_unlock(&local_thread_mutex); + pthread_join(local_thread, NULL); +} + +#if 0 +static void setup_listener(void) +{ + struct sockaddr_un addr; + int rv, fd, ci; + + rv = lvmlockd_socket_address(&addr); + if (rv < 0) + return rv; + + fd = socket(AF_LOCAL, SOCK_STREAM, 0); + if (fd < 0) + return fd; + + unlink(addr.sun_path); + rv = bind(fd, (struct sockaddr *) &addr, sizeof(struct sockaddr_un)); + if (rv < 0) + goto exit_fail; + + rv = chmod(addr.sun_path, DEFAULT_SOCKET_MODE); + if (rv < 0) + goto exit_fail; + + rv = chown(addr.sun_path, com.uid, com.gid); + if (rv < 0) { + log_error("could not set socket %s permissions: %s", + addr.sun_path, strerror(errno)); + goto exit_fail; + } + + rv = listen(fd, 5); + if (rv < 0) + goto exit_fail; + + fcntl(fd, F_SETFL, fcntl(fd, F_GETFL, 0) | O_NONBLOCK); + + listen_pi = add_pollfd(fd); + + return 0; + +exit_fail: + close(fd); + return -1; +} +#endif + +/* + * Get a list of all VGs with a lockd type (sanlock|dlm) from lvmetad. + * We'll match this list against a list of existing lockspaces that are + * found in the lock manager. + * + * For each of these VGs, also create a struct resource on ls->resources to + * represent each LV in the VG that uses a lock. For each of these LVs + * that are active, we'll attempt to adopt a lock. + */ + +static int get_lockd_vgs(struct list_head *vg_lockd) +{ + struct list_head update_vgs; + daemon_reply reply; + struct dm_config_node *cn; + struct dm_config_node *metadata; + struct dm_config_node *md_cn; + struct dm_config_node *lv_cn; + struct lockspace *ls, *safe; + struct resource *r; + const char *vg_name; + const char *vg_uuid; + const char *lock_type; + const char *lock_args; + const char *system_id; + char lv_lock_path[PATH_MAX]; + int mutex_unlocked = 0; + int rv = 0; + + INIT_LIST_HEAD(&update_vgs); + + pthread_mutex_lock(&lvmetad_mutex); + reply = daemon_send_simple(lvmetad_handle, "vg_list", + "token = %s", "skip", + NULL); + + if (reply.error || strcmp(daemon_reply_str(reply, "response", ""), "OK")) { + log_error("vg_list from lvmetad failed %d", reply.error); + rv = -EINVAL; + goto destroy; + } + + if (!(cn = dm_config_find_node(reply.cft->root, "volume_groups"))) { + log_error("get_lockd_vgs no vgs"); + rv = -EINVAL; + goto destroy; + } + + /* create an update_vgs list of all vg uuids */ + + for (cn = cn->child; cn; cn = cn->sib) { + vg_uuid = cn->key; + + if (!(ls = alloc_lockspace())) { + rv = -ENOMEM; + break; + } + + strncpy(ls->vg_uuid, vg_uuid, 64); + list_add_tail(&ls->list, &update_vgs); + log_debug("get_lockd_vgs %s", vg_uuid); + } + destroy: + daemon_reply_destroy(reply); + + if (rv < 0) + goto out; + + /* get vg_name and lock_type for each vg uuid entry in update_vgs */ + + list_for_each_entry(ls, &update_vgs, list) { + reply = daemon_send_simple(lvmetad_handle, "vg_lookup", + "token = %s", "skip", + "uuid = %s", ls->vg_uuid, + NULL); + + if (reply.error || strcmp(daemon_reply_str(reply, "response", ""), "OK")) { + log_error("vg_lookup from lvmetad failed %d", reply.error); + rv = -EINVAL; + goto next; + } + + vg_name = daemon_reply_str(reply, "name", NULL); + if (!vg_name) { + log_error("get_lockd_vgs %s no name", ls->vg_uuid); + rv = -EINVAL; + goto next; + } + + strncpy(ls->vg_name, vg_name, MAX_NAME); + + metadata = dm_config_find_node(reply.cft->root, "metadata"); + if (!metadata) { + log_error("get_lockd_vgs %s name %s no metadata", + ls->vg_uuid, ls->vg_name); + rv = -EINVAL; + goto next; + } + + lock_type = dm_config_find_str(metadata, "metadata/lock_type", NULL); + ls->lm_type = str_to_lm(lock_type); + + if ((ls->lm_type != LD_LM_SANLOCK) && (ls->lm_type != LD_LM_DLM)) + continue; + + lock_args = dm_config_find_str(metadata, "metadata/lock_args", NULL); + if (lock_args) + strncpy(ls->vg_args, lock_args, MAX_ARGS); + + system_id = dm_config_find_str(metadata, "metadata/system_id", NULL); + if (system_id) + strncpy(ls->vg_sysid, system_id, MAX_NAME); + + /* + * Make a record (struct resource) of each lv that uses a lock. + * For any lv that uses a lock, we'll check if the lv is active + * and if so try to adopt a lock for it. + */ + + for (md_cn = metadata->child; md_cn; md_cn = md_cn->sib) { + if (strcmp(md_cn->key, "logical_volumes")) + continue; + + for (lv_cn = md_cn->child; lv_cn; lv_cn = lv_cn->sib) { + snprintf(lv_lock_path, PATH_MAX, "%s/lock_type", lv_cn->key); + lock_type = dm_config_find_str(lv_cn, lv_lock_path, NULL); + + if (!lock_type) + continue; + + snprintf(lv_lock_path, PATH_MAX, "%s/lock_args", lv_cn->key); + lock_args = dm_config_find_str(lv_cn, lv_lock_path, NULL); + + if (!(r = alloc_resource())) { + rv = -ENOMEM; + goto next; + } + + r->type = LD_RT_LV; + strncpy(r->name, lv_cn->key, MAX_NAME); + strncpy(r->lv_args, lock_args, MAX_ARGS); + list_add_tail(&r->list, &ls->resources); + } + } + next: + daemon_reply_destroy(reply); + + log_debug("get_lockd_vgs %s lock_type %s lock_args %s", + ls->vg_name, lock_type, lock_args); + + if (rv < 0) + break; + } + pthread_mutex_unlock(&lvmetad_mutex); + mutex_unlocked = 1; +out: + /* Return lockd VG's on the vg_lockd list. */ + + list_for_each_entry_safe(ls, safe, &update_vgs, list) { + list_del(&ls->list); + + if ((ls->lm_type == LD_LM_SANLOCK) || (ls->lm_type == LD_LM_DLM)) + list_add_tail(&ls->list, vg_lockd); + else + free(ls); + } + + if (!mutex_unlocked) + pthread_mutex_unlock(&lvmetad_mutex); + + return rv; +} + +/* + * For each lockd VG found in lvmetad, check if any + * of it's LV's are active. If there's a device node + * for the LV in /dev/<vg>/ then we conclude it's active + * and will go on to adopt a lock for it. + * If not, then we don't need to adopt an LV lock for it + * and remove the struct resource that represents the LV. + */ + +static int get_active_lvs(struct list_head *vg_lockd) +{ + struct lockspace *ls; + struct resource *r, *rsafe; + struct list_head tmp_resources; + char vg_dir_path[PATH_MAX]; + struct dirent *de; + DIR *vg_dir; + + INIT_LIST_HEAD(&tmp_resources); + + /* Get the subset of lockd vgs with active lvs. */ + + list_for_each_entry(ls, vg_lockd, list) { + + /* Add these resources back as LV device nodes are found for them. */ + list_for_each_entry_safe(r, rsafe, &ls->resources, list) { + list_del(&r->list); + list_add(&r->list, &tmp_resources); + } + + snprintf(vg_dir_path, PATH_MAX-1, "/dev/%s/", ls->vg_name); + + if ((vg_dir = opendir(vg_dir_path))) { + while ((de = readdir(vg_dir))) { + if (de->d_name[0] == '.') + continue; + + /* put the struct resource back on the ls */ + list_for_each_entry(r, &tmp_resources, list) { + if (strncmp(r->name, de->d_name, MAX_NAME)) + continue; + + log_debug("lockd vg %s has active lv %s", ls->vg_name, r->name); + list_del(&r->list); + list_add_tail(&r->list, &ls->resources); + break; + } + } + closedir(vg_dir); + } + + /* Remove remaining tmp_resources for LVs that are not active. */ + list_for_each_entry_safe(r, rsafe, &tmp_resources, list) { + log_debug("lockd vg %s ignore inactive lv %s", ls->vg_name, r->name); + list_del(&r->list); + free_resource(r); + } + } + + return 0; +} + +static void adopt_locks(void) +{ + struct list_head ls_found; + struct list_head vg_lockd; + struct list_head to_unlock; + struct lockspace *ls, *lsafe; + struct lockspace *ls1, *l1safe; + struct lockspace *ls2, *l2safe; + struct resource *r, *rsafe; + struct action *act, *asafe; + int count_start = 0, count_start_done = 0, count_start_fail = 0; + int count_adopt = 0, count_adopt_done = 0, count_adopt_fail = 0; + int found, rv; + + INIT_LIST_HEAD(&adopt_results); + + INIT_LIST_HEAD(&ls_found); + INIT_LIST_HEAD(&vg_lockd); + INIT_LIST_HEAD(&to_unlock); + + /* + * Get list of lockspaces from lock managers. + * Get list of VGs from lvmetad with a lockd type. + * Get list of active lockd type LVs from /dev. + * + * ECONNREFUSED means the lock manager is not running. + * This is expected for at least one of them. + */ + + rv = lm_get_lockspaces_dlm(&ls_found); + if ((rv < 0) && (rv != -ECONNREFUSED)) + goto fail; + + rv = lm_get_lockspaces_sanlock(&ls_found); + if ((rv < 0) && (rv != -ECONNREFUSED)) + goto fail; + + if (list_empty(&ls_found)) { + log_debug("No lockspaces found to adopt"); + return; + } + + rv = get_lockd_vgs(&vg_lockd); + if (rv < 0) + goto fail; + + rv = get_active_lvs(&vg_lockd); + if (rv < 0) + goto fail; + + list_for_each_entry(ls, &ls_found, list) { + if (ls->lm_type == LD_LM_DLM) + gl_use_dlm = 1; + + log_debug("adopt %s lockspace %s vg %s", + lm_str(ls->lm_type), ls->name, ls->vg_name); + } + + if (!gl_use_dlm) + gl_use_sanlock = 1; + + list_for_each_entry(ls, &vg_lockd, list) { + log_debug("adopt lvmetad vg %s lock_type %s lock_args %s", + ls->vg_name, lm_str(ls->lm_type), ls->vg_args); + + list_for_each_entry(r, &ls->resources, list) + log_debug("adopt device lv %s/%s", ls->vg_name, r->name); + } + + /* + * Compare and merge the list of lockspaces in ls_found + * and the list of lockd VGs in vg_lockd. + * + * An ls from ls_found may not have had any active lvs when + * previous lvmlockd died, but the ls should still be joined, + * and checked for GL/VG locks. + * + * An ls from vg_lockd with active lvs should be in ls_found. + * If it's not then we might want to join the ls and acquire locks + * for the active lvs (as opposed to adopting orphans for them.) + * The orphan lock in the ls should have prevented the ls in + * the lock manager from going away. + * + * If an ls in vg_lockd has no active lvs and does not have + * a matching entry in ls_found, then skip it. + * + * An ls in ls_found should always have a matching ls in + * vg_lockd. If it doesn't, then maybe the vg has been + * removed even though the lockspace for the vg is still + * in the lock manager. Just leave the ls in the lm + * alone, and skip the ls_found entry. + */ + + list_for_each_entry_safe(ls1, l1safe, &ls_found, list) { + + /* The dlm global lockspace is special and doesn't match a VG. */ + if (!strcmp(ls1->name, gl_lsname_dlm)) { + list_del(&ls1->list); + free(ls1); + } + + found = 0; + + list_for_each_entry_safe(ls2, l2safe, &vg_lockd, list) { + if (strcmp(ls1->vg_name, ls2->vg_name)) + continue; + + /* + * LS in both ls_found and vg_lockd. + */ + log_debug("ls %s matches vg %s", ls1->name, ls2->vg_name); + memcpy(ls1->vg_uuid, ls2->vg_uuid, 64); + memcpy(ls1->vg_args, ls2->vg_args, MAX_ARGS); + list_for_each_entry_safe(r, rsafe, &ls2->resources, list) { + list_del(&r->list); + list_add(&r->list, &ls1->resources); + } + list_del(&ls2->list); + free(ls2); + found = 1; + break; + } + + /* + * LS in ls_found, not in vg_lockd. + * An lvm lockspace found in the lock manager has no + * corresponding VG in lvmetad. This shouldn't usually + * happen, but it's possible the VG could have been removed + * while the orphaned lockspace from it was still around. + * Report an error and leave the ls in the lm alone. + */ + if (!found) { + log_error("No VG %s found for lockspace %s %s", + ls1->vg_name, ls1->name, lm_str(ls1->lm_type)); + list_del(&ls1->list); + free(ls1); + } + } + + /* + * LS in vg_lockd, not in ls_found. + * lockd vgs from lvmetad that do not have an existing lockspace. + * This wouldn't be unusual; we just skip the vg. + * But, if the vg has active lvs, then it should have had locks + * and a lockspace. Should we attempt to join the lockspace and + * acquire (not adopt) locks for these LVs? + */ + + list_for_each_entry_safe(ls, lsafe, &vg_lockd, list) { + if (!list_empty(&ls->resources)) { + /* We should have found a lockspace. */ + /* TODO: add this ls and acquire locks for ls->resources? */ + log_error("No lockspace %s %s found for VG %s with active LVs", + ls->name, lm_str(ls->lm_type), ls->vg_name); + } else { + /* The VG wasn't started in the previous lvmlockd. */ + log_debug("No ls found for vg %s", ls->vg_name); + } + + list_del(&ls->list); + free(ls); + } + + /* + * Create and queue start actions to add lockspaces. + */ + + if (gl_use_dlm) { + if (!(act = alloc_action())) + goto fail; + log_debug("adopt add dlm global lockspace"); + act->op = LD_OP_START; + act->flags = (LD_AF_ADOPT | LD_AF_WAIT); + act->rt = LD_RT_GL; + act->lm_type = LD_LM_DLM; + act->client_id = ADOPT_CLIENT_ID; + add_dlm_global_lockspace(act); + count_start++; + } + + list_for_each_entry_safe(ls, lsafe, &ls_found, list) { + if (!(act = alloc_action())) + goto fail; + act->op = LD_OP_START; + act->flags = (LD_AF_ADOPT | LD_AF_WAIT); + act->rt = LD_RT_VG; + act->lm_type = ls->lm_type; + act->client_id = ADOPT_CLIENT_ID; + strncpy(act->vg_name, ls->vg_name, MAX_NAME); + memcpy(act->vg_uuid, ls->vg_uuid, 64); + memcpy(act->vg_args, ls->vg_args, MAX_ARGS); + act->host_id = ls->host_id; + + /* set act->version from lvmetad data? */ + + log_debug("adopt add %s vg lockspace %s", lm_str(act->lm_type), act->vg_name); + + rv = add_lockspace_thread(ls->name, act->vg_name, act->vg_uuid, + act->lm_type, act->vg_args, act); + if (rv < 0) { + log_error("Failed to create lockspace thread for VG %s", ls->vg_name); + list_del(&ls->list); + free(ls); + free_action(act); + count_start_fail++; + continue; + } + + /* + * When the lockspace_thread is done with the start act, + * it will see the act ADOPT flag and move the act onto + * the adopt_results list for us to collect below. + */ + count_start++; + } + + log_debug("adopt starting %d lockspaces", count_start); + + /* + * Wait for all start/rejoin actions to complete. Each start action + * queued above will appear on the adopt_results list when finished. + */ + + while (count_start_done < count_start) { + sleep(1); + act = NULL; + + pthread_mutex_lock(&client_mutex); + if (!list_empty(&adopt_results)) { + act = list_first_entry(&adopt_results, struct action, list); + list_del(&act->list); + } + pthread_mutex_unlock(&client_mutex); + + if (!act) + continue; + + if (act->result < 0) { + log_error("adopt add lockspace failed vg %s %d", act->vg_name, act->result); + count_start_fail++; + } + + free_action(act); + count_start_done++; + } + + log_debug("adopt started %d lockspaces done %d fail %d", + count_start, count_start_done, count_start_fail); + + /* + * Create lock-adopt actions for active LVs (ls->resources), + * and GL/VG locks (we don't know if these locks were held + * and orphaned by the last lvmlockd, so try to adopt them + * to see.) + * + * A proper struct lockspace now exists on the lockspaces list + * for each ls in ls_found. Lock ops for one of those + * lockspaces can be done as OP_LOCK actions queued using + * add_lock_action(); + * + * Start by attempting to adopt the lock in the most likely + * mode it was left in (ex for lvs, sh for vg/gl). If + * the mode is wrong, the lm will return an error and we + * try again with the other mode. + */ + + list_for_each_entry(ls, &ls_found, list) { + + /* + * Adopt orphan LV locks. + */ + + list_for_each_entry(r, &ls->resources, list) { + if (!(act = alloc_action())) + goto fail; + act->op = LD_OP_LOCK; + act->rt = LD_RT_LV; + act->mode = LD_LK_EX; + act->flags = (LD_AF_ADOPT | LD_AF_PERSISTENT); + act->client_id = ADOPT_CLIENT_ID; + act->lm_type = ls->lm_type; + strncpy(act->vg_name, ls->vg_name, MAX_NAME); + strncpy(act->lv_name, r->name, MAX_NAME); + strncpy(act->lv_args, r->lv_args, MAX_ARGS); + + log_debug("adopt lock for lv %s/%s", act->vg_name, act->lv_name); + + rv = add_lock_action(act); + if (rv < 0) { + log_error("adopt add_lock_action lv %s/%s error %d", act->vg_name, act->lv_name, rv); + count_adopt_fail++; + free_action(act); + } else { + count_adopt++; + } + } + + /* + * Adopt orphan VG lock. + */ + + if (!(act = alloc_action())) + goto fail; + act->op = LD_OP_LOCK; + act->rt = LD_RT_VG; + act->mode = LD_LK_SH; + act->flags = LD_AF_ADOPT; + act->client_id = ADOPT_CLIENT_ID; + act->lm_type = ls->lm_type; + strncpy(act->vg_name, ls->vg_name, MAX_NAME); + + log_debug("adopt lock for vg %s", act->vg_name); + + rv = add_lock_action(act); + if (rv < 0) { + log_error("adopt add_lock_action vg %s error %d", act->vg_name, rv); + count_adopt_fail++; + free_action(act); + } else { + count_adopt++; + } + } + + /* + * Adopt orphan GL lock. + */ + + if (!(act = alloc_action())) + goto fail; + act->op = LD_OP_LOCK; + act->rt = LD_RT_GL; + act->mode = LD_LK_SH; + act->flags = LD_AF_ADOPT; + act->client_id = ADOPT_CLIENT_ID; + act->lm_type = (gl_use_sanlock ? LD_LM_SANLOCK : LD_LM_DLM); + + log_debug("adopt lock for gl"); + + rv = add_lock_action(act); + if (rv < 0) { + log_error("adopt add_lock_action gl %s error %d", act->vg_name, rv); + count_adopt_fail++; + free_action(act); + } else { + count_adopt++; + } + + /* + * Wait for lock-adopt actions to complete. The completed + * actions are passed back here via the adopt_results list. + */ + + while (count_adopt_done < count_adopt) { + sleep(1); + act = NULL; + + pthread_mutex_lock(&client_mutex); + if (!list_empty(&adopt_results)) { + act = list_first_entry(&adopt_results, struct action, list); + list_del(&act->list); + } + pthread_mutex_unlock(&client_mutex); + + if (!act) + continue; + + /* + * lock adopt results + */ + + if (act->result == -EUCLEAN) { + /* + * Adopt failed because the orphan has a different mode + * than initially requested. Repeat the lock-adopt operation + * with the other mode. N.B. this logic depends on first + * trying sh then ex for GL/VG locks, and ex then sh for + * LV locks. + */ + + if ((act->rt != LD_RT_LV) && (act->mode == LD_LK_SH)) { + /* GL/VG locks: attempt to adopt ex after sh failed. */ + act->mode = LD_LK_EX; + rv = add_lock_action(act); + + } else if ((act->rt == LD_RT_LV) && (act->mode == LD_LK_EX)) { + /* LV locks: attempt to adopt sh after ex failed. */ + act->mode = LD_LK_SH; + rv = add_lock_action(act); + + } else { + log_error("Failed to adopt %s lock in vg %s error %d", + rt_str(act->rt), act->vg_name, act->result); + count_adopt_fail++; + count_adopt_done++; + free_action(act); + rv = 0; + } + + if (rv < 0) { + log_error("adopt add_lock_action again %s", act->vg_name); + count_adopt_fail++; + count_adopt_done++; + free_action(act); + } + + } else if (act->result == -ENOENT) { + /* + * No orphan lock exists. This is common for GL/VG locks + * because they may not have been held when lvmlockd exited. + * It's also expected for LV types that do not use a lock. + */ + + if (act->rt == LD_RT_LV) { + /* Unexpected, we should have found an orphan. */ + log_error("Failed to adopt LV lock for %s/%s error %d", + act->vg_name, act->lv_name, act->result); + count_adopt_fail++; + } else { + /* Normal, no GL/VG lock was orphaned. */ + log_debug("Did not adopt %s lock in vg %s error %d", + rt_str(act->rt), act->vg_name, act->result); + } + + count_adopt_done++; + free_action(act); + + } else if (act->result < 0) { + /* + * Some unexpected error. + */ + + log_error("adopt lock rt %s vg %s lv %s error %d", + rt_str(act->rt), act->vg_name, act->lv_name, act->result); + count_adopt_fail++; + count_adopt_done++; + free_action(act); + + } else { + /* + * Adopt success. + */ + + if (act->rt == LD_RT_LV) { + log_debug("adopt success lv %s/%s %s", act->vg_name, act->lv_name, mode_str(act->mode)); + free_action(act); + } else if (act->rt == LD_RT_VG) { + log_debug("adopt success vg %s %s", act->vg_name, mode_str(act->mode)); + list_add_tail(&act->list, &to_unlock); + } else if (act->rt == LD_RT_GL) { + log_debug("adopt success gl %s %s", act->vg_name, mode_str(act->mode)); + list_add_tail(&act->list, &to_unlock); + } + count_adopt_done++; + } + } + + /* + * Release adopted GL/VG locks. + * The to_unlock actions were the ones used to lock-adopt the GL/VG locks; + * now use them to do the unlocks. These actions will again be placed + * on adopt_results for us to collect because they have the ADOPT flag set. + */ + + count_adopt = 0; + count_adopt_done = 0; + + list_for_each_entry_safe(act, asafe, &to_unlock, list) { + list_del(&act->list); + + if (act->mode == LD_LK_EX) { + /* + * TODO: we probably want to check somehow that + * there's no lvm command still running that's + * using this ex lock and changing things. + */ + log_warn("adopt releasing ex %s lock %s", + rt_str(act->rt), act->vg_name); + } + + act->mode = LD_LK_UN; + + log_debug("adopt unlock for %s %s", rt_str(act->rt), act->vg_name); + + rv = add_lock_action(act); + if (rv < 0) { + log_error("adopt unlock add_lock_action error %d", rv); + free_action(act); + } else { + count_adopt++; + } + } + + /* Wait for the unlocks to complete. */ + + while (count_adopt_done < count_adopt) { + sleep(1); + act = NULL; + + pthread_mutex_lock(&client_mutex); + if (!list_empty(&adopt_results)) { + act = list_first_entry(&adopt_results, struct action, list); + list_del(&act->list); + } + pthread_mutex_unlock(&client_mutex); + + if (!act) + continue; + + if (act->result < 0) + log_error("adopt unlock error %d", act->result); + + count_adopt_done++; + free_action(act); + } + + + /* TODO: purge any remaining orphan locks in each rejoined ls? */ + + if (count_start_fail || count_adopt_fail) + goto fail; + + log_debug("adopt_locks done"); + return; + +fail: + log_error("adopt_locks failed, reset host"); +} + +static int get_peer_pid(int fd) +{ + struct ucred cred; + unsigned int len = sizeof(cred); + + if (getsockopt(fd, SOL_SOCKET, SO_PEERCRED, &cred, &len) != 0) + return -1; + + return cred.pid; +} + +static void process_listener(int poll_fd) +{ + struct client *cl; + int fd, pi; + + /* assert poll_fd == listen_fd */ + + fd = accept(listen_fd, NULL, NULL); + if (fd < 0) + return; + + if (!(cl = alloc_client())) + return; + + pi = add_pollfd(fd); + if (pi < 0) { + log_error("process_listener add_pollfd error %d", pi); + free_client(cl); + return; + } + + cl->pi = pi; + cl->fd = fd; + cl->pid = get_peer_pid(fd); + + pthread_mutex_init(&cl->mutex, NULL); + + pthread_mutex_lock(&client_mutex); + client_ids++; + + if (client_ids == ADOPT_CLIENT_ID) + client_ids++; + if (!client_ids) + client_ids++; + + cl->id = client_ids; + list_add_tail(&cl->list, &client_list); + pthread_mutex_unlock(&client_mutex); + + log_debug("client add id %d pi %d fd %d", cl->id, cl->pi, cl->fd); +} + +/* + * main loop polls on pipe[0] so that a thread can + * restart the poll by writing to pipe[1]. + */ +static int setup_restart(void) +{ + if (pipe(restart_fds)) { + log_error("setup_restart pipe error %d", errno); + return -1; + } + + restart_pi = add_pollfd(restart_fds[0]); + if (restart_pi < 0) + return restart_pi; + + return 0; +} + +/* + * thread wrote 'w' to restart_fds[1] to restart poll() + * after adding an fd back into pollfd. + */ +static void process_restart(int fd) +{ + char wake[1]; + /* assert fd == restart_fds[0] */ + read(restart_fds[0], wake, 1); +} + +static void sigterm_handler(int sig __attribute__((unused))) +{ + daemon_quit = 1; +} + +static int main_loop(daemon_state *ds_arg) +{ + struct client *cl; + int i, rv, is_recv, is_dead; + + signal(SIGTERM, &sigterm_handler); + + rv = setup_structs(); + if (rv < 0) { + log_error("Can't allocate memory"); + return rv; + } + + strcpy(gl_lsname_dlm, S_NAME_GL_DLM); + + INIT_LIST_HEAD(&local_vgs); + INIT_LIST_HEAD(&local_thread_actions); + pthread_mutex_init(&local_thread_mutex, NULL); + pthread_cond_init(&local_thread_cond, NULL); + INIT_LIST_HEAD(&lockspaces); + INIT_LIST_HEAD(&lockspaces_inactive); + pthread_mutex_init(&lockspaces_mutex, NULL); + pthread_mutex_init(&pollfd_mutex, NULL); + pthread_mutex_init(&log_mutex, NULL); + + openlog("lvmlockd", LOG_CONS | LOG_PID, LOG_DAEMON); + log_warn("lvmlockd started"); + + listen_fd = ds_arg->socket_fd; + listen_pi = add_pollfd(listen_fd); + + setup_client_thread(); + setup_worker_thread(); + setup_local_thread(); + setup_restart(); + + pthread_mutex_init(&lvmetad_mutex, NULL); + lvmetad_handle = lvmetad_open(NULL); + if (lvmetad_handle.error || lvmetad_handle.socket_fd < 0) + log_error("lvmetad_open error %d", lvmetad_handle.error); + else + lvmetad_connected = 1; + + /* add entries to local_vgs */ + create_work_action(LD_OP_UPDATE_LOCAL); + + /* + * Attempt to rejoin lockspaces and adopt locks from a previous + * instance of lvmlockd that left behind lockspaces/locks. + */ + if (adopt_opt) + adopt_locks(); + + while (1) { + rv = poll(pollfd, pollfd_maxi + 1, -1); + if (rv == -1 && errno == EINTR) { + if (daemon_quit) { + int count; + /* first sigterm would trigger stops, and + second sigterm may finish the joins. */ + count = for_each_lockspace(DO_STOP, DO_FREE, NO_FORCE); + if (!count) + break; + log_debug("ignore shutdown for %d lockspaces", count); + daemon_quit = 0; + } + continue; + } + if (rv < 0) { + log_error("poll errno %d", errno); + break; + } + + for (i = 0; i <= pollfd_maxi; i++) { + if (pollfd[i].fd < 0) + continue; + + is_recv = 0; + is_dead = 0; + + if (pollfd[i].revents & POLLIN) + is_recv = 1; + if (pollfd[i].revents & (POLLERR | POLLHUP | POLLNVAL)) + is_dead = 1; + + if (!is_recv && !is_dead) + continue; + + if (i == listen_pi) { + process_listener(pollfd[i].fd); + continue; + } + + if (i == restart_pi) { + process_restart(pollfd[i].fd); + continue; + } + + /* + log_debug("poll pi %d fd %d revents %x", + i, pollfd[i].fd, pollfd[i].revents); + */ + + pthread_mutex_lock(&client_mutex); + cl = find_client_pi(i); + if (cl) { + pthread_mutex_lock(&cl->mutex); + + if (cl->recv) { + /* should not happen */ + log_error("main client %d already recv", cl->id); + + } else if (cl->dead) { + /* should not happen */ + log_error("main client %d already dead", cl->id); + + } else if (is_dead) { + log_debug("close %s[%d.%u] fd %d", + cl->name[0] ? cl->name : "client", + cl->pid, cl->id, cl->fd); + cl->dead = 1; + cl->pi = -1; + cl->fd = -1; + cl->poll_ignore = 0; + close(pollfd[i].fd); + pollfd[i].fd = POLL_FD_UNUSED; + pollfd[i].events = 0; + pollfd[i].revents = 0; + + } else if (is_recv) { + cl->recv = 1; + cl->poll_ignore = 1; + pollfd[i].fd = POLL_FD_IGNORE; + pollfd[i].events = 0; + pollfd[i].revents = 0; + } + + pthread_mutex_unlock(&cl->mutex); + + client_work = 1; + pthread_cond_signal(&client_cond); + + /* client_thread will pick up and work on any + client with cl->recv or cl->dead set */ + + } else { + /* don't think this can happen */ + log_error("no client for index %d fd %d", + i, pollfd[i].fd); + close(pollfd[i].fd); + pollfd[i].fd = POLL_FD_UNUSED; + pollfd[i].events = 0; + pollfd[i].revents = 0; + } + pthread_mutex_unlock(&client_mutex); + + /* TODO?: after set_dead, scan pollfd for last unused + slot and reduce pollfd_maxi */ + } + } + + for_each_lockspace_retry(DO_STOP, DO_FREE, DO_FORCE); + free_lockspaces_inactive(); + close_worker_thread(); + close_client_thread(); + close_local_thread(); + closelog(); + daemon_close(lvmetad_handle); + return 0; +} + +static void usage(char *prog, FILE *file) +{ + fprintf(file, "Usage:\n"); + fprintf(file, "%s [options]\n\n", prog); + fprintf(file, " --help | -h\n"); + fprintf(file, " Show this help information.\n"); + fprintf(file, " --version | -V\n"); + fprintf(file, " Show version of lvmlockd.\n"); + fprintf(file, " --test | -T\n"); + fprintf(file, " Test mode, do not call lock manager.\n"); + fprintf(file, " --foreground | -f\n"); + fprintf(file, " Don't fork.\n"); + fprintf(file, " --daemon-debug | -D\n"); + fprintf(file, " Don't fork and print debugging to stdout.\n"); + fprintf(file, " --pid-file | -p <path>\n"); + fprintf(file, " Set path to the pid file. [%s]\n", LVMLOCKD_PIDFILE); + fprintf(file, " --socket-path | -s <path>\n"); + fprintf(file, " Set path to the socket to listen on. [%s]\n", LVMLOCKD_SOCKET); + fprintf(file, " --log-config | -l <str>\n"); + fprintf(file, " Set log config.\n"); + fprintf(file, " --local-also | -a\n"); + fprintf(file, " Manage locks between pids for local vgs.\n"); + fprintf(file, " --local-only | -o\n"); + fprintf(file, " Only manage locks for local vgs, not dlm|sanlock vgs.\n"); + fprintf(file, " --gl-type | -g <str>\n"); + fprintf(file, " Set global lock type to be dlm|sanlock.\n"); + fprintf(file, " --system-id | -y <str>\n"); + fprintf(file, " Set the local system id.\n"); + fprintf(file, " --host-id | -i <num>\n"); + fprintf(file, " Set the local sanlock host id.\n"); + fprintf(file, " --host-id-file | -F <path>\n"); + fprintf(file, " A file containing the local sanlock host_id.\n"); + fprintf(file, " --adopt | -A 0|1\n"); + fprintf(file, " Adopt locks from a previous instance of lvmlockd.\n"); +} + +int main(int argc, char *argv[]) +{ + daemon_state ds; + + ds.daemon_main = main_loop; + ds.daemon_init = NULL; + ds.daemon_fini = NULL; + ds.pidfile = getenv("LVM_LVMLOCKD_PIDFILE"); + ds.socket_path = getenv("LVM_LVMLOCKD_SOCKET"); + ds.protocol = lvmlockd_protocol; + ds.protocol_version = lvmlockd_protocol_version; + ds.name = "lvmlockd"; + + static struct option long_options[] = { + {"help", no_argument, 0, 'h' }, + {"version", no_argument, 0, 'V' }, + {"test", no_argument, 0, 'T' }, + {"foreground", no_argument, 0, 'f' }, + {"daemon-debug",no_argument, 0, 'D' }, + {"pid-file", required_argument, 0, 'p' }, + {"socket-path", required_argument, 0, 's' }, + {"local-also", no_argument, 0, 'a' }, + {"local-only", no_argument, 0, 'o' }, + {"gl-type", required_argument, 0, 'g' }, + {"system-id", required_argument, 0, 'y' }, + {"host-id", required_argument, 0, 'i' }, + {"host-id-file",required_argument, 0, 'F' }, + {"adopt", required_argument, 0, 'A' }, + {0, 0, 0, 0 } + }; + + while (1) { + int c; + int lm; + int option_index = 0; + + c = getopt_long(argc, argv, "hVTfDp:s:l:aog:S:I:A:", + long_options, &option_index); + if (c == -1) + break; + + switch (c) { + case '0': + break; + case 'h': + usage(argv[0], stdout); + exit(EXIT_SUCCESS); + case 'V': + printf("lvmlockd version: " LVM_VERSION "\n"); + exit(EXIT_SUCCESS); + case 'T': + daemon_test = 1; + break; + case 'f': + ds.foreground = 1; + break; + case 'D': + ds.foreground = 1; + daemon_debug = 1; + break; + case 'p': + ds.pidfile = strdup(optarg); + break; + case 's': + ds.socket_path = strdup(optarg); + break; + case 'a': + local_thread_also = 1; + break; + case 'o': + local_thread_also = 1; + local_thread_only = 1; + break; + case 'g': + lm = str_to_lm(optarg); + if (lm == LD_LM_DLM) + gl_use_dlm = 1; + else if (lm == LD_LM_SANLOCK) + gl_use_sanlock = 1; + else { + fprintf(stderr, "invalid gl-type option"); + exit(EXIT_FAILURE); + } + break; + case 'y': + our_system_id = strdup(optarg); + break; + case 'i': + daemon_host_id = atoi(optarg); + break; + case 'F': + daemon_host_id_file = strdup(optarg); + break; + case 'A': + adopt_opt = atoi(optarg); + break; + case '?': + default: + usage(argv[0], stdout); + exit(EXIT_FAILURE); + } + } + + if (!ds.pidfile) + ds.pidfile = LVMLOCKD_PIDFILE; + + if (!ds.socket_path) + ds.socket_path = LVMLOCKD_SOCKET; + + /* runs daemon_main/main_loop */ + daemon_start(ds); + + return 0; +} diff --git a/daemons/lvmlockd/lvmlockd-dlm.c b/daemons/lvmlockd/lvmlockd-dlm.c new file mode 100644 index 000000000..a193c726b --- /dev/null +++ b/daemons/lvmlockd/lvmlockd-dlm.c @@ -0,0 +1,641 @@ +/* + * Copyright (C) 2014 Red Hat, Inc. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU Lesser General Public License v.2.1. + */ + +#define _XOPEN_SOURCE 500 /* pthread */ +#define _ISOC99_SOURCE + +#include <assert.h> +#include <pthread.h> +#include <stdint.h> +#include <stddef.h> +#include <stdlib.h> +#include <unistd.h> +#include <stdio.h> +#include <poll.h> +#include <errno.h> +#include <string.h> +#include <endian.h> +#include <fcntl.h> +#include <byteswap.h> +#include <syslog.h> +#include <dirent.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/socket.h> + +#include "configure.h" +#include "daemon-server.h" +#include "daemon-log.h" + +#include "lvmlockd-internal.h" + +/* + * Using synchronous _wait dlm apis so do not define _REENTRANT and + * link with non-threaded version of library, libdlm_lt. + */ +#include "libdlm.h" + +struct lm_dlm { + dlm_lshandle_t *dh; +}; + +struct rd_dlm { + struct dlm_lksb lksb; + struct val_blk *vb; +}; + +int lm_data_size_dlm(void) +{ + return sizeof(struct rd_dlm); +} + +/* + * lock_args format + * + * vg_lock_args format for dlm is + * vg_version_string:undefined:cluster_name + * + * lv_lock_args are not used for dlm + * + * version_string is MAJOR.MINOR.PATCH + * undefined may contain ":" + */ + +#define VG_LOCK_ARGS_MAJOR 1 +#define VG_LOCK_ARGS_MINOR 0 +#define VG_LOCK_ARGS_PATCH 0 + +static int cluster_name_from_args(char *vg_args, char *clustername) +{ + return last_string_from_args(vg_args, clustername); +} + +static int check_args_version(char *vg_args) +{ + unsigned int major = 0; + int rv; + + rv = version_from_args(vg_args, &major, NULL, NULL); + if (rv < 0) { + log_error("check_args_version %s error %d", vg_args, rv); + return rv; + } + + if (major > VG_LOCK_ARGS_MAJOR) { + log_error("check_args_version %s major %d %d", vg_args, major, VG_LOCK_ARGS_MAJOR); + return -1; + } + + return 0; +} + +/* This will be set after dlm_controld is started. */ +#define DLM_CLUSTER_NAME_PATH "/sys/kernel/config/dlm/cluster/cluster_name" + +static int read_cluster_name(char *clustername) +{ + char *n; + int fd; + int rv; + + if (daemon_test) { + sprintf(clustername, "%s", "test"); + return 0; + } + + fd = open(DLM_CLUSTER_NAME_PATH, O_RDONLY); + if (fd < 0) { + log_error("read_cluster_name: open error %d, check dlm_controld", fd); + return fd; + } + + rv = read(fd, clustername, MAX_ARGS - 1); + if (rv < 0) { + log_error("read_cluster_name: cluster name read error %d, check dlm_controld", fd); + close(fd); + return rv; + } + + n = strstr(clustername, "\n"); + if (n) + *n = '\0'; + close(fd); + return 0; +} + +int lm_init_vg_dlm(char *ls_name, char *vg_name, uint32_t flags, char *vg_args) +{ + char clustername[MAX_ARGS]; + char lock_args_version[MAX_ARGS]; + int rv; + + memset(clustername, 0, sizeof(clustername)); + memset(lock_args_version, 0, sizeof(lock_args_version)); + + snprintf(lock_args_version, MAX_ARGS, "%u.%u.%u", + VG_LOCK_ARGS_MAJOR, VG_LOCK_ARGS_MINOR, VG_LOCK_ARGS_PATCH); + + rv = read_cluster_name(clustername); + if (rv < 0) + return rv; + + if (strlen(clustername) + strlen(lock_args_version) + 2 > MAX_ARGS) { + log_error("init_vg_dlm args too long"); + return -ENAMETOOLONG; + } + + snprintf(vg_args, MAX_ARGS, "%s:%s", lock_args_version, clustername); + rv = 0; + + log_debug("init_vg_dlm done %s vg_args %s", ls_name, vg_args); + return rv; +} + +int lm_add_lockspace_dlm(struct lockspace *ls, int adopt) +{ + char sys_clustername[MAX_ARGS]; + char arg_clustername[MAX_ARGS]; + struct lm_dlm *lmd; + int rv; + + memset(sys_clustername, 0, sizeof(sys_clustername)); + memset(arg_clustername, 0, sizeof(arg_clustername)); + + rv = read_cluster_name(sys_clustername); + if (rv < 0) + return rv; + + if (!ls->vg_args[0]) { + /* global lockspace has no vg args */ + goto skip_args; + } + + rv = check_args_version(ls->vg_args); + if (rv < 0) + return rv; + + rv = cluster_name_from_args(ls->vg_args, arg_clustername); + if (rv < 0) { + log_error("add_lockspace_dlm %s no cluster name from args %s", ls->name, ls->vg_args); + return rv; + } + + if (strcmp(sys_clustername, arg_clustername)) { + log_error("add_lockspace_dlm %s mismatching cluster names sys %s arg %s", + ls->name, sys_clustername, arg_clustername); + return -1; + } + + skip_args: + lmd = malloc(sizeof(struct lm_dlm)); + if (!lmd) { + rv = -ENOMEM; + goto out; + } + + if (daemon_test) + goto data; + + if (adopt) + lmd->dh = dlm_open_lockspace(ls->name); + else + lmd->dh = dlm_new_lockspace(ls->name, 0600, DLM_LSFL_NEWEXCL); + + if (!lmd->dh) { + free(lmd); + rv = -1; + log_error("add_lockspace_dlm new error %d", rv); + goto out; + } + + data: + ls->lm_data = lmd; + rv = 0; + out: + return rv; +} + +int lm_rem_lockspace_dlm(struct lockspace *ls, int free_vg) +{ + struct lm_dlm *lmd = (struct lm_dlm *)ls->lm_data; + int rv; + + if (daemon_test) + goto out; + + /* + * TODO: if free_vg is set, it means we are doing vgremove, + * and we may want to tell any other nodes to leave the lockspace. + * This is not really necessary since there should be no harm in + * having an unused lockspace sitting around. + */ + + rv = dlm_release_lockspace(ls->name, lmd->dh, 1); + if (rv < 0) { + log_error("rem_lockspace_dlm error %d", rv); + return rv; + } + out: + free(lmd); + ls->lm_data = NULL; + + if (!strcmp(ls->name, gl_lsname_dlm)) { + gl_running_dlm = 0; + gl_auto_dlm = 0; + } + + return 0; +} + +static int lm_add_resource_dlm(struct lockspace *ls, struct resource *r, int with_lock_nl) +{ + struct lm_dlm *lmd = (struct lm_dlm *)ls->lm_data; + struct rd_dlm *rdd = (struct rd_dlm *)r->lm_data; + uint32_t flags = 0; + char *buf; + int rv; + + if (r->type == LD_RT_GL || r->type == LD_RT_VG) { + buf = malloc(sizeof(struct val_blk) + DLM_LVB_LEN); + if (!buf) + return -ENOMEM; + memset(buf, 0, sizeof(struct val_blk) + DLM_LVB_LEN); + + rdd->vb = (struct val_blk *)buf; + rdd->lksb.sb_lvbptr = buf + sizeof(struct val_blk); + + flags |= LKF_VALBLK; + } + + if (!with_lock_nl) + goto out; + + /* because this is a new NL lock request */ + flags |= LKF_EXPEDITE; + + if (daemon_test) + goto out; + + rv = dlm_ls_lock_wait(lmd->dh, LKM_NLMODE, &rdd->lksb, flags, + r->name, strlen(r->name), + 0, NULL, NULL, NULL); + if (rv < 0) { + log_error("S %s R %s add_resource_dlm lock error %d", ls->name, r->name, rv); + return rv; + } + out: + return 0; +} + +int lm_rem_resource_dlm(struct lockspace *ls, struct resource *r) +{ + struct lm_dlm *lmd = (struct lm_dlm *)ls->lm_data; + struct rd_dlm *rdd = (struct rd_dlm *)r->lm_data; + struct dlm_lksb *lksb; + int rv; + + if (daemon_test) { + rv = 0; + goto out; + } + + lksb = &rdd->lksb; + + rv = dlm_ls_unlock_wait(lmd->dh, lksb->sb_lkid, 0, lksb); + if (rv < 0) { + log_error("S %s R %s rem_resource_dlm unlock error %d", ls->name, r->name, rv); + } + out: + if (rdd->vb) + free(rdd->vb); + + memset(rdd, 0, sizeof(struct rd_dlm)); + r->lm_init = 0; + return rv; +} + +static int to_dlm_mode(int ld_mode) +{ + switch (ld_mode) { + case LD_LK_EX: + return LKM_EXMODE; + case LD_LK_SH: + return LKM_PRMODE; + }; + return -1; +} + +static int lm_adopt_dlm(struct lockspace *ls, struct resource *r, int ld_mode, + uint32_t *r_version, uint32_t *n_version) +{ + struct lm_dlm *lmd = (struct lm_dlm *)ls->lm_data; + struct rd_dlm *rdd = (struct rd_dlm *)r->lm_data; + struct dlm_lksb *lksb; + uint32_t flags = 0; + int mode; + int rv; + + *r_version = 0; + *n_version = 0; + + if (!r->lm_init) { + rv = lm_add_resource_dlm(ls, r, 0); + if (rv < 0) + return rv; + r->lm_init = 1; + } + + lksb = &rdd->lksb; + + flags |= LKF_PERSISTENT; + flags |= LKF_ORPHAN; + + if (rdd->vb) + flags |= LKF_VALBLK; + + mode = to_dlm_mode(ld_mode); + if (mode < 0) { + log_error("adopt_dlm invalid mode %d", ld_mode); + return -EINVAL; + } + + log_debug("S %s R %s adopt_dlm", ls->name, r->name); + + if (daemon_test) + return 0; + + /* + * dlm returns 0 for success, -EAGAIN if an orphan is + * found with another mode, and -ENOENT if no orphan. + */ + + rv = dlm_ls_lock(lmd->dh, mode, lksb, flags, + r->name, strlen(r->name), + 0, NULL, NULL, NULL, NULL); + + if (rv == -EAGAIN) { + log_debug("S %s R %s adopt_dlm adopt mode %d try other mode", + ls->name, r->name, ld_mode); + return -EUCLEAN; + } + if (rv < 0) { + log_debug("S %s R %s adopt_dlm error %d", ls->name, r->name, rv); + return rv; + } + + /* + * TODO: for GL/VG locks we probably want to read the lvb, + * especially if adopting an ex lock, because when we + * release this adopted ex lock we may want to write new + * lvb values based on the current lvb values (at lease + * in the GL case where we increment the current values.) + * + * It should be possible to read the lvb by requesting + * this lock in the same mode it's already in. + */ + + return rv; +} + +/* + * Use PERSISTENT so that if lvmlockd exits while holding locks, + * the locks will remain orphaned in the dlm, still protecting what + * they were acquired to protect. + */ + +int lm_lock_dlm(struct lockspace *ls, struct resource *r, int ld_mode, + uint32_t *r_version, uint32_t *n_version, int adopt) +{ + struct lm_dlm *lmd = (struct lm_dlm *)ls->lm_data; + struct rd_dlm *rdd = (struct rd_dlm *)r->lm_data; + struct dlm_lksb *lksb; + struct val_blk vb; + uint32_t flags = 0; + uint16_t vb_version; + int mode; + int rv; + + if (adopt) { + /* When adopting, we don't follow the normal method + of acquiring a NL lock then converting it to the + desired mode. */ + return lm_adopt_dlm(ls, r, ld_mode, r_version, n_version); + } + + if (!r->lm_init) { + rv = lm_add_resource_dlm(ls, r, 1); + if (rv < 0) + return rv; + r->lm_init = 1; + } + + lksb = &rdd->lksb; + + flags |= LKF_CONVERT; + flags |= LKF_NOQUEUE; + flags |= LKF_PERSISTENT; + + if (rdd->vb) + flags |= LKF_VALBLK; + + mode = to_dlm_mode(ld_mode); + if (mode < 0) { + log_error("lock_dlm invalid mode %d", ld_mode); + return -EINVAL; + } + + log_debug("S %s R %s lock_dlm", ls->name, r->name); + + if (daemon_test) { + *r_version = 0; + *n_version = 0; + return 0; + } + + rv = dlm_ls_lock_wait(lmd->dh, mode, lksb, flags, + r->name, strlen(r->name), + 0, NULL, NULL, NULL); + if (rv == -EAGAIN) { + /* TODO: what case is this? what should be done? */ + log_error("S %s R %s lock_dlm mode %d rv EAGAIN", ls->name, r->name, mode); + return -EAGAIN; + } + if (rv < 0) { + log_error("S %s R %s lock_dlm error %d", ls->name, r->name, rv); + return rv; + } + + if (rdd->vb) { + if (lksb->sb_flags & DLM_SBF_VALNOTVALID) { + log_debug("S %s R %s lock_dlm VALNOTVALID", ls->name, r->name); + memset(rdd->vb, 0, sizeof(struct val_blk)); + *r_version = 0; + *n_version = 0; + goto out; + } + + memcpy(&vb, lksb->sb_lvbptr, sizeof(struct val_blk)); + vb_version = le16_to_cpu(vb.version); + + if (vb_version && ((vb_version & 0xFF00) > (VAL_BLK_VERSION & 0xFF00))) { + log_error("S %s R %s lock_dlm ignore vb_version %x", + ls->name, r->name, vb_version); + *r_version = 0; + *n_version = 0; + free(rdd->vb); + rdd->vb = NULL; + lksb->sb_lvbptr = NULL; + goto out; + } + + *r_version = le32_to_cpu(vb.r_version); + *n_version = le32_to_cpu(vb.n_version); + memcpy(rdd->vb, &vb, sizeof(vb)); /* rdd->vb saved as le */ + + log_debug("S %s R %s lock_dlm get r_version %u n_version %u", + ls->name, r->name, *r_version, *n_version); + } +out: + return 0; +} + +int lm_convert_dlm(struct lockspace *ls, struct resource *r, + int ld_mode, uint32_t r_version) +{ + struct lm_dlm *lmd = (struct lm_dlm *)ls->lm_data; + struct rd_dlm *rdd = (struct rd_dlm *)r->lm_data; + struct dlm_lksb *lksb = &rdd->lksb; + uint32_t mode; + uint32_t flags = 0; + int rv; + + log_debug("S %s R %s convert_dlm", ls->name, r->name); + + flags |= LKF_CONVERT; + flags |= LKF_NOQUEUE; + flags |= LKF_PERSISTENT; + + if (rdd->vb && r_version && (r->mode == LD_LK_EX)) { + if (!rdd->vb->version) { + /* first time vb has been written */ + rdd->vb->version = cpu_to_le16(VAL_BLK_VERSION); + } + rdd->vb->r_version = cpu_to_le32(r_version); + memcpy(lksb->sb_lvbptr, rdd->vb, sizeof(struct val_blk)); + + log_debug("S %s R %s convert_dlm set r_version %u", + ls->name, r->name, r_version); + + flags |= LKF_VALBLK; + } + + mode = to_dlm_mode(ld_mode); + + if (daemon_test) + return 0; + + rv = dlm_ls_lock_wait(lmd->dh, mode, lksb, flags, + r->name, strlen(r->name), + 0, NULL, NULL, NULL); + if (rv == -EAGAIN) { + /* TODO: what case is this? what should be done? */ + log_error("S %s R %s convert_dlm mode %d rv EAGAIN", ls->name, r->name, mode); + return -EAGAIN; + } + if (rv < 0) { + log_error("S %s R %s convert_dlm error %d", ls->name, r->name, rv); + } + return rv; +} + +int lm_unlock_dlm(struct lockspace *ls, struct resource *r, + uint32_t r_version, uint32_t n_version, uint32_t lmuf_flags) +{ + struct lm_dlm *lmd = (struct lm_dlm *)ls->lm_data; + struct rd_dlm *rdd = (struct rd_dlm *)r->lm_data; + struct dlm_lksb *lksb = &rdd->lksb; + uint32_t flags = 0; + int rv; + + log_debug("S %s R %s unlock_dlm r_version %u flags %x", + ls->name, r->name, r_version, lmuf_flags); + + /* + * Do not set PERSISTENT, because we don't need an orphan + * NL lock to protect anything. + */ + + flags |= LKF_CONVERT; + + if (rdd->vb && r_version && (r->mode == LD_LK_EX)) { + if (!rdd->vb->version) { + /* first time vb has been written */ + rdd->vb->version = cpu_to_le16(VAL_BLK_VERSION); + } + if (r_version) + rdd->vb->r_version = cpu_to_le32(r_version); + if (n_version) + rdd->vb->n_version = cpu_to_le32(n_version); + memcpy(lksb->sb_lvbptr, rdd->vb, sizeof(struct val_blk)); + + log_debug("S %s R %s unlock_dlm set r_version %u n_version %u", + ls->name, r->name, r_version, n_version); + + flags |= LKF_VALBLK; + } + + if (daemon_test) + return 0; + + rv = dlm_ls_lock_wait(lmd->dh, LKM_NLMODE, lksb, flags, + r->name, strlen(r->name), + 0, NULL, NULL, NULL); + if (rv < 0) { + log_error("S %s R %s unlock_dlm error %d", ls->name, r->name, rv); + } + + return rv; +} + +/* + * This list could be read from dlm_controld via libdlmcontrol, + * but it's simpler to get it from sysfs. + */ + +#define DLM_LOCKSPACES_PATH "/sys/kernel/config/dlm/cluster/spaces" + +int lm_get_lockspaces_dlm(struct list_head *ls_rejoin) +{ + struct lockspace *ls; + struct dirent *de; + DIR *ls_dir; + + if (!(ls_dir = opendir(DLM_LOCKSPACES_PATH))) + return -ECONNREFUSED; + + while ((de = readdir(ls_dir))) { + if (de->d_name[0] == '.') + continue; + + if (!(ls = alloc_lockspace())) { + closedir(ls_dir); + return -ENOMEM; + } + + ls->lm_type = LD_LM_DLM; + strncpy(ls->name, de->d_name, MAX_NAME); + strncpy(ls->vg_name, ls->name + strlen(LVM_LS_PREFIX), MAX_NAME); + list_add_tail(&ls->list, ls_rejoin); + } + + closedir(ls_dir); + return 0; +} + diff --git a/daemons/lvmlockd/lvmlockd-internal.h b/daemons/lvmlockd/lvmlockd-internal.h new file mode 100644 index 000000000..8c6382636 --- /dev/null +++ b/daemons/lvmlockd/lvmlockd-internal.h @@ -0,0 +1,387 @@ +/* + * Copyright (C) 2014 Red Hat, Inc. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU Lesser General Public License v.2.1. + */ + +#ifndef _LVM_LVMLOCKD_INTERNAL_H +#define _LVM_LVMLOCKD_INTERNAL_H + +/* TODO: figure out real restraints/requirements for these */ +#define MAX_NAME 64 +#define MAX_ARGS 64 + +#define R_NAME_GL_DISABLED "_GLLK_disabled" +#define R_NAME_GL "GLLK" +#define R_NAME_VG "VGLK" +#define S_NAME_GL_DLM "lvm_global" +#define LVM_LS_PREFIX "lvm_" /* ls name is prefix + vg_name */ +/* global lockspace name for sanlock is a vg name */ + +/* lock manager types */ +enum { + LD_LM_NONE = 0, + LD_LM_UNUSED = 1, /* place holder so values match lib/locking/lvmlockd.h */ + LD_LM_DLM = 2, + LD_LM_SANLOCK = 3, +}; + +/* operation types */ +enum { + LD_OP_HELLO = 1, + LD_OP_QUIT, + LD_OP_INIT, + LD_OP_FREE, + LD_OP_START, + LD_OP_STOP, + LD_OP_LOCK, + LD_OP_UPDATE, + LD_OP_CLOSE, + LD_OP_ENABLE, + LD_OP_DISABLE, + LD_OP_ADD_LOCAL, + LD_OP_REM_LOCAL, + LD_OP_UPDATE_LOCAL, + LD_OP_START_WAIT, + LD_OP_STOP_ALL, + LD_OP_DUMP_INFO, + LD_OP_DUMP_LOG, +}; + +/* resource types */ +enum { + LD_RT_GL = 1, + LD_RT_VG, + LD_RT_LV, +}; + +/* lock modes, more restrictive must be larger value */ +enum { + LD_LK_IV = -1, + LD_LK_UN = 0, + LD_LK_NL = 1, + LD_LK_SH = 2, + LD_LK_EX = 3, +}; + +struct list_head { + struct list_head *next, *prev; +}; + +struct client { + struct list_head list; + pthread_mutex_t mutex; + int pid; + int fd; + int pi; + uint32_t id; + unsigned int recv : 1; + unsigned int dead : 1; + unsigned int poll_ignore : 1; + char name[MAX_NAME]; +}; + +#define LD_AF_PERSISTENT 0x00000001 +#define LD_AF_UNUSED 0x00000002 /* use me */ +#define LD_AF_UNLOCK_CANCEL 0x00000004 +#define LD_AF_NEXT_VERSION 0x00000008 +#define LD_AF_WAIT 0x00000010 +#define LD_AF_FORCE 0x00000020 +#define LD_AF_EX_DISABLE 0x00000040 +#define LD_AF_ENABLE 0x00000080 +#define LD_AF_DISABLE 0x00000100 +#define LD_AF_SEARCH_LS 0x00000200 +#define LD_AF_LOCAL_LS 0x00000400 +#define LD_AF_UPDATE_NAMES_VERSION 0x00000800 +#define LD_AF_WAIT_STARTING 0x00001000 +#define LD_AF_DUP_GL_LS 0x00002000 +#define LD_AF_INACTIVE_LS 0x00004000 +#define LD_AF_ADD_LS_ERROR 0x00008000 +#define LD_AF_ADOPT 0x00010000 + +/* + * Number of times to repeat a lock request after + * a lock conflict (-EAGAIN) if unspecified in the + * request. + */ +#define DEFAULT_MAX_RETRIES 4 + +struct action { + struct list_head list; + uint32_t client_id; + uint32_t flags; /* LD_AF_ */ + uint32_t version; + uint64_t host_id; + int8_t op; /* operation type LD_OP_ */ + int8_t rt; /* resource type LD_RT_ */ + int8_t mode; /* lock mode LD_LK_ */ + int8_t lm_type; /* lock manager: LM_DLM, LM_SANLOCK */ + int retries; + int max_retries; + int result; + int lm_rv; /* return value from lm_ function */ + char vg_uuid[64]; + char vg_name[MAX_NAME+1]; + char lv_name[MAX_NAME+1]; + char vg_args[MAX_ARGS]; + char lv_args[MAX_ARGS]; + char vg_sysid[MAX_NAME+1]; +}; + +struct resource { + struct list_head list; /* lockspace.resources */ + char name[MAX_NAME+1]; /* vg name or lv name */ + int8_t type; /* resource type LD_RT_ */ + int8_t mode; + unsigned int sh_count; /* number of sh locks on locks list */ + uint32_t version; + unsigned int lm_init : 1; /* lm_data is initialized */ + struct list_head locks; + struct list_head actions; + struct val_blk *vb; + char lv_args[MAX_ARGS]; + char lm_data[0]; /* lock manager specific data */ +}; + +#define LD_LF_PERSISTENT 0x00000001 + +struct lock { + struct list_head list; /* resource.locks */ + int8_t mode; /* lock mode LD_LK_ */ + uint32_t version; + uint32_t flags; /* LD_LF_ */ + uint32_t client_id; /* may be 0 for persistent or internal locks */ +}; + +struct lockspace { + struct list_head list; /* lockspaces */ + char name[MAX_NAME+1]; + char vg_name[MAX_NAME+1]; + char vg_uuid[64]; + char vg_args[MAX_ARGS]; /* lock manager specific args */ + char vg_sysid[MAX_NAME+1]; + int8_t lm_type; /* lock manager: LM_DLM, LM_SANLOCK */ + void *lm_data; + uint64_t host_id; + uint32_t names_version; /* read/write from/to gl val_blk n_version */ + + uint32_t start_client_id; /* client_id that started the lockspace */ + pthread_t thread; /* makes synchronous lock requests */ + pthread_cond_t cond; + pthread_mutex_t mutex; + unsigned int create_fail : 1; + unsigned int create_done : 1; + unsigned int thread_work : 1; + unsigned int thread_stop : 1; + unsigned int thread_done : 1; + unsigned int update_local_vgs : 1; + unsigned int update_names_version: 1; + unsigned int sanlock_gl_enabled: 1; + unsigned int sanlock_gl_dup: 1; + + struct list_head actions; /* new client actions */ + struct list_head resources; /* resource/lock state for gl/vg/lv */ + /* TODO: should probably be tree */ +}; + +#define VAL_BLK_VERSION 0x0101 + +struct val_blk { + uint16_t version; + uint16_t flags; + uint32_t r_version; + uint32_t n_version; +}; + +/* lm_unlock flags */ +#define LMUF_FREE_VG 0x00000001 + +struct lockspace *alloc_lockspace(void); +int lockspaces_empty(void); +int last_string_from_args(char *args_in, char *last); +int version_from_args(char *args, unsigned int *major, unsigned int *minor, unsigned int *patch); + +int lm_init_vg_dlm(char *ls_name, char *vg_name, uint32_t flags, char *vg_args); +int lm_add_lockspace_dlm(struct lockspace *ls, int adopt); +int lm_rem_lockspace_dlm(struct lockspace *ls, int free_vg); +int lm_lock_dlm(struct lockspace *ls, struct resource *r, int ld_mode, + uint32_t *r_version, uint32_t *n_version, int adopt); +int lm_convert_dlm(struct lockspace *ls, struct resource *r, + int ld_mode, uint32_t r_version); +int lm_unlock_dlm(struct lockspace *ls, struct resource *r, + uint32_t r_version, uint32_t n_version, uint32_t lmu_flags); +int lm_rem_resource_dlm(struct lockspace *ls, struct resource *r); +int lm_get_lockspaces_dlm(struct list_head *ls_rejoin); +int lm_data_size_dlm(void); + +int lm_init_vg_sanlock(char *ls_name, char *vg_name, uint32_t flags, char *vg_args); +int lm_init_lv_sanlock(char *ls_name, char *vg_name, char *lv_name, char *vg_args, char *lv_args); +int lm_free_lv_sanlock(struct lockspace *ls, struct resource *r); +int lm_rename_vg_sanlock(char *ls_name, char *vg_name, uint32_t flags, char *vg_args); +int lm_add_lockspace_sanlock(struct lockspace *ls, int adopt); +int lm_rem_lockspace_sanlock(struct lockspace *ls, int free_vg); +int lm_lock_sanlock(struct lockspace *ls, struct resource *r, int ld_mode, + uint32_t *r_version, uint32_t *n_version, int *retry, int adopt); +int lm_convert_sanlock(struct lockspace *ls, struct resource *r, + int ld_mode, uint32_t r_version); +int lm_unlock_sanlock(struct lockspace *ls, struct resource *r, + uint32_t r_version, uint32_t n_version, uint32_t lmu_flags); +int lm_able_gl_sanlock(struct lockspace *ls, int enable); +int lm_ex_disable_gl_sanlock(struct lockspace *ls); +int lm_hosts_sanlock(struct lockspace *ls, int notify); +int lm_rem_resource_sanlock(struct lockspace *ls, struct resource *r); +int lm_gl_is_enabled(struct lockspace *ls); +int lm_get_lockspaces_sanlock(struct list_head *ls_rejoin); +int lm_data_size_sanlock(void); + +#if __BYTE_ORDER == __BIG_ENDIAN +#define le16_to_cpu(x) (bswap_16((x))) +#define le32_to_cpu(x) (bswap_32((x))) +#define le64_to_cpu(x) (bswap_64((x))) +#define cpu_to_le16(x) (bswap_16((x))) +#define cpu_to_le32(x) (bswap_32((x))) +#define cpu_to_le64(x) (bswap_64((x))) +#endif + +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define le16_to_cpu(x) (x) +#define le32_to_cpu(x) (x) +#define le64_to_cpu(x) (x) +#define cpu_to_le16(x) (x) +#define cpu_to_le32(x) (x) +#define cpu_to_le64(x) (x) +#endif + +#define container_of(ptr, type, member) ({ \ + const typeof( ((type *)0)->member ) *__mptr = (ptr); \ + (type *)( (char *)__mptr - offsetof(type,member) );}) + +static inline void INIT_LIST_HEAD(struct list_head *list) +{ + list->next = list; + list->prev = list; +} + +static inline void __list_add(struct list_head *new, + struct list_head *prev, + struct list_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +static inline void __list_del(struct list_head *prev, struct list_head *next) +{ + next->prev = prev; + prev->next = next; +} + +static inline void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} + +static inline void list_add_tail(struct list_head *new, struct list_head *head) +{ + __list_add(new, head->prev, head); +} + +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +static inline int list_empty(const struct list_head *head) +{ + return head->next == head; +} + +#define list_entry(ptr, type, member) \ + container_of(ptr, type, member) + +#define list_first_entry(ptr, type, member) \ + list_entry((ptr)->next, type, member) + +#define list_for_each_entry(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + + +/* to improve readability */ +#define WAIT 1 +#define NO_WAIT 0 +#define FORCE 1 +#define NO_FORCE 0 + +/* + * global variables + */ + +#ifndef EXTERN +#define EXTERN extern +#define INIT(X) +#else +#undef EXTERN +#define EXTERN +#define INIT(X) =X +#endif + +/* + * gl_type_static and gl_use_ are set by command line or config file + * to specify whether the global lock comes from dlm or sanlock. + * Without a static setting, lvmlockd will figure out where the + * global lock should be (but it could get mixed up in cases where + * both sanlock and dlm vgs exist.) + * + * gl_use_dlm means that the gl should come from lockspace gl_lsname_dlm + * gl_use_sanlock means that the gl should come from lockspace gl_lsname_sanlock + * + * gl_use_dlm has precedence over gl_use_sanlock, so if a node sees both + * dlm and sanlock vgs, it will use the dlm gl. + * + * gl_use_ is set when the first evidence of that lm_type is seen + * in any command. + * + * gl_lsname_sanlock is set when the first vg is seen in which an + * enabled gl is exists, or when init_vg creates a vg with gl enabled, + * or when enable_gl is used. + * + * gl_lsname_sanlock is cleared when free_vg deletes a vg with gl enabled + * or when disable_gl matches. + */ + +EXTERN int gl_type_static; +EXTERN int gl_use_dlm; +EXTERN int gl_use_sanlock; +EXTERN pthread_mutex_t gl_type_mutex; + +EXTERN char gl_lsname_dlm[MAX_NAME+1]; +EXTERN char gl_lsname_sanlock[MAX_NAME+1]; + +EXTERN int gl_running_dlm; +EXTERN int gl_auto_dlm; + +EXTERN int daemon_test; /* run as much as possible without a live lock manager */ +EXTERN int daemon_debug; +EXTERN int daemon_host_id; +EXTERN const char *daemon_host_id_file; + +void log_level(int level, const char *fmt, ...) __attribute__((format(printf, 2, 3))); +#define log_debug(fmt, args...) log_level(LOG_DEBUG, fmt, ##args) +#define log_error(fmt, args...) log_level(LOG_ERR, fmt, ##args) +#define log_warn(fmt, args...) log_level(LOG_WARNING, fmt, ##args) + +#endif diff --git a/daemons/lvmlockd/lvmlockd-sanlock.c b/daemons/lvmlockd/lvmlockd-sanlock.c new file mode 100644 index 000000000..50224c932 --- /dev/null +++ b/daemons/lvmlockd/lvmlockd-sanlock.c @@ -0,0 +1,1475 @@ +/* + * Copyright (C) 2014 Red Hat, Inc. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU Lesser General Public License v.2.1. + */ + +#define _XOPEN_SOURCE 500 /* pthread */ +#define _ISOC99_SOURCE + +#include <assert.h> +#include <pthread.h> +#include <stdint.h> +#include <stddef.h> +#include <stdlib.h> +#include <unistd.h> +#include <stdio.h> +#include <poll.h> +#include <errno.h> +#include <string.h> +#include <syslog.h> +#include <sys/types.h> +#include <sys/socket.h> + +#include "configure.h" +#include "daemon-server.h" +#include "daemon-log.h" + +#include "lvmlockd-internal.h" +#include "lvmlockd-client.h" + +#include "sanlock.h" +#include "sanlock_rv.h" +#include "sanlock_admin.h" +#include "sanlock_resource.h" + +/* + * If access to the pv containing the vg's leases is lost, sanlock cannot renew + * the leases we have acquired for locked LVs. This means that we could soon + * loose the lease to another host which could activate our LV exclusively. We + * do not want to get to the point of two hosts having the same LV active + * exclusively (it obviously violates the purpose of LV locks.) + * + * The default method of preventing this problem is for lvmlockd to do nothing, + * which produces a safe but potentially inconvenient result. Doing nothing + * leads to our LV leases not being released, which leads to sanlock using the + * local watchdog to reset us before another host can acquire our lock. It + * would often be preferrable to avoid the abrupt hard reset from the watchdog. + * + * There are other options to avoid being reset by our watchdog. If we can + * quickly stop using the LVs in question and release the locks for them, then + * we could avoid a reset (there's a certain grace period of about 40 seconds + * in which we can attempt this.) To do this, we can tell sanlock to run a + * specific program when it has lost access to our leases. We could use this + * program to: + * + * 1. Deactivate all lvs in the effected vg. If all the leases are + * deactivated, then our LV locks would be released and sanlock would no longer + * use the watchdog to reset us. If file systems are mounted on the active + * lvs, then deactivating them would fail, so this option would be of limited + * usefulness. + * + * 2. Option 1 could be extended to kill pids using the fs on the lv, unmount + * the fs, and deactivate the lv. This is probably out of scope for lvm + * directly, and would likely need the help of another system service. + * + * 3. Use dmsetup suspend to block access to lvs in the effected vg. If this + * was successful, the local host could no longer write to the lvs, we could + * safely release the LV locks, and sanlock would no longer reset us. At this + * point, with suspended lvs, the host would be in a fairly hobbled state, and + * would almost certainly need a manual, forcible reset. + * + * 4. Option 3 could be extended to monitor the lost storage, and if it is + * reconnected, the leases could be reacquired, and the suspended lvs resumed + * (reacquiring leases will fail if another host has acquired them since they + * were released.) This complexity of this option, combined with the fact that + * the error conditions are often not as simple as storage being lost and then + * later connecting, will result in this option being too unreliable. + * + * TODO: add a config option that we could use to select a different behavior + * than the default. Then implement one of the simpler options as a proof of + * concept, which could be extended if needed. + */ + +/* + * Each lockspace thread has its own sanlock daemon connection. + * If they shared one, sanlock acquire/release calls would be + * serialized. Some aspects of sanlock expect a single connection + * from each pid: signals due to a sanlock_request, and + * acquire/release/convert/inquire. The later can probably be + * addressed with a flag to indicate that the pid field should be + * interpretted as 'ci' (which the caller would need to figure + * out somehow.) + */ + +struct lm_sanlock { + struct sanlk_lockspace ss; + int align_size; + int sock; /* sanlock daemon connection */ +}; + +struct rd_sanlock { + union { + struct sanlk_resource rs; + char buf[sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk)]; + }; + struct val_blk *vb; +}; + +struct sanlk_resourced { + union { + struct sanlk_resource rs; + char buf[sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk)]; + }; +}; + +int lm_data_size_sanlock(void) +{ + return sizeof(struct rd_sanlock); +} + +/* + * lock_args format + * + * vg_lock_args format for sanlock is + * vg_version_string:undefined:lock_lv_name + * + * lv_lock_args format for sanlock is + * lv_version_string:undefined:offset + * + * version_string is MAJOR.MINOR.PATCH + * undefined may contain ":" + * + * If a new version of the lock_args string cannot be + * handled by an old version of lvmlockd, then the + * new lock_args string should contain a larger major number. + */ + +#define VG_LOCK_ARGS_MAJOR 1 +#define VG_LOCK_ARGS_MINOR 0 +#define VG_LOCK_ARGS_PATCH 0 + +#define LV_LOCK_ARGS_MAJOR 1 +#define LV_LOCK_ARGS_MINOR 0 +#define LV_LOCK_ARGS_PATCH 0 + +/* + * offset 0 is lockspace + * offset align_size * 1 is unused + * offset align_size * 2 is unused + * ... + * offset align_size * 64 is unused + * offset align_size * 65 is gl lock + * offset align_size * 66 is vg lock + * offset align_size * 67 is first lv lock + * offset align_size * 68 is second lv lock + * ... + */ + +#define LS_BEGIN 0 +#define GL_LOCK_BEGIN 65 +#define VG_LOCK_BEGIN 66 +#define LV_LOCK_BEGIN 67 + +static int lock_lv_name_from_args(char *vg_args, char *lock_lv_name) +{ + return last_string_from_args(vg_args, lock_lv_name); +} + +static int lock_lv_offset_from_args(char *lv_args, uint64_t *lock_lv_offset) +{ + char offset_str[MAX_ARGS]; + int rv; + + memset(offset_str, 0, sizeof(offset_str)); + + rv = last_string_from_args(lv_args, offset_str); + if (rv < 0) + return rv; + + *lock_lv_offset = strtoull(offset_str, NULL, 10); + return 0; +} + +static int check_args_version(char *args, unsigned int our_major) +{ + unsigned int major = 0; + int rv; + + rv = version_from_args(args, &major, NULL, NULL); + if (rv < 0) { + log_error("check_args_version %s error %d", args, rv); + return rv; + } + + if (major > our_major) { + log_error("check_args_version %s major %u %u", args, major, our_major); + return -1; + } + + return 0; +} + +#define MAX_LINE 64 + +static int read_host_id_file(void) +{ + FILE *file; + char line[MAX_LINE]; + char key_str[MAX_LINE]; + char val_str[MAX_LINE]; + char *key, *val, *sep; + int host_id = 0; + + file = fopen(daemon_host_id_file, "r"); + if (!file) + goto out; + + while (fgets(line, MAX_LINE, file)) { + if (line[0] == '#' || line[0] == '\n') + continue; + + key = line; + sep = strstr(line, "="); + val = sep + 1; + + if (!sep || !val) + continue; + + *sep = '\0'; + memset(key_str, 0, sizeof(key_str)); + memset(val_str, 0, sizeof(val_str)); + sscanf(key, "%s", key_str); + sscanf(val, "%s", val_str); + + if (!strcmp(key_str, "host_id")) { + host_id = atoi(val_str); + break; + } + } + fclose(file); +out: + log_debug("host_id %d from %s", host_id, daemon_host_id_file); + return host_id; +} + +/* + * vgcreate + * + * For init_vg, vgcreate passes the internal lv name as vg_args. + * This constructs the full/proper vg_args format, containing the + * version and lv name, and returns the real lock_args in vg_args. + */ + +int lm_init_vg_sanlock(char *ls_name, char *vg_name, uint32_t flags, char *vg_args) +{ + struct sanlk_lockspace ss; + struct sanlk_resourced rd; + struct sanlk_disk disk; + char lock_lv_name[MAX_ARGS]; + char lock_args_version[MAX_ARGS]; + const char *gl_name = NULL; + uint64_t offset; + int align_size; + int i, rv; + + memset(&ss, 0, sizeof(ss)); + memset(&rd, 0, sizeof(rd)); + memset(&disk, 0, sizeof(disk)); + memset(lock_lv_name, 0, sizeof(lock_lv_name)); + memset(lock_args_version, 0, sizeof(lock_args_version)); + + if (!vg_args || !vg_args[0] || !strcmp(vg_args, "none")) { + log_error("S %s init_vg_san vg_args missing", ls_name); + return -EINVAL; + } + + snprintf(lock_args_version, MAX_ARGS, "%u.%u.%u", + VG_LOCK_ARGS_MAJOR, VG_LOCK_ARGS_MINOR, VG_LOCK_ARGS_PATCH); + + /* see comment above about input vg_args being only lock_lv_name */ + snprintf(lock_lv_name, MAX_ARGS, "%s", vg_args); + + if (strlen(lock_lv_name) + strlen(lock_args_version) + 2 > MAX_ARGS) + return -ENAMETOOLONG; + + snprintf(disk.path, SANLK_PATH_LEN, "/dev/mapper/%s-%s", vg_name, lock_lv_name); + + log_debug("S %s init_vg_san path %s", ls_name, disk.path); + + if (daemon_test) { + if (!gl_lsname_sanlock[0]) + strncpy(gl_lsname_sanlock, ls_name, MAX_NAME); + goto out; + } + + align_size = sanlock_align(&disk); + if (align_size <= 0) { + log_error("S %s init_vg_san bad align size %d %s", + ls_name, align_size, disk.path); + return -EINVAL; + } + + strncpy(ss.name, ls_name, SANLK_NAME_LEN); + memcpy(ss.host_id_disk.path, disk.path, SANLK_PATH_LEN); + ss.host_id_disk.offset = LS_BEGIN * align_size; + + rv = sanlock_write_lockspace(&ss, 0, 0, 0); + if (rv < 0) { + log_error("S %s init_vg_san write_lockspace error %d %s", + ls_name, rv, ss.host_id_disk.path); + return rv; + } + + /* + * We want to create the global lock in the first sanlock vg. + * If other sanlock vgs exist, then one of them must contain + * the gl. If gl_lsname_sanlock is not set, then perhaps + * the sanlock vg with the gl has been removed or has not yet + * been seen. (Would vgcreate get this far in that case?) + * If dlm vgs exist, then we choose to use the dlm gl and + * not a sanlock gl. + */ + + if (flags & LD_AF_ENABLE) + gl_name = R_NAME_GL; + else if (flags & LD_AF_DISABLE) + gl_name = R_NAME_GL_DISABLED; + else if (!gl_use_sanlock || gl_lsname_sanlock[0] || !lockspaces_empty()) + gl_name = R_NAME_GL_DISABLED; + else + gl_name = R_NAME_GL; + + memcpy(rd.rs.lockspace_name, ss.name, SANLK_NAME_LEN); + strncpy(rd.rs.name, gl_name, SANLK_NAME_LEN); + memcpy(rd.rs.disks[0].path, disk.path, SANLK_PATH_LEN); + rd.rs.disks[0].offset = align_size * GL_LOCK_BEGIN; + rd.rs.num_disks = 1; + + rv = sanlock_write_resource(&rd.rs, 0, 0, 0); + if (rv < 0) { + log_error("S %s init_vg_san write_resource gl error %d %s", + ls_name, rv, rd.rs.disks[0].path); + return rv; + } + + memcpy(rd.rs.lockspace_name, ss.name, SANLK_NAME_LEN); + strncpy(rd.rs.name, R_NAME_VG, SANLK_NAME_LEN); + memcpy(rd.rs.disks[0].path, disk.path, SANLK_PATH_LEN); + rd.rs.disks[0].offset = align_size * VG_LOCK_BEGIN; + rd.rs.num_disks = 1; + + rv = sanlock_write_resource(&rd.rs, 0, 0, 0); + if (rv < 0) { + log_error("S %s init_vg_san write_resource vg error %d %s", + ls_name, rv, rd.rs.disks[0].path); + return rv; + } + + if (!strcmp(gl_name, R_NAME_GL)) + strncpy(gl_lsname_sanlock, ls_name, MAX_NAME); + out: + snprintf(vg_args, MAX_ARGS, "%s:%s", lock_args_version, lock_lv_name); + + log_debug("S %s init_vg_san done vg_args %s", ls_name, vg_args); + + /* + * Go through all lv resource slots and initialize them with the + * correct lockspace name but a special resource name that indicates + * it is unused. + */ + + memset(&rd, 0, sizeof(rd)); + rd.rs.num_disks = 1; + memcpy(rd.rs.disks[0].path, disk.path, SANLK_PATH_LEN); + strncpy(rd.rs.lockspace_name, ls_name, SANLK_NAME_LEN); + strcpy(rd.rs.name, "#unused"); + + offset = align_size * LV_LOCK_BEGIN; + + log_debug("S %s init_vg_san clearing lv lease areas", ls_name); + + for (i = 0; i < LVMLOCKD_SANLOCK_MAX_LVS_IN_VG; i++) { + rd.rs.disks[0].offset = offset; + + rv = sanlock_write_resource(&rd.rs, 0, 0, 0); + if (rv) { + log_error("clear lv resource area %llu error %d", + (unsigned long long)offset, rv); + break; + } + offset += align_size; + } + + return 0; +} + +/* + * lvcreate + * + * The offset at which the lv lease is written is passed + * all the way back to the lvcreate command so that it + * can be saved in the lv's lock_args in the vg metadata. + */ + +int lm_init_lv_sanlock(char *ls_name, char *vg_name, char *lv_name, + char *vg_args, char *lv_args) +{ + struct sanlk_resourced rd; + char lock_lv_name[MAX_ARGS]; + char lock_args_version[MAX_ARGS]; + uint64_t offset; + int align_size; + int lv_count = 0; + int rv; + + memset(&rd, 0, sizeof(rd)); + memset(lock_lv_name, 0, sizeof(lock_lv_name)); + memset(lock_args_version, 0, sizeof(lock_args_version)); + + rv = lock_lv_name_from_args(vg_args, lock_lv_name); + if (rv < 0) { + log_error("S %s init_lv_san lock_lv_name_from_args error %d %s", + ls_name, rv, vg_args); + return rv; + } + + snprintf(lock_args_version, MAX_ARGS, "%u.%u.%u", + LV_LOCK_ARGS_MAJOR, LV_LOCK_ARGS_MINOR, LV_LOCK_ARGS_PATCH); + + strncpy(rd.rs.lockspace_name, ls_name, SANLK_NAME_LEN); + rd.rs.num_disks = 1; + snprintf(rd.rs.disks[0].path, SANLK_PATH_LEN, "/dev/mapper/%s-%s", vg_name, lock_lv_name); + + align_size = sanlock_align(&rd.rs.disks[0]); + if (align_size <= 0) { + log_error("S %s init_lv_san align error %d", ls_name, align_size); + return -EINVAL; + } + + offset = align_size * LV_LOCK_BEGIN; + rd.rs.disks[0].offset = offset; + + if (daemon_test) { + snprintf(lv_args, MAX_ARGS, "%s:%llu", + lock_args_version, (unsigned long long)1111); + return 0; + } + + while (1) { + rd.rs.disks[0].offset = offset; + + memset(rd.rs.name, 0, SANLK_NAME_LEN); + + rv = sanlock_read_resource(&rd.rs, 0); + if (rv) { + log_error("S %s init_lv_san read error %d offset %llu", + ls_name, rv, (unsigned long long)offset); + break; + } + + if (!strncmp(rd.rs.name, lv_name, SANLK_NAME_LEN)) { + log_error("S %s init_lv_san resource name %s already exists at %llu", + ls_name, lv_name, (unsigned long long)offset); + return -EEXIST; + } + + if (!strcmp(rd.rs.name, "#unused")) { + log_debug("S %s init_lv_san %s found unused area at %llu", + ls_name, lv_name, (unsigned long long)offset); + + strncpy(rd.rs.name, lv_name, SANLK_NAME_LEN); + + rv = sanlock_write_resource(&rd.rs, 0, 0, 0); + if (!rv) { + snprintf(lv_args, MAX_ARGS, "%s:%llu", + lock_args_version, (unsigned long long)offset); + } else { + log_error("S %s init_lv_san write error %d offset %llu", + ls_name, rv, (unsigned long long)rv); + } + break; + } + + offset += align_size; + + if (lv_count++ >= LVMLOCKD_SANLOCK_MAX_LVS_IN_VG) { + log_error("S %s init_lv_san too many lvs %d", ls_name, lv_count); + rv = -ENOENT; + break; + } + } + + return rv; +} + +/* lvremove */ +int lm_free_lv_sanlock(struct lockspace *ls, struct resource *r) +{ + struct rd_sanlock *rds = (struct rd_sanlock *)r->lm_data; + struct sanlk_resource *rs = &rds->rs; + int rv; + + log_debug("S %s R %s free_lv_san", ls->name, r->name); + + if (daemon_test) + return 0; + + strcpy(rs->name, "#unused"); + + rv = sanlock_write_resource(rs, 0, 0, 0); + if (rv < 0) { + log_error("S %s R %s free_lv_san write error %d", + ls->name, r->name, rv); + } + + return rv; +} + +int lm_ex_disable_gl_sanlock(struct lockspace *ls) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + struct sanlk_resourced rd1; + struct sanlk_resourced rd2; + struct sanlk_resource *rs1; + struct sanlk_resource *rs2; + struct sanlk_resource **rs_args; + int rv; + + rs_args = malloc(2 * sizeof(struct sanlk_resource *)); + if (!rs_args) + return -ENOMEM; + + rs1 = &rd1.rs; + rs2 = &rd2.rs; + + memset(&rd1, 0, sizeof(rd1)); + memset(&rd2, 0, sizeof(rd2)); + + strncpy(rd1.rs.lockspace_name, ls->name, SANLK_NAME_LEN); + strncpy(rd1.rs.name, R_NAME_GL, SANLK_NAME_LEN); + + strncpy(rd2.rs.lockspace_name, ls->name, SANLK_NAME_LEN); + strncpy(rd2.rs.name, R_NAME_GL_DISABLED, SANLK_NAME_LEN); + + rd1.rs.num_disks = 1; + strncpy(rd1.rs.disks[0].path, lms->ss.host_id_disk.path, SANLK_PATH_LEN); + rd1.rs.disks[0].offset = lms->align_size * GL_LOCK_BEGIN; + + rv = sanlock_acquire(lms->sock, -1, 0, 1, &rs1, NULL); + if (rv < 0) { + log_error("S %s ex_disable_gl_san acquire error %d", + ls->name, rv); + goto out; + } + + rs_args[0] = rs1; + rs_args[1] = rs2; + + rv = sanlock_release(lms->sock, -1, SANLK_REL_RENAME, 2, rs_args); + if (rv < 0) { + log_error("S %s ex_disable_gl_san release_rename error %d", + ls->name, rv); + } + +out: + free(rs_args); + return rv; +} + +/* + * enable/disable exist because each vg contains a global lock, + * but we only want to use the gl from one of them. The first + * sanlock vg created, has its gl enabled, and subsequent + * sanlock vgs have their gl disabled. If the vg containing the + * gl is removed, the gl from another sanlock vg needs to be + * enabled. Or, if gl in multiple vgs are somehow enabled, we + * want to be able to disable one of them. + * + * Disable works by naming/renaming the gl resource to have a + * name that is different from the predefined name. + * When a host attempts to acquire the gl with its standard + * predefined name, it will fail because the resource's name + * on disk doesn't match. + */ + +int lm_able_gl_sanlock(struct lockspace *ls, int enable) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + struct sanlk_resourced rd; + const char *gl_name; + int rv; + + if (enable) + gl_name = R_NAME_GL; + else + gl_name = R_NAME_GL_DISABLED; + + memset(&rd, 0, sizeof(rd)); + + strncpy(rd.rs.lockspace_name, ls->name, SANLK_NAME_LEN); + strncpy(rd.rs.name, gl_name, SANLK_NAME_LEN); + + rd.rs.num_disks = 1; + strncpy(rd.rs.disks[0].path, lms->ss.host_id_disk.path, SANLK_PATH_LEN); + rd.rs.disks[0].offset = lms->align_size * GL_LOCK_BEGIN; + + rv = sanlock_write_resource(&rd.rs, 0, 0, 0); + if (rv < 0) { + log_error("S %s able_gl %d write_resource gl error %d %s", + ls->name, enable, rv, rd.rs.disks[0].path); + return rv; + } + + log_debug("S %s able_gl %s", ls->name, gl_name); + + ls->sanlock_gl_enabled = enable; + if (ls->sanlock_gl_dup && !enable) + ls->sanlock_gl_dup = 0; + + if (enable) + strncpy(gl_lsname_sanlock, ls->name, MAX_NAME); + + if (!enable && !strcmp(gl_lsname_sanlock, ls->name)) + memset(gl_lsname_sanlock, 0, sizeof(gl_lsname_sanlock)); + + return 0; +} + +static int gl_is_enabled(struct lockspace *ls, struct lm_sanlock *lms) +{ + char strname[SANLK_NAME_LEN + 1]; + struct sanlk_resourced rd; + uint64_t offset; + int rv; + + memset(&rd, 0, sizeof(rd)); + + strncpy(rd.rs.lockspace_name, ls->name, SANLK_NAME_LEN); + + /* leave rs.name empty, it is what we're checking */ + + rd.rs.num_disks = 1; + strncpy(rd.rs.disks[0].path, lms->ss.host_id_disk.path, SANLK_PATH_LEN); + + offset = lms->align_size * GL_LOCK_BEGIN; + rd.rs.disks[0].offset = offset; + + rv = sanlock_read_resource(&rd.rs, 0); + if (rv < 0) { + log_error("gl_is_enabled read_resource error %d", rv); + return rv; + } + + memset(strname, 0, sizeof(strname)); + memcpy(strname, rd.rs.name, SANLK_NAME_LEN); + + if (!strcmp(strname, R_NAME_GL_DISABLED)) { + return 0; + } + + if (!strcmp(strname, R_NAME_GL)) { + return 1; + } + + log_error("gl_is_enabled invalid gl name %s", strname); + return -1; +} + +int lm_gl_is_enabled(struct lockspace *ls) +{ + int rv; + rv = gl_is_enabled(ls, ls->lm_data); + ls->sanlock_gl_enabled = rv; + return rv; +} + +/* + * host A: start_vg/add_lockspace + * host B: vgremove + * + * The global lock cannot always be held around start_vg + * on host A because the gl is in a vg that may not be + * started yet, or may be in the vg we are starting. + * + * If B removes the vg, destroying the delta leases, + * while A is a lockspace member, it will cause A's + * sanlock delta lease renewal to fail, and lockspace + * recovery. + * + * I expect this overlap would usually cause a failure + * in the add_lockspace() on host A when it sees that + * the lockspace structures have been clobbered by B. + * Having add_lockspace() fail should be a fine result. + * + * If add_lockspace was somehow able to finish, the + * subsequent renewal would probably fail instead. + * This should also not create any major problems. + */ + +int lm_add_lockspace_sanlock(struct lockspace *ls, int adopt) +{ + struct stat st; + struct lm_sanlock *lms; + char lock_lv_name[MAX_ARGS]; + char lsname[SANLK_NAME_LEN + 1]; + char disk_path[SANLK_PATH_LEN]; + int rv; + + memset(disk_path, 0, sizeof(disk_path)); + memset(lock_lv_name, 0, sizeof(lock_lv_name)); + + rv = check_args_version(ls->vg_args, VG_LOCK_ARGS_MAJOR); + if (rv < 0) + return rv; + + rv = lock_lv_name_from_args(ls->vg_args, lock_lv_name); + if (rv < 0) { + log_error("S %s add_lockspace_san lock_lv_name_from_args error %d %s", + ls->name, rv, ls->vg_args); + return rv; + } + + snprintf(disk_path, SANLK_PATH_LEN, "/dev/mapper/%s-%s", + ls->vg_name, lock_lv_name); + + /* + * When a vg is started, the internal sanlock lv should be + * activated before lvmlockd is asked to add the lockspace. + * (sanlock needs to use the lv.) + * + * TODO: can we ask something on the system to activate the + * sanlock lv or should we just require that vgchange be used + * to start sanlock vgs? + * Should sanlock lvs be "auto-activated"? + */ + + /* FIXME: remove this, device is not always ready for us here */ + sleep(1); + + rv = stat(disk_path, &st); + if (rv < 0) { + log_error("S %s add_lockspace_san stat error %d disk_path %s", + ls->name, errno, disk_path); + return -1; + } + + if (!ls->host_id) { + if (daemon_host_id) + ls->host_id = daemon_host_id; + else if (daemon_host_id_file) + ls->host_id = read_host_id_file(); + } + + if (!ls->host_id || ls->host_id > 2000) { + log_error("S %s add_lockspace_san invalid host_id %llu", + ls->name, (unsigned long long)ls->host_id); + return -1; + } + + lms = malloc(sizeof(struct lm_sanlock)); + if (!lms) + return -ENOMEM; + + memset(lsname, 0, sizeof(lsname)); + strncpy(lsname, ls->name, SANLK_NAME_LEN); + + memcpy(lms->ss.name, lsname, SANLK_NAME_LEN); + lms->ss.host_id_disk.offset = 0; + lms->ss.host_id = ls->host_id; + strncpy(lms->ss.host_id_disk.path, disk_path, SANLK_PATH_LEN); + + if (daemon_test) { + if (!gl_lsname_sanlock[0]) { + strncpy(gl_lsname_sanlock, lsname, MAX_NAME); + log_debug("S %s add_lockspace_san use global lock", lsname); + } + goto out; + } + + lms->sock = sanlock_register(); + if (lms->sock < 0) { + log_error("S %s add_lockspace_san register error %d", lsname, lms->sock); + free(lms); + return -1; + } + + rv = sanlock_restrict(lms->sock, SANLK_RESTRICT_SIGKILL); + if (rv < 0) { + log_error("S %s restrict error %d", lsname, rv); + } + + lms->align_size = sanlock_align(&lms->ss.host_id_disk); + if (lms->align_size <= 0) { + log_error("S %s add_lockspace_san align error %d", lsname, lms->align_size); + close(lms->sock); + free(lms); + return -1; + } + + rv = gl_is_enabled(ls, lms); + if (rv < 0) { + log_error("S %s add_lockspace_san gl_enabled error %d", lsname, rv); + close(lms->sock); + free(lms); + return rv; + } + + ls->sanlock_gl_enabled = rv; + + if (rv) { + if (gl_use_dlm) { + log_error("S %s add_lockspace_san gl_use_dlm is set", lsname); + } else if (gl_lsname_sanlock[0] && strcmp(gl_lsname_sanlock, lsname)) { + log_error("S %s add_lockspace_san multiple sanlock global locks current %s", + lsname, gl_lsname_sanlock); + } else { + strncpy(gl_lsname_sanlock, lsname, MAX_NAME); + log_debug("S %s add_lockspace_san use global lock %s", + lsname, gl_lsname_sanlock); + } + } + + rv = sanlock_add_lockspace(&lms->ss, 0); + if (rv == -EEXIST && adopt) { + /* We could alternatively just skip the sanlock call for adopt. */ + log_debug("S %s add_lockspace_san adopt found ls", lsname); + goto out; + } + if (rv < 0) { + /* retry for some errors? */ + log_error("S %s add_lockspace_san add_lockspace error %d", lsname, rv); + close(lms->sock); + free(lms); + return rv; + } + + /* + * Don't let the lockspace be cleanly released if orphan locks + * exist, because the orphan locks are still protecting resources + * that are being used on the host, e.g. active lvs. If the + * lockspace is cleanly released, another host could acquire the + * orphan leases. + */ + + rv = sanlock_set_config(lsname, 0, SANLK_CONFIG_USED_BY_ORPHANS, NULL); + if (rv < 0) { + log_error("S %s add_lockspace_san set_config error %d", lsname, rv); + sanlock_rem_lockspace(&lms->ss, 0); + close(lms->sock); + free(lms); + return rv; + } + +out: + log_debug("S %s add_lockspace_san done", lsname); + + ls->lm_data = lms; + return 0; +} + +int lm_rem_lockspace_sanlock(struct lockspace *ls, int free_vg) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + int rv; + + if (daemon_test) + goto out; + + rv = sanlock_rem_lockspace(&lms->ss, 0); + if (rv < 0) { + log_error("S %s rem_lockspace_san error %d", ls->name, rv); + return rv; + } + + if (free_vg) { + /* + * Destroy sanlock lockspace (delta leases). Forces failure for any + * other host that is still using or attempts to use this lockspace. + * This shouldn't be generally necessary, but there may some races + * between nodes starting and removing a vg which this could help. + */ + strncpy(lms->ss.name, "#unused", SANLK_NAME_LEN); + + rv = sanlock_write_lockspace(&lms->ss, 0, 0, 0); + if (rv < 0) { + log_error("S %s rem_lockspace free_vg write_lockspace error %d %s", + ls->name, rv, lms->ss.host_id_disk.path); + } + } +out: + close(lms->sock); + + free(lms); + ls->lm_data = NULL; + + /* TODO: should we only clear gl_lsname when doing free_vg? */ + + if (!strcmp(ls->name, gl_lsname_sanlock)) + memset(gl_lsname_sanlock, 0, sizeof(gl_lsname_sanlock)); + + return 0; +} + +#if 0 +static int find_lv_offset(struct lockspace *ls, struct resource *r, + uint64_t *lv_args_offset) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + struct sanlk_resourced rd; + uint64_t offset; + int align_size; + int lv_count = 0; + int rv; + + memset(&rd, 0, sizeof(rd)); + + strncpy(rd.rs.lockspace_name, ls->name, SANLK_NAME_LEN); + rd.rs.num_disks = 1; + memcpy(rd.rs.disks[0].path, lms->ss.host_id_disk.path, SANLK_PATH_LEN); + + align_size = sanlock_align(&rd.rs.disks[0]); + if (align_size <= 0) { + log_error("find_lv_offset align error %d", align_size); + return -EINVAL; + } + + offset = align_size * LV_LOCK_BEGIN; + + while (1) { + rd.rs.disks[0].offset = offset; + + memset(rd.rs.name, 0, SANLK_NAME_LEN); + + rv = sanlock_read_resource(&rd.rs, 0); + if (rv) { + log_error("S %s find_lv_offset read error %d offset %llu", + ls->name, rv, (unsigned long long)offset); + break; + } + + if (!strncmp(rd.rs.name, r->name, SANLK_NAME_LEN)) { + /* found it */ + *lv_args_offset = offset; + rv = 0; + break; + } + + offset += align_size; + + if (lv_count++ >= LVMLOCKD_SANLOCK_MAX_LVS_IN_VG) { + rv = -EBADSLT; + break; + } + + } + return rv; +} +#endif + +static int lm_add_resource_sanlock(struct lockspace *ls, struct resource *r) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + struct rd_sanlock *rds = (struct rd_sanlock *)r->lm_data; + + strncpy(rds->rs.lockspace_name, ls->name, SANLK_NAME_LEN); + strncpy(rds->rs.name, r->name, SANLK_NAME_LEN); + rds->rs.num_disks = 1; + memcpy(rds->rs.disks[0].path, lms->ss.host_id_disk.path, SANLK_PATH_LEN); + + if (r->type == LD_RT_GL) + rds->rs.disks[0].offset = GL_LOCK_BEGIN * lms->align_size; + else if (r->type == LD_RT_VG) + rds->rs.disks[0].offset = VG_LOCK_BEGIN * lms->align_size; + + /* LD_RT_LV offset is set in each lm_lock call from lv_args. */ + + if (r->type == LD_RT_GL || r->type == LD_RT_VG) { + rds->vb = malloc(sizeof(struct val_blk)); + if (!rds->vb) + return -ENOMEM; + memset(rds->vb, 0, sizeof(struct val_blk)); + } + + return 0; +} + +int lm_rem_resource_sanlock(struct lockspace *ls, struct resource *r) +{ + struct rd_sanlock *rds = (struct rd_sanlock *)r->lm_data; + + /* TODO: assert r->mode == UN or unlock if it's not? */ + + if (rds->vb) + free(rds->vb); + + memset(rds, 0, sizeof(struct rd_sanlock)); + r->lm_init = 0; + return 0; +} + +int lm_lock_sanlock(struct lockspace *ls, struct resource *r, int ld_mode, + uint32_t *r_version, uint32_t *n_version, int *retry, int adopt) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + struct rd_sanlock *rds = (struct rd_sanlock *)r->lm_data; + struct sanlk_resource *rs; + uint64_t lock_lv_offset; + uint32_t flags = 0; + struct val_blk vb; + uint16_t vb_version; + int added = 0; + int rv; + + if (!r->lm_init) { + rv = lm_add_resource_sanlock(ls, r); + if (rv < 0) + return rv; + r->lm_init = 1; + added = 1; + } + + rs = &rds->rs; + + if (r->type == LD_RT_LV) { + /* + * The lv may have been removed and recreated with a new lease + * offset, so we need to get the offset from lv_args each time + * instead of reusing the value that we last set in rds->rs. + * act->lv_args is copied to r->lv_args before every lm_lock(). + */ + + rv = check_args_version(r->lv_args, LV_LOCK_ARGS_MAJOR); + if (rv < 0) { + log_error("S %s R %s lock_san wrong lv_args version %s", + ls->name, r->name, r->lv_args); + return rv; + } + + rv = lock_lv_offset_from_args(r->lv_args, &lock_lv_offset); + if (rv < 0) { + log_error("S %s R %s lock_san lv_offset_from_args error %d %s", + ls->name, r->name, rv, r->lv_args); + return rv; + } + + if (!added && (rds->rs.disks[0].offset != lock_lv_offset)) { + log_debug("S %s R %s lock_san offset old %llu new %llu", + ls->name, r->name, + (unsigned long long)rds->rs.disks[0].offset, + (unsigned long long)lock_lv_offset); + } + + rds->rs.disks[0].offset = lock_lv_offset; + } + + if (ld_mode == LD_LK_SH) { + rs->flags |= SANLK_RES_SHARED; + } else if (ld_mode == LD_LK_EX) { + rs->flags &= ~SANLK_RES_SHARED; + } else { + log_error("lock_san invalid mode %d", ld_mode); + return -EINVAL; + } + + /* + * Use PERSISTENT because if lvmlockd exits while holding + * a lock, it's not safe to simply clear/drop the lock while + * a command or lv is using it. + */ + + rs->flags |= SANLK_RES_PERSISTENT; + + log_debug("S %s R %s lock_san acquire %s:%llu", + ls->name, r->name, rs->disks[0].path, + (unsigned long long)rs->disks[0].offset); + + if (daemon_test) { + *r_version = 0; + *n_version = 0; + return 0; + } + + if (rds->vb) + flags |= SANLK_ACQUIRE_LVB; + if (adopt) + flags |= SANLK_ACQUIRE_ORPHAN_ONLY; + + rv = sanlock_acquire(lms->sock, -1, flags, 1, &rs, NULL); + + if (rv == -EAGAIN) { + /* + * It appears that sanlock_acquire returns EAGAIN when we request + * a shared lock but the lock is held ex by another host. + * There's no point in retrying this case, just return an error. + * + * TODO: verify the sanlock behavior here. + */ + log_debug("S %s R %s lock_san acquire mode %d rv EAGAIN", ls->name, r->name, ld_mode); + *retry = 0; + return -EAGAIN; + } + + if (adopt && (rv == -EUCLEAN)) { + /* + * The orphan lock exists but in a different mode than we asked + * for, so the caller should try again with the other mode. + */ + log_debug("S %s R %s lock_san adopt mode %d try other mode", + ls->name, r->name, ld_mode); + *retry = 0; + return -EUCLEAN; + } + + if (adopt && (rv == -ENOENT)) { + /* + * No orphan lock exists. + */ + log_debug("S %s R %s lock_san adopt mode %d no orphan found", + ls->name, r->name, ld_mode); + *retry = 0; + return -ENOENT; + } + + if (rv == SANLK_ACQUIRE_IDLIVE || rv == SANLK_ACQUIRE_OWNED || rv == SANLK_ACQUIRE_OTHER) { + /* + * The lock is held by another host. These failures can + * happen while multiple hosts are concurrently acquiring + * shared locks. We want to retry a couple times in this + * case because we'll probably get the sh lock. + * + * I believe these are also the errors when requesting an + * ex lock that another host holds ex. We want to report + * something like: "lock is held by another host" in this case. + * Retry is pointless here. + * + * We can't distinguish between the two cases above, + * so if requesting a sh lock, retry a couple times, + * otherwise don't. + * + * TODO: verify sanlock behavior here. + */ + log_debug("S %s R %s lock_san acquire mode %d rv %d", ls->name, r->name, ld_mode, rv); + *retry = (ld_mode == LD_LK_SH) ? 1 : 0; + return -EAGAIN; + } + + if (rv < 0) { + log_error("S %s R %s lock_san acquire error %d", + ls->name, r->name, rv); + + if (added) { + lm_rem_resource_sanlock(ls, r); + return rv; + } + + /* if the gl has been disabled, remove and free the gl resource */ + if ((rv == SANLK_LEADER_RESOURCE) && (r->type == LD_RT_GL)) { + if (!lm_gl_is_enabled(ls)) { + log_error("S %s R %s lock_san gl has been disabled", + ls->name, r->name); + if (!strcmp(gl_lsname_sanlock, ls->name)) + memset(gl_lsname_sanlock, 0, sizeof(gl_lsname_sanlock)); + return -EUNATCH; + } + } + + return rv; + } + + if (rds->vb) { + rv = sanlock_get_lvb(0, rs, (char *)&vb, sizeof(vb)); + if (rv < 0) { + log_error("S %s R %s lock_san get_lvb error %d", ls->name, r->name, rv); + *r_version = 0; + *n_version = 0; + goto out; + } + + vb_version = le16_to_cpu(vb.version); + + if (vb_version && ((vb_version & 0xFF00) > (VAL_BLK_VERSION & 0xFF00))) { + log_error("S %s R %s lock_san ignore vb_version %x", + ls->name, r->name, vb_version); + *r_version = 0; + free(rds->vb); + rds->vb = NULL; + goto out; + } + + *r_version = le32_to_cpu(vb.r_version); + *n_version = le32_to_cpu(vb.n_version); + memcpy(rds->vb, &vb, sizeof(vb)); /* rds->vb saved as le */ + + log_debug("S %s R %s lock_san get r_version %u n_version %u", + ls->name, r->name, *r_version, *n_version); + } +out: + return rv; +} + +int lm_convert_sanlock(struct lockspace *ls, struct resource *r, + int ld_mode, uint32_t r_version) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + struct rd_sanlock *rds = (struct rd_sanlock *)r->lm_data; + struct sanlk_resource *rs = &rds->rs; + struct val_blk vb; + uint32_t flags = 0; + int rv; + + log_debug("S %s R %s convert_san", ls->name, r->name); + + if (daemon_test) + goto rs_flag; + + if (rds->vb && r_version && (r->mode == LD_LK_EX)) { + if (!rds->vb->version) { + /* first time vb has been written */ + rds->vb->version = cpu_to_le16(VAL_BLK_VERSION); + } + if (r_version) + rds->vb->r_version = cpu_to_le32(r_version); + memcpy(&vb, rds->vb, sizeof(vb)); + + log_debug("S %s R %s convert_san set r_version %u", + ls->name, r->name, r_version); + + rv = sanlock_set_lvb(0, rs, (char *)&vb, sizeof(vb)); + if (rv < 0) { + log_error("S %s R %s convert_san set_lvb error %d", + ls->name, r->name, rv); + } + } + + rs_flag: + if (ld_mode == LD_LK_SH) + rs->flags |= SANLK_RES_SHARED; + else + rs->flags &= ~SANLK_RES_SHARED; + + if (daemon_test) + return 0; + + rv = sanlock_convert(lms->sock, -1, flags, rs); + if (rv == -EAGAIN) { + /* TODO: what case is this? what should be done? */ + log_error("S %s R %s convert_san EAGAIN", ls->name, r->name); + return -EAGAIN; + } + if (rv < 0) { + log_error("S %s R %s convert_san convert error %d", ls->name, r->name, rv); + } + + return rv; +} + +static int release_rename(struct lockspace *ls, struct resource *r) +{ + struct rd_sanlock rd1; + struct rd_sanlock rd2; + struct sanlk_resource *res1; + struct sanlk_resource *res2; + struct sanlk_resource **res_args; + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + struct rd_sanlock *rds = (struct rd_sanlock *)r->lm_data; + int rv; + + log_debug("S %s R %s release rename", ls->name, r->name); + + res_args = malloc(2 * sizeof(struct sanlk_resource *)); + if (!res_args) + return -ENOMEM; + + memcpy(&rd1, rds, sizeof(struct rd_sanlock)); + memcpy(&rd2, rds, sizeof(struct rd_sanlock)); + + res1 = (struct sanlk_resource *)&rd1; + res2 = (struct sanlk_resource *)&rd2; + + strcpy(res2->name, "invalid_removed"); + + res_args[0] = res1; + res_args[1] = res2; + + rv = sanlock_release(lms->sock, -1, SANLK_REL_RENAME, 2, res_args); + if (rv < 0) { + log_error("S %s R %s unlock_san release rename error %d", ls->name, r->name, rv); + } + + free(res_args); + + return rv; +} + +/* + * rds->vb is stored in le + * + * r_version is r->version + * + * for GL locks lvmlockd just increments this value + * each time the global lock is released from ex. + * + * for VG locks it is the seqno from the vg metadata. + * + * n_version is r->names_version + * + * n_version is only used in gl locks. + * lvmlockd increments this value each time + * the global lock is released from ex by a + * command that changes the list of vgs. + */ + +int lm_unlock_sanlock(struct lockspace *ls, struct resource *r, + uint32_t r_version, uint32_t n_version, uint32_t lmu_flags) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + struct rd_sanlock *rds = (struct rd_sanlock *)r->lm_data; + struct sanlk_resource *rs = &rds->rs; + struct val_blk vb; + int rv; + + log_debug("S %s R %s unlock_san r_version %u flags %x", + ls->name, r->name, r_version, lmu_flags); + + if (daemon_test) + return 0; + + if (rds->vb && r_version && (r->mode == LD_LK_EX)) { + if (!rds->vb->version) { + /* first time vb has been written */ + rds->vb->version = cpu_to_le16(VAL_BLK_VERSION); + } + if (r_version) + rds->vb->r_version = cpu_to_le32(r_version); + if (n_version) + rds->vb->n_version = cpu_to_le32(n_version); + memcpy(&vb, rds->vb, sizeof(vb)); + + log_debug("S %s R %s unlock_san set r_version %u n_version %u", + ls->name, r->name, r_version, n_version); + + rv = sanlock_set_lvb(0, rs, (char *)&vb, sizeof(vb)); + if (rv < 0) { + log_error("S %s R %s unlock_san set_lvb error %d", + ls->name, r->name, rv); + } + } + + /* + * For vgremove (FREE_VG) we unlock-rename the vg and gl locks + * so they cannot be reacquired. + */ + if ((lmu_flags & LMUF_FREE_VG) && + (r->type == LD_RT_GL || r->type == LD_RT_VG)) { + return release_rename(ls, r); + } + + rv = sanlock_release(lms->sock, -1, 0, 1, &rs); + if (rv < 0) { + log_error("S %s R %s unlock_san release error %d", ls->name, r->name, rv); + } + + return rv; +} + +int lm_hosts_sanlock(struct lockspace *ls, int notify) +{ + struct sanlk_host *hss = NULL; + struct sanlk_host *hs; + uint32_t state; + int hss_count = 0; + int found_self = 0; + int found_others = 0; + int i, rv; + + rv = sanlock_get_hosts(ls->name, 0, &hss, &hss_count, 0); + if (rv < 0) { + log_error("S %s hosts_san get_hosts error %d", ls->name, rv); + return 0; + } + + if (!hss || !hss_count) { + log_error("S %s hosts_san zero hosts", ls->name); + return 0; + } + + hs = hss; + + for (i = 0; i < hss_count; i++) { + log_debug("S %s hosts_san host_id %llu gen %llu flags %x", + ls->name, + (unsigned long long)hs->host_id, + (unsigned long long)hs->generation, + hs->flags); + + if (hs->host_id == ls->host_id) { + found_self = 1; + hs++; + continue; + } + + state = hs->flags & SANLK_HOST_MASK; + if (state == SANLK_HOST_LIVE) + found_others++; + hs++; + } + free(hss); + + if (found_others && notify) { +#if 0 + struct sanlk_host_event he; + memset(&he, 0, sizeof(he)); + hm.host_id = 1; + hm.generation = 0; + hm.event = EVENT_VGSTOP; + sanlock_set_event(ls->name, &he, SANLK_SETEV_ALL_HOSTS); +#endif + /* + * We'll need to retry for a while before all the hosts see + * this event and stop the vg. + * We'll need to register for events from the lockspace + * and add the registered fd to our poll set. + */ + } + + if (!found_self) { + log_error("S %s hosts_san self not found others %d", ls->name, found_others); + return 0; + } + + return found_others; +} + +int lm_get_lockspaces_sanlock(struct list_head *ls_rejoin) +{ + struct sanlk_lockspace *ss_all = NULL; + struct sanlk_lockspace *ss; + struct lockspace *ls; + int ss_count = 0; + int i, rv; + + rv = sanlock_get_lockspaces(&ss_all, &ss_count, 0); + if (rv < 0) + return rv; + + if (!ss_all || !ss_count) + return 0; + + ss = ss_all; + + for (i = 0; i < ss_count; i++) { + + if (strncmp(ss->name, LVM_LS_PREFIX, strlen(LVM_LS_PREFIX))) + continue; + + if (!(ls = alloc_lockspace())) + return -ENOMEM; + + ls->lm_type = LD_LM_SANLOCK; + ls->host_id = ss->host_id; + strncpy(ls->name, ss->name, MAX_NAME); + strncpy(ls->vg_name, ss->name + strlen(LVM_LS_PREFIX), MAX_NAME); + list_add_tail(&ls->list, ls_rejoin); + + ss++; + } + + free(ss_all); + return 0; +} + diff --git a/man/lvmlockd.8.in b/man/lvmlockd.8.in new file mode 100644 index 000000000..f20bfd095 --- /dev/null +++ b/man/lvmlockd.8.in @@ -0,0 +1,801 @@ +.TH "LVMLOCKD" "8" "LVM TOOLS #VERSION#" "Red Hat, Inc" "\"" + +.SH NAME +lvmlockd \(em lvm locking daemon + +.SH DESCRIPTION +lvm commands use lvmlockd to coordinate access to shared storage. +.br +When lvm is used on devices shared by multiple hosts, locks will: + +- coordinate reading and writing of lvm metadata +.br +- validate caching of lvm metadata +.br +- prevent concurrent activation of logical volumes + +lvmlockd uses an external lock manager to perform basic locking. +.br +Lock manager (lock type) options are: + +- sanlock: places locks on disk within lvm storage. +.br +- dlm: uses network communication and a cluster manager. + +.SH OPTIONS + +lvmlockd [options] + +For default settings, see lvmlockd -h. + +.B --help | -h + Show this help information. + +.B --version | -V + Show version of lvmlockd. + +.B --test | -T + Test mode, do not call lock manager. + +.B --foreground | -f + Don't fork. + +.B --daemon-debug | -D + Don't fork and print debugging to stdout. + +.B --pid-file | -p +.I path + Set path to the pid file. + +.B --socket-path | -s +.I path + Set path to the socket to listen on. + +.B --local-also | -a + Manage locks between pids for local VGs. + +.B --local-only | -o + Only manage locks for local VGs, not dlm|sanlock VGs. + +.B --gl-type | -g +.I str + Set global lock type to be dlm|sanlock. + +.B --system-id | -y +.I str + Set the local system id. + +.B --host-id | -i +.I num + Set the local sanlock host id. + +.B --host-id-file | -F +.I path + A file containing the local sanlock host_id. + + +.SH USAGE + +.SS Initial set up + +Using lvm with lvmlockd for the first time includes some one-time set up +steps: + +.SS 1. choose a lock manager + +.I dlm +.br +If dlm (or corosync) are already being used by other cluster +software, then select dlm. dlm uses corosync which requires additional +configuration beyond the scope of this document. See corosync and dlm +documentation for instructions on configuration, setup and usage. + +.I sanlock +.br +Choose sanlock if dlm/corosync are not otherwise required. +sanlock does not depend on any clustering software or configuration. + +.SS 2. configure hosts to use lvmlockd + +On all hosts running lvmlockd, configure lvm.conf: +.nf +locking_type = 1 +use_lvmlockd = 1 +use_lvmetad = 1 +.fi + +.I sanlock +.br +Assign each host a unique host_id in the range 1-2000 by setting +.br +/etc/lvm/lvmlocal.conf local/host_id = <num> + +.SS 3. start lvmlockd + +Use a service/init file if available, or just run "lvmlockd". + +.SS 4. start lock manager + +.I sanlock +.br +systemctl start wdmd sanlock + +.I dlm +.br +Follow external clustering documentation when applicable, otherwise: +.br +systemctl start corosync dlm + +.SS 5. create VGs on shared devices + +vgcreate --lock-type sanlock|dlm <vg_name> <devices> + +The vgcreate --lock-type option means that lvm commands will perform +locking for the VG using lvmlockd and the specified lock manager. + +.SS 6. start VGs on all hosts + +vgchange --lock-start + +lvmlockd requires that VGs created with a lock type be "started" before +being used. This is a lock manager operation to start/join the VG +lockspace, and it may take some time. Until the start completes, locks +are not available. Reading and reporting lvm commands are allowed while +start is in progress. +.br +(A service/init file may be used to start VGs.) + +.SS 7. create and activate LVs + +An LV activated exclusively on one host cannot be activated on another. +When multiple hosts need to use the same LV concurrently, the LV can be +activated with a shared lock (see lvchange options -aey vs -asy.) +(Shared locks are disallowed for certain LV types that cannot be used from +multiple hosts.) + +.SS Subsequent start up + +.nf +After initial set up, start up includes: + +- start lvmetad +- start lvmlockd +- start lock manager +- vgchange --lock-start +- activate LVs + +The shut down sequence is the reverse: + +- deactivate LVs +- vgchange --lock-stop +- stop lock manager +- stop lvmlockd +- stop lvmetad +.fi + + +.SH TOPICS + +.SS locking terms + +The following terms are used to distinguish VGs that require locking from +those that do not. Also see +.BR lvmsystemid (7). + +.I "lockd VG" + +A "lockd VG" is a shared VG that has a "lock type" of dlm or sanlock. +Using it requires lvmlockd. These VGs exist on shared storage that is +visible to multiple hosts. lvm commands use lvmlockd to perform locking +for these VGs when they are used. + +If the lock manager for a lock type is not available (e.g. not started or +failed), lvmlockd is not able to acquire locks from it, and lvm commands +are unable to fully use VGs with the given lock type. Commands generally +allow reading and reporting in this condition, but changes and activation +are not allowed. Maintaining a properly running lock manager can require +background not covered here. + +.I "local VG" + +A "local VG" is meant to be used by a single host. It has no lock type or +lock type "none". lvm commands and lvmlockd do not perform locking for +these VGs. A local VG typically exists on local (non-shared) devices and +cannot be used concurrently from different hosts. + +If a local VG does exist on shared devices, it should be owned by a single +host by having its system_id set. Only the host with a matching system_id +can then use the local VG. A VG with no lock type and no system_id should +be excluded from all but one host using lvm.conf filters. Without any of +these protections, a local VG on shared devices can be easily damaged or +destroyed. + +.I "clvm VG" + +A "clvm VG" is a shared VG that has the CLUSTERED flag set (and may +optionally have lock type "clvm"). Using it requires clvmd. These VGs +cannot be used by hosts using lvmlockd, only by hosts using clvm. See +below for converting a clvm VG to a lockd VG. + +The term "clustered" is widely used in other documentation, and refers to +clvm VGs. Statements about "clustered" VGs usually do not apply to lockd +VGs. A new set of rules, properties and descriptions apply to lockd VGs, +created with a "lock type", as opposed to clvm VGs, created with the +"clustered" flag. + + +.SS locking activity + +To optimize the use of lvm with lvmlockd, consider the three kinds of lvm +locks and when they are used: + +1. +.I GL lock + +The global lock (GL lock) is associated with global information, which is +information not isolated to a single VG. This is primarily: + +.nf +- the list of all VG names +- the list of PVs not allocated to a VG (orphan PVs) +- properties of orphan PVs, e.g. PV size +.fi + +The global lock is used in shared mode by commands that want to read this +information, or in exclusive mode by commands that want to change this +information. + +The vgs command acquires the global lock in shared mode because it reports +the list of all VG names. + +The vgcreate command acquires the global lock in exclusive mode because it +creates a new VG name, and it takes a PV from the list of unused PVs. + +When use_lvmlockd is enabled, many lvm commands attempt to acquire the +global lock even if no lockd VGs exist. For this reason, lvmlockd should +not be enabled unless lockd VGs will be used. + +2. +.I VG lock + +A VG lock is associated with each VG. The VG lock is acquired in shared +mode to read the VG and in exclusive mode to change the VG (write the VG +metadata). This serializes modifications to a VG with all other lvm +commands on the VG. + +The vgs command will not only acquire the GL lock (see above), but will +acquire the VG lock for each VG prior to reading it. + +The "vgs vg_name" command does not acquire the GL lock (it does not need +the list of all VG names), but will acquire the VG lock on each vg_name +listed. + +3. +.I LV lock + +An LV lock is acquired before the LV is activated, and is released after +the LV is deactivated. If the LV lock cannot be acquired, the LV is not +activated. LV locks are persistent and remain in place after the +activation command is done. GL and VG locks are transient, and are held +only while an lvm command is running. + +.I reporting + +Reporting commands can sometimes lead to unexpected and excessive locking +activity. See below for optimizing reporting commands to avoid unwanted +locking. + +If tags are used on the command line, all VGs must be read to search for +matching tags. This implies acquiring the GL lock and each VG lock. + + +.SS locking conflicts + +When a command asks lvmlockd to acquire a lock, lvmlockd submits a +non-blocking lock request to the lock manager. This request will fail if +the same lock is held by another host in an incompatible mode. In certain +cases, lvmlockd may retry the request and hide simple transient conflicts +from the command. In other cases, such as LV lock conflicts, the failure +will be returned to the command immediately. The command will fail, +reporting the conflict with another host. + +GL and VG locks are held for short periods, over the course of a single +lvm command, so GL/VG lock conflicts can occur during a small window of +time when two conflicting commands on different hosts happen to overlap +each other. In these cases, retry attempts within lvmlockd will often +mask the transient lock conflicts. + +Another factor that impacts lock conflicts is if lvm commands are +coordinated by a user or program. If commands using conflicting GL/VG +locks are not run concurrently on multiple hosts, they will not encounter +lock conflicts. If no attempt is made to activate LVs exclusively on +multiple hosts, then LV activation will not fail due to lock conflicts. + +Frequent, uncoordinated lvm commands, running concurrently on multiple +hosts, that are making changes to the same lvm resources may occasionally +fail due to locking conflicts. Internal retry attempts could be tuned to +the level necessary to mask these conflicts. Or, retry attempts can be +disabled if all command conflicts should be reported via a command +failure. + +(Commands may report lock failures for reasons other than conflicts. See +below for more cases, e.g. no GL lock exists, locking is not started, +etc.) + +.SS local VGs on shared devices + +When local VGs exist on shared devices, no locking is performed for them +by lvmlockd. The system_id should be set for these VGs to prevent +multiple hosts from using them, or lvm.conf filters should be set to make +the devices visible to only one host. + +The "owner" of a VG is the host with a matching system_id. When local VGs +exist on shared devices, only the VG owner can read and write the local +VG. lvm commands on all other hosts will fail to read or write the VG +with an unmatching system_id. + +If a local VG on shared devices has no system_id, and filters are not used +to make the devices visible to a single host, then all hosts are able to +read and write it, which can easily corrupt the VG. + +See +.BR lvmsystemid (7) +for more information. + +.SS lockd VGs from hosts not using lvmlockd + +Only hosts that will use lockd VGs should be configured to run lvmlockd. +However, lockd VGs may be visible from hosts not using lockd VGs and not +running lvmlockd, much like local VGs with foreign system_id's may be +visible. In this case, the lockd VGs are treated in a similar way to a +local VG with an unmatching system_id. + +.SS vgcreate + +Forms of the vgcreate command: + +.B vgcreate <vg_name> <devices> +.br +- creates a local VG +.br +- If lvm.conf system_id_source = "none", the VG will have no system_id. + This is not recommended, especially for VGs on shared devices. +.br +- If lvm.conf system_id_source does not disable the system_id, the VG + will be owned by the host creating the VG. + +.B vgcreate --lock-type sanlock|dlm <vg_name> <devices> +.br +- creates a lockd vg +.br +- lvm commands will request locks from lvmlockd to use the VG +.br +- lvmlockd will obtain locks from the specified lock manager +.br +- this requires lvmlockd to be configured (use_lvmlockd=1) +.br +- run vgchange --lock-start on other hosts to start the new VG + +.B vgcreate -cy <vg_name> <devices> +.br +- creates a clvm VG when clvm is configured +.br +- creates a lockd VG when lvmlockd is configured + (the --lock-type option is preferred in this case) +.br +- this clustered option originally created a clvm VG, + but will be translated to a lock type when appropriate. +.br +- if use_lvmlockd=1, -cy is translated to --lock-type <type>, + where <type> comes from lvm.conf:vgcreate_cy_lock_type, + which can be set to either sanlock or dlm. + + +After use_lvmlockd=1 is set, and before the first lockd VG is created, no +global lock will exist, and lvm commands will try and fail to acquire it. +lvm commands will report this error until the first lockd VG is created: +"Skipping global lock: not found". + +lvm commands that only read VGs are allowed to continue in this state, +without the shared GL lock, but commands that attempt to acquire the GL +lock exclusively to make changes will fail. + + +.SS starting and stopping VGs + +Starting a lockd VG (vgchange --lock-start) causes the lock manager to +start or join the lockspace for the VG. This makes locks for the VG +accessible to the host. Stopping the VG leaves the lockspace and makes +locks for the VG inaccessible to the host. + +Lockspaces should be started as early as possible because starting +(joining) a lockspace can take a long time (potentially minutes after a +host failure when using sanlock.) A VG can be started after all the +following are true: + +.nf +- lvmlockd is running +- lock manager is running +- VG is visible to the system +.fi + +All lockd VGs can be started/stopped using: +.br +vgchange --lock-start +.br +vgchange --lock-stop + + +Individual VGs can be started/stopped using: +.br +vgchange --lock-start <vg_name> ... +.br +vgchange --lock-stop <vg_name> ... + +To make vgchange wait for start to complete: +.br +vgchange --lock-start --lock-opt wait +.br +vgchange --lock-start --lock-opt wait <vg_name> + +To stop all lockspaces and wait for all to complete: +.br +lvmlock --stop-lockspaces --wait + +To start only selected lockd VGs, use the lvm.conf +activation/lock_start_list. When defined, only VG names in this list are +started by vgchange. If the list is not defined (the default), all +visible lockd VGs are started. To start only "vg1", use the following +lvm.conf configuration: + +.nf +activation { + lock_start_list = [ "vg1" ] + ... +} +.fi + + +.SS automatic starting and automatic activation + +Scripts or programs on a host that automatically start VGs will use the +"auto" option with --lock-start to indicate that the command is being run +automatically by the system: + +vgchange --lock-start --lock-opt auto [vg_name ...] +.br +vgchange --lock-start --lock-opt autowait [vg_name ...] + +By default, the "auto" variations have identical behavior to +--lock-start and '--lock-start --lock-opt wait' options. + +However, when the lvm.conf activation/auto_lock_start_list is defined, the +auto start commands perform an additional filtering phase to all VGs being +started, testing each VG name against the auto_lock_start_list. The +auto_lock_start_list defines lockd VGs that will be started by the auto +start command. Visible lockd VGs not included in the list are ignored by +the auto start command. If the list is undefined, all VG names pass this +filter. (The lock_start_list is also still used to filter all VGs.) + +The auto_lock_start_list allows a user to select certain lockd VGs that +should be automatically started by the system (or indirectly, those that +should not). + +To use auto activation of lockd LVs (see auto_activation_volume_list), +auto starting of the corresponding lockd VGs is necessary. + + +.SS sanlock global lock + +There are some special cases related to the global lock in sanlock VGs. + +The global lock exists in one of the sanlock VGs. The first sanlock VG +created will contain the global lock. Subsequent sanlock VGs will each +contain disabled global locks that can be enabled later if necessary. + +The VG containing the global lock must be visible to all hosts using +sanlock VGs. This can be a reason to create a small sanlock VG, visible +to all hosts, and dedicated to just holding the global lock. While not +required, this strategy can help to avoid extra work in the future if VGs +are moved or removed. + +The vgcreate command typically acquires the global lock, but in the case +of the first sanlock VG, there will be no global lock to acquire until the +initial vgcreate is complete. So, creating the first sanlock VG is a +special case that skips the global lock. + +vgcreate for a sanlock VG determines it is the first one to exist if no +other sanlock VGs are visible. It is possible that other sanlock VGs do +exist but are not visible or started on the host running vgcreate. This +raises the possibility of more than one global lock existing. If this +happens, commands will warn of the condition, and it should be manually +corrected. + +If the situation arises where more than one sanlock VG contains a global +lock, the global lock should be manually disabled in all but one of them +with the command: + +lvmlock --gl-disable <vg_name> + +(The one VG with the global lock enabled must be visible to all hosts.) + +An opposite problem can occur if the VG holding the global lock is +removed. In this case, no global lock will exist following the vgremove, +and subsequent lvm commands will fail to acquire it. In this case, the +global lock needs to be manually enabled in one of the remaining sanlock +VGs with the command: + +lvmlock --gl-enable <vg_name> + +A small sanlock VG dedicated to holding the global lock can avoid the case +where the GL lock must be manually enabled after a vgremove. + + +.SS changing lock type + +To change a local VG to a lockd VG: + +vgchange --lock-type sanlock|dlm <vg_name> + +All LVs must be inactive to change the lock type. + +To change a clvm VG to a lockd VG: + +vgchange --lock-type sanlock|dlm <vg_name> + +Changing a lockd VG to a local VG is not yet generally allowed. +(It can be done partially in certain recovery cases.) + + + +.SS limitations of lockd VGs + +Things that do not yet work in lockd VGs: +.br +- old style mirror LVs (only raid1) +.br +- creating a new thin pool and a new thin LV in a single command +.br +- using lvcreate to create cache pools or cache LVs (use lvconvert) +.br +- splitting raid1 mirror LVs +.br +- vgrename +.br +- vgsplit +.br +- vgmerge + +sanlock VGs can contain up to 190 LVs. This limit is due to the size of +the internal lvmlock LV used to hold sanlock leases. + + +.SS vgremove of a sanlock VG + +vgremove of a sanlock VG will fail if other hosts have the VG started. +Run vgchange --lock-stop <vg_name> on all other hosts before vgremove. + +(It may take several seconds before vgremove recognizes that all hosts +have stopped.) + + +.SS shared LVs + +When an LV is used concurrently from multiple hosts (e.g. by a +multi-host/cluster application or file system), the LV can be activated on +multiple hosts concurrently using a shared lock. + +To activate the LV with a shared lock: lvchange -asy vg/lv. + +The default activation mode is always exclusive (-ay defaults to -aey). + +If the LV type does not allow the LV to be used concurrently from multiple +hosts, then a shared activation lock is not allowed and the lvchange +command will report an error. LV types that cannot be used concurrently +from multiple hosts include thin, cache, raid, mirror, and snapshot. + +lvextend on LV with shared locks is not yet allowed. The LV must be +deactivated, or activated exclusively to run lvextend. + + +.SS recover from lost PV holding sanlock locks + +In a sanlock VG, the locks are stored on a PV within the VG. If this PV +is lost, the locks need to be reconstructed as follows: + +1. Enable the unsafe lock modes option in lvm.conf so that default locking requirements can be overriden. + +\& + +.nf +allow_override_lock_modes = 1 +.fi + +2. Remove missing PVs and partial LVs from the VG. + +\& + +.nf +vgreduce --removemissing --force --lock-gl na --lock-vg na <vg> +.fi + +3. If step 2 does not remove the internal/hidden "lvmlock" lv, it should be removed. + +\& + +.nf +lvremove --lock-vg na --lock-lv na <vg>/lvmlock +.fi + +4. Change the lock type to none. + +\& + +.nf +vgchange --lock-type none --force --lock-gl na --lock-vg na <vg> +.fi + +5. VG space is needed to recreate the locks. If there is not enough space, vgextend the vg. + +6. Change the lock type back to sanlock. This creates a new internal +lvmlock lv, and recreates locks. + +\& + +.nf +vgchange --lock-type sanlock <vg> +.fi + + +.SS locking system failures + +.B lvmlockd failure + +If lvmlockd fails or is killed while holding locks, the locks are orphaned +in the lock manager. lvmlockd can be restarted, and it will adopt the +locks from the lock manager that had been held by the previous instance. + +.B dlm/corosync failure + +If dlm or corosync fail, the clustering system will fence the host using a +method configured within the dlm/corosync clustering environment. + +lvm commands on other hosts will be blocked from acquiring any locks until +the dlm/corosync recovery process is complete. + +.B sanlock lock storage failure + +If access to the device containing the VG's locks is lost, sanlock cannot +renew its leases for locked LVs. This means that the host could soon lose +the lease to another host which could activate the LV exclusively. +sanlock is designed to never reach the point where two hosts hold the +same lease exclusively at once, so the same LV should never be active on +two hosts at once when activated exclusively. + +The current method of handling this involves no action from lvmlockd, +while allowing sanlock to protect the leases itself. This produces a safe +but potentially inconvenient result. Doing nothing from lvmlockd leads to +the host's LV locks not being released, which leads to sanlock using the +local watchdog to reset the host before another host can acquire any locks +held by the local host. + +lvm commands on other hosts will be blocked from acquiring locks held by +the failed/reset host until the sanlock recovery time expires (2-4 +minutes). This includes activation of any LVs that were locked by the +failed host. It also includes GL/VG locks held by any lvm commands that +happened to be running on the failed host at the time of the failure. + +(In the future, lvmlockd may have the option to suspend locked LVs in +response the sanlock leases expiring. This would avoid the need for +sanlock to reset the host.) + +.B sanlock daemon failure + +If the sanlock daemon fails or exits while a lockspace is started, the +local watchdog will reset the host. See previous section for the impact +on other hosts. + + +.SS overriding, disabling, testing locking + +Special options to manually override or disable default locking: + +Disable use_lvmlockd for an individual command. Return success to all +lockd calls without attempting to contact lvmlockd: + +<lvm_command> --config 'global { use_lvmlockd = 0 }' + +Ignore error if lockd call failed to connect to lvmlockd or did not get a +valid response to its request: + +<lvm_command> --sysinit +.br +<lvm_command> --ignorelockingfailure + +Specifying "na" as the lock mode will cause the lockd_xy() call to do +nothing (like the --config): + +<lvm_command> --lock-gl na +.br +<lvm_command> --lock-vg na +.br +<lvm_command> --lock-lv na + +(This is not permitted unless lvm.conf:allow_override_lock_modes=1.) + +Exercise all locking code in client and daemon, for each specific +lock_type, but return success at a step would fail because the specific +locking system is not running: + +lvmockd --test + + +.SS locking between local processes + +With the --local-also option, lvmlockd will handle VG locking between +local processes for local VGs. The standard internal lockd_vg calls, +typically used for locking lockd VGs, are applied to local VGs. The +global lock behavior does not change and applies to both lockd VGs and +local VGs as usual. + +The --local-only option extends the --local-also option to include a +special "global lock" for local VGs. This option should be used when only +local VGs exist, no lockd VGs exist. It allows the internal lockd_gl +calls to provide GL locking between local processes. + + +.SS changing dlm cluster name + +When a dlm VG is created, the cluster name is saved in the VG metadata for +the new VG. To use the VG, a host must be in the named cluster. If the +cluster name is changed, or the VG is moved to a different cluster, the +cluster name for the dlm VG must be changed. To do this: + +1. Ensure the VG is not being used by any hosts. + +2. The new cluster must be active on the node making the change. +.br + The current dlm cluster name can be seen by: +.br + cat /sys/kernel/config/dlm/cluster/cluster_name + +3. Change the VG lock type to none: +.br + vgchange --lock-type none --force <vg_name> + +4. Change the VG lock type back to dlm which sets the new cluster name: +.br + vgchange --lock-type dlm <vg_name> + + +.SS clvm comparison + +User visible or command level differences between lockd VGs (with +lvmlockd) and clvm VGs (with clvmd): + +lvmlockd includes the sanlock lock manager option. + +lvmlockd does not require all hosts to see all the same shared devices. + +lvmlockd defaults to the exclusive activation mode in all VGs. + +lvmlockd commands always apply to the local host, and never have an effect +on a remote host. (The activation option 'l' is not used.) + +lvmlockd works with lvmetad. + +lvmlockd works with thin and cache pools and LVs. + +lvmlockd allows VG ownership by system id (also works when lvmlockd is not +used). + +lvmlockd saves the cluster name for a lockd VG using dlm. Only hosts in +the matching cluster can use the VG. + +lvmlockd prefers the new vgcreate --lock-type option in place of the +--clustered (-c) option. + +lvmlockd requires starting/stopping lockd VGs with vgchange --lock-start +and --lock-stop. + + |