diff options
author | David Teigland <teigland@redhat.com> | 2015-03-05 14:00:44 -0600 |
---|---|---|
committer | David Teigland <teigland@redhat.com> | 2015-06-24 12:17:40 -0500 |
commit | 94de9dabaebb11c3cb1d27a687f93c75185e2fa7 (patch) | |
tree | fb4d671ff8f73a655bb4391ed6b35a33195427ed | |
parent | 7760665fb8f4a85a244a9751a0a1d1cc46c157dc (diff) | |
download | lvm2-94de9dabaebb11c3cb1d27a687f93c75185e2fa7.tar.gz |
Add lvmlockd
83 files changed, 14680 insertions, 129 deletions
diff --git a/configure.in b/configure.in index d330bd206..0bd6828ed 100644 --- a/configure.in +++ b/configure.in @@ -39,6 +39,7 @@ case "$host_os" in DEVMAPPER=yes LVMETAD=no LVMPOLLD=no + LVMLOCKD=no ODIRECT=yes DM_IOCTLS=yes SELINUX=yes @@ -1133,6 +1134,50 @@ AC_DEFINE_UNQUOTED(DEFAULT_USE_LVMPOLLD, [$DEFAULT_USE_LVMPOLLD], [Use lvmpolld by default.]) ################################################################################ +dnl -- Build lvmlockd +AC_MSG_CHECKING(whether to build lvmlockd) +AC_ARG_ENABLE(lvmlockd, + AC_HELP_STRING([--enable-lvmlockd], + [enable the LVM lock daemon]), + LVMLOCKD=$enableval) +AC_MSG_RESULT($LVMLOCKD) + +BUILD_LVMLOCKD=$LVMLOCKD + +if test "$BUILD_LVMLOCKD" = yes; then + AC_MSG_CHECKING([defaults for use_lvmlockd]) + AC_ARG_ENABLE(use_lvmlockd, + AC_HELP_STRING([--disable-use-lvmlockd], + [disable usage of LVM lock daemon]), + [case ${enableval} in + yes) DEFAULT_USE_LVMLOCKD=1 ;; + *) DEFAULT_USE_LVMLOCKD=0 ;; + esac], DEFAULT_USE_LVMLOCKD=1) + AC_MSG_RESULT($DEFAULT_USE_LVMLOCKD) + AC_DEFINE([LVMLOCKD_SUPPORT], 1, [Define to 1 to include code that uses lvmlockd.]) + + AC_ARG_WITH(lvmlockd-pidfile, + AC_HELP_STRING([--with-lvmlockd-pidfile=PATH], + [lvmlockd pidfile [PID_DIR/lvmlockd.pid]]), + LVMLOCKD_PIDFILE=$withval, + LVMLOCKD_PIDFILE="$DEFAULT_PID_DIR/lvmlockd.pid") + AC_DEFINE_UNQUOTED(LVMLOCKD_PIDFILE, ["$LVMLOCKD_PIDFILE"], + [Path to lvmlockd pidfile.]) +else + DEFAULT_USE_LVMLOCKD=0 +fi +AC_DEFINE_UNQUOTED(DEFAULT_USE_LVMLOCKD, [$DEFAULT_USE_LVMLOCKD], + [Use lvmlockd by default.]) + +################################################################################ +dnl -- Look for sanlock and dlm libraries +if test "$BUILD_LVMLOCKD" = yes; then + PKG_CHECK_MODULES(LOCKD_SANLOCK, libsanlock_client, [HAVE_LOCKD_SANLOCK=yes], $bailout) + PKG_CHECK_MODULES(LOCKD_DLM, libdlm, [HAVE_LOCKD_DLM=yes], $bailout) +fi + +################################################################################ + dnl -- Enable blkid wiping functionality AC_MSG_CHECKING(whether to enable libblkid detection of signatures when wiping) AC_ARG_ENABLE(blkid_wiping, @@ -1753,6 +1798,7 @@ AC_SUBST(BUILD_CMIRRORD) AC_SUBST(BUILD_DMEVENTD) AC_SUBST(BUILD_LVMETAD) AC_SUBST(BUILD_LVMPOLLD) +AC_SUBST(BUILD_LVMLOCKD) AC_SUBST(CACHE) AC_SUBST(CFLAGS) AC_SUBST(CFLOW_CMD) @@ -1792,6 +1838,7 @@ AC_SUBST(DEFAULT_SYS_DIR) AC_SUBST(DEFAULT_USE_BLKID_WIPING) AC_SUBST(DEFAULT_USE_LVMETAD) AC_SUBST(DEFAULT_USE_LVMPOLLD) +AC_SUBST(DEFAULT_USE_LVMLOCKD) AC_SUBST(DEVMAPPER) AC_SUBST(DLM_CFLAGS) AC_SUBST(DLM_LIBS) @@ -1869,6 +1916,7 @@ AC_SUBST(WRITE_INSTALL) AC_SUBST(DMEVENTD_PIDFILE) AC_SUBST(LVMETAD_PIDFILE) AC_SUBST(LVMPOLLD_PIDFILE) +AC_SUBST(LVMLOCKD_PIDFILE) AC_SUBST(CLVMD_PIDFILE) AC_SUBST(CMIRRORD_PIDFILE) AC_SUBST(interface) @@ -1903,6 +1951,7 @@ daemons/dmeventd/plugins/snapshot/Makefile daemons/dmeventd/plugins/thin/Makefile daemons/lvmetad/Makefile daemons/lvmpolld/Makefile +daemons/lvmlockd/Makefile conf/Makefile conf/example.conf conf/lvmlocal.conf @@ -1949,6 +1998,8 @@ scripts/lvm2_lvmetad_systemd_red_hat.socket scripts/lvm2_lvmpolld_init_red_hat scripts/lvm2_lvmpolld_systemd_red_hat.service scripts/lvm2_lvmpolld_systemd_red_hat.socket +scripts/lvm2_lvmlockd_systemd_red_hat.service +scripts/lvm2_lvmlocking_systemd_red_hat.service scripts/lvm2_monitoring_init_red_hat scripts/lvm2_monitoring_systemd_red_hat.service scripts/lvm2_pvscan_systemd_red_hat@.service @@ -1976,3 +2027,9 @@ AS_IF([test -n "$CACHE_CONFIGURE_WARN"], AS_IF([test "$ODIRECT" != yes], [AC_MSG_WARN([O_DIRECT disabled: low-memory pvmove may lock up])]) + +AS_IF([test "$BUILD_LVMLOCKD" == yes && test "$BUILD_LVMPOLLD" == no], + [AC_MSG_WARN([lvmlockd requires lvmpolld])]) + +AS_IF([test "$BUILD_LVMLOCKD" == yes && test "$BUILD_LVMETAD" == no], + [AC_MSG_WARN([lvmlockd requires lvmetad])]) diff --git a/daemons/Makefile.in b/daemons/Makefile.in index 8a466b3f8..a2e7094cf 100644 --- a/daemons/Makefile.in +++ b/daemons/Makefile.in @@ -15,7 +15,7 @@ srcdir = @srcdir@ top_srcdir = @top_srcdir@ top_builddir = @top_builddir@ -.PHONY: dmeventd clvmd cmirrord lvmetad lvmpolld +.PHONY: dmeventd clvmd cmirrord lvmetad lvmpolld lvmlockd ifneq ("@CLVMD@", "none") SUBDIRS += clvmd @@ -40,8 +40,12 @@ ifeq ("@BUILD_LVMPOLLD@", "yes") SUBDIRS += lvmpolld endif +ifeq ("@BUILD_LVMLOCKD@", "yes") + SUBDIRS += lvmlockd +endif + ifeq ($(MAKECMDGOALS),distclean) - SUBDIRS = clvmd cmirrord dmeventd lvmetad lvmpolld + SUBDIRS = clvmd cmirrord dmeventd lvmetad lvmpolld lvmlockd endif include $(top_builddir)/make.tmpl diff --git a/daemons/lvmlockd/Makefile.in b/daemons/lvmlockd/Makefile.in new file mode 100644 index 000000000..fcdce5c50 --- /dev/null +++ b/daemons/lvmlockd/Makefile.in @@ -0,0 +1,53 @@ +# +# Copyright (C) 2014-2015 Red Hat, Inc. +# +# This file is part of LVM2. +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions +# of the GNU Lesser General Public License v.2.1. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +top_builddir = @top_builddir@ + +SOURCES = \ + lvmlockd-core.c \ + lvmlockd-sanlock.c \ + lvmlockd-dlm.c + +TARGETS = lvmlockd lvmlockctl + +.PHONY: install_lvmlockd + +include $(top_builddir)/make.tmpl + +INCLUDES += -I$(top_srcdir)/libdaemon/server +LVMLIBS = -ldaemonserver $(LVMINTERNAL_LIBS) -ldevmapper + +LIBS += $(PTHREAD_LIBS) -ldlm_lt -lsanlock_client -lrt + +LDFLAGS += -L$(top_builddir)/libdaemon/server +CLDFLAGS += -L$(top_builddir)/libdaemon/server + +lvmlockd: $(OBJECTS) $(top_builddir)/libdaemon/client/libdaemonclient.a \ + $(top_builddir)/libdaemon/server/libdaemonserver.a + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $(OBJECTS) $(LVMLIBS) $(LIBS) + +lvmlockctl: lvmlockctl.o $(top_builddir)/libdaemon/client/libdaemonclient.a \ + $(top_builddir)/libdaemon/server/libdaemonserver.a + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ lvmlockctl.o $(LVMLIBS) + +install_lvmlockd: lvmlockd + $(INSTALL_PROGRAM) -D $< $(sbindir)/$(<F) + +install_lvmlockctl: lvmlockctl + $(INSTALL_PROGRAM) -D $< $(sbindir)/$(<F) + +install_lvm2: install_lvmlockd install_lvmlockctl + +install: install_lvm2 diff --git a/daemons/lvmlockd/lvmlockctl.c b/daemons/lvmlockd/lvmlockctl.c new file mode 100644 index 000000000..b8ab9ed11 --- /dev/null +++ b/daemons/lvmlockd/lvmlockctl.c @@ -0,0 +1,635 @@ +#define _GNU_SOURCE +#include "configure.h" +#include "lvmlockd-client.h" + +#include <stdio.h> +#include <stdint.h> +#include <stddef.h> +#include <stdlib.h> +#include <unistd.h> +#include <getopt.h> +#include <string.h> +#include <signal.h> +#include <errno.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/socket.h> +#include <sys/un.h> + +static int quit; +static int info; +static int dump; +static int wait_opt; +static int force_opt; +static int gl_enable; +static int gl_disable; +static int stop_lockspaces; +static char *able_vg_name; + +#define DUMP_SOCKET_NAME "lvmlockd-dump.sock" +#define DUMP_BUF_SIZE (1024 * 1024) +static char dump_buf[DUMP_BUF_SIZE]; +static int dump_len; +static struct sockaddr_un dump_addr; +static socklen_t dump_addrlen; + +daemon_handle _lvmlockd; + +#define log_debug(fmt, args...) \ +do { \ + printf(fmt "\n", ##args); \ +} while (0) + +#define log_error(fmt, args...) \ +do { \ + printf(fmt "\n", ##args); \ +} while (0) + +#define MAX_LINE 512 + +/* copied from lvmlockd-internal.h */ +#define MAX_NAME 64 +#define MAX_ARGS 64 + +/* + * lvmlockd dumps the client info before the lockspaces, + * so we can look up client info when printing lockspace info. + */ + +#define MAX_CLIENTS 100 + +struct client_info { + uint32_t client_id; + int pid; + char name[MAX_NAME+1]; +}; + +static struct client_info clients[MAX_CLIENTS]; +static int num_clients; + +static void save_client_info(char *line) +{ + uint32_t pid = 0; + int fd = 0; + int pi = 0; + uint32_t client_id = 0; + char name[MAX_NAME+1] = { 0 }; + + sscanf(line, "info=client pid=%u fd=%d pi=%d id=%u name=%s", + &pid, &fd, &pi, &client_id, name); + + clients[num_clients].client_id = client_id; + clients[num_clients].pid = pid; + strcpy(clients[num_clients].name, name); + num_clients++; +} + +static void find_client_info(uint32_t client_id, uint32_t *pid, char *cl_name) +{ + int i; + + for (i = 0; i < num_clients; i++) { + if (clients[i].client_id == client_id) { + *pid = clients[i].pid; + strcpy(cl_name, clients[i].name); + return; + } + } +} + +static void format_info_ls(char *line) +{ + char ls_name[MAX_NAME+1] = { 0 }; + char vg_name[MAX_NAME+1] = { 0 }; + char vg_uuid[MAX_NAME+1] = { 0 }; + char vg_sysid[MAX_NAME+1] = { 0 }; + char lock_args[MAX_ARGS+1] = { 0 }; + char lock_type[MAX_NAME+1] = { 0 }; + + sscanf(line, "info=ls ls_name=%s vg_name=%s vg_uuid=%s vg_sysid=%s vg_args=%s lm_type=%s", + ls_name, vg_name, vg_uuid, vg_sysid, lock_args, lock_type); + + printf("\n"); + + printf("VG %s lock_type=%s %s\n", vg_name, lock_type, vg_uuid); + + printf("LS %s %s\n", lock_type, ls_name); +} + +static void format_info_ls_action(char *line) +{ + uint32_t client_id = 0; + char flags[MAX_NAME+1] = { 0 }; + char version[MAX_NAME+1] = { 0 }; + char op[MAX_NAME+1] = { 0 }; + uint32_t pid = 0; + char cl_name[MAX_NAME+1] = { 0 }; + + sscanf(line, "info=ls_action client_id=%u %s %s op=%s", + &client_id, flags, version, op); + + find_client_info(client_id, &pid, cl_name); + + printf("OP %s pid %u (%s)", op, pid, cl_name); +} + +static void format_info_r(char *line, char *r_name_out, char *r_type_out) +{ + char r_name[MAX_NAME+1] = { 0 }; + char r_type[4] = { 0 }; + char mode[4] = { 0 }; + char sh_count[MAX_NAME+1] = { 0 }; + uint32_t ver = 0; + + sscanf(line, "info=r name=%s type=%s mode=%s %s version=%u", + r_name, r_type, mode, sh_count, &ver); + + /* when mode is not un, wait and print each lk line */ + + if (strcmp(mode, "un")) { + strcpy(r_name_out, r_name); + strcpy(r_type_out, r_type); + return; + } + + /* when mode is un, there will be no lk lines, so print now */ + + if (!strcmp(r_type, "gl")) { + printf("LK GL un ver %4u\n", ver); + + } else if (!strcmp(r_type, "vg")) { + printf("LK VG un ver %4u\n", ver); + + } else if (!strcmp(r_type, "lv")) { + printf("LK LV un %s\n", r_name); + } +} + +static void format_info_lk(char *line, char *r_name, char *r_type) +{ + char mode[4] = { 0 }; + uint32_t ver = 0; + char flags[MAX_NAME+1] = { 0 }; + uint32_t client_id = 0; + uint32_t pid = 0; + char cl_name[MAX_NAME+1] = { 0 }; + + if (!r_name[0] || !r_type[0]) { + printf("format_info_lk error r_name %s r_type %s\n", r_name, r_type); + printf("%s\n", line); + return; + } + + sscanf(line, "info=lk mode=%s version=%u %s client_id=%u", + mode, &ver, flags, &client_id); + + find_client_info(client_id, &pid, cl_name); + + if (!strcmp(r_type, "gl")) { + printf("LK GL %s ver %4u pid %u (%s)\n", mode, ver, pid, cl_name); + + } else if (!strcmp(r_type, "vg")) { + printf("LK VG %s ver %4u pid %u (%s)\n", mode, ver, pid, cl_name); + + } else if (!strcmp(r_type, "lv")) { + printf("LK LV %s %s\n", mode, r_name); + } +} + +static void format_info_r_action(char *line, char *r_name, char *r_type) +{ + uint32_t client_id = 0; + char flags[MAX_NAME+1] = { 0 }; + char version[MAX_NAME+1] = { 0 }; + char op[MAX_NAME+1] = { 0 }; + char rt[4] = { 0 }; + char mode[4] = { 0 }; + char lm[MAX_NAME+1] = { 0 }; + char result[MAX_NAME+1] = { 0 }; + char lm_rv[MAX_NAME+1] = { 0 }; + uint32_t pid = 0; + char cl_name[MAX_NAME+1] = { 0 }; + + if (!r_name[0] || !r_type[0]) { + printf("format_info_r_action error r_name %s r_type %s\n", r_name, r_type); + printf("%s\n", line); + return; + } + + sscanf(line, "info=r_action client_id=%u %s %s op=%s rt=%s mode=%s %s %s %s", + &client_id, flags, version, op, rt, mode, lm, result, lm_rv); + + find_client_info(client_id, &pid, cl_name); + + if (strcmp(op, "lock")) { + printf("OP %s pid %u (%s)", op, pid, cl_name); + return; + } + + if (!strcmp(r_type, "gl")) { + printf("LW GL %s ver %4u pid %u (%s)\n", mode, 0, pid, cl_name); + + } else if (!strcmp(r_type, "vg")) { + printf("LW VG %s ver %4u pid %u (%s)\n", mode, 0, pid, cl_name); + + } else if (!strcmp(r_type, "lv")) { + printf("LW LV %s %s\n", mode, r_name); + } +} + +static void format_info_line(char *line) +{ + char r_name[MAX_NAME+1]; + char r_type[MAX_NAME+1]; + + if (!strncmp(line, "info=structs ", strlen("info=structs "))) { + printf("%s\n", line); + + } else if (!strncmp(line, "info=client ", strlen("info=client "))) { + save_client_info(line); + + } else if (!strncmp(line, "info=ls ", strlen("info=ls "))) { + format_info_ls(line); + + } else if (!strncmp(line, "info=ls_action ", strlen("info=ls_action "))) { + format_info_ls_action(line); + + } else if (!strncmp(line, "info=r ", strlen("info=r "))) { + memset(r_name, 0, sizeof(r_name)); + memset(r_type, 0, sizeof(r_type)); + format_info_r(line, r_name, r_type); + + } else if (!strncmp(line, "info=lk ", strlen("info=lk "))) { + /* will use info from previous r */ + format_info_lk(line, r_name, r_type); + + } else if (!strncmp(line, "info=r_action ", strlen("info=r_action "))) { + /* will use info from previous r */ + format_info_r_action(line, r_name, r_type); + } else { + printf("UN %s\n", line); + } +} + +static void format_info(void) +{ + char line[MAX_LINE]; + int i, j; + + j = 0; + memset(line, 0, sizeof(line)); + + for (i = 0; i < dump_len; i++) { + line[j++] = dump_buf[i]; + + if ((line[j-1] == '\n') || (line[j-1] == '\0')) { + format_info_line(line); + j = 0; + memset(line, 0, sizeof(line)); + } + } +} + + +static daemon_reply _lvmlockd_send(const char *req_name, ...) +{ + va_list ap; + daemon_reply repl; + daemon_request req; + + req = daemon_request_make(req_name); + + va_start(ap, req_name); + daemon_request_extend_v(req, ap); + va_end(ap); + + repl = daemon_send(_lvmlockd, req); + + daemon_request_destroy(req); + + return repl; +} + +/* See the same in lib/locking/lvmlockd.c */ +#define NO_LOCKD_RESULT -1000 + +static int _lvmlockd_result(daemon_reply reply, int *result) +{ + int reply_result; + const char *reply_flags; + const char *lock_type; + + if (reply.error) { + log_error("lvmlockd_result reply error %d", reply.error); + return 0; + } + + if (strcmp(daemon_reply_str(reply, "response", ""), "OK")) { + log_error("lvmlockd_result bad response"); + return 0; + } + + reply_result = daemon_reply_int(reply, "op_result", NO_LOCKD_RESULT); + if (reply_result == -1000) { + log_error("lvmlockd_result no op_result"); + return 0; + } + + /* The lock_type that lvmlockd used for locking. */ + lock_type = daemon_reply_str(reply, "lock_type", "none"); + + *result = reply_result; + + reply_flags = daemon_reply_str(reply, "result_flags", NULL); + + log_debug("lvmlockd_result %d %s lm %s", reply_result, reply_flags, lock_type); + return 1; +} + +static int do_quit(void) +{ + daemon_reply reply; + int rv = 0; + + reply = daemon_send_simple(_lvmlockd, "quit", NULL); + + if (reply.error) { + log_error("reply error %d", reply.error); + rv = reply.error; + } + + daemon_reply_destroy(reply); + return rv; +} + +static int setup_dump_socket(void) +{ + int s, rv; + + s = socket(AF_LOCAL, SOCK_DGRAM, 0); + if (s < 0) + return s; + + memset(&dump_addr, 0, sizeof(dump_addr)); + dump_addr.sun_family = AF_LOCAL; + strcpy(&dump_addr.sun_path[1], DUMP_SOCKET_NAME); + dump_addrlen = sizeof(sa_family_t) + strlen(dump_addr.sun_path+1) + 1; + + rv = bind(s, (struct sockaddr *) &dump_addr, dump_addrlen); + if (rv < 0) + return rv; + + return s; +} + +static int do_dump(const char *req_name) +{ + daemon_reply reply; + int result; + int fd, rv = 0; + + fd = setup_dump_socket(); + if (fd < 0) { + log_error("socket error %d", fd); + return fd; + } + + reply = daemon_send_simple(_lvmlockd, req_name, NULL); + + if (reply.error) { + log_error("reply error %d", reply.error); + rv = reply.error; + goto out; + } + + result = daemon_reply_int(reply, "result", 0); + dump_len = daemon_reply_int(reply, "dump_len", 0); + + daemon_reply_destroy(reply); + + if (result < 0) { + rv = result; + log_error("result %d", result); + } + + if (!dump_len) + goto out; + + memset(dump_buf, 0, sizeof(dump_buf)); + + rv = recvfrom(fd, dump_buf, dump_len, MSG_WAITALL, + (struct sockaddr *)&dump_addr, &dump_addrlen); + if (rv < 0) { + log_error("recvfrom error %d %d", rv, errno); + rv = -errno; + goto out; + } + + rv = 0; + if ((info && dump) || !strcmp(req_name, "dump")) + printf("%s\n", dump_buf); + else + format_info(); +out: + close(fd); + return rv; +} + +static int do_able(const char *req_name) +{ + daemon_reply reply; + int result; + int rv; + + reply = _lvmlockd_send(req_name, + "cmd = %s", "lvmlock", + "pid = %d", getpid(), + "vg_name = %s", able_vg_name, + NULL); + + if (!_lvmlockd_result(reply, &result)) { + log_error("lvmlockd result %d", result); + rv = result; + } else { + rv = 0; + } + + daemon_reply_destroy(reply); + return rv; +} + +static int do_stop_lockspaces(void) +{ + daemon_reply reply; + char opts[32]; + int result; + int rv; + + memset(opts, 0, sizeof(opts)); + + if (wait_opt) + strcat(opts, "wait "); + if (force_opt) + strcat(opts, "force "); + + reply = _lvmlockd_send("stop_all", + "cmd = %s", "lvmlock", + "pid = %d", getpid(), + "opts = %s", opts[0] ? opts : "none", + NULL); + + if (!_lvmlockd_result(reply, &result)) { + log_error("lvmlockd result %d", result); + rv = result; + } else { + rv = 0; + } + + daemon_reply_destroy(reply); + return rv; +} + +static void print_usage(void) +{ + printf("lvmlockctl options\n"); + printf("Options:\n"); + printf("--help | -h\n"); + printf(" Show this help information.\n"); + printf("--quit | -q\n"); + printf(" Tell lvmlockd to quit.\n"); + printf("--info | -i\n"); + printf(" Print lock state information from lvmlockd.\n"); + printf("--dump | -d\n"); + printf(" Print log buffer from lvmlockd.\n"); + printf("--wait | -w 0|1\n"); + printf(" Wait option for other commands.\n"); + printf("--force | -f 0|1>\n"); + printf(" Force option for other commands.\n"); + printf("--stop-lockspaces | -S\n"); + printf(" Stop all lockspaces.\n"); + printf("--gl-enable <vg_name>\n"); + printf(" Tell lvmlockd to enable the global lock in a sanlock vg.\n"); + printf("--gl-disable <vg_name>\n"); + printf(" Tell lvmlockd to disable the global lock in a sanlock vg.\n"); +} + +static int read_options(int argc, char *argv[]) +{ + int option_index = 0; + int c; + + static struct option long_options[] = { + {"help", no_argument, 0, 'h' }, + {"quit", no_argument, 0, 'q' }, + {"info", no_argument, 0, 'i' }, + {"dump", no_argument, 0, 'd' }, + {"wait", required_argument, 0, 'w' }, + {"force", required_argument, 0, 'f' }, + {"gl-enable", required_argument, 0, 'E' }, + {"gl-disable", required_argument, 0, 'D' }, + {"stop-lockspaces", no_argument, 0, 'S' }, + {0, 0, 0, 0 } + }; + + if (argc == 1) { + print_usage(); + exit(0); + } + + while (1) { + c = getopt_long(argc, argv, "hqidE:D:w:S", long_options, &option_index); + if (c == -1) + break; + + switch (c) { + case 'h': + /* --help */ + print_usage(); + exit(0); + case 'q': + /* --quit */ + quit = 1; + break; + case 'i': + /* --info */ + info = 1; + break; + case 'd': + /* --dump */ + dump = 1; + break; + case 'w': + wait_opt = atoi(optarg); + break; + case 'E': + gl_enable = 1; + able_vg_name = strdup(optarg); + break; + case 'D': + gl_disable = 1; + able_vg_name = strdup(optarg); + break; + case 'S': + stop_lockspaces = 1; + break; + default: + print_usage(); + exit(1); + } + } + + + return 0; +} + +int main(int argc, char **argv) +{ + int rv = 0; + + rv = read_options(argc, argv); + if (rv < 0) + return rv; + + _lvmlockd = lvmlockd_open(NULL); + + if (_lvmlockd.socket_fd < 0 || _lvmlockd.error) { + log_error("lvmlockd open error %d", _lvmlockd.error); + return -1; + } + + if (quit) { + rv = do_quit(); + goto out; + } + + if (info) { + rv = do_dump("info"); + goto out; + } + + if (dump) { + rv = do_dump("dump"); + goto out; + } + + if (gl_enable) { + rv = do_able("enable_gl"); + goto out; + } + + if (gl_disable) { + rv = do_able("disable_gl"); + goto out; + } + + if (stop_lockspaces) { + rv = do_stop_lockspaces(); + goto out; + } + +out: + lvmlockd_close(_lvmlockd); + return rv; +} + diff --git a/daemons/lvmlockd/lvmlockd-client.h b/daemons/lvmlockd/lvmlockd-client.h new file mode 100644 index 000000000..0a3e4b2d2 --- /dev/null +++ b/daemons/lvmlockd/lvmlockd-client.h @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2014 Red Hat, Inc. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU Lesser General Public License v.2.1. + */ + +#ifndef _LVM_LVMLOCKD_CLIENT_H +#define _LVM_LVMLOCKD_CLIENT_H + +#include "daemon-client.h" + +#define LVMLOCKD_SOCKET DEFAULT_RUN_DIR "/lvmlockd.socket" + +/* Wrappers to open/close connection */ + +static inline daemon_handle lvmlockd_open(const char *sock) +{ + daemon_info lvmlockd_info = { + .path = "lvmlockd", + .socket = sock ?: LVMLOCKD_SOCKET, + .protocol = "lvmlockd", + .protocol_version = 1, + .autostart = 0 + }; + + return daemon_open(lvmlockd_info); +} + +static inline void lvmlockd_close(daemon_handle h) +{ + return daemon_close(h); +} + +/* + * Errors returned as the lvmlockd result value. + */ +#define ENOLS 210 /* lockspace not found */ +#define ESTARTING 211 /* lockspace is starting */ +#define EARGS 212 +#define EHOSTID 213 +#define EMANAGER 214 +#define EPREPARE 215 +#define ELOCKD 216 + +#endif diff --git a/daemons/lvmlockd/lvmlockd-core.c b/daemons/lvmlockd/lvmlockd-core.c new file mode 100644 index 000000000..ade0aac70 --- /dev/null +++ b/daemons/lvmlockd/lvmlockd-core.c @@ -0,0 +1,5665 @@ +/* + * Copyright (C) 2014 Red Hat, Inc. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU Lesser General Public License v.2.1. + */ + +#define _XOPEN_SOURCE 500 /* pthread */ +#define _ISOC99_SOURCE +#define _GNU_SOURCE + +#include "configure.h" +#include "daemon-io.h" +#include "daemon-server.h" +#include "daemon-log.h" +#include "config-util.h" +#include "lvm-version.h" +#include "lvmetad-client.h" +#include "lvmlockd-client.h" + +#include <assert.h> +#include <pthread.h> +#include <stdint.h> +#include <stddef.h> +#include <stdlib.h> +#include <unistd.h> +#include <poll.h> +#include <errno.h> +#include <signal.h> +#include <getopt.h> +#include <syslog.h> +#include <dirent.h> +#include <time.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/utsname.h> +#include <sys/un.h> + +#define EXTERN +#include "lvmlockd-internal.h" + +/* + * Basic operation of lvmlockd + * + * lvmlockd main process runs main_loop() which uses poll(). + * poll listens for new connections from lvm commands and for + * messages from existing connected lvm commands. + * + * lvm command starts and connects to lvmlockd. + * + * lvmlockd receives a connection request from command and adds a + * 'struct client' to keep track of the connection to the command. + * The client's fd is added to the set of fd's in poll(). + * + * lvm command sends a lock request to lvmlockd. The lock request + * can be for the global lock, a vg lock, or an lv lock. + * + * lvmlockd main_loop/poll sees a message from an existing client. + * It sets client.recv = 1, then wakes up client_thread_main. + * + * client_thread_main iterates through client structs (cl), looking + * for any that need processing, finds the one with cl->recv set, + * and calls client_recv_action(cl). + * + * client_recv_action(cl) reads the message/request from the client, + * allocates a new 'struct action' (act) to represent the request, + * sets the act with what is found in the request, then looks at + * the specific operation in act->op (LD_OP_FOO) to decide what to + * do with the action: + * + * . If the action is to start a lockspace, create a new thread + * to manage that lockspace: add_lockspace(act). + * + * . If the action is a lock request, pass the act to the thread + * that is managing that lockspace: add_lock_action(act). + * + * . Other misc actions are are passed to the worker_thread: + * add_work_action(act). + * + * Onec the client_thread has passed the action off to another + * thread to process, it goes back to waiting for more client + * handling work to do. + * + * The thread that was given the action by the client_thread + * now processes that action according to the operation, act->op. + * This is either a lockspace_thread (for lock ops or ops that + * add/rem a lockspace), or the worker_thread. See below for + * how these ops are processed by these threads. When the + * given thread is done processing the action, the result is + * set in act->result, and the act struct for the completed action + * is passed back to the client_thread (client_results list). + * + * The client_thread takes completed actions (from client_results + * list), and sends the result back to the client that sent the + * request represented by the action. The act struct is then freed. + * + * This completes the cycle of work between lvm commands (clients) + * and lvmlockd. In summary: + * + * - main process polls for new client connections and new requests + * from lvm commands + * - client_thread reads requests from clients + * - client_thread creates an action struct for each request + * - client_thread passes the act to another thread for processing + * - other threads pass completed act structs back to client_thread + * - client_thread sends the act result back to the client and frees the act + * + * + * Lockspace threads: + * Each lockd VG has its own lockspace that contains locks for that VG. + * Each 'struct lockspace' is managed by a separate lockspace_thread. + * When the lockspace_thread is first created, the first thing it does + * is join the lockspace in the lock manager. This can take a long time. + * If the join fails, the thread exits. After the join, the thread + * enters a loop waiting for lock actions to perform in the lockspace. + * + * The request to remove/leave a lockspace causes a flag to be set in + * the lockspace struct. When the lockspace_thread sees this flag + * set, it leaves the lockspace, and exits. + * + * When the client_thread passes a new action to a lockspace_thread, + * i.e. a new lock request, the lockspace_thread identifies which resource + * is being locked (GL, VG, LV), and gets the 'struct resource' (r) for it. + * r->type will be LD_RT_GL, LD_RT_VG, or LD_RT_LV. r->name is the + * resource name, and is fixed for GL and VG resources, but is based on + * the LV name for LV resources. The act is added to the resource's + * list of actions: r->actions, i.e. outstanding lock requests on the + * resource. + * + * The lockspace thread then iterates through each resource in the + * lockspace, processing any outstanding actions on each: res_process(ls, r). + * + * res_process() compares the outstanding actions/requests in r->actions + * against any existing locks on the resource in r->locks. If the + * action is blocked by existing locks, it's left on r->actions. If not, + * the action/request is passed to the lock manager. If the result from + * the lock manager is success, a new 'struct lock' is created for the + * action and saved on r->locks. The result is set in act->result and + * the act is passed back to the client_thread to be returned to the client. + */ + +static const char *lvmlockd_protocol = "lvmlockd"; +static const int lvmlockd_protocol_version = 1; +static int daemon_quit; +static int adopt_opt; + +static daemon_handle lvmetad_handle; +static pthread_mutex_t lvmetad_mutex; +static int lvmetad_connected; + +/* + * We use a separate socket for dumping daemon info. + * This will not interfere with normal operations, and allows + * free-form debug data to be dumped instead of the libdaemon + * protocol that wants all data in the cft format. + * 1MB should fit all the info we need to dump. + */ +#define DUMP_SOCKET_NAME "lvmlockd-dump.sock" +#define DUMP_BUF_SIZE (1024 * 1024) +static char dump_buf[DUMP_BUF_SIZE]; +static struct sockaddr_un dump_addr; +static socklen_t dump_addrlen; + +/* + * Main program polls client connections, adds new clients, + * adds work for client thread. + * + * pollfd_mutex is used for adding vs removing entries, + * and for resume vs realloc. + */ +#define POLL_FD_UNUSED -1 /* slot if free */ +#define POLL_FD_IGNORE -2 /* slot is used but ignore in poll */ +#define ADD_POLL_SIZE 16 /* increment slots by this amount */ + +static pthread_mutex_t pollfd_mutex; +static struct pollfd *pollfd; +static int pollfd_size; +static int pollfd_maxi; +static int listen_pi; +static int listen_fd; +static int restart_pi; +static int restart_fds[2]; + +/* + * Each lockspace has its own thread to do locking. + * The lockspace thread makes synchronous lock requests to dlm/sanlock. + * Every vg with a lockd type, i.e. "dlm", "sanlock", should be on this list. + * + * lockspaces_inactive holds old ls structs for vgs that have been + * stopped, or for vgs that failed to start. The old ls structs + * are removed from the inactive list and freed when a new ls with + * the same name is started and added to the standard lockspaces list. + * Keeping this bit of "history" for the ls allows us to return a + * more informative error message if a vg lock request is made for + * an ls that has been stopped or failed to start. + */ +static pthread_mutex_t lockspaces_mutex; +static struct list_head lockspaces; +static struct list_head lockspaces_inactive; + +/* + * This flag is set to 1 if we see multiple vgs with the global + * lock enabled. While this is set, we return a special flag + * with the vg lock result indicating to the lvm command that + * there is a duplicate gl in the vg which should be resolved. + * While this is set, find_lockspace_name has the side job of + * counting the number of lockspaces with enabled gl's so that + * this can be set back to zero when the duplicates are disabled. + */ +static int sanlock_gl_dup; + +/* + * Client thread reads client requests and writes client results. + */ +static pthread_t client_thread; +static pthread_mutex_t client_mutex; +static pthread_cond_t client_cond; +static struct list_head client_list; /* connected clients */ +static struct list_head client_results; /* actions to send back to clients */ +static uint32_t client_ids; /* 0 and ADOPT_CLIENT_ID are skipped */ +static int client_stop; /* stop the thread */ +static int client_work; /* a client on client_list has work to do */ + +#define ADOPT_CLIENT_ID 0xFFFFFFFF /* special client_id for adopt actions */ +static struct list_head adopt_results; /* special start actions from adopt_locks() */ + +/* + * Worker thread performs misc non-locking actions, e.g. init/free. + */ +static pthread_t worker_thread; +static pthread_mutex_t worker_mutex; +static pthread_cond_t worker_cond; +static struct list_head worker_list; /* actions for worker_thread */ +static int worker_stop; /* stop the thread */ +static int worker_wake; /* wake the thread without adding work */ + +/* + * The content of every log_foo() statement is saved in the + * circular buffer, which can be dumped to a client and printed. + */ +#define LOG_LINE_SIZE 256 +#define LOG_DUMP_SIZE DUMP_BUF_SIZE +#define LOG_SYSLOG_PRIO LOG_WARNING +static char log_dump[LOG_DUMP_SIZE]; +static unsigned int log_point; +static unsigned int log_wrap; +static pthread_mutex_t log_mutex; +static int syslog_priority = LOG_SYSLOG_PRIO; + +/* + * Structure pools to avoid repeated malloc/free. + */ +#define MAX_UNUSED_ACTION 64 +#define MAX_UNUSED_CLIENT 64 +#define MAX_UNUSED_RESOURCE 64 +#define MAX_UNUSED_LOCK 64 +static pthread_mutex_t unused_struct_mutex; +static struct list_head unused_action; +static struct list_head unused_client; +static struct list_head unused_resource; +static struct list_head unused_lock; +static int unused_action_count; +static int unused_client_count; +static int unused_resource_count; +static int unused_lock_count; +static int resource_lm_data_size; /* max size of lm_data from sanlock|dlm */ +static int alloc_new_structs; /* used for initializing in setup_structs */ + +#define DO_STOP 1 +#define NO_STOP 0 +#define DO_FREE 1 +#define NO_FREE 0 +#define DO_FORCE 1 +#define NO_FORCE 0 + +static int add_lock_action(struct action *act); +static int str_to_lm(const char *str); +static void clear_lockspace_inactive(char *name); + +static int _syslog_name_to_num(const char *name) +{ + if (!strcmp(name, "emerg")) + return LOG_EMERG; + if (!strcmp(name, "alert")) + return LOG_ALERT; + if (!strcmp(name, "crit")) + return LOG_CRIT; + if (!strcmp(name, "err") || !strcmp(name, "error")) + return LOG_ERR; + if (!strcmp(name, "warning") || !strcmp(name, "warn")) + return LOG_WARNING; + if (!strcmp(name, "notice")) + return LOG_NOTICE; + if (!strcmp(name, "info")) + return LOG_INFO; + if (!strcmp(name, "debug")) + return LOG_DEBUG; + return LOG_WARNING; +} + +static const char *_syslog_num_to_name(int num) +{ + switch (num) { + case LOG_EMERG: + return "emerg"; + case LOG_ALERT: + return "alert"; + case LOG_CRIT: + return "crit"; + case LOG_ERR: + return "err"; + case LOG_WARNING: + return "warning"; + case LOG_NOTICE: + return "notice"; + case LOG_INFO: + return "info"; + case LOG_DEBUG: + return "debug"; + } + return "unknown"; +} + +static uint64_t monotime(void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec; +} + +static void log_save_line(int len, char *line, + char *log_buf, unsigned int *point, unsigned int *wrap) +{ + unsigned int p = *point; + unsigned int w = *wrap; + int i; + + if (len < LOG_DUMP_SIZE - p) { + memcpy(log_buf + p, line, len); + p += len; + + if (p == LOG_DUMP_SIZE) { + p = 0; + w = 1; + } + goto out; + } + + for (i = 0; i < len; i++) { + log_buf[p++] = line[i]; + + if (p == LOG_DUMP_SIZE) { + p = 0; + w = 1; + } + } + out: + *point = p; + *wrap = w; +} + +void log_level(int level, const char *fmt, ...) +{ + char line[LOG_LINE_SIZE]; + va_list ap; + int len = LOG_LINE_SIZE - 1; + int ret, pos = 0; + + memset(line, 0, sizeof(line)); + + ret = snprintf(line, len, "%llu ", (unsigned long long)time(NULL)); + pos += ret; + + va_start(ap, fmt); + ret = vsnprintf(line + pos, len - pos, fmt, ap); + va_end(ap); + + if (ret >= len - pos) + pos = len - 1; + else + pos += ret; + + line[pos++] = '\n'; + line[pos++] = '\0'; + + pthread_mutex_lock(&log_mutex); + log_save_line(pos - 1, line, log_dump, &log_point, &log_wrap); + pthread_mutex_unlock(&log_mutex); + + if (level <= syslog_priority) + syslog(level, "%s", line); + + if (daemon_debug) + fprintf(stderr, "%s", line); +} + +static int dump_log(int *dump_len) +{ + int tail_len; + + pthread_mutex_lock(&log_mutex); + + if (!log_wrap && !log_point) { + *dump_len = 0; + } else if (log_wrap) { + tail_len = LOG_DUMP_SIZE - log_point; + memcpy(dump_buf, log_dump+log_point, tail_len); + if (log_point) + memcpy(dump_buf+tail_len, log_dump, log_point); + *dump_len = LOG_DUMP_SIZE; + } else { + memcpy(dump_buf, log_dump, log_point-1); + *dump_len = log_point-1; + } + pthread_mutex_unlock(&log_mutex); + + return 0; +} + +struct lockspace *alloc_lockspace(void) +{ + struct lockspace *ls; + + if (!(ls = malloc(sizeof(struct lockspace)))) { + log_error("out of memory for lockspace"); + return NULL; + } + + memset(ls, 0, sizeof(struct lockspace)); + INIT_LIST_HEAD(&ls->actions); + INIT_LIST_HEAD(&ls->resources); + pthread_mutex_init(&ls->mutex, NULL); + pthread_cond_init(&ls->cond, NULL); + return ls; +} + +static struct action *alloc_action(void) +{ + struct action *act; + + pthread_mutex_lock(&unused_struct_mutex); + if (!unused_action_count || alloc_new_structs) { + act = malloc(sizeof(struct action)); + } else { + act = list_first_entry(&unused_action, struct action, list); + list_del(&act->list); + unused_action_count--; + } + pthread_mutex_unlock(&unused_struct_mutex); + if (act) + memset(act, 0, sizeof(struct action)); + else + log_error("out of memory for action"); + return act; +} + +static struct client *alloc_client(void) +{ + struct client *cl; + + pthread_mutex_lock(&unused_struct_mutex); + if (!unused_client_count || alloc_new_structs) { + cl = malloc(sizeof(struct client)); + } else { + cl = list_first_entry(&unused_client, struct client, list); + list_del(&cl->list); + unused_client_count--; + } + pthread_mutex_unlock(&unused_struct_mutex); + if (cl) + memset(cl, 0, sizeof(struct client)); + else + log_error("out of memory for client"); + return cl; +} + +static struct resource *alloc_resource(void) +{ + struct resource *r; + + pthread_mutex_lock(&unused_struct_mutex); + if (!unused_resource_count || alloc_new_structs) { + r = malloc(sizeof(struct resource) + resource_lm_data_size); + } else { + r = list_first_entry(&unused_resource, struct resource, list); + list_del(&r->list); + unused_resource_count--; + } + pthread_mutex_unlock(&unused_struct_mutex); + if (r) { + memset(r, 0, sizeof(struct resource) + resource_lm_data_size); + INIT_LIST_HEAD(&r->locks); + INIT_LIST_HEAD(&r->actions); + } else { + log_error("out of memory for resource"); + } + return r; +} + +static struct lock *alloc_lock(void) +{ + struct lock *lk; + + pthread_mutex_lock(&unused_struct_mutex); + if (!unused_lock_count || alloc_new_structs) { + lk = malloc(sizeof(struct lock)); + } else { + lk = list_first_entry(&unused_lock, struct lock, list); + list_del(&lk->list); + unused_lock_count--; + } + pthread_mutex_unlock(&unused_struct_mutex); + if (lk) + memset(lk, 0, sizeof(struct lock)); + else + log_error("out of memory for lock"); + return lk; +} + +static void free_action(struct action *act) +{ + pthread_mutex_lock(&unused_struct_mutex); + if (unused_action_count >= MAX_UNUSED_ACTION) { + free(act); + } else { + list_add_tail(&act->list, &unused_action); + unused_action_count++; + } + pthread_mutex_unlock(&unused_struct_mutex); +} + +static void free_client(struct client *cl) +{ + pthread_mutex_lock(&unused_struct_mutex); + if (unused_client_count >= MAX_UNUSED_CLIENT) { + free(cl); + } else { + list_add_tail(&cl->list, &unused_client); + unused_client_count++; + } + pthread_mutex_unlock(&unused_struct_mutex); +} + +static void free_resource(struct resource *r) +{ + pthread_mutex_lock(&unused_struct_mutex); + if (unused_resource_count >= MAX_UNUSED_RESOURCE) { + free(r); + } else { + list_add_tail(&r->list, &unused_resource); + unused_resource_count++; + } + pthread_mutex_unlock(&unused_struct_mutex); +} + +static void free_lock(struct lock *lk) +{ + pthread_mutex_lock(&unused_struct_mutex); + if (unused_lock_count >= MAX_UNUSED_LOCK) { + free(lk); + } else { + list_add_tail(&lk->list, &unused_lock); + unused_lock_count++; + } + pthread_mutex_unlock(&unused_struct_mutex); +} + +static int setup_structs(void) +{ + struct action *act; + struct client *cl; + struct resource *r; + struct lock *lk; + int data_san = lm_data_size_sanlock(); + int data_dlm = lm_data_size_dlm(); + int i; + + resource_lm_data_size = data_san > data_dlm ? data_san : data_dlm; + + pthread_mutex_init(&unused_struct_mutex, NULL); + INIT_LIST_HEAD(&unused_action); + INIT_LIST_HEAD(&unused_client); + INIT_LIST_HEAD(&unused_resource); + INIT_LIST_HEAD(&unused_lock); + + /* + * For setup, force the alloc_ functions to alloc new structs instead + * of taking them unused. This allows alloc_struct/free_struct loop to + * populate the unused lists. + */ + alloc_new_structs = 1; + + for (i = 0; i < MAX_UNUSED_ACTION/2; i++) { + if (!(act = alloc_action())) + goto fail; + free_action(act); + } + + for (i = 0; i < MAX_UNUSED_CLIENT/2; i++) { + if (!(cl = alloc_client())) + goto fail; + free_client(cl); + } + + for (i = 0; i < MAX_UNUSED_RESOURCE/2; i++) { + if (!(r = alloc_resource())) + goto fail; + free_resource(r); + } + + for (i = 0; i < MAX_UNUSED_LOCK/2; i++) { + if (!(lk = alloc_lock())) + goto fail; + free_lock(lk); + } + + alloc_new_structs = 0; + return 0; +fail: + alloc_new_structs = 0; + return -ENOMEM; +} + +static int add_pollfd(int fd) +{ + int i, new_size; + + pthread_mutex_lock(&pollfd_mutex); + for (i = 0; i < pollfd_size; i++) { + if (pollfd[i].fd != POLL_FD_UNUSED) + continue; + + pollfd[i].fd = fd; + pollfd[i].events = POLLIN; + pollfd[i].revents = 0; + + if (i > pollfd_maxi) + pollfd_maxi = i; + + pthread_mutex_unlock(&pollfd_mutex); + return i; + } + + new_size = pollfd_size + ADD_POLL_SIZE; + + pollfd = realloc(pollfd, new_size * sizeof(struct pollfd)); + if (!pollfd) { + log_error("can't alloc new size %d for pollfd", new_size); + return -ENOMEM; + } + + for (i = pollfd_size; i < new_size; i++) { + pollfd[i].fd = POLL_FD_UNUSED; + pollfd[i].events = 0; + pollfd[i].revents = 0; + } + + i = pollfd_size; + pollfd[i].fd = fd; + pollfd[i].events = POLLIN; + pollfd[i].revents = 0; + pollfd_maxi = i; + + pollfd_size = new_size; + + pthread_mutex_unlock(&pollfd_mutex); + return i; +} + +static void rem_pollfd(int pi) +{ + if (pi < 0) { + log_error("rem_pollfd %d", pi); + return; + } + pthread_mutex_lock(&pollfd_mutex); + pollfd[pi].fd = POLL_FD_UNUSED; + pollfd[pi].events = 0; + pollfd[pi].revents = 0; + pthread_mutex_unlock(&pollfd_mutex); +} + +static const char *lm_str(int x) +{ + switch (x) { + case LD_LM_NONE: + return "none"; + case LD_LM_DLM: + return "dlm"; + case LD_LM_SANLOCK: + return "sanlock"; + default: + return "lm_unknown"; + } +} + +static const char *rt_str(int x) +{ + switch (x) { + case LD_RT_GL: + return "gl"; + case LD_RT_VG: + return "vg"; + case LD_RT_LV: + return "lv"; + default: + return "."; + }; +} + +static const char *op_str(int x) +{ + switch (x) { + case LD_OP_INIT: + return "init"; + case LD_OP_FREE: + return "free"; + case LD_OP_START: + return "start"; + case LD_OP_STOP: + return "stop"; + case LD_OP_LOCK: + return "lock"; + case LD_OP_UPDATE: + return "update"; + case LD_OP_CLOSE: + return "close"; + case LD_OP_ENABLE: + return "enable"; + case LD_OP_DISABLE: + return "disable"; + case LD_OP_START_WAIT: + return "start_wait"; + case LD_OP_STOP_ALL: + return "stop_all"; + case LD_OP_RENAME_BEFORE: + return "rename_before"; + case LD_OP_RENAME_FINAL: + return "rename_final"; + case LD_OP_RUNNING_LM: + return "running_lm"; + case LD_OP_FIND_FREE_LOCK: + return "find_free_lock"; + default: + return "op_unknown"; + }; +} + +static const char *mode_str(int x) +{ + switch (x) { + case LD_LK_IV: + return "iv"; + case LD_LK_UN: + return "un"; + case LD_LK_NL: + return "nl"; + case LD_LK_SH: + return "sh"; + case LD_LK_EX: + return "ex"; + default: + return "."; + }; +} + +int last_string_from_args(char *args_in, char *last) +{ + const char *args = args_in; + const char *colon, *str = NULL; + + while (1) { + if (!args || (*args == '\0')) + break; + colon = strstr(args, ":"); + if (!colon) + break; + str = colon; + args = colon + 1; + } + + if (str) { + snprintf(last, MAX_ARGS, "%s", str + 1); + return 0; + } + return -1; +} + +int version_from_args(char *args, unsigned int *major, unsigned int *minor, unsigned int *patch) +{ + char version[MAX_ARGS]; + char *major_str, *minor_str, *patch_str; + char *n, *d1, *d2; + + strncpy(version, args, MAX_ARGS); + + n = strstr(version, ":"); + if (n) + *n = '\0'; + + d1 = strstr(version, "."); + if (!d1) + return -1; + + d2 = strstr(d1 + 1, "."); + if (!d2) + return -1; + + major_str = version; + minor_str = d1 + 1; + patch_str = d2 + 1; + + *d1 = '\0'; + *d2 = '\0'; + + if (major) + *major = atoi(major_str); + if (minor) + *minor = atoi(minor_str); + if (patch) + *patch = atoi(patch_str); + + return 0; +} + +/* + * These are few enough that arrays of function pointers can + * be avoided. + */ + +static int lm_prepare_lockspace(struct lockspace *ls, struct action *act) +{ + int rv; + + if (ls->lm_type == LD_LM_DLM) + rv = lm_prepare_lockspace_dlm(ls); + else if (ls->lm_type == LD_LM_SANLOCK) + rv = lm_prepare_lockspace_sanlock(ls); + else + return -1; + + if (act) + act->lm_rv = rv; + return rv; +} + +static int lm_add_lockspace(struct lockspace *ls, struct action *act, int adopt) +{ + int rv; + + if (ls->lm_type == LD_LM_DLM) + rv = lm_add_lockspace_dlm(ls, adopt); + else if (ls->lm_type == LD_LM_SANLOCK) + rv = lm_add_lockspace_sanlock(ls, adopt); + else + return -1; + + if (act) + act->lm_rv = rv; + return rv; +} + +static int lm_rem_lockspace(struct lockspace *ls, struct action *act, int free_vg) +{ + int rv; + + if (ls->lm_type == LD_LM_DLM) + rv = lm_rem_lockspace_dlm(ls, free_vg); + else if (ls->lm_type == LD_LM_SANLOCK) + rv = lm_rem_lockspace_sanlock(ls, free_vg); + else + return -1; + + if (act) + act->lm_rv = rv; + return rv; +} + +static int lm_lock(struct lockspace *ls, struct resource *r, int mode, struct action *act, + uint32_t *r_version, int *retry, int adopt) +{ + int rv; + + if (ls->lm_type == LD_LM_DLM) + rv = lm_lock_dlm(ls, r, mode, r_version, adopt); + else if (ls->lm_type == LD_LM_SANLOCK) + rv = lm_lock_sanlock(ls, r, mode, r_version, retry, adopt); + else + return -1; + + if (act) + act->lm_rv = rv; + return rv; +} + +static int lm_convert(struct lockspace *ls, struct resource *r, + int mode, struct action *act, uint32_t r_version) +{ + int rv; + + if (ls->lm_type == LD_LM_DLM) + rv = lm_convert_dlm(ls, r, mode, r_version); + else if (ls->lm_type == LD_LM_SANLOCK) + rv = lm_convert_sanlock(ls, r, mode, r_version); + else + return -1; + + if (act) + act->lm_rv = rv; + return rv; +} + +static int lm_unlock(struct lockspace *ls, struct resource *r, struct action *act, + uint32_t r_version, uint32_t lmu_flags) +{ + int rv; + + if (ls->lm_type == LD_LM_DLM) + return lm_unlock_dlm(ls, r, r_version, lmu_flags); + else if (ls->lm_type == LD_LM_SANLOCK) + return lm_unlock_sanlock(ls, r, r_version, lmu_flags); + else + return -1; + + if (act) + act->lm_rv = rv; + return rv; +} + +static int lm_hosts(struct lockspace *ls, int notify) +{ + if (ls->lm_type == LD_LM_DLM) + return 0; + else if (ls->lm_type == LD_LM_SANLOCK) + return lm_hosts_sanlock(ls, notify); + return -1; +} + +static void lm_rem_resource(struct lockspace *ls, struct resource *r) +{ + if (ls->lm_type == LD_LM_DLM) + lm_rem_resource_dlm(ls, r); + else if (ls->lm_type == LD_LM_SANLOCK) + lm_rem_resource_sanlock(ls, r); +} + +static int lm_find_free_lock(struct lockspace *ls, uint64_t *free_offset) +{ + if (ls->lm_type == LD_LM_DLM) + return 0; + else if (ls->lm_type == LD_LM_SANLOCK) + return lm_find_free_lock_sanlock(ls, free_offset); + return -1; +} + +/* + * While adopting locks, actions originate from the adopt_locks() + * function, not from a client. So, these actions (flagged ADOPT), + * should be passed back to the adopt_locks() function through the + * adopt_results list, and not be sent back to a client via the + * client_list/client_thread. + */ + +static void add_client_result(struct action *act) +{ + pthread_mutex_lock(&client_mutex); + if (act->flags & LD_AF_ADOPT) + list_add_tail(&act->list, &adopt_results); + else + list_add_tail(&act->list, &client_results); + pthread_cond_signal(&client_cond); + pthread_mutex_unlock(&client_mutex); +} + +static struct lock *find_lock_client(struct resource *r, uint32_t client_id) +{ + struct lock *lk; + + list_for_each_entry(lk, &r->locks, list) { + if (lk->client_id == client_id) + return lk; + } + return NULL; +} + +static struct lock *find_lock_persistent(struct resource *r) +{ + struct lock *lk; + + list_for_each_entry(lk, &r->locks, list) { + if (lk->flags & LD_LF_PERSISTENT) + return lk; + } + return NULL; +} + +static struct action *find_action_client(struct resource *r, uint32_t client_id) +{ + struct action *act; + + list_for_each_entry(act, &r->actions, list) { + if (act->client_id != client_id) + continue; + return act; + } + return NULL; +} + +static void add_work_action(struct action *act) +{ + pthread_mutex_lock(&worker_mutex); + if (!worker_stop) { + list_add_tail(&act->list, &worker_list); + pthread_cond_signal(&worker_cond); + } + pthread_mutex_unlock(&worker_mutex); +} + +static int res_lock(struct lockspace *ls, struct resource *r, struct action *act, int *retry) +{ + struct lock *lk; + uint32_t r_version = 0; + int rv; + + log_debug("S %s R %s res_lock mode %s", ls->name, r->name, mode_str(act->mode)); + + if (r->mode == LD_LK_SH && act->mode == LD_LK_SH) + goto add_lk; + + if (r->type == LD_RT_LV && act->lv_args[0]) + memcpy(r->lv_args, act->lv_args, MAX_ARGS); + + rv = lm_lock(ls, r, act->mode, act, &r_version, retry, act->flags & LD_AF_ADOPT); + if (rv == -EAGAIN) + return rv; + if (rv < 0) { + log_error("S %s R %s res_lock lm error %d", ls->name, r->name, rv); + return rv; + } + + log_debug("S %s R %s res_lock lm done r_version %u", + ls->name, r->name, r_version); + + /* lm_lock() reads new r_version */ + + if ((r_version > r->version) || (!r->version && !r->version_zero_valid)) { + /* + * New r_version of the lock: means that another + * host has changed data protected by this lock + * since the last time we acquired it. We + * should invalidate any local cache of the data + * protected by this lock and reread it from disk. + */ + r->version = r_version; + + /* + * When a new global lock is enabled in a new vg, + * it will have version zero, and the first time + * we use it we need to validate the global cache + * since we don't have any version history to know + * the state of the cache. The version could remain + * zero for a long time if no global state is changed + * to cause the GL version to be incremented to 1. + */ + r->version_zero_valid = 1; + + /* + * r is vglk: tell lvmetad to set the vg invalid + * flag, and provide the new r_version. If lvmetad finds + * that its cached vg has seqno less than the value + * we send here, it will set the vg invalid flag. + * lvm commands that read the vg from lvmetad, will + * see the invalid flag returned, will reread the + * vg from disk, update the lvmetad copy, and go on. + * + * r is global: tell lvmetad to set the global invalid + * flag. When commands see this flag returned from lvmetad, + * they will reread metadata from disk, update the lvmetad + * caches, and tell lvmetad to set global invalid to 0. + */ + + if ((r->type == LD_RT_VG) && lvmetad_connected) { + daemon_reply reply; + char *uuid; + + log_debug("S %s R %s res_lock set lvmetad vg version %u", + ls->name, r->name, r_version); + + if (!ls->vg_uuid[0] || !strcmp(ls->vg_uuid, "none")) + uuid = ls->name; + else + uuid = ls->vg_uuid; + + pthread_mutex_lock(&lvmetad_mutex); + reply = daemon_send_simple(lvmetad_handle, "set_vg_info", + "token = %s", "skip", + "uuid = %s", uuid, + "version = %d", (int)r_version, + NULL); + pthread_mutex_unlock(&lvmetad_mutex); + + if (reply.error || strcmp(daemon_reply_str(reply, "response", ""), "OK")) + log_error("set_vg_info in lvmetad failed %d", reply.error); + daemon_reply_destroy(reply); + } + + if ((r->type == LD_RT_GL) && lvmetad_connected) { + daemon_reply reply; + + log_debug("S %s R %s res_lock set lvmetad global invalid", + ls->name, r->name); + + pthread_mutex_lock(&lvmetad_mutex); + reply = daemon_send_simple(lvmetad_handle, "set_global_info", + "token = %s", "skip", + "global_invalid = %d", 1, + NULL); + pthread_mutex_unlock(&lvmetad_mutex); + + if (reply.error || strcmp(daemon_reply_str(reply, "response", ""), "OK")) + log_error("set_global_info in lvmetad failed %d", reply.error); + daemon_reply_destroy(reply); + } + } + + r->mode = act->mode; + +add_lk: + if (r->mode == LD_LK_SH) + r->sh_count++; + + if (!(lk = alloc_lock())) + return -ENOMEM; + + lk->client_id = act->client_id; + lk->mode = act->mode; + + if (act->flags & LD_AF_PERSISTENT) { + lk->flags |= LD_LF_PERSISTENT; + lk->client_id = 0; + } + + list_add_tail(&lk->list, &r->locks); + + return 0; +} + +static int res_convert(struct lockspace *ls, struct resource *r, + struct lock *lk, struct action *act) +{ + uint32_t r_version; + int rv; + + log_debug("S %s R %s res_convert mode %d", ls->name, r->name, act->mode); + + if (act->mode == LD_LK_EX && lk->mode == LD_LK_SH && r->sh_count > 1) + return -EAGAIN; + + /* + * lm_convert() writes new version (from ex) + * Same as lm_unlock() + */ + + if ((r->type == LD_RT_GL) && (r->mode == LD_LK_EX)) { + r->version++; + lk->version = r->version; + r_version = r->version; + log_debug("S %s R %s res_convert r_version inc %u", + ls->name, r->name, r_version); + + } else if ((r->type == LD_RT_VG) && (r->mode == LD_LK_EX) && (lk->version > r->version)) { + r->version = lk->version; + r_version = r->version; + log_debug("S %s R %s res_convert r_version new %u", ls->name, r->name, r_version); + } else { + r_version = 0; + } + + rv = lm_convert(ls, r, act->mode, act, r_version); + if (rv < 0) { + log_error("S %s R %s res_convert lm error %d", ls->name, r->name, rv); + return rv; + } + + log_debug("S %s R %s res_convert lm done", ls->name, r->name); + + if (lk->mode == LD_LK_EX && act->mode == LD_LK_SH) { + r->sh_count = 1; + } else if (lk->mode == LD_LK_SH && act->mode == LD_LK_EX) { + r->sh_count = 0; + } else { + /* should not be possible */ + log_error("S %s R %s res_convert invalid modes %d %d", + ls->name, r->name, lk->mode, act->mode); + return -1; + } + + r->mode = act->mode; + lk->mode = act->mode; + + return 0; +} + +static int res_cancel(struct lockspace *ls, struct resource *r, + struct action *act) +{ + struct action *cact; + + /* + * a client can cancel its own non-persistent lock requests, + * when could this happen? + * + * a client can cancel other client's persistent lock requests, + * when could this happen? + */ + + if (act->flags & LD_AF_PERSISTENT) { + list_for_each_entry(cact, &r->actions, list) { + if (!(cact->flags & LD_AF_PERSISTENT)) + continue; + goto do_cancel; + } + } else { + cact = find_action_client(r, act->client_id); + if (cact) + goto do_cancel; + } + + return -ENOENT; + +do_cancel: + log_debug("S %s R %s res_cancel client %d", ls->name, r->name, cact->client_id); + cact->result = -ECANCELED; + list_del(&cact->list); + add_client_result(cact); + + return -ECANCELED; +} + +/* + * lm_unlock() writes new a r_version (from ex) + * + * The r_version of the vg resource is incremented if + * an "update" was received for the vg lock. The update + * contains the new vg seqno from the vg metadata which is + * used as the r_version. + * + * The r_version of the global resource is automatically + * incremented when it is unlocked from ex mode. + * + * r_version is incremented every time a command releases + * the global lock from ex. + */ + +/* + * persistent locks will not be unlocked for OP_CLOSE/act_close + * because act_close->flags does not have the PERSISTENT flag + * set, and a persistent lk->client_id is zero, which will not + * match the client in act_close->client_id. + */ + +static int res_unlock(struct lockspace *ls, struct resource *r, + struct action *act) +{ + struct lock *lk; + uint32_t r_version; + int rv; + + if (act->flags & LD_AF_PERSISTENT) { + lk = find_lock_persistent(r); + if (lk) + goto do_unlock; + } else { + lk = find_lock_client(r, act->client_id); + if (lk) + goto do_unlock; + } + + if (act->op != LD_OP_CLOSE) + log_error("S %s R %s res_unlock no locks", ls->name, r->name); + return -ENOENT; + +do_unlock: + log_debug("S %s R %s res_unlock %s", ls->name, r->name, + (act->op == LD_OP_CLOSE) ? "from close" : ""); + + /* send unlock to lm when last sh lock is unlocked */ + if (lk->mode == LD_LK_SH) { + r->sh_count--; + if (r->sh_count > 0) + goto rem_lk; + } + + if ((r->type == LD_RT_GL) && (r->mode == LD_LK_EX)) { + r->version++; + lk->version = r->version; + r_version = r->version; + + log_debug("S %s R %s res_unlock r_version inc %u", ls->name, r->name, r_version); + + } else if ((r->type == LD_RT_VG) && (r->mode == LD_LK_EX) && (lk->version > r->version)) { + r->version = lk->version; + r_version = r->version; + + log_debug("S %s R %s res_unlock r_version new %u", + ls->name, r->name, r_version); + } else { + r_version = 0; + } + + rv = lm_unlock(ls, r, act, r_version, 0); + if (rv < 0) { + /* should never happen, retry? */ + log_error("S %s R %s res_unlock lm error %d", ls->name, r->name, rv); + return rv; + } + + log_debug("S %s R %s res_unlock lm done", ls->name, r->name); + +rem_lk: + list_del(&lk->list); + free_lock(lk); + + if (list_empty(&r->locks)) + r->mode = LD_LK_UN; + + return 0; +} + +static int res_update(struct lockspace *ls, struct resource *r, + struct action *act) +{ + struct lock *lk; + + lk = find_lock_client(r, act->client_id); + if (!lk) { + log_error("S %s R %s res_update client %u lock not found", + ls->name, r->name, act->client_id); + return -ENOENT; + } + + if (r->mode != LD_LK_EX) { + log_error("S %s R %s res_update version on non-ex lock", + ls->name, r->name); + return -EINVAL; + } + + /* lk version will be written to lm by unlock */ + + if (act->flags & LD_AF_NEXT_VERSION) + lk->version = r->version + 1; + else + lk->version = act->version; + + log_debug("S %s R %s res_update lk version to %u", ls->name, r->name, lk->version); + + return 0; +} + +/* + * There is nothing to deallocate when freeing a dlm LV, the LV + * will simply be unlocked by rem_resource. + */ + +static int free_lv(struct lockspace *ls, struct resource *r) +{ + if (ls->lm_type == LD_LM_SANLOCK) + return lm_free_lv_sanlock(ls, r); + else if (ls->lm_type == LD_LM_DLM) + return 0; + else + return -EINVAL; +} + +/* + * NB. we can't do this if sanlock is holding any locks on + * the resource; we'd be rewriting the resource from under + * sanlock and would confuse or break it badly. We don't + * know what another host is doing, so these must be used + * very carefully. + */ + +static int res_able(struct lockspace *ls, struct resource *r, + struct action *act) +{ + int rv; + + if (ls->lm_type != LD_LM_SANLOCK) { + log_error("enable/disable only applies to sanlock"); + return -EINVAL; + } + + if (r->type != LD_RT_GL) { + log_error("enable/disable only applies to global lock"); + return -EINVAL; + } + + if (r->mode != LD_LK_UN) { + log_error("enable/disable only allowed on unlocked resource"); + return -EINVAL; + } + + if (act->op == LD_OP_ENABLE && gl_lsname_sanlock[0]) { + log_error("disable global lock in %s before enable in %s", + gl_lsname_sanlock, ls->name); + return -EINVAL; + } + + if ((act->op == LD_OP_DISABLE) && (act->flags & LD_AF_EX_DISABLE)) { + rv = lm_ex_disable_gl_sanlock(ls); + goto out; + } + + rv = lm_able_gl_sanlock(ls, act->op == LD_OP_ENABLE); +out: + return rv; +} + +/* + * Go through queued actions, and make lock/unlock calls on the resource + * based on the actions and the existing lock state. + * + * All lock operations sent to the lock manager are non-blocking. + * This is because sanlock does not support lock queueing. + * Eventually we could enhance this to take advantage of lock + * queueing when available (i.e. for the dlm). + * + * act_close_list: list of CLOSE actions, identifying clients that have + * closed/terminated their lvmlockd connection, and whose locks should + * be released. Do not remove these actions from act_close_list. + * + * retry_out: set to 1 if the lock manager said we should retry, + * meaning we should call res_process() again in a short while to retry. + */ + +static void res_process(struct lockspace *ls, struct resource *r, + struct list_head *act_close_list, int *retry_out) +{ + struct action *act, *safe, *act_close; + struct lock *lk; + int lm_retry; + int rv; + + /* + * handle version updates for ex locks + * (new version will be written by unlock) + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (act->op == LD_OP_UPDATE) { + rv = res_update(ls, r, act); + act->result = rv; + list_del(&act->list); + add_client_result(act); + } + } + + /* + * handle explicit unlock actions + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if ((act->op == LD_OP_LOCK) && + (act->mode == LD_LK_IV || act->mode == LD_LK_NL)) { + act->result = -EINVAL; + list_del(&act->list); + add_client_result(act); + } + + if (act->op == LD_OP_LOCK && act->mode == LD_LK_UN) { + rv = res_unlock(ls, r, act); + + if (rv == -ENOENT && (act->flags & LD_AF_UNLOCK_CANCEL)) + rv = res_cancel(ls, r, act); + + /* + * possible unlock results: + * 0: unlock succeeded + * -ECANCELED: cancel succeeded + * -ENOENT: nothing to unlock or cancel + */ + + act->result = rv; + list_del(&act->list); + add_client_result(act); + } + } + + /* + * handle implicit unlocks due to client exit, + * also clear any outstanding actions for the client + */ + + list_for_each_entry(act_close, act_close_list, list) { + res_unlock(ls, r, act_close); + res_cancel(ls, r, act_close); + } + + /* + * handle freeing a lock for an lv that has been removed + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (act->op == LD_OP_FREE && act->rt == LD_RT_LV) { + log_debug("S %s R %s free_lv", ls->name, r->name); + rv = free_lv(ls, r); + act->result = rv; + list_del(&act->list); + add_client_result(act); + goto r_free; + + } + } + + /* + * handle enable/disable + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (act->op == LD_OP_ENABLE || act->op == LD_OP_DISABLE) { + rv = res_able(ls, r, act); + act->result = rv; + list_del(&act->list); + add_client_result(act); + + if (!rv && act->op == LD_OP_DISABLE) { + log_debug("S %s R %s free disabled", ls->name, r->name); + goto r_free; + } + } + } + + /* + * transient requests on existing transient locks + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (act->flags & LD_AF_PERSISTENT) + continue; + + lk = find_lock_client(r, act->client_id); + if (!lk) + continue; + + if (lk->mode != act->mode) { + /* convert below */ + /* + act->result = -EEXIST; + list_del(&act->list); + add_client_result(act); + */ + continue; + } else { + /* success */ + act->result = -EALREADY; + list_del(&act->list); + add_client_result(act); + } + } + + /* + * persistent requests on existing persistent locks + * + * persistent locks are not owned by a client, so any + * existing with matching mode satisfies a request. + * only one persistent lock is kept on a resource. + * a single "unowned" persistent lock satisfies + * any/multiple client requests for a persistent lock. + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (!(act->flags & LD_AF_PERSISTENT)) + continue; + + lk = find_lock_persistent(r); + if (!lk) + continue; + + if (lk->mode != act->mode) { + /* convert below */ + /* + act->result = -EEXIST; + list_del(&act->list); + add_client_result(act); + */ + continue; + } else { + /* success */ + act->result = -EALREADY; + list_del(&act->list); + add_client_result(act); + } + } + + /* + * transient requests with existing persistent locks + * + * Just grant the transient request and do not + * keep a record of it. Assume that the persistent + * lock will not go away while the transient lock + * is needed. + * + * This would be used when an ex, persistent lv lock + * exists from activation, and then something like + * lvextend asks for a transient ex lock to change + * the lv. The lv could not be unlocked by deactivation + * while the lvextend was running. + * + * The logic here for mixing T/P locks is not general + * support; there are a number of cases where it will + * not work: updating version number (lv locks have + * none), ex locks from multiple clients will not + * conflict, explicit un of the transient lock will fail. + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (act->flags & LD_AF_PERSISTENT) + continue; + + lk = find_lock_persistent(r); + if (!lk) + continue; + + if ((lk->mode == LD_LK_EX) || + (lk->mode == LD_LK_SH && act->mode == LD_LK_SH)) { + act->result = 0; + list_del(&act->list); + add_client_result(act); + } else { + /* persistent lock is sh, transient request is ex */ + /* FIXME: can we remove this case? do a convert here? */ + log_debug("res_process %s existing persistent lock new transient", r->name); + act->result = -EEXIST; + list_del(&act->list); + add_client_result(act); + } + } + + /* + * persistent requests with existing transient locks + * + * If a client requests a P (persistent) lock for a T (transient) + * lock it already holds, we can just change T to P. Fail if the + * same happens for locks from different clients. Changing + * another client's lock from T to P may cause problems + * if that client tries to unlock or update version. + * + * I don't think this P/T combination will be used. + * It might be used if a command was able to take a P + * vg lock, in which case the T vg lock would already + * be held for reading. If the T lock was sh, it would + * be converted to P ex. If the T/P modes matched, the + * lock could just be changed from T to P. + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (!(act->flags & LD_AF_PERSISTENT)) + continue; + + lk = find_lock_client(r, act->client_id); + if (!lk) + continue; + + if (lk->mode != act->mode) { + /* FIXME: convert and change to persistent? */ + log_debug("res_process %s existing transient lock new persistent", r->name); + act->result = -EEXIST; + list_del(&act->list); + add_client_result(act); + } else { + lk->flags |= LD_LF_PERSISTENT; + lk->client_id = 0; + act->result = 0; + list_del(&act->list); + add_client_result(act); + } + } + + /* + * convert mode of existing locks + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (act->flags & LD_AF_PERSISTENT) + lk = find_lock_persistent(r); + else + lk = find_lock_client(r, act->client_id); + if (!lk) + continue; + + if (lk->mode == act->mode) { + /* should never happen, should be found above */ + log_error("convert same mode"); + continue; + } + + /* convert fails immediately, no EAGAIN retry */ + rv = res_convert(ls, r, lk, act); + act->result = rv; + list_del(&act->list); + add_client_result(act); + } + + /* + * Cases above are all requests addressed by existing locks. + * Below handles the rest. Transient and persistent are + * handled the same, except + * - if mode of existing lock is incompat with requested, + * leave the act on r->actions + * - if r mode is EX, any lock action is blocked, just quit + * + * Retry a lock request that fails due to a lock conflict (-EAGAIN): + * if we have not exceeded max retries and lm sets lm_retry (sanlock + * transient conflicts from shared lock implementation), or r type + * is gl or vg (transient real conflicts we want to hide from command). + * lv lock conflicts won't be transient so don't retry them. + */ + + + if (r->mode == LD_LK_EX) + return; + + /* + * r mode is SH or UN, pass lock-sh actions to lm + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + /* grant in order, so break here */ + if (act->op == LD_OP_LOCK && act->mode == LD_LK_EX) + break; + + if (act->op == LD_OP_LOCK && act->mode == LD_LK_SH) { + lm_retry = 0; + + rv = res_lock(ls, r, act, &lm_retry); + if ((rv == -EAGAIN) && + (act->retries <= act->max_retries) && + (lm_retry || (r->type != LD_RT_LV))) { + /* leave act on list */ + log_debug("S %s R %s res_lock EAGAIN retry", ls->name, r->name); + act->retries++; + *retry_out = 1; + } else { + act->result = rv; + list_del(&act->list); + add_client_result(act); + } + if (rv == -EUNATCH) + goto r_free; + } + } + + /* + * r mode is SH, any ex lock action is blocked, just quit + */ + + if (r->mode == LD_LK_SH) + return; + + /* + * r mode is UN, pass lock-ex action to lm + */ + + list_for_each_entry_safe(act, safe, &r->actions, list) { + if (act->op == LD_OP_LOCK && act->mode == LD_LK_EX) { + lm_retry = 0; + + rv = res_lock(ls, r, act, &lm_retry); + if ((rv == -EAGAIN) && + (act->retries <= act->max_retries) && + (lm_retry || (r->type != LD_RT_LV))) { + /* leave act on list */ + log_debug("S %s R %s res_lock EAGAIN retry", ls->name, r->name); + act->retries++; + *retry_out = 1; + } else { + act->result = rv; + list_del(&act->list); + add_client_result(act); + } + if (rv == -EUNATCH) + goto r_free; + break; + } + } + + return; + +r_free: + /* For the EUNATCH case it may be possible there are queued actions? */ + list_for_each_entry_safe(act, safe, &r->actions, list) { + log_error("S %s R %s res_process r_free cancel %s client %d", + ls->name, r->name, op_str(act->op), act->client_id); + act->result = -ECANCELED; + list_del(&act->list); + add_client_result(act); + } + log_debug("S %s R %s res_process free", ls->name, r->name); + lm_rem_resource(ls, r); + list_del(&r->list); + free_resource(r); +} + +#define LOCKS_EXIST_ANY 1 +#define LOCKS_EXIST_GL 2 +#define LOCKS_EXIST_VG 3 +#define LOCKS_EXIST_LV 4 + +static int for_each_lock(struct lockspace *ls, int locks_do) +{ + struct resource *r; + struct lock *lk; + + list_for_each_entry(r, &ls->resources, list) { + list_for_each_entry(lk, &r->locks, list) { + if (locks_do == LOCKS_EXIST_ANY) + return 1; + + if (locks_do == LOCKS_EXIST_GL && r->type == LD_RT_GL) + return 1; + + if (locks_do == LOCKS_EXIST_VG && r->type == LD_RT_VG) + return 1; + + if (locks_do == LOCKS_EXIST_LV && r->type == LD_RT_LV) + return 1; + } + } + + return 0; +} + +static int clear_locks(struct lockspace *ls, int free_vg) +{ + struct resource *r, *r_safe; + struct lock *lk, *lk_safe; + struct action *act, *act_safe; + uint32_t lk_version; + uint32_t r_version; + int lk_count = 0; + int rv; + + list_for_each_entry_safe(r, r_safe, &ls->resources, list) { + lk_version = 0; + + list_for_each_entry_safe(lk, lk_safe, &r->locks, list) { + lk_count++; + + if (lk->flags & LD_LF_PERSISTENT) + log_error("S %s R %s clear lock persistent", ls->name, r->name); + else + log_error("S %s R %s clear lock client %d", ls->name, r->name, lk->client_id); + + if (lk->version > lk_version) + lk_version = lk->version; + + list_del(&lk->list); + free_lock(lk); + } + + if (r->mode == LD_LK_UN) + goto r_free; + + if ((r->type == LD_RT_GL) && (r->mode == LD_LK_EX)) { + r->version++; + r_version = r->version; + log_debug("S %s R %s clear_locks r_version inc %u", + ls->name, r->name, r_version); + + } else if ((r->type == LD_RT_VG) && (r->mode == LD_LK_EX) && (lk_version > r->version)) { + r->version = lk_version; + r_version = r->version; + log_debug("S %s R %s clear_locks r_version new %u", + ls->name, r->name, r_version); + + } else { + r_version = 0; + } + + rv = lm_unlock(ls, r, NULL, r_version, free_vg ? LMUF_FREE_VG : 0); + if (rv < 0) { + /* should never happen */ + log_error("S %s R %s clear_locks free %d lm unlock error %d", + ls->name, r->name, free_vg, rv); + } + + list_for_each_entry_safe(act, act_safe, &r->actions, list) { + log_error("S %s R %s clear_locks cancel %s client %d", + ls->name, r->name, op_str(act->op), act->client_id); + act->result = -ECANCELED; + list_del(&act->list); + add_client_result(act); + } + r_free: + log_debug("S %s R %s free", ls->name, r->name); + lm_rem_resource(ls, r); + list_del(&r->list); + free_resource(r); + } + + return lk_count; +} + +/* + * find and return the resource that is referenced by the action + * - there is a single gl resource per lockspace + * - there is a single vg resource per lockspace + * - there can be many lv resources per lockspace, compare names + */ + +static struct resource *find_resource_act(struct lockspace *ls, + struct action *act, + int nocreate) +{ + struct resource *r; + + list_for_each_entry(r, &ls->resources, list) { + if (r->type != act->rt) + continue; + + if (r->type == LD_RT_GL && act->rt == LD_RT_GL) + return r; + + if (r->type == LD_RT_VG && act->rt == LD_RT_VG) + return r; + + if (r->type == LD_RT_LV && act->rt == LD_RT_LV && + !strcmp(r->name, act->lv_uuid)) + return r; + } + + if (nocreate) + return NULL; + + if (!(r = alloc_resource())) + return NULL; + + r->type = act->rt; + + r->mode = LD_LK_UN; + + if (r->type == LD_RT_GL) + strncpy(r->name, R_NAME_GL, MAX_NAME); + else if (r->type == LD_RT_VG) + strncpy(r->name, R_NAME_VG, MAX_NAME); + else if (r->type == LD_RT_LV) + strncpy(r->name, act->lv_uuid, MAX_NAME); + + list_add_tail(&r->list, &ls->resources); + + return r; +} + +static void free_ls_resources(struct lockspace *ls) +{ + struct resource *r, *r_safe; + + list_for_each_entry_safe(r, r_safe, &ls->resources, list) { + lm_rem_resource(ls, r); + list_del(&r->list); + free_resource(r); + } +} + +/* + * Process actions queued for this lockspace by + * client_recv_action / add_lock_action. + * + * The lockspace_thread can touch its own ls struct without holding + * lockspaces_mutex until it sets ls->thread_done, after which it + * cannot touch ls without holding lockspaces_mutex. + */ + +#define LOCK_RETRY_MS 1000 /* milliseconds to delay between retry */ + +static void *lockspace_thread_main(void *arg_in) +{ + struct lockspace *ls = arg_in; + struct resource *r, *r2; + struct action *add_act, *act, *safe; + struct list_head tmp_act; + struct list_head act_close; + int free_vg = 0; + int error = 0; + int adopt_flag = 0; + int wait_flag = 0; + int retry; + int rv; + + INIT_LIST_HEAD(&act_close); + + /* first action may be client add */ + pthread_mutex_lock(&ls->mutex); + act = NULL; + add_act = NULL; + if (!list_empty(&ls->actions)) { + act = list_first_entry(&ls->actions, struct action, list); + if (act->op == LD_OP_START) { + add_act = act; + list_del(&add_act->list); + + if (add_act->flags & LD_AF_WAIT) + wait_flag = 1; + if (add_act->flags & LD_AF_ADOPT) + adopt_flag = 1; + } + } + pthread_mutex_unlock(&ls->mutex); + + log_debug("S %s lm_add_lockspace %s wait %d adopt %d", + ls->name, lm_str(ls->lm_type), wait_flag, adopt_flag); + + /* + * The prepare step does not wait for anything and is quick; + * it tells us if the parameters are valid and the lm is running. + */ + error = lm_prepare_lockspace(ls, add_act); + + if (add_act && (!wait_flag || error)) { + /* send initial join result back to client */ + add_act->result = error; + add_client_result(add_act); + add_act = NULL; + } + + /* + * The actual lockspace join can take a while. + */ + if (!error) { + error = lm_add_lockspace(ls, add_act, adopt_flag); + + log_debug("S %s lm_add_lockspace done %d", ls->name, error); + + if (ls->sanlock_gl_enabled && gl_lsname_sanlock[0] && + strcmp(ls->name, gl_lsname_sanlock)) + sanlock_gl_dup = 1; + + if (add_act) { + /* send final join result back to client */ + add_act->result = error; + add_client_result(add_act); + } + } + + pthread_mutex_lock(&ls->mutex); + if (error) { + ls->thread_stop = 1; + ls->create_fail = 1; + } else { + ls->create_done = 1; + } + pthread_mutex_unlock(&ls->mutex); + + if (error) + goto out_act; + + while (1) { + pthread_mutex_lock(&ls->mutex); + while (!ls->thread_work) { + if (ls->thread_stop) { + pthread_mutex_unlock(&ls->mutex); + goto out_rem; + } + pthread_cond_wait(&ls->cond, &ls->mutex); + } + + /* + * Process all the actions queued for this lockspace. + * The client thread queues actions on ls->actions. + * + * Here, take all the actions off of ls->actions, and: + * + * - For lock operations, move the act to r->actions. + * These lock actions/operations processed by res_process(). + * + * - For non-lock operations, e.g. related to managing + * the lockspace, process them in this loop. + */ + + while (1) { + if (list_empty(&ls->actions)) { + ls->thread_work = 0; + break; + } + + act = list_first_entry(&ls->actions, struct action, list); + + if (sanlock_gl_dup && ls->sanlock_gl_enabled) + act->flags |= LD_AF_DUP_GL_LS; + + if (act->op == LD_OP_STOP) { + ls->thread_work = 0; + break; + } + + if (act->op == LD_OP_FREE && act->rt == LD_RT_VG) { + /* vgremove */ + log_debug("S %s checking for lockspace hosts", ls->name); + rv = lm_hosts(ls, 1); + if (rv) { + /* + * Checking for hosts here in addition to after the + * main loop allows vgremove to fail and be rerun + * after the ls is stopped on other hosts. + */ + log_error("S %s lockspace hosts %d", ls->name, rv); + list_del(&act->list); + act->result = -EBUSY; + add_client_result(act); + continue; + } + ls->thread_work = 0; + ls->thread_stop = 1; + free_vg = 1; + break; + } + + if (act->op == LD_OP_RENAME_BEFORE && act->rt == LD_RT_VG) { + /* vgrename */ + log_debug("S %s checking for lockspace hosts", ls->name); + rv = lm_hosts(ls, 1); + if (rv) { + log_error("S %s lockspace hosts %d", ls->name, rv); + list_del(&act->list); + act->result = -EBUSY; + add_client_result(act); + continue; + } + ls->thread_work = 0; + ls->thread_stop = 1; + /* Do we want to check hosts again below like vgremove? */ + break; + } + + if (act->op == LD_OP_FIND_FREE_LOCK && act->rt == LD_RT_VG) { + uint64_t free_offset = 0; + log_debug("S %s find free lock", ls->name); + rv = lm_find_free_lock(ls, &free_offset); + log_debug("S %s find free lock %d offset %llu", + ls->name, rv, (unsigned long long)free_offset); + ls->free_lock_offset = free_offset; + list_del(&act->list); + act->result = rv; + add_client_result(act); + continue; + } + + list_del(&act->list); + + /* applies to all resources */ + if (act->op == LD_OP_CLOSE) { + list_add(&act->list, &act_close); + continue; + } + + /* + * All the other op's are for locking. + * Find the specific resource that the lock op is for, + * and add the act to the resource's list of lock ops. + * + * (This creates a new resource if the one named in + * the act is not found.) + */ + + r = find_resource_act(ls, act, (act->op == LD_OP_FREE) ? 1 : 0); + if (!r) { + act->result = (act->op == LD_OP_FREE) ? -ENOENT : -ENOMEM; + add_client_result(act); + continue; + } + + list_add_tail(&act->list, &r->actions); + + log_debug("S %s R %s action %s %s", ls->name, r->name, + op_str(act->op), mode_str(act->mode)); + } + pthread_mutex_unlock(&ls->mutex); + + /* + * Process the lock operations that have been queued for each + * resource. + */ + + retry = 0; + + list_for_each_entry_safe(r, r2, &ls->resources, list) + res_process(ls, r, &act_close, &retry); + + list_for_each_entry_safe(act, safe, &act_close, list) { + list_del(&act->list); + free_action(act); + } + + if (retry) { + ls->thread_work = 1; + usleep(LOCK_RETRY_MS * 1000); + } + } + +out_rem: + log_debug("S %s stopping", ls->name); + + /* + * For sanlock, we need to unlock any existing locks + * before removing the lockspace, otherwise the sanlock + * daemon will kill us when the lockspace goes away. + * For dlm, we leave with force, so all locks will + * automatically be dropped when we leave the lockspace, + * so unlocking all before leaving could be skipped. + * + * Blindly dropping all existing locks must only be + * allowed in emergency/force situations, otherwise it's + * obviously dangerous, since the lock holders are still + * operating under the assumption that they hold the lock. + * + * For vgremove of a sanlock vg, the vg lock will be held, + * and possibly the gl lock if this vg holds the gl. + * sanlock vgremove wants to unlock-rename these locks. + */ + + log_debug("S %s clearing locks", ls->name); + + rv = clear_locks(ls, free_vg); + + /* + * Tell any other hosts in the lockspace to leave it + * before we remove it (for vgremove). We do this + * before leaving the lockspace ourself because we + * need to be in the lockspace to see others. + */ + + if (free_vg) { + log_debug("S %s checking for lockspace hosts", ls->name); + rv = lm_hosts(ls, 1); + if (rv) + log_error("S %s other lockspace hosts %d", ls->name, rv); + } + + /* + * Leave the lockspace. + */ + + rv = lm_rem_lockspace(ls, NULL, free_vg); + + log_debug("S %s rem_lockspace done %d", ls->name, rv); + +out_act: + /* + * Move remaining actions to results; this will usually (always?) + * be only the stop action. + */ + INIT_LIST_HEAD(&tmp_act); + + pthread_mutex_lock(&ls->mutex); + list_for_each_entry_safe(act, safe, &ls->actions, list) { + if (act->op == LD_OP_FREE) + act->result = 0; + else if (act->op == LD_OP_STOP) + act->result = 0; + else if (act->op == LD_OP_RENAME_BEFORE) + act->result = 0; + else + act->result = -ENOLS; + list_del(&act->list); + list_add_tail(&act->list, &tmp_act); + } + pthread_mutex_unlock(&ls->mutex); + + pthread_mutex_lock(&client_mutex); + list_for_each_entry_safe(act, safe, &tmp_act, list) { + list_del(&act->list); + list_add_tail(&act->list, &client_results); + } + pthread_cond_signal(&client_cond); + pthread_mutex_unlock(&client_mutex); + + pthread_mutex_lock(&lockspaces_mutex); + ls->thread_done = 1; + pthread_mutex_unlock(&lockspaces_mutex); + + /* + * worker_thread will join this thread, and move the + * ls struct from lockspaces list to lockspaces_inactive. + */ + pthread_mutex_lock(&worker_mutex); + worker_wake = 1; + pthread_cond_signal(&worker_cond); + pthread_mutex_unlock(&worker_mutex); + + return NULL; +} + +int lockspaces_empty(void) +{ + int rv; + pthread_mutex_lock(&lockspaces_mutex); + rv = list_empty(&lockspaces); + pthread_mutex_unlock(&lockspaces_mutex); + return rv; +} + +/* + * lockspaces_mutex is locked + * + * When duplicate sanlock global locks have been seen, + * this function has a secondary job of counting the + * number of lockspaces that exist with the gl enabled, + * with the side effect of setting sanlock_gl_dup back to + * zero when the duplicates have been removed/disabled. + */ + +static struct lockspace *find_lockspace_name(char *ls_name) +{ + struct lockspace *ls_found = NULL; + struct lockspace *ls; + int gl_count = 0; + + list_for_each_entry(ls, &lockspaces, list) { + if (!strcmp(ls->name, ls_name)) + ls_found = ls; + + if (!sanlock_gl_dup && ls_found) + return ls_found; + + if (sanlock_gl_dup && ls->sanlock_gl_enabled) + gl_count++; + } + + /* this is the side effect we want from this function */ + if (sanlock_gl_dup && gl_count < 2) + sanlock_gl_dup = 0; + + return ls_found; +} + +/* + * If lvm_<vg_name> is longer than max lockspace name (64) we just ignore the + * extra characters. For sanlock vgs, the name is shortened further to 48 in + * the sanlock code. + */ + +static int vg_ls_name(const char *vg_name, char *ls_name) +{ + if (strlen(vg_name) + 4 > MAX_NAME) { + log_error("vg name too long %s", vg_name); + return -1; + } + + snprintf(ls_name, MAX_NAME, "%s%s", LVM_LS_PREFIX, vg_name); + return 0; +} + +/* FIXME: add mutex for gl_lsname_ ? */ + +static int gl_ls_name(char *ls_name) +{ + if (gl_use_dlm) + memcpy(ls_name, gl_lsname_dlm, MAX_NAME); + else if (gl_use_sanlock) + memcpy(ls_name, gl_lsname_sanlock, MAX_NAME); + else { + log_error("gl_ls_name: global lockspace type unknown"); + return -1; + } + return 0; +} + +/* + * When this function returns an error, the caller needs to deal + * with act (in the cases where act exists). + */ + +static int add_lockspace_thread(const char *ls_name, + const char *vg_name, + const char *vg_uuid, + int lm_type, const char *vg_args, + struct action *act) +{ + struct lockspace *ls, *ls2; + struct resource *r; + uint32_t version = 0; + int rv; + + if (act) + version = act->version; + + log_debug("add_lockspace_thread %s %s version %u", + lm_str(lm_type), ls_name, version); + + if (!(ls = alloc_lockspace())) + return -ENOMEM; + + strncpy(ls->name, ls_name, MAX_NAME); + ls->lm_type = lm_type; + + if (act) + ls->start_client_id = act->client_id; + + if (vg_uuid) + strncpy(ls->vg_uuid, vg_uuid, 64); + + if (vg_name) + strncpy(ls->vg_name, vg_name, MAX_NAME); + + if (vg_args) + strncpy(ls->vg_args, vg_args, MAX_ARGS); + + if (act) + ls->host_id = act->host_id; + + if (!(r = alloc_resource())) { + free(ls); + return -ENOMEM; + } + + r->type = LD_RT_VG; + r->mode = LD_LK_UN; + r->version = version; + strncpy(r->name, R_NAME_VG, MAX_NAME); + list_add_tail(&r->list, &ls->resources); + + pthread_mutex_lock(&lockspaces_mutex); + ls2 = find_lockspace_name(ls->name); + if (ls2) { + if (ls2->thread_stop) + rv = -EAGAIN; + else + rv = -EEXIST; + pthread_mutex_unlock(&lockspaces_mutex); + free_resource(r); + free(ls); + return rv; + } + + /* + * act will be null when this lockspace is added automatically/internally + * and not by an explicit client action that wants a result. + */ + if (act) + list_add(&act->list, &ls->actions); + + clear_lockspace_inactive(ls->name); + + list_add_tail(&ls->list, &lockspaces); + pthread_mutex_unlock(&lockspaces_mutex); + + rv = pthread_create(&ls->thread, NULL, lockspace_thread_main, ls); + if (rv < 0) { + pthread_mutex_lock(&lockspaces_mutex); + list_del(&ls->list); + pthread_mutex_unlock(&lockspaces_mutex); + free_resource(r); + free(ls); + return rv; + } + + return 0; +} + +/* + * There is no add_sanlock_global_lockspace or + * rem_sanlock_global_lockspace because with sanlock, + * the global lockspace is one of the vg lockspaces. + */ + +static int add_dlm_global_lockspace(struct action *act) +{ + int rv; + + if (gl_running_dlm) + return -EEXIST; + + gl_running_dlm = 1; + + /* Keep track of whether we automatically added + the global ls, so we know to automatically + remove it. */ + + if (act) + gl_auto_dlm = 0; + else + gl_auto_dlm = 1; + + /* + * There's a short period after which a previous gl lockspace thread + * has set gl_running_dlm = 0, but before its ls struct has been + * deleted, during which this add_lockspace_thread() can fail with + * -EAGAIN. + */ + + rv = add_lockspace_thread(gl_lsname_dlm, NULL, NULL, LD_LM_DLM, NULL, act); + + if (rv < 0) { + log_error("add_dlm_global_lockspace add_lockspace_thread %d", rv); + gl_running_dlm = 0; + gl_auto_dlm = 0; + } + + return rv; +} + +/* + * If dlm gl lockspace is the only one left, then stop it. + * This is not used for an explicit rem_lockspace action from + * the client, only for auto remove. + */ + +static int rem_dlm_global_lockspace(void) +{ + struct lockspace *ls, *ls_gl = NULL; + int others = 0; + int rv = 0; + + pthread_mutex_lock(&lockspaces_mutex); + list_for_each_entry(ls, &lockspaces, list) { + if (!strcmp(ls->name, gl_lsname_dlm)) { + ls_gl = ls; + continue; + } + if (ls->thread_stop) + continue; + others++; + break; + } + + if (others) { + rv = -EAGAIN; + goto out; + } + + if (!ls_gl) { + rv = -ENOENT; + goto out; + } + + ls = ls_gl; + pthread_mutex_lock(&ls->mutex); + ls->thread_stop = 1; + ls->thread_work = 1; + pthread_cond_signal(&ls->cond); + pthread_mutex_unlock(&ls->mutex); + rv = 0; +out: + pthread_mutex_unlock(&lockspaces_mutex); + return rv; +} + +/* + * When the first dlm lockspace is added for a vg, + * automatically add a separate dlm lockspace for the + * global lock if it hasn't been done explicitly. + * This is to make the dlm global lockspace work similarly to + * the sanlock global lockspace, which is "automatic" by + * nature of being one of the vg lockspaces. + * + * For sanlock, a separate lockspace is not used for + * the global lock, but the gl lock lives in a vg + * lockspace, (although it's recommended to create a + * special vg dedicated to holding the gl). + * + * N.B. for dlm, if this is an add+WAIT action for a vg + * lockspace, and this triggered the automatic addition + * of the global lockspace, then the action may complete + * for the vg ls add, while the gl ls add is still in + * progress. If the caller wants to ensure that the + * gl ls add is complete, they should explicitly add+WAIT + * the gl ls. + * + * If this function returns and error, the caller + * will queue the act with that error for the client. + */ + +static int add_lockspace(struct action *act) +{ + char ls_name[MAX_NAME+1]; + int rv; + + memset(ls_name, 0, sizeof(ls_name)); + + if (act->rt == LD_RT_GL) { + if (gl_use_dlm) { + rv = add_dlm_global_lockspace(act); + return rv; + } else { + return -EINVAL; + } + } + + if (act->rt == LD_RT_VG) { + if (gl_use_dlm) { + rv = add_dlm_global_lockspace(NULL); + if (rv < 0 && rv != -EEXIST) + return rv; + } + + vg_ls_name(act->vg_name, ls_name); + + rv = add_lockspace_thread(ls_name, act->vg_name, act->vg_uuid, + act->lm_type, act->vg_args, + act); + + if (rv) + log_error("add_lockspace %s add_lockspace_thread %d", ls_name, rv); + return rv; + } + + log_error("add_lockspace bad type %d", act->rt); + return -1; +} + +/* + * vgchange --lock-stop vgname will lock the vg ex, then send a stop, + * so we exect to find the ex vg lock held here, and will automatically + * unlock it when stopping. + * + * Should we attempt to stop the lockspace containing the gl last? + */ + +static int rem_lockspace(struct action *act) +{ + struct lockspace *ls; + char ls_name[MAX_NAME+1]; + int force = act->flags & LD_AF_FORCE; + int rt = act->rt; + + if (act->rt == LD_RT_GL && act->lm_type != LD_LM_DLM) + return -EINVAL; + + memset(ls_name, 0, sizeof(ls_name)); + + if (act->rt == LD_RT_GL) + gl_ls_name(ls_name); + else + vg_ls_name(act->vg_name, ls_name); + + pthread_mutex_lock(&lockspaces_mutex); + ls = find_lockspace_name(ls_name); + if (!ls) { + pthread_mutex_unlock(&lockspaces_mutex); + return -ENOLS; + } + + pthread_mutex_lock(&ls->mutex); + if (ls->thread_stop) { + pthread_mutex_unlock(&ls->mutex); + pthread_mutex_unlock(&lockspaces_mutex); + return -ESTALE; + } + + if (!force && for_each_lock(ls, LOCKS_EXIST_LV)) { + pthread_mutex_unlock(&ls->mutex); + pthread_mutex_unlock(&lockspaces_mutex); + return -EBUSY; + } + ls->thread_work = 1; + ls->thread_stop = 1; + if (act) + list_add_tail(&act->list, &ls->actions); + pthread_cond_signal(&ls->cond); + pthread_mutex_unlock(&ls->mutex); + pthread_mutex_unlock(&lockspaces_mutex); + + /* + * If the dlm global lockspace was automatically added when + * the first dlm vg lockspace was added, then reverse that + * by automatically removing the dlm global lockspace when + * the last dlm vg lockspace is removed. + */ + + if (rt == LD_RT_VG && gl_use_dlm && gl_auto_dlm) + rem_dlm_global_lockspace(); + + return 0; +} + +/* + * count how many lockspaces started by this client are still starting; + * the client will use this to wait for all its start operations to finish + * (START_WAIT). + */ + +static int count_lockspace_starting(uint32_t client_id) +{ + struct lockspace *ls; + int count = 0; + int done = 0; + int fail = 0; + + pthread_mutex_lock(&lockspaces_mutex); + list_for_each_entry(ls, &lockspaces, list) { + if (ls->start_client_id != client_id) + continue; + + if (!ls->create_done && !ls->create_fail) { + count++; + continue; + } + + if (ls->create_done) + done++; + if (ls->create_fail) + fail++; + } + pthread_mutex_unlock(&lockspaces_mutex); + + log_debug("count_lockspace_starting client %u count %d done %d fail %d", + client_id, count, done, fail); + + return count; +} + +/* lockspaces_mutex is held */ +static struct lockspace *find_lockspace_inactive(char *ls_name) +{ + struct lockspace *ls; + + list_for_each_entry(ls, &lockspaces_inactive, list) { + if (!strcmp(ls->name, ls_name)) + return ls; + } + + return NULL; +} + +/* lockspaces_mutex is held */ +static void clear_lockspace_inactive(char *ls_name) +{ + struct lockspace *ls; + + ls = find_lockspace_inactive(ls_name); + if (ls) { + list_del(&ls->list); + free(ls); + } +} + +static void free_lockspaces_inactive(void) +{ + struct lockspace *ls, *safe; + + pthread_mutex_lock(&lockspaces_mutex); + list_for_each_entry_safe(ls, safe, &lockspaces_inactive, list) { + list_del(&ls->list); + free(ls); + } + pthread_mutex_unlock(&lockspaces_mutex); +} + +/* + * Loop through all lockspaces, and: + * - if do_stop is set, stop any that are not stopped + * - if do_free is set, join any that are done stopping (and free ls) + * + * do_stop will not stop an ls with lv locks unless force is set. + * + * This function does not block or wait for anything. + * + * do_stop (no do_free): + * returns count of lockspaces that need stop (have locks and no force) + * + * do_free (no do_stop): + * returns count of lockspaces that are stopped and need freeing + * + * do_stop and do_free: + * returns sum of the previous two + */ + +static int for_each_lockspace(int do_stop, int do_free, int do_force) +{ + struct lockspace *ls, *safe; + int need_stop = 0; + int need_free = 0; + int stop_count = 0; + int free_count = 0; + int done; + int stop; + + pthread_mutex_lock(&lockspaces_mutex); + + if (do_stop) { + list_for_each_entry(ls, &lockspaces, list) { + + pthread_mutex_lock(&ls->mutex); + if (ls->thread_stop) { + pthread_mutex_unlock(&ls->mutex); + continue; + } + + if (!do_force && for_each_lock(ls, LOCKS_EXIST_ANY)) { + need_stop++; + } else { + ls->thread_work = 1; + ls->thread_stop = 1; + pthread_cond_signal(&ls->cond); + stop_count++; + } + pthread_mutex_unlock(&ls->mutex); + } + } + + if (do_free) { + list_for_each_entry_safe(ls, safe, &lockspaces, list) { + + pthread_mutex_lock(&ls->mutex); + done = ls->thread_done; + stop = ls->thread_stop; + pthread_mutex_unlock(&ls->mutex); + + /* This ls has locks and force is not set. */ + if (!stop) + continue; + + /* + * Once thread_done is set, we know that the lockspace_thread + * will not be using/touching the ls struct. Any other + * thread touches the ls struct under lockspaces_mutex. + */ + if (done) { + pthread_join(ls->thread, NULL); + list_del(&ls->list); + + /* In future we may need to free ls->actions here */ + free_ls_resources(ls); + list_add(&ls->list, &lockspaces_inactive); + free_count++; + } else { + need_free++; + } + } + } + + if (list_empty(&lockspaces)) { + if (!gl_type_static) { + gl_use_dlm = 0; + gl_use_sanlock = 0; + } + } + pthread_mutex_unlock(&lockspaces_mutex); + + if (stop_count || free_count || need_stop || need_free) { + log_debug("for_each_lockspace do_stop %d do_free %d " + "stop_count %d free_count %d need_stop %d need_free %d", + do_stop, do_free, stop_count, free_count, need_stop, need_free); + } + + return need_stop + need_free; +} + +/* + * This is only called when the daemon is exiting so the sleep/retry + * loop doesn't have any adverse impact. + */ + +static void for_each_lockspace_retry(int do_stop, int do_free, int do_force) +{ + int count; + + while (1) { + count = for_each_lockspace(do_stop, do_free, do_force); + if (!count) + break; + + log_debug("for_each_lockspace_retry remaining %d", count); + sleep(1); + } +} + +static int work_init_vg(struct action *act) +{ + struct lockspace *ls; + char ls_name[MAX_NAME+1]; + int rv = 0; + + memset(ls_name, 0, sizeof(ls_name)); + + vg_ls_name(act->vg_name, ls_name); + + /* + * The max dlm ls name is 64 and the max sanlock ls name is 48. So, + * after the "lvm_" prefix, only the first 60/44 characters of the VG + * name are used for the lockspace name. This will cause a collision + * in the lock manager if two different VG names have the first 60/44 + * chars in common. At the time of vgcreate (here), check if any other + * VG's are known that would collide. If the collision is not detected + * at vgcreate time, it will be detected at start time and add_lockspace + * will fail for the second of the two matching ls names. + */ + pthread_mutex_lock(&lockspaces_mutex); + list_for_each_entry(ls, &lockspaces, list) { + if ((ls->lm_type == LD_LM_SANLOCK) && !strncmp(ls->name, ls_name, 48)) { + rv = -EEXIST; + break; + } + if ((ls->lm_type == LD_LM_DLM) && !strcmp(ls->name, ls_name)) { + rv = -EEXIST; + break; + } + } + pthread_mutex_unlock(&lockspaces_mutex); + + if (rv == -EEXIST) { + log_error("Existing lockspace name %s matches new %s VG names %s %s", + ls->name, ls_name, ls->vg_name, act->vg_name); + return rv; + } + + if (act->lm_type == LD_LM_SANLOCK) + rv = lm_init_vg_sanlock(ls_name, act->vg_name, act->flags, act->vg_args); + else if (act->lm_type == LD_LM_DLM) + rv = lm_init_vg_dlm(ls_name, act->vg_name, act->flags, act->vg_args); + else + rv = -EINVAL; + + return rv; +} + +static int work_rename_vg(struct action *act) +{ + char ls_name[MAX_NAME+1]; + int rv = 0; + + memset(ls_name, 0, sizeof(ls_name)); + + vg_ls_name(act->vg_name, ls_name); + + if (act->lm_type == LD_LM_SANLOCK) + rv = lm_rename_vg_sanlock(ls_name, act->vg_name, act->flags, act->vg_args); + else if (act->lm_type == LD_LM_DLM) + return 0; + else + rv = -EINVAL; + + return rv; +} + +static void work_test_gl(void) +{ + struct lockspace *ls; + int is_enabled = 0; + + pthread_mutex_lock(&lockspaces_mutex); + list_for_each_entry(ls, &lockspaces, list) { + if (ls->lm_type != LD_LM_SANLOCK) + continue; + + pthread_mutex_lock(&ls->mutex); + if (ls->create_done && !ls->thread_stop) { + is_enabled = lm_gl_is_enabled(ls); + if (is_enabled) { + log_debug("S %s worker found gl_is_enabled", ls->name); + strncpy(gl_lsname_sanlock, ls->name, MAX_NAME); + } + } + pthread_mutex_unlock(&ls->mutex); + + if (is_enabled) + break; + } + + if (!is_enabled) + log_debug("worker found no gl_is_enabled"); + pthread_mutex_unlock(&lockspaces_mutex); +} + +static int work_init_lv(struct action *act) +{ + struct lockspace *ls; + char ls_name[MAX_NAME+1]; + char vg_args[MAX_ARGS]; + char lv_args[MAX_ARGS]; + uint64_t free_offset = 0; + int lm_type = 0; + int rv = 0; + + memset(ls_name, 0, sizeof(ls_name)); + memset(vg_args, 0, MAX_ARGS); + memset(lv_args, 0, MAX_ARGS); + + vg_ls_name(act->vg_name, ls_name); + + pthread_mutex_lock(&lockspaces_mutex); + ls = find_lockspace_name(ls_name); + if (ls) { + lm_type = ls->lm_type; + memcpy(vg_args, ls->vg_args, MAX_ARGS); + free_offset = ls->free_lock_offset; + ls->free_lock_offset = 0; + } + pthread_mutex_unlock(&lockspaces_mutex); + + if (!ls) { + lm_type = act->lm_type; + memcpy(vg_args, act->vg_args, MAX_ARGS); + } + + if (act->lm_type != lm_type) { + log_error("init_lv ls_name %s wrong lm_type %d %d", + ls_name, act->lm_type, lm_type); + return -EINVAL; + } + + if (lm_type == LD_LM_SANLOCK) { + rv = lm_init_lv_sanlock(ls_name, act->vg_name, act->lv_uuid, + vg_args, lv_args, free_offset); + + memcpy(act->lv_args, lv_args, MAX_ARGS); + return rv; + + } else if (act->lm_type == LD_LM_DLM) { + return 0; + } else { + log_error("init_lv ls_name %s bad lm_type %d", ls_name, act->lm_type); + return -EINVAL; + } +} + +/* + * When an action is queued for the worker_thread, it is processed right away. + * After processing, some actions need to be retried again in a short while. + * These actions are put on the delayed_list, and the worker_thread will + * process these delayed actions again in SHORT_DELAY_PERIOD. + */ + +#define SHORT_DELAY_PERIOD 2 +#define LONG_DELAY_PERIOD 60 + +static void *worker_thread_main(void *arg_in) +{ + struct list_head delayed_list; + struct timespec ts; + struct action *act, *safe; + uint64_t last_delayed_time = 0; + int delay_sec = LONG_DELAY_PERIOD; + int rv; + + INIT_LIST_HEAD(&delayed_list); + + while (1) { + pthread_mutex_lock(&worker_mutex); + clock_gettime(CLOCK_REALTIME, &ts); + ts.tv_sec += delay_sec; + rv = 0; + act = NULL; + + while (list_empty(&worker_list) && !worker_stop && !worker_wake && !rv) { + rv = pthread_cond_timedwait(&worker_cond, &worker_mutex, &ts); + } + worker_wake = 0; + + if (worker_stop) { + pthread_mutex_unlock(&worker_mutex); + goto out; + } + + if (!list_empty(&worker_list)) { + act = list_first_entry(&worker_list, struct action, list); + list_del(&act->list); + } + pthread_mutex_unlock(&worker_mutex); + + /* + * Do new work actions before processing delayed work actions. + */ + + if (!act) + goto delayed_work; + + if (act->op == LD_OP_RUNNING_LM) { + int run_sanlock = lm_is_running_sanlock(); + int run_dlm = lm_is_running_dlm(); + + if (run_sanlock && run_dlm) + act->result = -EXFULL; + else if (!run_sanlock && !run_dlm) + act->result = -ENOLCK; + else if (run_sanlock) + act->result = LD_LM_SANLOCK; + else if (run_dlm) + act->result = LD_LM_DLM; + add_client_result(act); + + } else if ((act->op == LD_OP_LOCK) && (act->flags & LD_AF_SEARCH_LS)) { + /* + * worker_thread used as a helper to search existing + * sanlock vgs for an enabled gl. + */ + log_debug("work search for gl"); + work_test_gl(); + + /* try again to find a gl lockspace for this act */ + rv = add_lock_action(act); + if (rv < 0) { + act->result = rv; + add_client_result(act); + } + + } else if ((act->op == LD_OP_INIT) && (act->rt == LD_RT_VG)) { + log_debug("work init_vg %s", act->vg_name); + act->result = work_init_vg(act); + add_client_result(act); + + } else if ((act->op == LD_OP_INIT) && (act->rt == LD_RT_LV)) { + log_debug("work init_lv %s/%s uuid %s", act->vg_name, act->lv_name, act->lv_uuid); + act->result = work_init_lv(act); + add_client_result(act); + + } else if ((act->op == LD_OP_RENAME_FINAL) && (act->rt == LD_RT_VG)) { + log_debug("work rename_vg %s", act->vg_name); + act->result = work_rename_vg(act); + add_client_result(act); + + } else if (act->op == LD_OP_START_WAIT) { + act->result = count_lockspace_starting(act->client_id); + if (!act->result) + add_client_result(act); + else + list_add(&act->list, &delayed_list); + + } else if (act->op == LD_OP_STOP_ALL) { + act->result = for_each_lockspace(DO_STOP, DO_FREE, (act->flags & LD_AF_FORCE) ? DO_FORCE : NO_FORCE); + if (!act->result || !(act->flags & LD_AF_WAIT)) + add_client_result(act); + else + list_add(&act->list, &delayed_list); + + } else { + log_error("work unknown op %d", act->op); + act->result = -EINVAL; + add_client_result(act); + } + + delayed_work: + /* + * We may want to track retry times per action so that + * we can delay different actions by different amounts. + */ + + if (monotime() - last_delayed_time < SHORT_DELAY_PERIOD) { + delay_sec = 1; + continue; + } + last_delayed_time = monotime(); + + list_for_each_entry_safe(act, safe, &delayed_list, list) { + if (act->op == LD_OP_START_WAIT) { + log_debug("work delayed start_wait for client %u", act->client_id); + act->result = count_lockspace_starting(act->client_id); + if (!act->result) { + list_del(&act->list); + add_client_result(act); + } + + } else if (act->op == LD_OP_STOP_ALL) { + log_debug("work delayed stop_all"); + act->result = for_each_lockspace(DO_STOP, DO_FREE, (act->flags & LD_AF_FORCE) ? DO_FORCE : NO_FORCE); + if (!act->result) { + list_del(&act->list); + act->result = 0; + add_client_result(act); + } + } + } + + /* + * This is not explicitly queued work, and not delayed work, + * but lockspace thread cleanup that's needed when a + * lockspace has been stopped/removed or failed to start. + */ + + for_each_lockspace(NO_STOP, DO_FREE, NO_FORCE); + + if (list_empty(&delayed_list)) + delay_sec = LONG_DELAY_PERIOD; + else + delay_sec = 1; + } +out: + list_for_each_entry_safe(act, safe, &delayed_list, list) { + list_del(&act->list); + free_action(act); + } + + pthread_mutex_lock(&worker_mutex); + list_for_each_entry_safe(act, safe, &worker_list, list) { + list_del(&act->list); + free_action(act); + } + pthread_mutex_unlock(&worker_mutex); + return NULL; +} + +static int setup_worker_thread(void) +{ + int rv; + + INIT_LIST_HEAD(&worker_list); + + pthread_mutex_init(&worker_mutex, NULL); + pthread_cond_init(&worker_cond, NULL); + + rv = pthread_create(&worker_thread, NULL, worker_thread_main, NULL); + if (rv) + return -1; + return 0; +} + +static void close_worker_thread(void) +{ + pthread_mutex_lock(&worker_mutex); + worker_stop = 1; + pthread_cond_signal(&worker_cond); + pthread_mutex_unlock(&worker_mutex); + pthread_join(worker_thread, NULL); +} + +/* client_mutex is locked */ +static struct client *find_client_work(void) +{ + struct client *cl; + + list_for_each_entry(cl, &client_list, list) { + if (cl->recv || cl->dead) + return cl; + } + return NULL; +} + +/* client_mutex is locked */ +static struct client *find_client_id(uint32_t id) +{ + struct client *cl; + + list_for_each_entry(cl, &client_list, list) { + if (cl->id == id) + return cl; + } + return NULL; +} + +/* client_mutex is locked */ +static struct client *find_client_pi(int pi) +{ + struct client *cl; + + list_for_each_entry(cl, &client_list, list) { + if (cl->pi == pi) + return cl; + } + return NULL; +} + +/* + * wake up poll() because we have added an fd + * back into pollfd and poll() needs to be restarted + * to recognize it. + */ +static void restart_poll(void) +{ + int rv; + rv = write(restart_fds[1], "w", 1); + if (!rv || rv < 0) + log_debug("restart_poll write %d", errno); +} + +/* poll will take requests from client again, cl->mutex must be held */ +static void client_resume(struct client *cl) +{ + if (cl->dead) + return; + + if (!cl->poll_ignore || cl->fd == -1 || cl->pi == -1) { + /* shouldn't happen */ + log_error("client_resume %d bad state ig %d fd %d pi %d", + cl->id, cl->poll_ignore, cl->fd, cl->pi); + return; + } + + pthread_mutex_lock(&pollfd_mutex); + if (pollfd[cl->pi].fd != POLL_FD_IGNORE) { + log_error("client_resume %d pi %d fd %d not IGNORE", + cl->id, cl->pi, cl->fd); + } + pollfd[cl->pi].fd = cl->fd; + pollfd[cl->pi].events = POLLIN; + pthread_mutex_unlock(&pollfd_mutex); + + restart_poll(); +} + +/* called from client_thread, cl->mutex is held */ +static void client_send_result(struct client *cl, struct action *act) +{ + response res; + char result_flags[128]; + + if (cl->dead) { + log_debug("client send %d skip dead", cl->id); + return; + } + + memset(result_flags, 0, sizeof(result_flags)); + + buffer_init(&res.buffer); + + /* + * EUNATCH is returned when the global lock existed, + * but had been disabled when we tried to lock it, + * so we removed it, and no longer have a gl to lock. + */ + + if (act->result == -EUNATCH) + act->result = -ENOLS; + + /* + * init_vg with dlm|sanlock returns vg_args + * init_lv with sanlock returns lv_args + */ + + if (act->result == -ENOLS) { + /* + * The lockspace could not be found, in which case + * the caller may want to know if any lockspaces exist + * or if lockspaces exist, but not one with the global lock. + * Given this detail, it may be able to procede without + * the lock. + * + * FIXME: it would also help the caller to know if there + * are other sanlock VGs that have not been started. + * If there are, then one of them might have a global + * lock enabled. In that case, vgcreate may not want + * to create a new sanlock vg with gl enabled. + */ + pthread_mutex_lock(&lockspaces_mutex); + if (list_empty(&lockspaces)) + strcat(result_flags, "NO_LOCKSPACES,"); + pthread_mutex_unlock(&lockspaces_mutex); + + if (gl_use_sanlock && !gl_lsname_sanlock[0]) + strcat(result_flags, "NO_GL_LS,"); + else if (gl_use_dlm && !gl_lsname_dlm[0]) + strcat(result_flags, "NO_GL_LS,"); + else + strcat(result_flags, "NO_GL_LS,"); + } + + if (act->flags & LD_AF_DUP_GL_LS) + strcat(result_flags, "DUP_GL_LS,"); + + if (act->flags & LD_AF_INACTIVE_LS) + strcat(result_flags, "INACTIVE_LS,"); + + if (act->flags & LD_AF_ADD_LS_ERROR) + strcat(result_flags, "ADD_LS_ERROR,"); + + if (act->op == LD_OP_INIT) { + /* + * init is a special case where lock args need + * to be passed back to the client. + */ + const char *vg_args = "none"; + const char *lv_args = "none"; + + if (act->vg_args[0]) + vg_args = act->vg_args; + + if (act->lv_args[0]) + lv_args = act->lv_args; + + log_debug("send %s[%d.%u] %s %s rv %d vg_args %s lv_args %s", + cl->name[0] ? cl->name : "client", cl->pid, cl->id, + op_str(act->op), rt_str(act->rt), + act->result, vg_args ? vg_args : "", lv_args ? lv_args : ""); + + res = daemon_reply_simple("OK", + "op = %d", act->op, + "op_result = %d", act->result, + "lm_result = %d", act->lm_rv, + "vg_lock_args = %s", vg_args, + "lv_lock_args = %s", lv_args, + "result_flags = %s", result_flags[0] ? result_flags : "none", + NULL); + } else { + /* + * A normal reply. + */ + + log_debug("send %s[%d.%u] %s %s rv %d %s %s", + cl->name[0] ? cl->name : "client", cl->pid, cl->id, + op_str(act->op), rt_str(act->rt), + act->result, (act->result == -ENOLS) ? "ENOLS" : "", result_flags); + + res = daemon_reply_simple("OK", + "op = %d", act->op, + "lock_type = %s", lm_str(act->lm_type), + "op_result = %d", act->result, + "lm_result = %d", act->lm_rv, + "result_flags = %s", result_flags[0] ? result_flags : "none", + NULL); + } + + buffer_write(cl->fd, &res.buffer); + buffer_destroy(&res.buffer); + + client_resume(cl); +} + +/* called from client_thread */ +static void client_purge(struct client *cl) +{ + struct lockspace *ls; + struct action *act; + + pthread_mutex_lock(&lockspaces_mutex); + list_for_each_entry(ls, &lockspaces, list) { + if (!(act = alloc_action())) + continue; + + act->op = LD_OP_CLOSE; + act->client_id = cl->id; + + pthread_mutex_lock(&ls->mutex); + if (!ls->thread_stop) { + list_add_tail(&act->list, &ls->actions); + ls->thread_work = 1; + pthread_cond_signal(&ls->cond); + } else { + free_action(act); + } + pthread_mutex_unlock(&ls->mutex); + } + pthread_mutex_unlock(&lockspaces_mutex); +} + +static int add_lock_action(struct action *act) +{ + struct lockspace *ls = NULL; + char ls_name[MAX_NAME+1]; + + memset(ls_name, 0, sizeof(ls_name)); + + /* Determine which lockspace this action is for, and set ls_name. */ + + if (act->rt == LD_RT_GL && gl_use_sanlock && + (act->op == LD_OP_ENABLE || act->op == LD_OP_DISABLE)) + vg_ls_name(act->vg_name, ls_name); + else if (act->rt == LD_RT_GL) + gl_ls_name(ls_name); + else + vg_ls_name(act->vg_name, ls_name); + + retry: + pthread_mutex_lock(&lockspaces_mutex); + if (ls_name[0]) + ls = find_lockspace_name(ls_name); + if (!ls) { + int ls_inactive = 0; + int ls_create_fail = 0; + + ls = find_lockspace_inactive(ls_name); + if (ls) { + ls_inactive = 1; + ls_create_fail = ls->create_fail; + ls = NULL; + } + pthread_mutex_unlock(&lockspaces_mutex); + + if (act->op == LD_OP_UPDATE && act->rt == LD_RT_VG) { + log_debug("lockspace not found ignored for vg update"); + return -ENOLS; + + } else if (act->flags & LD_AF_SEARCH_LS) { + /* fail if we've already tried searching for the ls */ + log_error("lockspace search repeated %s", ls_name); + return -ENOLS; + + } else if (act->op == LD_OP_LOCK && act->rt == LD_RT_GL && gl_use_sanlock) { + /* gl may have been enabled in an existing vg */ + log_debug("gl lockspace not found check sanlock vgs"); + act->flags |= LD_AF_SEARCH_LS; + add_work_action(act); + return 0; + + } else if (act->op == LD_OP_LOCK && act->rt == LD_RT_GL && gl_use_dlm) { + log_debug("gl lockspace not found add dlm global"); + act->flags |= LD_AF_SEARCH_LS; + act->flags |= LD_AF_WAIT_STARTING; + add_dlm_global_lockspace(NULL); + gl_ls_name(ls_name); + goto retry; + + } else if (act->op == LD_OP_LOCK && act->mode == LD_LK_UN) { + log_debug("lockspace not found ignored for unlock"); + return -ENOLS; + + } else if (act->op == LD_OP_LOCK && act->rt == LD_RT_VG && ls_inactive) { + /* ls has been stopped or previously failed to start */ + log_debug("lockspace inactive create_fail %d %s", + ls_create_fail, ls_name); + act->flags |= LD_AF_INACTIVE_LS; + if (ls_create_fail) + act->flags |= LD_AF_ADD_LS_ERROR; + return -ENOLS; + + } else { + log_error("lockspace not found %s", ls_name); + return -ENOLS; + } + } + + if (act->lm_type == LD_LM_NONE) { + /* return to the command the type we are using */ + act->lm_type = ls->lm_type; + } else if (act->lm_type != ls->lm_type) { + /* should not happen */ + log_error("S %s add_lock_action bad lm_type %d ls %d", + ls_name, act->lm_type, ls->lm_type); + return -EINVAL; + } + + pthread_mutex_lock(&ls->mutex); + if (ls->thread_stop) { + pthread_mutex_unlock(&ls->mutex); + pthread_mutex_unlock(&lockspaces_mutex); + log_error("lockspace is stopping %s", ls_name); + return -ESTALE; + } + + if (!ls->create_fail && !ls->create_done && !(act->flags & LD_AF_WAIT_STARTING)) { + pthread_mutex_unlock(&ls->mutex); + pthread_mutex_unlock(&lockspaces_mutex); + log_debug("lockspace is starting %s", ls_name); + return -ESTARTING; + } + + list_add_tail(&act->list, &ls->actions); + ls->thread_work = 1; + pthread_cond_signal(&ls->cond); + pthread_mutex_unlock(&ls->mutex); + pthread_mutex_unlock(&lockspaces_mutex); + + /* lockspace_thread_main / res_process take it from here */ + + return 0; +} + +static int str_to_op_rt(const char *req_name, int *op, int *rt) +{ + if (!req_name) + goto out; + + if (!strcmp(req_name, "hello")) { + *op = LD_OP_HELLO; + *rt = 0; + return 0; + } + if (!strcmp(req_name, "quit")) { + *op = LD_OP_QUIT; + *rt = 0; + return 0; + } + if (!strcmp(req_name, "info")) { + *op = LD_OP_DUMP_INFO; + *rt = 0; + return 0; + } + if (!strcmp(req_name, "dump")) { + *op = LD_OP_DUMP_LOG; + *rt = 0; + return 0; + } + if (!strcmp(req_name, "init_vg")) { + *op = LD_OP_INIT; + *rt = LD_RT_VG; + return 0; + } + if (!strcmp(req_name, "init_lv")) { + *op = LD_OP_INIT; + *rt = LD_RT_LV; + return 0; + } + if (!strcmp(req_name, "free_vg")) { + *op = LD_OP_FREE; + *rt = LD_RT_VG; + return 0; + } + if (!strcmp(req_name, "free_lv")) { + *op = LD_OP_FREE; + *rt = LD_RT_LV; + return 0; + } + if (!strcmp(req_name, "start_vg")) { + *op = LD_OP_START; + *rt = LD_RT_VG; + return 0; + } + if (!strcmp(req_name, "stop_vg")) { + *op = LD_OP_STOP; + *rt = LD_RT_VG; + return 0; + } + if (!strcmp(req_name, "start_wait")) { + *op = LD_OP_START_WAIT; + *rt = 0; + return 0; + } + if (!strcmp(req_name, "stop_all")) { + *op = LD_OP_STOP_ALL; + *rt = 0; + return 0; + } + if (!strcmp(req_name, "lock_gl")) { + *op = LD_OP_LOCK; + *rt = LD_RT_GL; + return 0; + } + if (!strcmp(req_name, "lock_vg")) { + *op = LD_OP_LOCK; + *rt = LD_RT_VG; + return 0; + } + if (!strcmp(req_name, "lock_lv")) { + *op = LD_OP_LOCK; + *rt = LD_RT_LV; + return 0; + } + if (!strcmp(req_name, "vg_update")) { + *op = LD_OP_UPDATE; + *rt = LD_RT_VG; + return 0; + } + if (!strcmp(req_name, "enable_gl")) { + *op = LD_OP_ENABLE; + *rt = LD_RT_GL; + return 0; + } + if (!strcmp(req_name, "disable_gl")) { + *op = LD_OP_DISABLE; + *rt = LD_RT_GL; + return 0; + } + if (!strcmp(req_name, "rename_vg_before")) { + *op = LD_OP_RENAME_BEFORE; + *rt = LD_RT_VG; + return 0; + } + if (!strcmp(req_name, "rename_vg_final")) { + *op = LD_OP_RENAME_FINAL; + *rt = LD_RT_VG; + return 0; + } + if (!strcmp(req_name, "running_lm")) { + *op = LD_OP_RUNNING_LM; + *rt = 0; + return 0; + } + if (!strcmp(req_name, "find_free_lock")) { + *op = LD_OP_FIND_FREE_LOCK; + *rt = LD_RT_VG; + return 0; + } +out: + return -1; +} + +static int str_to_mode(const char *str) +{ + if (!str) + goto out; + if (!strcmp(str, "un")) + return LD_LK_UN; + if (!strcmp(str, "nl")) + return LD_LK_NL; + if (!strcmp(str, "sh")) + return LD_LK_SH; + if (!strcmp(str, "ex")) + return LD_LK_EX; +out: + return LD_LK_IV; +} + +static int str_to_lm(const char *str) +{ + if (!str || !strcmp(str, "none")) + return LD_LM_NONE; + if (!strcmp(str, "sanlock")) + return LD_LM_SANLOCK; + if (!strcmp(str, "dlm")) + return LD_LM_DLM; + return -2; +} + +static uint32_t str_to_opts(const char *str) +{ + uint32_t flags = 0; + + if (!str) + goto out; + if (strstr(str, "persistent")) + flags |= LD_AF_PERSISTENT; + if (strstr(str, "unlock_cancel")) + flags |= LD_AF_UNLOCK_CANCEL; + if (strstr(str, "next_version")) + flags |= LD_AF_NEXT_VERSION; + if (strstr(str, "wait")) + flags |= LD_AF_WAIT; + if (strstr(str, "force")) + flags |= LD_AF_FORCE; + if (strstr(str, "ex_disable")) + flags |= LD_AF_EX_DISABLE; + if (strstr(str, "enable")) + flags |= LD_AF_ENABLE; + if (strstr(str, "disable")) + flags |= LD_AF_DISABLE; +out: + return flags; +} + +/* + * dump info + * client_list: each client struct + * lockspaces: each lockspace struct + * lockspace actions: each action struct + * lockspace resources: each resource struct + * lockspace resource actions: each action struct + * lockspace resource locks: each lock struct + */ + +static int setup_dump_socket(void) +{ + int s; + + s = socket(AF_LOCAL, SOCK_DGRAM, 0); + if (s < 0) + return s; + + memset(&dump_addr, 0, sizeof(dump_addr)); + dump_addr.sun_family = AF_LOCAL; + strcpy(&dump_addr.sun_path[1], DUMP_SOCKET_NAME); + dump_addrlen = sizeof(sa_family_t) + strlen(dump_addr.sun_path+1) + 1; + + return s; +} + +static int send_dump_buf(int fd, int dump_len) +{ + int pos = 0; + int ret; + +retry: + ret = sendto(fd, dump_buf + pos, dump_len - pos, MSG_DONTWAIT | MSG_NOSIGNAL, + (struct sockaddr *)&dump_addr, dump_addrlen); + if (ret <= 0) + return ret; + + pos += ret; + + if (pos < dump_len) + goto retry; + + return 0; +} + +static int print_structs(const char *prefix, int pos, int len) +{ + return snprintf(dump_buf + pos, len - pos, + "info=%s " + "unused_action_count=%d " + "unused_client_count=%d " + "unused_resource_count=%d " + "unused_lock_count=%d\n", + prefix, + unused_action_count, + unused_client_count, + unused_resource_count, + unused_lock_count); +} + +static int print_client(struct client *cl, const char *prefix, int pos, int len) +{ + return snprintf(dump_buf + pos, len - pos, + "info=%s " + "pid=%d " + "fd=%d " + "pi=%d " + "id=%u " + "name=%s\n", + prefix, + cl->pid, + cl->fd, + cl->pi, + cl->id, + cl->name[0] ? cl->name : "."); +} + +static int print_lockspace(struct lockspace *ls, const char *prefix, int pos, int len) +{ + return snprintf(dump_buf + pos, len - pos, + "info=%s " + "ls_name=%s " + "vg_name=%s " + "vg_uuid=%s " + "vg_sysid=%s " + "vg_args=%s " + "lm_type=%s " + "host_id=%llu " + "create_fail=%d " + "create_done=%d " + "thread_work=%d " + "thread_stop=%d " + "thread_done=%d " + "sanlock_gl_enabled=%d " + "sanlock_gl_dup=%d\n", + prefix, + ls->name, + ls->vg_name, + ls->vg_uuid, + ls->vg_sysid[0] ? ls->vg_sysid : ".", + ls->vg_args, + lm_str(ls->lm_type), + (unsigned long long)ls->host_id, + ls->create_fail ? 1 : 0, + ls->create_done ? 1 : 0, + ls->thread_work ? 1 : 0, + ls->thread_stop ? 1 : 0, + ls->thread_done ? 1 : 0, + ls->sanlock_gl_enabled ? 1 : 0, + ls->sanlock_gl_dup ? 1 : 0); +} + +static int print_action(struct action *act, const char *prefix, int pos, int len) +{ + return snprintf(dump_buf + pos, len - pos, + "info=%s " + "client_id=%u " + "flags=0x%x " + "version=%u " + "op=%s " + "rt=%s " + "mode=%s " + "lm_type=%s " + "result=%d " + "lm_rv=%d\n", + prefix, + act->client_id, + act->flags, + act->version, + op_str(act->op), + rt_str(act->rt), + mode_str(act->mode), + lm_str(act->lm_type), + act->result, + act->lm_rv); +} + +static int print_resource(struct resource *r, const char *prefix, int pos, int len) +{ + return snprintf(dump_buf + pos, len - pos, + "info=%s " + "name=%s " + "type=%s " + "mode=%s " + "sh_count=%d " + "version=%u\n", + prefix, + r->name, + rt_str(r->type), + mode_str(r->mode), + r->sh_count, + r->version); +} + +static int print_lock(struct lock *lk, const char *prefix, int pos, int len) +{ + return snprintf(dump_buf + pos, len - pos, + "info=%s " + "mode=%s " + "version=%u " + "flags=0x%x " + "client_id=%u\n", + prefix, + mode_str(lk->mode), + lk->version, + lk->flags, + lk->client_id); +} + +static int dump_info(int *dump_len) +{ + struct client *cl; + struct lockspace *ls; + struct resource *r; + struct lock *lk; + struct action *act; + int len, pos, ret; + int rv = 0; + + memset(dump_buf, 0, sizeof(dump_buf)); + len = sizeof(dump_buf); + pos = 0; + + /* + * memory + */ + + pthread_mutex_lock(&unused_struct_mutex); + ret = print_structs("structs", pos, len); + if (ret >= len - pos) { + pthread_mutex_unlock(&unused_struct_mutex); + return -ENOSPC; + } + pos += ret; + pthread_mutex_unlock(&unused_struct_mutex); + + /* + * clients + */ + + pthread_mutex_lock(&client_mutex); + list_for_each_entry(cl, &client_list, list) { + ret = print_client(cl, "client", pos, len); + if (ret >= len - pos) { + rv = -ENOSPC; + break; + } + pos += ret; + } + pthread_mutex_unlock(&client_mutex); + + if (rv < 0) + return rv; + + /* + * lockspaces with their action/resource/lock info + */ + + pthread_mutex_lock(&lockspaces_mutex); + list_for_each_entry(ls, &lockspaces, list) { + + ret = print_lockspace(ls, "ls", pos, len); + if (ret >= len - pos) { + rv = -ENOSPC; + goto out; + } + pos += ret; + + list_for_each_entry(act, &ls->actions, list) { + ret = print_action(act, "ls_action", pos, len); + if (ret >= len - pos) { + rv = -ENOSPC; + goto out; + } + pos += ret; + } + + list_for_each_entry(r, &ls->resources, list) { + ret = print_resource(r, "r", pos, len); + if (ret >= len - pos) { + rv = -ENOSPC; + goto out; + } + pos += ret; + + list_for_each_entry(lk, &r->locks, list) { + ret = print_lock(lk, "lk", pos, len); + if (ret >= len - pos) { + rv = -ENOSPC; + goto out; + } + pos += ret; + } + + list_for_each_entry(act, &r->actions, list) { + ret = print_action(act, "r_action", pos, len); + if (ret >= len - pos) { + rv = -ENOSPC; + goto out; + } + pos += ret; + } + } + } +out: + pthread_mutex_unlock(&lockspaces_mutex); + + *dump_len = pos; + + return rv; +} + +/* called from client_thread, cl->mutex is held */ +static void client_recv_action(struct client *cl) +{ + request req; + response res; + struct action *act; + const char *cl_name; + const char *vg_name; + const char *vg_uuid; + const char *vg_sysid; + const char *str; + int64_t val; + uint32_t opts = 0; + int result = 0; + int cl_pid; + int op, rt, lm, mode; + int rv; + + buffer_init(&req.buffer); + + rv = buffer_read(cl->fd, &req.buffer); + if (!rv) { + if (errno == ECONNRESET) { + log_debug("client recv %d ECONNRESET", cl->id); + cl->dead = 1; + } else { + log_error("client recv %d buffer_read error %d", cl->id, errno); + } + buffer_destroy(&req.buffer); + client_resume(cl); + return; + } + + req.cft = dm_config_from_string(req.buffer.mem); + if (!req.cft) { + log_error("client recv %d config_from_string error", cl->id); + buffer_destroy(&req.buffer); + client_resume(cl); + return; + } + + str = daemon_request_str(req, "request", NULL); + rv = str_to_op_rt(str, &op, &rt); + if (rv < 0) { + log_error("client recv %d bad request name \"%s\"", cl->id, str ? str : ""); + dm_config_destroy(req.cft); + buffer_destroy(&req.buffer); + client_resume(cl); + return; + } + + if (op == LD_OP_HELLO || op == LD_OP_QUIT || + op == LD_OP_DUMP_INFO || op == LD_OP_DUMP_LOG) { + + /* + * FIXME: add the client command name to the hello messages + * so it can be saved in cl->name here. + */ + + result = 0; + + if (op == LD_OP_QUIT) { + log_debug("op quit"); + pthread_mutex_lock(&lockspaces_mutex); + if (list_empty(&lockspaces)) { + daemon_quit = 1; + } else { + result = -EBUSY; + } + pthread_mutex_unlock(&lockspaces_mutex); + } + + buffer_init(&res.buffer); + + if (op == LD_OP_DUMP_INFO || op == LD_OP_DUMP_LOG) { + int dump_len = 0; + int fd; + + fd = setup_dump_socket(); + if (fd < 0) + result = fd; + else if (op == LD_OP_DUMP_INFO) + result = dump_info(&dump_len); + else if (op == LD_OP_DUMP_LOG) + result = dump_log(&dump_len); + else + result = -EINVAL; + + res = daemon_reply_simple("OK", + "result = %d", result, + "dump_len = %d", dump_len, + NULL); + if (fd >= 0) { + send_dump_buf(fd, dump_len); + close(fd); + } + + } else { + res = daemon_reply_simple("OK", + "result = %d", result, + "protocol = %s", lvmlockd_protocol, + "version = %d", lvmlockd_protocol_version, + NULL); + } + + buffer_write(cl->fd, &res.buffer); + buffer_destroy(&res.buffer); + dm_config_destroy(req.cft); + buffer_destroy(&req.buffer); + client_resume(cl); + return; + } + + cl_name = daemon_request_str(req, "cmd", NULL); + cl_pid = daemon_request_int(req, "pid", 0); + vg_name = daemon_request_str(req, "vg_name", NULL); + vg_uuid = daemon_request_str(req, "vg_uuid", NULL); + vg_sysid = daemon_request_str(req, "vg_sysid", NULL); + str = daemon_request_str(req, "mode", NULL); + mode = str_to_mode(str); + str = daemon_request_str(req, "opts", NULL); + opts = str_to_opts(str); + str = daemon_request_str(req, "vg_lock_type", NULL); + lm = str_to_lm(str); + + if (cl_pid && cl_pid != cl->pid) + log_error("client recv bad message pid %d client %d", cl_pid, cl->pid); + + /* FIXME: do this in hello message instead */ + if (!cl->name[0] && cl_name) + strncpy(cl->name, cl_name, MAX_NAME); + + if (!gl_use_dlm && !gl_use_sanlock && (lm > 0)) { + if (lm == LD_LM_DLM) + gl_use_dlm = 1; + else if (lm == LD_LM_SANLOCK) + gl_use_sanlock = 1; + + log_debug("set gl_use_%s", lm_str(lm)); + } + + if (!(act = alloc_action())) { + log_error("No memory for action"); + dm_config_destroy(req.cft); + buffer_destroy(&req.buffer); + client_resume(cl); + return; + } + + act->client_id = cl->id; + act->op = op; + act->rt = rt; + act->mode = mode; + act->flags = opts; + act->lm_type = lm; + + if (vg_name && strcmp(vg_name, "none")) + strncpy(act->vg_name, vg_name, MAX_NAME); + + if (vg_uuid && strcmp(vg_uuid, "none")) + strncpy(act->vg_uuid, vg_uuid, 64); + + if (vg_sysid && strcmp(vg_sysid, "none")) + strncpy(act->vg_sysid, vg_sysid, MAX_NAME); + + str = daemon_request_str(req, "lv_name", NULL); + if (str && strcmp(str, "none")) + strncpy(act->lv_name, str, MAX_NAME); + + str = daemon_request_str(req, "lv_uuid", NULL); + if (str && strcmp(str, "none")) + strncpy(act->lv_uuid, str, MAX_NAME); + + val = daemon_request_int(req, "version", 0); + if (val) + act->version = (uint32_t)val; + + str = daemon_request_str(req, "vg_lock_args", NULL); + if (str && strcmp(str, "none")) + strncpy(act->vg_args, str, MAX_ARGS); + + str = daemon_request_str(req, "lv_lock_args", NULL); + if (str && strcmp(str, "none")) + strncpy(act->lv_args, str, MAX_ARGS); + + /* start_vg will include lvmlocal.conf local/host_id here */ + val = daemon_request_int(req, "host_id", 0); + if (val) + act->host_id = val; + + act->max_retries = daemon_request_int(req, "max_retries", DEFAULT_MAX_RETRIES); + + dm_config_destroy(req.cft); + buffer_destroy(&req.buffer); + + log_debug("recv %s[%d.%u] %s %s \"%s\" mode %s flags %x", + cl->name[0] ? cl->name : "client", cl->pid, cl->id, + op_str(act->op), rt_str(act->rt), act->vg_name, mode_str(act->mode), opts); + + switch (act->op) { + case LD_OP_START: + rv = add_lockspace(act); + break; + case LD_OP_STOP: + rv = rem_lockspace(act); + break; + case LD_OP_INIT: + case LD_OP_START_WAIT: + case LD_OP_STOP_ALL: + case LD_OP_RENAME_FINAL: + case LD_OP_RUNNING_LM: + add_work_action(act); + rv = 0; + break; + case LD_OP_LOCK: + case LD_OP_UPDATE: + case LD_OP_ENABLE: + case LD_OP_DISABLE: + case LD_OP_FREE: + case LD_OP_RENAME_BEFORE: + case LD_OP_FIND_FREE_LOCK: + rv = add_lock_action(act); + break; + default: + rv = -EINVAL; + }; + + if (rv < 0) { + act->result = rv; + add_client_result(act); + } +} + +static void *client_thread_main(void *arg_in) +{ + struct client *cl; + struct action *act; + + while (1) { + pthread_mutex_lock(&client_mutex); + while (!client_work && list_empty(&client_results)) { + if (client_stop) { + pthread_mutex_unlock(&client_mutex); + goto out; + } + pthread_cond_wait(&client_cond, &client_mutex); + } + + /* + * Send outgoing results back to clients + */ + + if (!list_empty(&client_results)) { + act = list_first_entry(&client_results, struct action, list); + list_del(&act->list); + cl = find_client_id(act->client_id); + pthread_mutex_unlock(&client_mutex); + + if (cl) { + pthread_mutex_lock(&cl->mutex); + client_send_result(cl, act); + pthread_mutex_unlock(&cl->mutex); + } else { + log_debug("no client for result"); + } + free_action(act); + continue; + } + + /* + * Queue incoming actions for lockspace threads + */ + + if (client_work) { + cl = find_client_work(); + if (!cl) + client_work = 0; + pthread_mutex_unlock(&client_mutex); + + if (!cl) + continue; + + pthread_mutex_lock(&cl->mutex); + + if (cl->recv) { + cl->recv = 0; + client_recv_action(cl); + } + + if (cl->dead) { + /* + log_debug("client rem %d pi %d fd %d ig %d", + cl->id, cl->pi, cl->fd, cl->poll_ignore); + */ + /* + * If cl->dead was set in main_loop, then the + * fd has already been closed and the pollfd + * entry is already unused. + * main_loop set dead=1, ignore=0, pi=-1, fd=-1 + * + * if cl->dead was not set in main_loop, but + * set in client_recv_action, then the main_loop + * should be ignoring this client fd. + * main_loop set ignore=1 + */ + + if (cl->poll_ignore) { + log_debug("client close %d pi %d fd %d", + cl->id, cl->pi, cl->fd); + /* assert cl->pi != -1 */ + /* assert pollfd[pi].fd == FD_IGNORE */ + close(cl->fd); + rem_pollfd(cl->pi); + cl->pi = -1; + cl->fd = -1; + cl->poll_ignore = 0; + } else { + /* main thread should have closed */ + if (cl->pi != -1 || cl->fd != -1) { + log_error("client %d bad state pi %d fd %d", + cl->id, cl->pi, cl->fd); + } + } + pthread_mutex_unlock(&cl->mutex); + + pthread_mutex_lock(&client_mutex); + list_del(&cl->list); + pthread_mutex_unlock(&client_mutex); + + client_purge(cl); + + free_client(cl); + } else { + pthread_mutex_unlock(&cl->mutex); + } + } + pthread_mutex_unlock(&client_mutex); + } +out: + return NULL; +} + +static int setup_client_thread(void) +{ + int rv; + + INIT_LIST_HEAD(&client_list); + INIT_LIST_HEAD(&client_results); + + pthread_mutex_init(&client_mutex, NULL); + pthread_cond_init(&client_cond, NULL); + + rv = pthread_create(&client_thread, NULL, client_thread_main, NULL); + if (rv) + return -1; + return 0; +} + +static void close_client_thread(void) +{ + pthread_mutex_lock(&client_mutex); + client_stop = 1; + pthread_cond_signal(&client_cond); + pthread_mutex_unlock(&client_mutex); + pthread_join(client_thread, NULL); +} + +/* + * Get a list of all VGs with a lockd type (sanlock|dlm) from lvmetad. + * We'll match this list against a list of existing lockspaces that are + * found in the lock manager. + * + * For each of these VGs, also create a struct resource on ls->resources to + * represent each LV in the VG that uses a lock. For each of these LVs + * that are active, we'll attempt to adopt a lock. + */ + +static int get_lockd_vgs(struct list_head *vg_lockd) +{ + struct list_head update_vgs; + daemon_reply reply; + struct dm_config_node *cn; + struct dm_config_node *metadata; + struct dm_config_node *md_cn; + struct dm_config_node *lv_cn; + struct lockspace *ls, *safe; + struct resource *r; + const char *vg_name; + const char *vg_uuid; + const char *lv_uuid; + const char *lock_type; + const char *lock_args; + char find_str_path[PATH_MAX]; + int mutex_unlocked = 0; + int rv = 0; + + INIT_LIST_HEAD(&update_vgs); + + pthread_mutex_lock(&lvmetad_mutex); + reply = daemon_send_simple(lvmetad_handle, "vg_list", + "token = %s", "skip", + NULL); + + if (reply.error || strcmp(daemon_reply_str(reply, "response", ""), "OK")) { + log_error("vg_list from lvmetad failed %d", reply.error); + rv = -EINVAL; + goto destroy; + } + + if (!(cn = dm_config_find_node(reply.cft->root, "volume_groups"))) { + log_error("get_lockd_vgs no vgs"); + rv = -EINVAL; + goto destroy; + } + + /* create an update_vgs list of all vg uuids */ + + for (cn = cn->child; cn; cn = cn->sib) { + vg_uuid = cn->key; + + if (!(ls = alloc_lockspace())) { + rv = -ENOMEM; + break; + } + + strncpy(ls->vg_uuid, vg_uuid, 64); + list_add_tail(&ls->list, &update_vgs); + log_debug("get_lockd_vgs %s", vg_uuid); + } + destroy: + daemon_reply_destroy(reply); + + if (rv < 0) + goto out; + + /* get vg_name and lock_type for each vg uuid entry in update_vgs */ + + list_for_each_entry(ls, &update_vgs, list) { + reply = daemon_send_simple(lvmetad_handle, "vg_lookup", + "token = %s", "skip", + "uuid = %s", ls->vg_uuid, + NULL); + + if (reply.error || strcmp(daemon_reply_str(reply, "response", ""), "OK")) { + log_error("vg_lookup from lvmetad failed %d", reply.error); + rv = -EINVAL; + goto next; + } + + vg_name = daemon_reply_str(reply, "name", NULL); + if (!vg_name) { + log_error("get_lockd_vgs %s no name", ls->vg_uuid); + rv = -EINVAL; + goto next; + } + + strncpy(ls->vg_name, vg_name, MAX_NAME); + + metadata = dm_config_find_node(reply.cft->root, "metadata"); + if (!metadata) { + log_error("get_lockd_vgs %s name %s no metadata", + ls->vg_uuid, ls->vg_name); + rv = -EINVAL; + goto next; + } + + lock_type = dm_config_find_str(metadata, "metadata/lock_type", NULL); + ls->lm_type = str_to_lm(lock_type); + + if ((ls->lm_type != LD_LM_SANLOCK) && (ls->lm_type != LD_LM_DLM)) { + log_debug("get_lockd_vgs %s not lockd type", ls->vg_name); + continue; + } + + lock_args = dm_config_find_str(metadata, "metadata/lock_args", NULL); + if (lock_args) + strncpy(ls->vg_args, lock_args, MAX_ARGS); + + log_debug("get_lockd_vgs %s lock_type %s lock_args %s", + ls->vg_name, lock_type, lock_args ?: "none"); + + /* + * Make a record (struct resource) of each lv that uses a lock. + * For any lv that uses a lock, we'll check if the lv is active + * and if so try to adopt a lock for it. + */ + + for (md_cn = metadata->child; md_cn; md_cn = md_cn->sib) { + if (strcmp(md_cn->key, "logical_volumes")) + continue; + + for (lv_cn = md_cn->child; lv_cn; lv_cn = lv_cn->sib) { + snprintf(find_str_path, PATH_MAX, "%s/lock_type", lv_cn->key); + lock_type = dm_config_find_str(lv_cn, find_str_path, NULL); + + if (!lock_type) + continue; + + snprintf(find_str_path, PATH_MAX, "%s/lock_args", lv_cn->key); + lock_args = dm_config_find_str(lv_cn, find_str_path, NULL); + + snprintf(find_str_path, PATH_MAX, "%s/id", lv_cn->key); + lv_uuid = dm_config_find_str(lv_cn, find_str_path, NULL); + + if (!lv_uuid) { + log_error("get_lock_vgs no lv id for name %s", lv_cn->key); + continue; + } + + if (!(r = alloc_resource())) { + rv = -ENOMEM; + goto next; + } + + r->type = LD_RT_LV; + strncpy(r->name, lv_uuid, MAX_NAME); + if (lock_args) + strncpy(r->lv_args, lock_args, MAX_ARGS); + list_add_tail(&r->list, &ls->resources); + log_debug("get_lockd_vgs %s lv %s %s (name %s)", + ls->vg_name, r->name, lock_args ? lock_args : "", lv_cn->key); + } + } + next: + daemon_reply_destroy(reply); + + if (rv < 0) + break; + } + pthread_mutex_unlock(&lvmetad_mutex); + mutex_unlocked = 1; +out: + /* Return lockd VG's on the vg_lockd list. */ + + list_for_each_entry_safe(ls, safe, &update_vgs, list) { + list_del(&ls->list); + + if ((ls->lm_type == LD_LM_SANLOCK) || (ls->lm_type == LD_LM_DLM)) + list_add_tail(&ls->list, vg_lockd); + else + free(ls); + } + + if (!mutex_unlocked) + pthread_mutex_unlock(&lvmetad_mutex); + + return rv; +} + +static char _dm_uuid[64]; + +static char *get_dm_uuid(char *dm_name) +{ + struct dm_info info; + struct dm_task *dmt; + const char *uuid; + + if (!(dmt = dm_task_create(DM_DEVICE_INFO))) + goto fail_out; + + if (!dm_task_set_name(dmt, dm_name)) + goto fail; + + if (!dm_task_run(dmt)) + goto fail; + + if (!dm_task_get_info(dmt, &info)) + goto fail; + + if (!info.exists) + goto fail; + + uuid = dm_task_get_uuid(dmt); + if (!uuid) { + log_error("Failed to get uuid for device %s", dm_name); + goto fail; + } + + if (strncmp(uuid, "LVM", 3)) { + log_debug("dm device %s is not from LVM", dm_name); + goto fail; + } + + memset(_dm_uuid, 0, sizeof(_dm_uuid)); + strcpy(_dm_uuid, uuid); + dm_task_destroy(dmt); + return _dm_uuid; + +fail: + dm_task_destroy(dmt); +fail_out: + return NULL; +} + +/* + * dm reports the LV uuid as: + * LVM-ydpRIdDWBDX25upmj2k0D4deat6oxH8er03T0f4xM8rPIV8XqIhwv3h8Y7xRWjMr + * + * the lock name for the LV is: + * r03T0f-4xM8-rPIV-8XqI-hwv3-h8Y7-xRWjMr + * + * This function formats both as: + * r03T0f4xM8rPIV8XqIhwv3h8Y7xRWjMr + * + * and returns 1 if they match. + */ + +static int match_dm_uuid(char *dm_uuid, char *lv_lock_uuid) +{ + char buf1[64]; + char buf2[64]; + int i, j; + + memset(buf1, 0, sizeof(buf1)); + memset(buf2, 0, sizeof(buf2)); + + for (i = 0, j = 0; i < strlen(lv_lock_uuid); i++) { + if (lv_lock_uuid[i] == '-') + continue; + buf1[j] = lv_lock_uuid[i]; + j++; + } + + for (i = 36, j = 0; i < 69; i++) { + buf2[j] = dm_uuid[i]; + j++; + } + + if (!strcmp(buf1, buf2)) + return 1; + return 0; +} + +/* + * All LVs with a lock_type are on ls->resources. + * Remove any that are not active. The remaining + * will have locks adopted. + */ + +static int remove_inactive_lvs(struct list_head *vg_lockd) +{ + struct lockspace *ls; + struct resource *r, *rsafe; + struct dm_names *names; + struct dm_task *dmt; + char *dm_uuid; + char *vgname, *lvname, *layer; + char namebuf[MAX_NAME+1]; + unsigned next = 0; + int rv = 0; + + if (!(dmt = dm_task_create(DM_DEVICE_LIST))) + return -1; + + if (!dm_task_run(dmt)) { + log_error("Failed to get dm devices"); + rv = -1; + goto ret; + } + + if (!(names = dm_task_get_names(dmt))) { + log_error("Failed to get dm names"); + rv = -1; + goto ret; + } + + if (!names->dev) { + log_debug("dm names none found"); + goto out; + } + + /* + * For each dm name, compare it to each lv in each lockd vg. + */ + + do { + names = (struct dm_names *)((char *) names + next); + + dm_uuid = get_dm_uuid(names->name); + if (!dm_uuid) + goto next_dmname; + + vgname = NULL; + lvname = NULL; + layer = NULL; + + memset(namebuf, 0, sizeof(namebuf)); + strncpy(namebuf, names->name, MAX_NAME); + vgname = namebuf; + + dm_split_lvm_name(NULL, NULL, &vgname, &lvname, &layer); + + log_debug("adopt remove_inactive dm name %s dm uuid %s vgname %s lvname %s", + names->name, dm_uuid, vgname, lvname); + + if (!vgname || !lvname) { + log_debug("dm name %s invalid split vg %s lv %s layer %s", + names->name, vgname ? vgname : "", lvname ? lvname : "", layer ? layer : ""); + goto next_dmname; + } + + list_for_each_entry(ls, vg_lockd, list) { + if (strcmp(vgname, ls->vg_name)) + continue; + + if (!strcmp(lvname, "lvmlock")) + continue; + + list_for_each_entry(r, &ls->resources, list) { + if (!match_dm_uuid(dm_uuid, r->name)) + continue; + + /* Found an active LV in a lockd VG. */ + log_debug("dm device %s adopt in vg %s lv %s", + names->name, ls->vg_name, r->name); + r->adopt = 1; + goto next_dmname; + } + } +next_dmname: + next = names->next; + } while (next); + +out: + /* Remove any struct resources that do not need locks adopted. */ + list_for_each_entry(ls, vg_lockd, list) { + list_for_each_entry_safe(r, rsafe, &ls->resources, list) { + if (r->adopt) { + r->adopt = 0; + } else { + log_debug("lockd vg %s remove inactive lv %s", ls->vg_name, r->name); + list_del(&r->list); + free_resource(r); + } + } + } +ret: + dm_task_destroy(dmt); + return rv; +} + +static void adopt_locks(void) +{ + struct list_head ls_found; + struct list_head vg_lockd; + struct list_head to_unlock; + struct lockspace *ls, *lsafe; + struct lockspace *ls1, *l1safe; + struct lockspace *ls2, *l2safe; + struct resource *r, *rsafe; + struct action *act, *asafe; + int count_start = 0, count_start_done = 0, count_start_fail = 0; + int count_adopt = 0, count_adopt_done = 0, count_adopt_fail = 0; + int found, rv; + + INIT_LIST_HEAD(&adopt_results); + + INIT_LIST_HEAD(&ls_found); + INIT_LIST_HEAD(&vg_lockd); + INIT_LIST_HEAD(&to_unlock); + + /* + * Get list of lockspaces from lock managers. + * Get list of VGs from lvmetad with a lockd type. + * Get list of active lockd type LVs from /dev. + * + * ECONNREFUSED means the lock manager is not running. + * This is expected for at least one of them. + */ + + rv = lm_get_lockspaces_dlm(&ls_found); + if ((rv < 0) && (rv != -ECONNREFUSED)) + goto fail; + + rv = lm_get_lockspaces_sanlock(&ls_found); + if ((rv < 0) && (rv != -ECONNREFUSED)) + goto fail; + + if (list_empty(&ls_found)) { + log_debug("No lockspaces found to adopt"); + return; + } + + /* + * Adds a struct lockspace to vg_lockd for each lockd VG. + * Adds a struct resource to ls->resources for each LV. + */ + rv = get_lockd_vgs(&vg_lockd); + if (rv < 0) { + log_error("adopt_locks get_lockd_vgs failed"); + goto fail; + } + + /* + * For each resource on each lockspace, check if the + * corresponding LV is active. If so, leave the + * resource struct, if not free the resource struct. + * The remain entries need to have locks adopted. + */ + rv = remove_inactive_lvs(&vg_lockd); + if (rv < 0) { + log_error("adopt_locks remove_inactive_lvs failed"); + goto fail; + } + + list_for_each_entry(ls, &ls_found, list) { + if (ls->lm_type == LD_LM_DLM) + gl_use_dlm = 1; + + log_debug("adopt %s lockspace %s vg %s", + lm_str(ls->lm_type), ls->name, ls->vg_name); + } + + if (!gl_use_dlm) + gl_use_sanlock = 1; + + list_for_each_entry(ls, &vg_lockd, list) { + log_debug("adopt lvmetad vg %s lock_type %s lock_args %s", + ls->vg_name, lm_str(ls->lm_type), ls->vg_args); + + list_for_each_entry(r, &ls->resources, list) + log_debug("adopt lv %s %s", ls->vg_name, r->name); + } + + /* + * Compare and merge the list of lockspaces in ls_found + * and the list of lockd VGs in vg_lockd. + * + * An ls from ls_found may not have had any active lvs when + * previous lvmlockd died, but the ls should still be joined, + * and checked for GL/VG locks. + * + * An ls from vg_lockd with active lvs should be in ls_found. + * If it's not then we might want to join the ls and acquire locks + * for the active lvs (as opposed to adopting orphans for them.) + * The orphan lock in the ls should have prevented the ls in + * the lock manager from going away. + * + * If an ls in vg_lockd has no active lvs and does not have + * a matching entry in ls_found, then skip it. + * + * An ls in ls_found should always have a matching ls in + * vg_lockd. If it doesn't, then maybe the vg has been + * removed even though the lockspace for the vg is still + * in the lock manager. Just leave the ls in the lm + * alone, and skip the ls_found entry. + */ + + list_for_each_entry_safe(ls1, l1safe, &ls_found, list) { + + /* The dlm global lockspace is special and doesn't match a VG. */ + if (!strcmp(ls1->name, gl_lsname_dlm)) { + list_del(&ls1->list); + free(ls1); + continue; + } + + found = 0; + + list_for_each_entry_safe(ls2, l2safe, &vg_lockd, list) { + if (strcmp(ls1->vg_name, ls2->vg_name)) + continue; + + /* + * LS in both ls_found and vg_lockd. + */ + log_debug("ls %s matches vg %s", ls1->name, ls2->vg_name); + memcpy(ls1->vg_uuid, ls2->vg_uuid, 64); + memcpy(ls1->vg_args, ls2->vg_args, MAX_ARGS); + list_for_each_entry_safe(r, rsafe, &ls2->resources, list) { + list_del(&r->list); + list_add(&r->list, &ls1->resources); + } + list_del(&ls2->list); + free(ls2); + found = 1; + break; + } + + /* + * LS in ls_found, not in vg_lockd. + * An lvm lockspace found in the lock manager has no + * corresponding VG in lvmetad. This shouldn't usually + * happen, but it's possible the VG could have been removed + * while the orphaned lockspace from it was still around. + * Report an error and leave the ls in the lm alone. + */ + if (!found) { + log_error("No VG %s found for lockspace %s %s", + ls1->vg_name, ls1->name, lm_str(ls1->lm_type)); + list_del(&ls1->list); + free(ls1); + } + } + + /* + * LS in vg_lockd, not in ls_found. + * lockd vgs from lvmetad that do not have an existing lockspace. + * This wouldn't be unusual; we just skip the vg. + * But, if the vg has active lvs, then it should have had locks + * and a lockspace. Should we attempt to join the lockspace and + * acquire (not adopt) locks for these LVs? + */ + + list_for_each_entry_safe(ls, lsafe, &vg_lockd, list) { + if (!list_empty(&ls->resources)) { + /* We should have found a lockspace. */ + /* add this ls and acquire locks for ls->resources? */ + log_error("No lockspace %s %s found for VG %s with active LVs", + ls->name, lm_str(ls->lm_type), ls->vg_name); + } else { + /* The VG wasn't started in the previous lvmlockd. */ + log_debug("No ls found for vg %s", ls->vg_name); + } + + list_del(&ls->list); + free(ls); + } + + /* + * Create and queue start actions to add lockspaces. + */ + + if (gl_use_dlm) { + if (!(act = alloc_action())) + goto fail; + log_debug("adopt add dlm global lockspace"); + act->op = LD_OP_START; + act->flags = (LD_AF_ADOPT | LD_AF_WAIT); + act->rt = LD_RT_GL; + act->lm_type = LD_LM_DLM; + act->client_id = ADOPT_CLIENT_ID; + add_dlm_global_lockspace(act); + count_start++; + } + + list_for_each_entry_safe(ls, lsafe, &ls_found, list) { + if (!(act = alloc_action())) + goto fail; + act->op = LD_OP_START; + act->flags = (LD_AF_ADOPT | LD_AF_WAIT); + act->rt = LD_RT_VG; + act->lm_type = ls->lm_type; + act->client_id = ADOPT_CLIENT_ID; + strncpy(act->vg_name, ls->vg_name, MAX_NAME); + memcpy(act->vg_uuid, ls->vg_uuid, 64); + memcpy(act->vg_args, ls->vg_args, MAX_ARGS); + act->host_id = ls->host_id; + + /* set act->version from lvmetad data? */ + + log_debug("adopt add %s vg lockspace %s", lm_str(act->lm_type), act->vg_name); + + rv = add_lockspace_thread(ls->name, act->vg_name, act->vg_uuid, + act->lm_type, act->vg_args, act); + if (rv < 0) { + log_error("Failed to create lockspace thread for VG %s", ls->vg_name); + list_del(&ls->list); + free(ls); + free_action(act); + count_start_fail++; + continue; + } + + /* + * When the lockspace_thread is done with the start act, + * it will see the act ADOPT flag and move the act onto + * the adopt_results list for us to collect below. + */ + count_start++; + } + + log_debug("adopt starting %d lockspaces", count_start); + + /* + * Wait for all start/rejoin actions to complete. Each start action + * queued above will appear on the adopt_results list when finished. + */ + + while (count_start_done < count_start) { + sleep(1); + act = NULL; + + pthread_mutex_lock(&client_mutex); + if (!list_empty(&adopt_results)) { + act = list_first_entry(&adopt_results, struct action, list); + list_del(&act->list); + } + pthread_mutex_unlock(&client_mutex); + + if (!act) + continue; + + if (act->result < 0) { + log_error("adopt add lockspace failed vg %s %d", act->vg_name, act->result); + count_start_fail++; + } + + free_action(act); + count_start_done++; + } + + log_debug("adopt started %d lockspaces done %d fail %d", + count_start, count_start_done, count_start_fail); + + /* + * Create lock-adopt actions for active LVs (ls->resources), + * and GL/VG locks (we don't know if these locks were held + * and orphaned by the last lvmlockd, so try to adopt them + * to see.) + * + * A proper struct lockspace now exists on the lockspaces list + * for each ls in ls_found. Lock ops for one of those + * lockspaces can be done as OP_LOCK actions queued using + * add_lock_action(); + * + * Start by attempting to adopt the lock in the most likely + * mode it was left in (ex for lvs, sh for vg/gl). If + * the mode is wrong, the lm will return an error and we + * try again with the other mode. + */ + + list_for_each_entry(ls, &ls_found, list) { + + /* + * Adopt orphan LV locks. + */ + + list_for_each_entry(r, &ls->resources, list) { + if (!(act = alloc_action())) + goto fail; + act->op = LD_OP_LOCK; + act->rt = LD_RT_LV; + act->mode = LD_LK_EX; + act->flags = (LD_AF_ADOPT | LD_AF_PERSISTENT); + act->client_id = ADOPT_CLIENT_ID; + act->lm_type = ls->lm_type; + strncpy(act->vg_name, ls->vg_name, MAX_NAME); + strncpy(act->lv_uuid, r->name, MAX_NAME); + strncpy(act->lv_args, r->lv_args, MAX_ARGS); + + log_debug("adopt lock for lv %s %s", act->vg_name, act->lv_uuid); + + rv = add_lock_action(act); + if (rv < 0) { + log_error("adopt add_lock_action lv %s %s error %d", act->vg_name, act->lv_uuid, rv); + count_adopt_fail++; + free_action(act); + } else { + count_adopt++; + } + } + + /* + * Adopt orphan VG lock. + */ + + if (!(act = alloc_action())) + goto fail; + act->op = LD_OP_LOCK; + act->rt = LD_RT_VG; + act->mode = LD_LK_SH; + act->flags = LD_AF_ADOPT; + act->client_id = ADOPT_CLIENT_ID; + act->lm_type = ls->lm_type; + strncpy(act->vg_name, ls->vg_name, MAX_NAME); + + log_debug("adopt lock for vg %s", act->vg_name); + + rv = add_lock_action(act); + if (rv < 0) { + log_error("adopt add_lock_action vg %s error %d", act->vg_name, rv); + count_adopt_fail++; + free_action(act); + } else { + count_adopt++; + } + } + + /* + * Adopt orphan GL lock. + */ + + if (!(act = alloc_action())) + goto fail; + act->op = LD_OP_LOCK; + act->rt = LD_RT_GL; + act->mode = LD_LK_SH; + act->flags = LD_AF_ADOPT; + act->client_id = ADOPT_CLIENT_ID; + act->lm_type = (gl_use_sanlock ? LD_LM_SANLOCK : LD_LM_DLM); + + log_debug("adopt lock for gl"); + + rv = add_lock_action(act); + if (rv < 0) { + log_error("adopt add_lock_action gl %s error %d", act->vg_name, rv); + count_adopt_fail++; + free_action(act); + } else { + count_adopt++; + } + + /* + * Wait for lock-adopt actions to complete. The completed + * actions are passed back here via the adopt_results list. + */ + + while (count_adopt_done < count_adopt) { + sleep(1); + act = NULL; + + pthread_mutex_lock(&client_mutex); + if (!list_empty(&adopt_results)) { + act = list_first_entry(&adopt_results, struct action, list); + list_del(&act->list); + } + pthread_mutex_unlock(&client_mutex); + + if (!act) + continue; + + /* + * lock adopt results + */ + + if (act->result == -EUCLEAN) { + /* + * Adopt failed because the orphan has a different mode + * than initially requested. Repeat the lock-adopt operation + * with the other mode. N.B. this logic depends on first + * trying sh then ex for GL/VG locks, and ex then sh for + * LV locks. + */ + + if ((act->rt != LD_RT_LV) && (act->mode == LD_LK_SH)) { + /* GL/VG locks: attempt to adopt ex after sh failed. */ + act->mode = LD_LK_EX; + rv = add_lock_action(act); + + } else if ((act->rt == LD_RT_LV) && (act->mode == LD_LK_EX)) { + /* LV locks: attempt to adopt sh after ex failed. */ + act->mode = LD_LK_SH; + rv = add_lock_action(act); + + } else { + log_error("Failed to adopt %s lock in vg %s error %d", + rt_str(act->rt), act->vg_name, act->result); + count_adopt_fail++; + count_adopt_done++; + free_action(act); + rv = 0; + } + + if (rv < 0) { + log_error("adopt add_lock_action again %s", act->vg_name); + count_adopt_fail++; + count_adopt_done++; + free_action(act); + } + + } else if (act->result == -ENOENT) { + /* + * No orphan lock exists. This is common for GL/VG locks + * because they may not have been held when lvmlockd exited. + * It's also expected for LV types that do not use a lock. + */ + + if (act->rt == LD_RT_LV) { + /* Unexpected, we should have found an orphan. */ + log_error("Failed to adopt LV lock for %s %s error %d", + act->vg_name, act->lv_uuid, act->result); + count_adopt_fail++; + } else { + /* Normal, no GL/VG lock was orphaned. */ + log_debug("Did not adopt %s lock in vg %s error %d", + rt_str(act->rt), act->vg_name, act->result); + } + + count_adopt_done++; + free_action(act); + + } else if (act->result < 0) { + /* + * Some unexpected error. + */ + + log_error("adopt lock rt %s vg %s lv %s error %d", + rt_str(act->rt), act->vg_name, act->lv_uuid, act->result); + count_adopt_fail++; + count_adopt_done++; + free_action(act); + + } else { + /* + * Adopt success. + */ + + if (act->rt == LD_RT_LV) { + log_debug("adopt success lv %s %s %s", act->vg_name, act->lv_uuid, mode_str(act->mode)); + free_action(act); + } else if (act->rt == LD_RT_VG) { + log_debug("adopt success vg %s %s", act->vg_name, mode_str(act->mode)); + list_add_tail(&act->list, &to_unlock); + } else if (act->rt == LD_RT_GL) { + log_debug("adopt success gl %s %s", act->vg_name, mode_str(act->mode)); + list_add_tail(&act->list, &to_unlock); + } + count_adopt_done++; + } + } + + /* + * Release adopted GL/VG locks. + * The to_unlock actions were the ones used to lock-adopt the GL/VG locks; + * now use them to do the unlocks. These actions will again be placed + * on adopt_results for us to collect because they have the ADOPT flag set. + */ + + count_adopt = 0; + count_adopt_done = 0; + + list_for_each_entry_safe(act, asafe, &to_unlock, list) { + list_del(&act->list); + + if (act->mode == LD_LK_EX) { + /* + * FIXME: we probably want to check somehow that + * there's no lvm command still running that's + * using this ex lock and changing things. + */ + log_warn("adopt releasing ex %s lock %s", + rt_str(act->rt), act->vg_name); + } + + act->mode = LD_LK_UN; + + log_debug("adopt unlock for %s %s", rt_str(act->rt), act->vg_name); + + rv = add_lock_action(act); + if (rv < 0) { + log_error("adopt unlock add_lock_action error %d", rv); + free_action(act); + } else { + count_adopt++; + } + } + + /* Wait for the unlocks to complete. */ + + while (count_adopt_done < count_adopt) { + sleep(1); + act = NULL; + + pthread_mutex_lock(&client_mutex); + if (!list_empty(&adopt_results)) { + act = list_first_entry(&adopt_results, struct action, list); + list_del(&act->list); + } + pthread_mutex_unlock(&client_mutex); + + if (!act) + continue; + + if (act->result < 0) + log_error("adopt unlock error %d", act->result); + + count_adopt_done++; + free_action(act); + } + + + /* FIXME: purge any remaining orphan locks in each rejoined ls? */ + + if (count_start_fail || count_adopt_fail) + goto fail; + + log_debug("adopt_locks done"); + return; + +fail: + log_error("adopt_locks failed, reset host"); +} + +static int get_peer_pid(int fd) +{ + struct ucred cred; + unsigned int len = sizeof(cred); + + if (getsockopt(fd, SOL_SOCKET, SO_PEERCRED, &cred, &len) != 0) + return -1; + + return cred.pid; +} + +static void process_listener(int poll_fd) +{ + struct client *cl; + int fd, pi; + + /* assert poll_fd == listen_fd */ + + fd = accept(listen_fd, NULL, NULL); + if (fd < 0) + return; + + if (!(cl = alloc_client())) + return; + + pi = add_pollfd(fd); + if (pi < 0) { + log_error("process_listener add_pollfd error %d", pi); + free_client(cl); + return; + } + + cl->pi = pi; + cl->fd = fd; + cl->pid = get_peer_pid(fd); + + pthread_mutex_init(&cl->mutex, NULL); + + pthread_mutex_lock(&client_mutex); + client_ids++; + + if (client_ids == ADOPT_CLIENT_ID) + client_ids++; + if (!client_ids) + client_ids++; + + cl->id = client_ids; + list_add_tail(&cl->list, &client_list); + pthread_mutex_unlock(&client_mutex); + + log_debug("client add id %d pi %d fd %d", cl->id, cl->pi, cl->fd); +} + +/* + * main loop polls on pipe[0] so that a thread can + * restart the poll by writing to pipe[1]. + */ +static int setup_restart(void) +{ + if (pipe(restart_fds)) { + log_error("setup_restart pipe error %d", errno); + return -1; + } + + restart_pi = add_pollfd(restart_fds[0]); + if (restart_pi < 0) + return restart_pi; + + return 0; +} + +/* + * thread wrote 'w' to restart_fds[1] to restart poll() + * after adding an fd back into pollfd. + */ +static void process_restart(int fd) +{ + char wake[1]; + int rv; + + /* assert fd == restart_fds[0] */ + + rv = read(restart_fds[0], wake, 1); + if (!rv || rv < 0) + log_debug("process_restart error %d", errno); +} + +static void sigterm_handler(int sig __attribute__((unused))) +{ + daemon_quit = 1; +} + +static int main_loop(daemon_state *ds_arg) +{ + struct client *cl; + int i, rv, is_recv, is_dead; + + signal(SIGTERM, &sigterm_handler); + + rv = setup_structs(); + if (rv < 0) { + log_error("Can't allocate memory"); + return rv; + } + + strcpy(gl_lsname_dlm, S_NAME_GL_DLM); + + INIT_LIST_HEAD(&lockspaces); + INIT_LIST_HEAD(&lockspaces_inactive); + pthread_mutex_init(&lockspaces_mutex, NULL); + pthread_mutex_init(&pollfd_mutex, NULL); + pthread_mutex_init(&log_mutex, NULL); + + openlog("lvmlockd", LOG_CONS | LOG_PID, LOG_DAEMON); + log_warn("lvmlockd started"); + + listen_fd = ds_arg->socket_fd; + listen_pi = add_pollfd(listen_fd); + + setup_client_thread(); + setup_worker_thread(); + setup_restart(); + + pthread_mutex_init(&lvmetad_mutex, NULL); + lvmetad_handle = lvmetad_open(NULL); + if (lvmetad_handle.error || lvmetad_handle.socket_fd < 0) + log_error("lvmetad_open error %d", lvmetad_handle.error); + else + lvmetad_connected = 1; + + /* + * Attempt to rejoin lockspaces and adopt locks from a previous + * instance of lvmlockd that left behind lockspaces/locks. + */ + if (adopt_opt) + adopt_locks(); + + while (1) { + rv = poll(pollfd, pollfd_maxi + 1, -1); + if (rv == -1 && errno == EINTR) { + if (daemon_quit) { + int count; + /* first sigterm would trigger stops, and + second sigterm may finish the joins. */ + count = for_each_lockspace(DO_STOP, DO_FREE, NO_FORCE); + if (!count) + break; + log_debug("ignore shutdown for %d lockspaces", count); + daemon_quit = 0; + } + continue; + } + if (rv < 0) { + log_error("poll errno %d", errno); + break; + } + + for (i = 0; i <= pollfd_maxi; i++) { + if (pollfd[i].fd < 0) + continue; + + is_recv = 0; + is_dead = 0; + + if (pollfd[i].revents & POLLIN) + is_recv = 1; + if (pollfd[i].revents & (POLLERR | POLLHUP | POLLNVAL)) + is_dead = 1; + + if (!is_recv && !is_dead) + continue; + + if (i == listen_pi) { + process_listener(pollfd[i].fd); + continue; + } + + if (i == restart_pi) { + process_restart(pollfd[i].fd); + continue; + } + + /* + log_debug("poll pi %d fd %d revents %x", + i, pollfd[i].fd, pollfd[i].revents); + */ + + pthread_mutex_lock(&client_mutex); + cl = find_client_pi(i); + if (cl) { + pthread_mutex_lock(&cl->mutex); + + if (cl->recv) { + /* should not happen */ + log_error("main client %d already recv", cl->id); + + } else if (cl->dead) { + /* should not happen */ + log_error("main client %d already dead", cl->id); + + } else if (is_dead) { + log_debug("close %s[%d.%u] fd %d", + cl->name[0] ? cl->name : "client", + cl->pid, cl->id, cl->fd); + cl->dead = 1; + cl->pi = -1; + cl->fd = -1; + cl->poll_ignore = 0; + close(pollfd[i].fd); + pollfd[i].fd = POLL_FD_UNUSED; + pollfd[i].events = 0; + pollfd[i].revents = 0; + + } else if (is_recv) { + cl->recv = 1; + cl->poll_ignore = 1; + pollfd[i].fd = POLL_FD_IGNORE; + pollfd[i].events = 0; + pollfd[i].revents = 0; + } + + pthread_mutex_unlock(&cl->mutex); + + client_work = 1; + pthread_cond_signal(&client_cond); + + /* client_thread will pick up and work on any + client with cl->recv or cl->dead set */ + + } else { + /* don't think this can happen */ + log_error("no client for index %d fd %d", + i, pollfd[i].fd); + close(pollfd[i].fd); + pollfd[i].fd = POLL_FD_UNUSED; + pollfd[i].events = 0; + pollfd[i].revents = 0; + } + pthread_mutex_unlock(&client_mutex); + + /* After set_dead, should we scan pollfd for + last unused slot and reduce pollfd_maxi? */ + } + } + + for_each_lockspace_retry(DO_STOP, DO_FREE, DO_FORCE); + free_lockspaces_inactive(); + close_worker_thread(); + close_client_thread(); + closelog(); + daemon_close(lvmetad_handle); + return 0; +} + +static void usage(char *prog, FILE *file) +{ + fprintf(file, "Usage:\n"); + fprintf(file, "%s [options]\n\n", prog); + fprintf(file, " --help | -h\n"); + fprintf(file, " Show this help information.\n"); + fprintf(file, " --version | -V\n"); + fprintf(file, " Show version of lvmlockd.\n"); + fprintf(file, " --test | -T\n"); + fprintf(file, " Test mode, do not call lock manager.\n"); + fprintf(file, " --foreground | -f\n"); + fprintf(file, " Don't fork.\n"); + fprintf(file, " --daemon-debug | -D\n"); + fprintf(file, " Don't fork and print debugging to stdout.\n"); + fprintf(file, " --pid-file | -p <path>\n"); + fprintf(file, " Set path to the pid file. [%s]\n", LVMLOCKD_PIDFILE); + fprintf(file, " --socket-path | -s <path>\n"); + fprintf(file, " Set path to the socket to listen on. [%s]\n", LVMLOCKD_SOCKET); + fprintf(file, " --syslog-priority | -S err|warning|debug\n"); + fprintf(file, " Write log messages from this level up to syslog. [%s]\n", _syslog_num_to_name(LOG_SYSLOG_PRIO)); + fprintf(file, " --gl-type | -g <str>\n"); + fprintf(file, " Set global lock type to be dlm|sanlock.\n"); + fprintf(file, " --host-id | -i <num>\n"); + fprintf(file, " Set the local sanlock host id.\n"); + fprintf(file, " --host-id-file | -F <path>\n"); + fprintf(file, " A file containing the local sanlock host_id.\n"); + fprintf(file, " --adopt | -A 0|1\n"); + fprintf(file, " Adopt locks from a previous instance of lvmlockd.\n"); +} + +int main(int argc, char *argv[]) +{ + daemon_state ds; + + ds.daemon_main = main_loop; + ds.daemon_init = NULL; + ds.daemon_fini = NULL; + ds.pidfile = getenv("LVM_LVMLOCKD_PIDFILE"); + ds.socket_path = getenv("LVM_LVMLOCKD_SOCKET"); + ds.protocol = lvmlockd_protocol; + ds.protocol_version = lvmlockd_protocol_version; + ds.name = "lvmlockd"; + + static struct option long_options[] = { + {"help", no_argument, 0, 'h' }, + {"version", no_argument, 0, 'V' }, + {"test", no_argument, 0, 'T' }, + {"foreground", no_argument, 0, 'f' }, + {"daemon-debug", no_argument, 0, 'D' }, + {"pid-file", required_argument, 0, 'p' }, + {"socket-path", required_argument, 0, 's' }, + {"gl-type", required_argument, 0, 'g' }, + {"host-id", required_argument, 0, 'i' }, + {"host-id-file", required_argument, 0, 'F' }, + {"adopt", required_argument, 0, 'A' }, + {"syslog-priority", required_argument, 0, 'S' }, + {0, 0, 0, 0 } + }; + + while (1) { + int c; + int lm; + int option_index = 0; + + c = getopt_long(argc, argv, "hVTfDp:s:l:g:S:I:A:", + long_options, &option_index); + if (c == -1) + break; + + switch (c) { + case '0': + break; + case 'h': + usage(argv[0], stdout); + exit(EXIT_SUCCESS); + case 'V': + printf("lvmlockd version: " LVM_VERSION "\n"); + exit(EXIT_SUCCESS); + case 'T': + daemon_test = 1; + break; + case 'f': + ds.foreground = 1; + break; + case 'D': + ds.foreground = 1; + daemon_debug = 1; + break; + case 'p': + ds.pidfile = strdup(optarg); + break; + case 's': + ds.socket_path = strdup(optarg); + break; + case 'g': + lm = str_to_lm(optarg); + if (lm == LD_LM_DLM) + gl_use_dlm = 1; + else if (lm == LD_LM_SANLOCK) + gl_use_sanlock = 1; + else { + fprintf(stderr, "invalid gl-type option"); + exit(EXIT_FAILURE); + } + break; + case 'i': + daemon_host_id = atoi(optarg); + break; + case 'F': + daemon_host_id_file = strdup(optarg); + break; + case 'A': + adopt_opt = atoi(optarg); + break; + case 'S': + syslog_priority = _syslog_name_to_num(optarg); + break; + case '?': + default: + usage(argv[0], stdout); + exit(EXIT_FAILURE); + } + } + + if (!ds.pidfile) + ds.pidfile = LVMLOCKD_PIDFILE; + + if (!ds.socket_path) + ds.socket_path = LVMLOCKD_SOCKET; + + /* runs daemon_main/main_loop */ + daemon_start(ds); + + return 0; +} diff --git a/daemons/lvmlockd/lvmlockd-dlm.c b/daemons/lvmlockd/lvmlockd-dlm.c new file mode 100644 index 000000000..96b5b2be0 --- /dev/null +++ b/daemons/lvmlockd/lvmlockd-dlm.c @@ -0,0 +1,666 @@ +/* + * Copyright (C) 2014 Red Hat, Inc. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU Lesser General Public License v.2.1. + */ + +#define _XOPEN_SOURCE 500 /* pthread */ +#define _ISOC99_SOURCE +#define _GNU_SOURCE + +#include <assert.h> +#include <pthread.h> +#include <stdint.h> +#include <stddef.h> +#include <stdlib.h> +#include <unistd.h> +#include <stdio.h> +#include <poll.h> +#include <errno.h> +#include <string.h> +#include <endian.h> +#include <fcntl.h> +#include <byteswap.h> +#include <syslog.h> +#include <dirent.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/socket.h> + +#include "configure.h" +#include "daemon-server.h" +#include "daemon-log.h" +#include "xlate.h" + +#include "lvmlockd-internal.h" +#include "lvmlockd-client.h" + +/* + * Using synchronous _wait dlm apis so do not define _REENTRANT and + * link with non-threaded version of library, libdlm_lt. + */ +#include "libdlm.h" + +struct lm_dlm { + dlm_lshandle_t *dh; +}; + +struct rd_dlm { + struct dlm_lksb lksb; + struct val_blk *vb; +}; + +int lm_data_size_dlm(void) +{ + return sizeof(struct rd_dlm); +} + +/* + * lock_args format + * + * vg_lock_args format for dlm is + * vg_version_string:undefined:cluster_name + * + * lv_lock_args are not used for dlm + * + * version_string is MAJOR.MINOR.PATCH + * undefined may contain ":" + */ + +#define VG_LOCK_ARGS_MAJOR 1 +#define VG_LOCK_ARGS_MINOR 0 +#define VG_LOCK_ARGS_PATCH 0 + +static int cluster_name_from_args(char *vg_args, char *clustername) +{ + return last_string_from_args(vg_args, clustername); +} + +static int check_args_version(char *vg_args) +{ + unsigned int major = 0; + int rv; + + rv = version_from_args(vg_args, &major, NULL, NULL); + if (rv < 0) { + log_error("check_args_version %s error %d", vg_args, rv); + return rv; + } + + if (major > VG_LOCK_ARGS_MAJOR) { + log_error("check_args_version %s major %d %d", vg_args, major, VG_LOCK_ARGS_MAJOR); + return -1; + } + + return 0; +} + +/* This will be set after dlm_controld is started. */ +#define DLM_CLUSTER_NAME_PATH "/sys/kernel/config/dlm/cluster/cluster_name" + +static int read_cluster_name(char *clustername) +{ + char *n; + int fd; + int rv; + + if (daemon_test) { + sprintf(clustername, "%s", "test"); + return 0; + } + + fd = open(DLM_CLUSTER_NAME_PATH, O_RDONLY); + if (fd < 0) { + log_error("read_cluster_name: open error %d, check dlm_controld", fd); + return fd; + } + + rv = read(fd, clustername, MAX_ARGS - 1); + if (rv < 0) { + log_error("read_cluster_name: cluster name read error %d, check dlm_controld", fd); + close(fd); + return rv; + } + + n = strstr(clustername, "\n"); + if (n) + *n = '\0'; + close(fd); + return 0; +} + +int lm_init_vg_dlm(char *ls_name, char *vg_name, uint32_t flags, char *vg_args) +{ + char clustername[MAX_ARGS]; + char lock_args_version[MAX_ARGS]; + int rv; + + memset(clustername, 0, sizeof(clustername)); + memset(lock_args_version, 0, sizeof(lock_args_version)); + + snprintf(lock_args_version, MAX_ARGS, "%u.%u.%u", + VG_LOCK_ARGS_MAJOR, VG_LOCK_ARGS_MINOR, VG_LOCK_ARGS_PATCH); + + rv = read_cluster_name(clustername); + if (rv < 0) + return -EMANAGER; + + if (strlen(clustername) + strlen(lock_args_version) + 2 > MAX_ARGS) { + log_error("init_vg_dlm args too long"); + return -EARGS; + } + + snprintf(vg_args, MAX_ARGS, "%s:%s", lock_args_version, clustername); + rv = 0; + + log_debug("init_vg_dlm done %s vg_args %s", ls_name, vg_args); + return rv; +} + +int lm_prepare_lockspace_dlm(struct lockspace *ls) +{ + char sys_clustername[MAX_ARGS]; + char arg_clustername[MAX_ARGS]; + struct lm_dlm *lmd; + int rv; + + memset(sys_clustername, 0, sizeof(sys_clustername)); + memset(arg_clustername, 0, sizeof(arg_clustername)); + + rv = read_cluster_name(sys_clustername); + if (rv < 0) + return -EMANAGER; + + if (!ls->vg_args[0]) { + /* global lockspace has no vg args */ + goto skip_args; + } + + rv = check_args_version(ls->vg_args); + if (rv < 0) + return -EARGS; + + rv = cluster_name_from_args(ls->vg_args, arg_clustername); + if (rv < 0) { + log_error("prepare_lockspace_dlm %s no cluster name from args %s", ls->name, ls->vg_args); + return -EARGS; + } + + if (strcmp(sys_clustername, arg_clustername)) { + log_error("prepare_lockspace_dlm %s mismatching cluster names sys %s arg %s", + ls->name, sys_clustername, arg_clustername); + return -EARGS; + } + + skip_args: + lmd = malloc(sizeof(struct lm_dlm)); + if (!lmd) + return -ENOMEM; + + ls->lm_data = lmd; + return 0; +} + +int lm_add_lockspace_dlm(struct lockspace *ls, int adopt) +{ + struct lm_dlm *lmd = (struct lm_dlm *)ls->lm_data; + + if (daemon_test) + return 0; + + if (adopt) + lmd->dh = dlm_open_lockspace(ls->name); + else + lmd->dh = dlm_new_lockspace(ls->name, 0600, DLM_LSFL_NEWEXCL); + + if (!lmd->dh) { + log_error("add_lockspace_dlm %s adopt %d error", ls->name, adopt); + free(lmd); + ls->lm_data = NULL; + return -1; + } + + return 0; +} + +int lm_rem_lockspace_dlm(struct lockspace *ls, int free_vg) +{ + struct lm_dlm *lmd = (struct lm_dlm *)ls->lm_data; + int rv; + + if (daemon_test) + goto out; + + /* + * If free_vg is set, it means we are doing vgremove, and we may want + * to tell any other nodes to leave the lockspace. This is not really + * necessary since there should be no harm in having an unused + * lockspace sitting around. A new "notification lock" would need to + * be added with a callback to signal this. + */ + + rv = dlm_release_lockspace(ls->name, lmd->dh, 1); + if (rv < 0) { + log_error("rem_lockspace_dlm error %d", rv); + return rv; + } + out: + free(lmd); + ls->lm_data = NULL; + + if (!strcmp(ls->name, gl_lsname_dlm)) { + gl_running_dlm = 0; + gl_auto_dlm = 0; + } + + return 0; +} + +static int lm_add_resource_dlm(struct lockspace *ls, struct resource *r, int with_lock_nl) +{ + struct lm_dlm *lmd = (struct lm_dlm *)ls->lm_data; + struct rd_dlm *rdd = (struct rd_dlm *)r->lm_data; + uint32_t flags = 0; + char *buf; + int rv; + + if (r->type == LD_RT_GL || r->type == LD_RT_VG) { + buf = malloc(sizeof(struct val_blk) + DLM_LVB_LEN); + if (!buf) + return -ENOMEM; + memset(buf, 0, sizeof(struct val_blk) + DLM_LVB_LEN); + + rdd->vb = (struct val_blk *)buf; + rdd->lksb.sb_lvbptr = buf + sizeof(struct val_blk); + + flags |= LKF_VALBLK; + } + + if (!with_lock_nl) + goto out; + + /* because this is a new NL lock request */ + flags |= LKF_EXPEDITE; + + if (daemon_test) + goto out; + + rv = dlm_ls_lock_wait(lmd->dh, LKM_NLMODE, &rdd->lksb, flags, + r->name, strlen(r->name), + 0, NULL, NULL, NULL); + if (rv < 0) { + log_error("S %s R %s add_resource_dlm lock error %d", ls->name, r->name, rv); + return rv; + } + out: + return 0; +} + +int lm_rem_resource_dlm(struct lockspace *ls, struct resource *r) +{ + struct lm_dlm *lmd = (struct lm_dlm *)ls->lm_data; + struct rd_dlm *rdd = (struct rd_dlm *)r->lm_data; + struct dlm_lksb *lksb; + int rv = 0; + + if (daemon_test) + goto out; + + lksb = &rdd->lksb; + + if (!lksb->sb_lkid) + goto out; + + rv = dlm_ls_unlock_wait(lmd->dh, lksb->sb_lkid, 0, lksb); + if (rv < 0) { + log_error("S %s R %s rem_resource_dlm unlock error %d", ls->name, r->name, rv); + } + out: + if (rdd->vb) + free(rdd->vb); + + memset(rdd, 0, sizeof(struct rd_dlm)); + r->lm_init = 0; + return rv; +} + +static int to_dlm_mode(int ld_mode) +{ + switch (ld_mode) { + case LD_LK_EX: + return LKM_EXMODE; + case LD_LK_SH: + return LKM_PRMODE; + }; + return -1; +} + +static int lm_adopt_dlm(struct lockspace *ls, struct resource *r, int ld_mode, + uint32_t *r_version) +{ + struct lm_dlm *lmd = (struct lm_dlm *)ls->lm_data; + struct rd_dlm *rdd = (struct rd_dlm *)r->lm_data; + struct dlm_lksb *lksb; + uint32_t flags = 0; + int mode; + int rv; + + *r_version = 0; + + if (!r->lm_init) { + rv = lm_add_resource_dlm(ls, r, 0); + if (rv < 0) + return rv; + r->lm_init = 1; + } + + lksb = &rdd->lksb; + + flags |= LKF_PERSISTENT; + flags |= LKF_ORPHAN; + + if (rdd->vb) + flags |= LKF_VALBLK; + + mode = to_dlm_mode(ld_mode); + if (mode < 0) { + log_error("adopt_dlm invalid mode %d", ld_mode); + rv = -EINVAL; + goto fail; + } + + log_debug("S %s R %s adopt_dlm", ls->name, r->name); + + if (daemon_test) + return 0; + + /* + * dlm returns 0 for success, -EAGAIN if an orphan is + * found with another mode, and -ENOENT if no orphan. + * + * cast/bast/param are (void *)1 because the kernel + * returns errors if some are null. + */ + + rv = dlm_ls_lockx(lmd->dh, mode, lksb, flags, + r->name, strlen(r->name), 0, + (void *)1, (void *)1, (void *)1, + NULL, NULL); + + if (rv == -EAGAIN) { + log_debug("S %s R %s adopt_dlm adopt mode %d try other mode", + ls->name, r->name, ld_mode); + rv = -EUCLEAN; + goto fail; + } + if (rv < 0) { + log_debug("S %s R %s adopt_dlm mode %d flags %x error %d errno %d", + ls->name, r->name, mode, flags, rv, errno); + goto fail; + } + + /* + * FIXME: For GL/VG locks we probably want to read the lvb, + * especially if adopting an ex lock, because when we + * release this adopted ex lock we may want to write new + * lvb values based on the current lvb values (at lease + * in the GL case where we increment the current values.) + * + * It should be possible to read the lvb by requesting + * this lock in the same mode it's already in. + */ + + return rv; + + fail: + lm_rem_resource_dlm(ls, r); + return rv; +} + +/* + * Use PERSISTENT so that if lvmlockd exits while holding locks, + * the locks will remain orphaned in the dlm, still protecting what + * they were acquired to protect. + */ + +int lm_lock_dlm(struct lockspace *ls, struct resource *r, int ld_mode, + uint32_t *r_version, int adopt) +{ + struct lm_dlm *lmd = (struct lm_dlm *)ls->lm_data; + struct rd_dlm *rdd = (struct rd_dlm *)r->lm_data; + struct dlm_lksb *lksb; + struct val_blk vb; + uint32_t flags = 0; + uint16_t vb_version; + int mode; + int rv; + + if (adopt) { + /* When adopting, we don't follow the normal method + of acquiring a NL lock then converting it to the + desired mode. */ + return lm_adopt_dlm(ls, r, ld_mode, r_version); + } + + if (!r->lm_init) { + rv = lm_add_resource_dlm(ls, r, 1); + if (rv < 0) + return rv; + r->lm_init = 1; + } + + lksb = &rdd->lksb; + + flags |= LKF_CONVERT; + flags |= LKF_NOQUEUE; + flags |= LKF_PERSISTENT; + + if (rdd->vb) + flags |= LKF_VALBLK; + + mode = to_dlm_mode(ld_mode); + if (mode < 0) { + log_error("lock_dlm invalid mode %d", ld_mode); + return -EINVAL; + } + + log_debug("S %s R %s lock_dlm", ls->name, r->name); + + if (daemon_test) { + *r_version = 0; + return 0; + } + + rv = dlm_ls_lock_wait(lmd->dh, mode, lksb, flags, + r->name, strlen(r->name), + 0, NULL, NULL, NULL); + if (rv == -EAGAIN) { + log_error("S %s R %s lock_dlm mode %d rv EAGAIN", ls->name, r->name, mode); + return -EAGAIN; + } + if (rv < 0) { + log_error("S %s R %s lock_dlm error %d", ls->name, r->name, rv); + return rv; + } + + if (rdd->vb) { + if (lksb->sb_flags & DLM_SBF_VALNOTVALID) { + log_debug("S %s R %s lock_dlm VALNOTVALID", ls->name, r->name); + memset(rdd->vb, 0, sizeof(struct val_blk)); + *r_version = 0; + goto out; + } + + memcpy(&vb, lksb->sb_lvbptr, sizeof(struct val_blk)); + vb_version = le16_to_cpu(vb.version); + + if (vb_version && ((vb_version & 0xFF00) > (VAL_BLK_VERSION & 0xFF00))) { + log_error("S %s R %s lock_dlm ignore vb_version %x", + ls->name, r->name, vb_version); + *r_version = 0; + free(rdd->vb); + rdd->vb = NULL; + lksb->sb_lvbptr = NULL; + goto out; + } + + *r_version = le32_to_cpu(vb.r_version); + memcpy(rdd->vb, &vb, sizeof(vb)); /* rdd->vb saved as le */ + + log_debug("S %s R %s lock_dlm get r_version %u", + ls->name, r->name, *r_version); + } +out: + return 0; +} + +int lm_convert_dlm(struct lockspace *ls, struct resource *r, + int ld_mode, uint32_t r_version) +{ + struct lm_dlm *lmd = (struct lm_dlm *)ls->lm_data; + struct rd_dlm *rdd = (struct rd_dlm *)r->lm_data; + struct dlm_lksb *lksb = &rdd->lksb; + uint32_t mode; + uint32_t flags = 0; + int rv; + + log_debug("S %s R %s convert_dlm", ls->name, r->name); + + flags |= LKF_CONVERT; + flags |= LKF_NOQUEUE; + flags |= LKF_PERSISTENT; + + if (rdd->vb && r_version && (r->mode == LD_LK_EX)) { + if (!rdd->vb->version) { + /* first time vb has been written */ + rdd->vb->version = cpu_to_le16(VAL_BLK_VERSION); + } + rdd->vb->r_version = cpu_to_le32(r_version); + memcpy(lksb->sb_lvbptr, rdd->vb, sizeof(struct val_blk)); + + log_debug("S %s R %s convert_dlm set r_version %u", + ls->name, r->name, r_version); + + flags |= LKF_VALBLK; + } + + mode = to_dlm_mode(ld_mode); + + if (daemon_test) + return 0; + + rv = dlm_ls_lock_wait(lmd->dh, mode, lksb, flags, + r->name, strlen(r->name), + 0, NULL, NULL, NULL); + if (rv == -EAGAIN) { + /* FIXME: When does this happen? Should something different be done? */ + log_error("S %s R %s convert_dlm mode %d rv EAGAIN", ls->name, r->name, mode); + return -EAGAIN; + } + if (rv < 0) { + log_error("S %s R %s convert_dlm error %d", ls->name, r->name, rv); + } + return rv; +} + +int lm_unlock_dlm(struct lockspace *ls, struct resource *r, + uint32_t r_version, uint32_t lmuf_flags) +{ + struct lm_dlm *lmd = (struct lm_dlm *)ls->lm_data; + struct rd_dlm *rdd = (struct rd_dlm *)r->lm_data; + struct dlm_lksb *lksb = &rdd->lksb; + uint32_t flags = 0; + int rv; + + log_debug("S %s R %s unlock_dlm r_version %u flags %x", + ls->name, r->name, r_version, lmuf_flags); + + /* + * Do not set PERSISTENT, because we don't need an orphan + * NL lock to protect anything. + */ + + flags |= LKF_CONVERT; + + if (rdd->vb && r_version && (r->mode == LD_LK_EX)) { + if (!rdd->vb->version) { + /* first time vb has been written */ + rdd->vb->version = cpu_to_le16(VAL_BLK_VERSION); + } + if (r_version) + rdd->vb->r_version = cpu_to_le32(r_version); + memcpy(lksb->sb_lvbptr, rdd->vb, sizeof(struct val_blk)); + + log_debug("S %s R %s unlock_dlm set r_version %u", + ls->name, r->name, r_version); + + flags |= LKF_VALBLK; + } + + if (daemon_test) + return 0; + + rv = dlm_ls_lock_wait(lmd->dh, LKM_NLMODE, lksb, flags, + r->name, strlen(r->name), + 0, NULL, NULL, NULL); + if (rv < 0) { + log_error("S %s R %s unlock_dlm error %d", ls->name, r->name, rv); + } + + return rv; +} + +/* + * This list could be read from dlm_controld via libdlmcontrol, + * but it's simpler to get it from sysfs. + */ + +#define DLM_LOCKSPACES_PATH "/sys/kernel/config/dlm/cluster/spaces" + +int lm_get_lockspaces_dlm(struct list_head *ls_rejoin) +{ + struct lockspace *ls; + struct dirent *de; + DIR *ls_dir; + + if (!(ls_dir = opendir(DLM_LOCKSPACES_PATH))) + return -ECONNREFUSED; + + while ((de = readdir(ls_dir))) { + if (de->d_name[0] == '.') + continue; + + if (strncmp(de->d_name, LVM_LS_PREFIX, strlen(LVM_LS_PREFIX))) + continue; + + if (!(ls = alloc_lockspace())) { + closedir(ls_dir); + return -ENOMEM; + } + + ls->lm_type = LD_LM_DLM; + strncpy(ls->name, de->d_name, MAX_NAME); + strncpy(ls->vg_name, ls->name + strlen(LVM_LS_PREFIX), MAX_NAME); + list_add_tail(&ls->list, ls_rejoin); + } + + closedir(ls_dir); + return 0; +} + +int lm_is_running_dlm(void) +{ + char sys_clustername[MAX_ARGS]; + int rv; + + memset(sys_clustername, 0, sizeof(sys_clustername)); + + rv = read_cluster_name(sys_clustername); + if (rv < 0) + return 0; + return 1; +} diff --git a/daemons/lvmlockd/lvmlockd-internal.h b/daemons/lvmlockd/lvmlockd-internal.h new file mode 100644 index 000000000..b6f4056f6 --- /dev/null +++ b/daemons/lvmlockd/lvmlockd-internal.h @@ -0,0 +1,371 @@ +/* + * Copyright (C) 2014 Red Hat, Inc. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU Lesser General Public License v.2.1. + */ + +#ifndef _LVM_LVMLOCKD_INTERNAL_H +#define _LVM_LVMLOCKD_INTERNAL_H + +#define MAX_NAME 64 +#define MAX_ARGS 64 + +#define R_NAME_GL_DISABLED "_GLLK_disabled" +#define R_NAME_GL "GLLK" +#define R_NAME_VG "VGLK" +#define S_NAME_GL_DLM "lvm_global" +#define LVM_LS_PREFIX "lvm_" /* ls name is prefix + vg_name */ +/* global lockspace name for sanlock is a vg name */ + +/* lock manager types */ +enum { + LD_LM_NONE = 0, + LD_LM_UNUSED = 1, /* place holder so values match lib/locking/lvmlockd.h */ + LD_LM_DLM = 2, + LD_LM_SANLOCK = 3, +}; + +/* operation types */ +enum { + LD_OP_HELLO = 1, + LD_OP_QUIT, + LD_OP_INIT, + LD_OP_FREE, + LD_OP_START, + LD_OP_STOP, + LD_OP_LOCK, + LD_OP_UPDATE, + LD_OP_CLOSE, + LD_OP_ENABLE, + LD_OP_DISABLE, + LD_OP_START_WAIT, + LD_OP_STOP_ALL, + LD_OP_DUMP_INFO, + LD_OP_DUMP_LOG, + LD_OP_RENAME_BEFORE, + LD_OP_RENAME_FINAL, + LD_OP_RUNNING_LM, + LD_OP_FIND_FREE_LOCK, +}; + +/* resource types */ +enum { + LD_RT_GL = 1, + LD_RT_VG, + LD_RT_LV, +}; + +/* lock modes, more restrictive must be larger value */ +enum { + LD_LK_IV = -1, + LD_LK_UN = 0, + LD_LK_NL = 1, + LD_LK_SH = 2, + LD_LK_EX = 3, +}; + +struct list_head { + struct list_head *next, *prev; +}; + +struct client { + struct list_head list; + pthread_mutex_t mutex; + int pid; + int fd; + int pi; + uint32_t id; + unsigned int recv : 1; + unsigned int dead : 1; + unsigned int poll_ignore : 1; + char name[MAX_NAME+1]; +}; + +#define LD_AF_PERSISTENT 0x00000001 +#define LD_AF_UNUSED 0x00000002 /* use me */ +#define LD_AF_UNLOCK_CANCEL 0x00000004 +#define LD_AF_NEXT_VERSION 0x00000008 +#define LD_AF_WAIT 0x00000010 +#define LD_AF_FORCE 0x00000020 +#define LD_AF_EX_DISABLE 0x00000040 +#define LD_AF_ENABLE 0x00000080 +#define LD_AF_DISABLE 0x00000100 +#define LD_AF_SEARCH_LS 0x00000200 +#define LD_AF_WAIT_STARTING 0x00001000 +#define LD_AF_DUP_GL_LS 0x00002000 +#define LD_AF_INACTIVE_LS 0x00004000 +#define LD_AF_ADD_LS_ERROR 0x00008000 +#define LD_AF_ADOPT 0x00010000 + +/* + * Number of times to repeat a lock request after + * a lock conflict (-EAGAIN) if unspecified in the + * request. + */ +#define DEFAULT_MAX_RETRIES 4 + +struct action { + struct list_head list; + uint32_t client_id; + uint32_t flags; /* LD_AF_ */ + uint32_t version; + uint64_t host_id; + int8_t op; /* operation type LD_OP_ */ + int8_t rt; /* resource type LD_RT_ */ + int8_t mode; /* lock mode LD_LK_ */ + int8_t lm_type; /* lock manager: LM_DLM, LM_SANLOCK */ + int retries; + int max_retries; + int result; + int lm_rv; /* return value from lm_ function */ + char vg_uuid[64]; + char vg_name[MAX_NAME+1]; + char lv_name[MAX_NAME+1]; + char lv_uuid[MAX_NAME+1]; + char vg_args[MAX_ARGS]; + char lv_args[MAX_ARGS]; + char vg_sysid[MAX_NAME+1]; +}; + +struct resource { + struct list_head list; /* lockspace.resources */ + char name[MAX_NAME+1]; /* vg name or lv name */ + int8_t type; /* resource type LD_RT_ */ + int8_t mode; + unsigned int sh_count; /* number of sh locks on locks list */ + uint32_t version; + unsigned int lm_init : 1; /* lm_data is initialized */ + unsigned int adopt : 1; /* temp flag in remove_inactive_lvs */ + unsigned int version_zero_valid : 1; + struct list_head locks; + struct list_head actions; + struct val_blk *vb; + char lv_args[MAX_ARGS]; + char lm_data[0]; /* lock manager specific data */ +}; + +#define LD_LF_PERSISTENT 0x00000001 + +struct lock { + struct list_head list; /* resource.locks */ + int8_t mode; /* lock mode LD_LK_ */ + uint32_t version; + uint32_t flags; /* LD_LF_ */ + uint32_t client_id; /* may be 0 for persistent or internal locks */ +}; + +struct lockspace { + struct list_head list; /* lockspaces */ + char name[MAX_NAME+1]; + char vg_name[MAX_NAME+1]; + char vg_uuid[64]; + char vg_args[MAX_ARGS]; /* lock manager specific args */ + char vg_sysid[MAX_NAME+1]; + int8_t lm_type; /* lock manager: LM_DLM, LM_SANLOCK */ + void *lm_data; + uint64_t host_id; + uint64_t free_lock_offset; /* start search for free lock here */ + + uint32_t start_client_id; /* client_id that started the lockspace */ + pthread_t thread; /* makes synchronous lock requests */ + pthread_cond_t cond; + pthread_mutex_t mutex; + unsigned int create_fail : 1; + unsigned int create_done : 1; + unsigned int thread_work : 1; + unsigned int thread_stop : 1; + unsigned int thread_done : 1; + unsigned int sanlock_gl_enabled: 1; + unsigned int sanlock_gl_dup: 1; + + struct list_head actions; /* new client actions */ + struct list_head resources; /* resource/lock state for gl/vg/lv */ +}; + +#define VAL_BLK_VERSION 0x0101 + +struct val_blk { + uint16_t version; + uint16_t flags; + uint32_t r_version; +}; + +/* lm_unlock flags */ +#define LMUF_FREE_VG 0x00000001 + +struct lockspace *alloc_lockspace(void); +int lockspaces_empty(void); +int last_string_from_args(char *args_in, char *last); +int version_from_args(char *args, unsigned int *major, unsigned int *minor, unsigned int *patch); + +int lm_init_vg_dlm(char *ls_name, char *vg_name, uint32_t flags, char *vg_args); +int lm_prepare_lockspace_dlm(struct lockspace *ls); +int lm_add_lockspace_dlm(struct lockspace *ls, int adopt); +int lm_rem_lockspace_dlm(struct lockspace *ls, int free_vg); +int lm_lock_dlm(struct lockspace *ls, struct resource *r, int ld_mode, + uint32_t *r_version, int adopt); +int lm_convert_dlm(struct lockspace *ls, struct resource *r, + int ld_mode, uint32_t r_version); +int lm_unlock_dlm(struct lockspace *ls, struct resource *r, + uint32_t r_version, uint32_t lmu_flags); +int lm_rem_resource_dlm(struct lockspace *ls, struct resource *r); +int lm_get_lockspaces_dlm(struct list_head *ls_rejoin); +int lm_data_size_dlm(void); +int lm_is_running_dlm(void); + +int lm_init_vg_sanlock(char *ls_name, char *vg_name, uint32_t flags, char *vg_args); +int lm_init_lv_sanlock(char *ls_name, char *vg_name, char *lv_name, char *vg_args, char *lv_args, uint64_t free_offset); +int lm_free_lv_sanlock(struct lockspace *ls, struct resource *r); +int lm_rename_vg_sanlock(char *ls_name, char *vg_name, uint32_t flags, char *vg_args); +int lm_prepare_lockspace_sanlock(struct lockspace *ls); +int lm_add_lockspace_sanlock(struct lockspace *ls, int adopt); +int lm_rem_lockspace_sanlock(struct lockspace *ls, int free_vg); +int lm_lock_sanlock(struct lockspace *ls, struct resource *r, int ld_mode, + uint32_t *r_version, int *retry, int adopt); +int lm_convert_sanlock(struct lockspace *ls, struct resource *r, + int ld_mode, uint32_t r_version); +int lm_unlock_sanlock(struct lockspace *ls, struct resource *r, + uint32_t r_version, uint32_t lmu_flags); +int lm_able_gl_sanlock(struct lockspace *ls, int enable); +int lm_ex_disable_gl_sanlock(struct lockspace *ls); +int lm_hosts_sanlock(struct lockspace *ls, int notify); +int lm_rem_resource_sanlock(struct lockspace *ls, struct resource *r); +int lm_gl_is_enabled(struct lockspace *ls); +int lm_get_lockspaces_sanlock(struct list_head *ls_rejoin); +int lm_data_size_sanlock(void); +int lm_is_running_sanlock(void); +int lm_find_free_lock_sanlock(struct lockspace *ls, uint64_t *free_offset); + +#define container_of(ptr, type, member) ({ \ + const typeof( ((type *)0)->member ) *__mptr = (ptr); \ + (type *)( (char *)__mptr - offsetof(type,member) );}) + +static inline void INIT_LIST_HEAD(struct list_head *list) +{ + list->next = list; + list->prev = list; +} + +static inline void __list_add(struct list_head *new, + struct list_head *prev, + struct list_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +static inline void __list_del(struct list_head *prev, struct list_head *next) +{ + next->prev = prev; + prev->next = next; +} + +static inline void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} + +static inline void list_add_tail(struct list_head *new, struct list_head *head) +{ + __list_add(new, head->prev, head); +} + +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +static inline int list_empty(const struct list_head *head) +{ + return head->next == head; +} + +#define list_entry(ptr, type, member) \ + container_of(ptr, type, member) + +#define list_first_entry(ptr, type, member) \ + list_entry((ptr)->next, type, member) + +#define list_for_each_entry(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + + +/* to improve readability */ +#define WAIT 1 +#define NO_WAIT 0 +#define FORCE 1 +#define NO_FORCE 0 + +/* + * global variables + */ + +#ifndef EXTERN +#define EXTERN extern +#define INIT(X) +#else +#undef EXTERN +#define EXTERN +#define INIT(X) =X +#endif + +/* + * gl_type_static and gl_use_ are set by command line or config file + * to specify whether the global lock comes from dlm or sanlock. + * Without a static setting, lvmlockd will figure out where the + * global lock should be (but it could get mixed up in cases where + * both sanlock and dlm vgs exist.) + * + * gl_use_dlm means that the gl should come from lockspace gl_lsname_dlm + * gl_use_sanlock means that the gl should come from lockspace gl_lsname_sanlock + * + * gl_use_dlm has precedence over gl_use_sanlock, so if a node sees both + * dlm and sanlock vgs, it will use the dlm gl. + * + * gl_use_ is set when the first evidence of that lm_type is seen + * in any command. + * + * gl_lsname_sanlock is set when the first vg is seen in which an + * enabled gl is exists, or when init_vg creates a vg with gl enabled, + * or when enable_gl is used. + * + * gl_lsname_sanlock is cleared when free_vg deletes a vg with gl enabled + * or when disable_gl matches. + */ + +EXTERN int gl_type_static; +EXTERN int gl_use_dlm; +EXTERN int gl_use_sanlock; +EXTERN pthread_mutex_t gl_type_mutex; + +EXTERN char gl_lsname_dlm[MAX_NAME+1]; +EXTERN char gl_lsname_sanlock[MAX_NAME+1]; + +EXTERN int gl_running_dlm; +EXTERN int gl_auto_dlm; + +EXTERN int daemon_test; /* run as much as possible without a live lock manager */ +EXTERN int daemon_debug; +EXTERN int daemon_host_id; +EXTERN const char *daemon_host_id_file; + +void log_level(int level, const char *fmt, ...) __attribute__((format(printf, 2, 3))); +#define log_debug(fmt, args...) log_level(LOG_DEBUG, fmt, ##args) +#define log_error(fmt, args...) log_level(LOG_ERR, fmt, ##args) +#define log_warn(fmt, args...) log_level(LOG_WARNING, fmt, ##args) + +#endif diff --git a/daemons/lvmlockd/lvmlockd-sanlock.c b/daemons/lvmlockd/lvmlockd-sanlock.c new file mode 100644 index 000000000..85a52c880 --- /dev/null +++ b/daemons/lvmlockd/lvmlockd-sanlock.c @@ -0,0 +1,1716 @@ +/* + * Copyright (C) 2014 Red Hat, Inc. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU Lesser General Public License v.2.1. + */ + +#define _XOPEN_SOURCE 500 /* pthread */ +#define _ISOC99_SOURCE +#define _GNU_SOURCE + +#include <assert.h> +#include <pthread.h> +#include <stdint.h> +#include <stddef.h> +#include <stdlib.h> +#include <unistd.h> +#include <stdio.h> +#include <poll.h> +#include <errno.h> +#include <string.h> +#include <syslog.h> +#include <sys/types.h> +#include <sys/socket.h> + +#include "configure.h" +#include "daemon-server.h" +#include "daemon-log.h" +#include "xlate.h" + +#include "lvmlockd-internal.h" +#include "lvmlockd-client.h" + +#include "sanlock.h" +#include "sanlock_rv.h" +#include "sanlock_admin.h" +#include "sanlock_resource.h" + +/* + * If access to the pv containing the vg's leases is lost, sanlock cannot renew + * the leases we have acquired for locked LVs. This means that we could soon + * loose the lease to another host which could activate our LV exclusively. We + * do not want to get to the point of two hosts having the same LV active + * exclusively (it obviously violates the purpose of LV locks.) + * + * The default method of preventing this problem is for lvmlockd to do nothing, + * which produces a safe but potentially inconvenient result. Doing nothing + * leads to our LV leases not being released, which leads to sanlock using the + * local watchdog to reset us before another host can acquire our lock. It + * would often be preferrable to avoid the abrupt hard reset from the watchdog. + * + * There are other options to avoid being reset by our watchdog. If we can + * quickly stop using the LVs in question and release the locks for them, then + * we could avoid a reset (there's a certain grace period of about 40 seconds + * in which we can attempt this.) To do this, we can tell sanlock to run a + * specific program when it has lost access to our leases. We could use this + * program to: + * + * 1. Deactivate all lvs in the effected vg. If all the leases are + * deactivated, then our LV locks would be released and sanlock would no longer + * use the watchdog to reset us. If file systems are mounted on the active + * lvs, then deactivating them would fail, so this option would be of limited + * usefulness. + * + * 2. Option 1 could be extended to kill pids using the fs on the lv, unmount + * the fs, and deactivate the lv. This is probably out of scope for lvm + * directly, and would likely need the help of another system service. + * + * 3. Use dmsetup suspend to block access to lvs in the effected vg. If this + * was successful, the local host could no longer write to the lvs, we could + * safely release the LV locks, and sanlock would no longer reset us. At this + * point, with suspended lvs, the host would be in a fairly hobbled state, and + * would almost certainly need a manual, forcible reset. + * + * 4. Option 3 could be extended to monitor the lost storage, and if it is + * reconnected, the leases could be reacquired, and the suspended lvs resumed + * (reacquiring leases will fail if another host has acquired them since they + * were released.) This complexity of this option, combined with the fact that + * the error conditions are often not as simple as storage being lost and then + * later connecting, will result in this option being too unreliable. + * + * Add a config option that we could use to select a different behavior than + * the default. Then implement one of the simpler options as a proof of + * concept, which could be extended if needed. + */ + +/* + * Each lockspace thread has its own sanlock daemon connection. + * If they shared one, sanlock acquire/release calls would be + * serialized. Some aspects of sanlock expect a single connection + * from each pid: signals due to a sanlock_request, and + * acquire/release/convert/inquire. The later can probably be + * addressed with a flag to indicate that the pid field should be + * interpretted as 'ci' (which the caller would need to figure + * out somehow.) + */ + +struct lm_sanlock { + struct sanlk_lockspace ss; + int align_size; + int sock; /* sanlock daemon connection */ +}; + +struct rd_sanlock { + union { + struct sanlk_resource rs; + char buf[sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk)]; + }; + struct val_blk *vb; +}; + +struct sanlk_resourced { + union { + struct sanlk_resource rs; + char buf[sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk)]; + }; +}; + +int lm_data_size_sanlock(void) +{ + return sizeof(struct rd_sanlock); +} + +/* + * lock_args format + * + * vg_lock_args format for sanlock is + * vg_version_string:undefined:lock_lv_name + * + * lv_lock_args format for sanlock is + * lv_version_string:undefined:offset + * + * version_string is MAJOR.MINOR.PATCH + * undefined may contain ":" + * + * If a new version of the lock_args string cannot be + * handled by an old version of lvmlockd, then the + * new lock_args string should contain a larger major number. + */ + +#define VG_LOCK_ARGS_MAJOR 1 +#define VG_LOCK_ARGS_MINOR 0 +#define VG_LOCK_ARGS_PATCH 0 + +#define LV_LOCK_ARGS_MAJOR 1 +#define LV_LOCK_ARGS_MINOR 0 +#define LV_LOCK_ARGS_PATCH 0 + +/* + * offset 0 is lockspace + * offset align_size * 1 is unused + * offset align_size * 2 is unused + * ... + * offset align_size * 64 is unused + * offset align_size * 65 is gl lock + * offset align_size * 66 is vg lock + * offset align_size * 67 is first lv lock + * offset align_size * 68 is second lv lock + * ... + */ + +#define LS_BEGIN 0 +#define GL_LOCK_BEGIN 65 +#define VG_LOCK_BEGIN 66 +#define LV_LOCK_BEGIN 67 + +static int lock_lv_name_from_args(char *vg_args, char *lock_lv_name) +{ + return last_string_from_args(vg_args, lock_lv_name); +} + +static int lock_lv_offset_from_args(char *lv_args, uint64_t *lock_lv_offset) +{ + char offset_str[MAX_ARGS]; + int rv; + + memset(offset_str, 0, sizeof(offset_str)); + + rv = last_string_from_args(lv_args, offset_str); + if (rv < 0) + return rv; + + *lock_lv_offset = strtoull(offset_str, NULL, 10); + return 0; +} + +static int check_args_version(char *args, unsigned int our_major) +{ + unsigned int major = 0; + int rv; + + rv = version_from_args(args, &major, NULL, NULL); + if (rv < 0) { + log_error("check_args_version %s error %d", args, rv); + return rv; + } + + if (major > our_major) { + log_error("check_args_version %s major %u %u", args, major, our_major); + return -1; + } + + return 0; +} + +#define MAX_LINE 64 + +static int read_host_id_file(void) +{ + FILE *file; + char line[MAX_LINE]; + char key_str[MAX_LINE]; + char val_str[MAX_LINE]; + char *key, *val, *sep; + int host_id = 0; + + file = fopen(daemon_host_id_file, "r"); + if (!file) + goto out; + + while (fgets(line, MAX_LINE, file)) { + if (line[0] == '#' || line[0] == '\n') + continue; + + key = line; + sep = strstr(line, "="); + val = sep + 1; + + if (!sep || !val) + continue; + + *sep = '\0'; + memset(key_str, 0, sizeof(key_str)); + memset(val_str, 0, sizeof(val_str)); + sscanf(key, "%s", key_str); + sscanf(val, "%s", val_str); + + if (!strcmp(key_str, "host_id")) { + host_id = atoi(val_str); + break; + } + } + fclose(file); +out: + log_debug("host_id %d from %s", host_id, daemon_host_id_file); + return host_id; +} + +/* + * vgcreate + * + * For init_vg, vgcreate passes the internal lv name as vg_args. + * This constructs the full/proper vg_args format, containing the + * version and lv name, and returns the real lock_args in vg_args. + */ + +int lm_init_vg_sanlock(char *ls_name, char *vg_name, uint32_t flags, char *vg_args) +{ + struct sanlk_lockspace ss; + struct sanlk_resourced rd; + struct sanlk_disk disk; + char lock_lv_name[MAX_ARGS]; + char lock_args_version[MAX_ARGS]; + const char *gl_name = NULL; + uint32_t daemon_version; + uint32_t daemon_proto; + uint64_t offset; + int align_size; + int i, rv; + + memset(&ss, 0, sizeof(ss)); + memset(&rd, 0, sizeof(rd)); + memset(&disk, 0, sizeof(disk)); + memset(lock_lv_name, 0, sizeof(lock_lv_name)); + memset(lock_args_version, 0, sizeof(lock_args_version)); + + if (!vg_args || !vg_args[0] || !strcmp(vg_args, "none")) { + log_error("S %s init_vg_san vg_args missing", ls_name); + return -EARGS; + } + + snprintf(lock_args_version, MAX_ARGS, "%u.%u.%u", + VG_LOCK_ARGS_MAJOR, VG_LOCK_ARGS_MINOR, VG_LOCK_ARGS_PATCH); + + /* see comment above about input vg_args being only lock_lv_name */ + snprintf(lock_lv_name, MAX_ARGS, "%s", vg_args); + + if (strlen(lock_lv_name) + strlen(lock_args_version) + 2 > MAX_ARGS) + return -EARGS; + + snprintf(disk.path, SANLK_PATH_LEN, "/dev/mapper/%s-%s", vg_name, lock_lv_name); + + log_debug("S %s init_vg_san path %s", ls_name, disk.path); + + if (daemon_test) { + if (!gl_lsname_sanlock[0]) + strncpy(gl_lsname_sanlock, ls_name, MAX_NAME); + return 0; + } + + rv = sanlock_version(0, &daemon_version, &daemon_proto); + if (rv < 0) { + log_error("S %s init_vg_san failed to connect to sanlock daemon", ls_name); + return -EMANAGER; + } + + log_debug("sanlock daemon version %08x proto %08x", + daemon_version, daemon_proto); + + align_size = sanlock_align(&disk); + if (align_size <= 0) { + log_error("S %s init_vg_san bad disk align size %d %s", + ls_name, align_size, disk.path); + return -EARGS; + } + + strncpy(ss.name, ls_name, SANLK_NAME_LEN); + memcpy(ss.host_id_disk.path, disk.path, SANLK_PATH_LEN); + ss.host_id_disk.offset = LS_BEGIN * align_size; + + rv = sanlock_write_lockspace(&ss, 0, 0, 0); + if (rv < 0) { + log_error("S %s init_vg_san write_lockspace error %d %s", + ls_name, rv, ss.host_id_disk.path); + return rv; + } + + /* + * We want to create the global lock in the first sanlock vg. + * If other sanlock vgs exist, then one of them must contain + * the gl. If gl_lsname_sanlock is not set, then perhaps + * the sanlock vg with the gl has been removed or has not yet + * been seen. (Would vgcreate get this far in that case?) + * If dlm vgs exist, then we choose to use the dlm gl and + * not a sanlock gl. + */ + + if (flags & LD_AF_ENABLE) + gl_name = R_NAME_GL; + else if (flags & LD_AF_DISABLE) + gl_name = R_NAME_GL_DISABLED; + else if (!gl_use_sanlock || gl_lsname_sanlock[0] || !lockspaces_empty()) + gl_name = R_NAME_GL_DISABLED; + else + gl_name = R_NAME_GL; + + memcpy(rd.rs.lockspace_name, ss.name, SANLK_NAME_LEN); + strncpy(rd.rs.name, gl_name, SANLK_NAME_LEN); + memcpy(rd.rs.disks[0].path, disk.path, SANLK_PATH_LEN); + rd.rs.disks[0].offset = align_size * GL_LOCK_BEGIN; + rd.rs.num_disks = 1; + + rv = sanlock_write_resource(&rd.rs, 0, 0, 0); + if (rv < 0) { + log_error("S %s init_vg_san write_resource gl error %d %s", + ls_name, rv, rd.rs.disks[0].path); + return rv; + } + + memcpy(rd.rs.lockspace_name, ss.name, SANLK_NAME_LEN); + strncpy(rd.rs.name, R_NAME_VG, SANLK_NAME_LEN); + memcpy(rd.rs.disks[0].path, disk.path, SANLK_PATH_LEN); + rd.rs.disks[0].offset = align_size * VG_LOCK_BEGIN; + rd.rs.num_disks = 1; + + rv = sanlock_write_resource(&rd.rs, 0, 0, 0); + if (rv < 0) { + log_error("S %s init_vg_san write_resource vg error %d %s", + ls_name, rv, rd.rs.disks[0].path); + return rv; + } + + if (!strcmp(gl_name, R_NAME_GL)) + strncpy(gl_lsname_sanlock, ls_name, MAX_NAME); + + snprintf(vg_args, MAX_ARGS, "%s:%s", lock_args_version, lock_lv_name); + + log_debug("S %s init_vg_san done vg_args %s", ls_name, vg_args); + + /* + * Go through all lv resource slots and initialize them with the + * correct lockspace name but a special resource name that indicates + * it is unused. + */ + + memset(&rd, 0, sizeof(rd)); + rd.rs.num_disks = 1; + memcpy(rd.rs.disks[0].path, disk.path, SANLK_PATH_LEN); + strncpy(rd.rs.lockspace_name, ls_name, SANLK_NAME_LEN); + strcpy(rd.rs.name, "#unused"); + + offset = align_size * LV_LOCK_BEGIN; + + log_debug("S %s init_vg_san clearing lv lease areas", ls_name); + + for (i = 0; ; i++) { + rd.rs.disks[0].offset = offset; + + rv = sanlock_write_resource(&rd.rs, 0, 0, 0); + if (rv == -EMSGSIZE || rv == -ENOSPC) { + /* This indicates the end of the device is reached. */ + rv = -EMSGSIZE; + break; + } + + if (rv) { + log_error("clear lv resource area %llu error %d", + (unsigned long long)offset, rv); + break; + } + offset += align_size; + } + + return 0; +} + +/* + * lvcreate + * + * The offset at which the lv lease is written is passed + * all the way back to the lvcreate command so that it + * can be saved in the lv's lock_args in the vg metadata. + */ + +int lm_init_lv_sanlock(char *ls_name, char *vg_name, char *lv_name, + char *vg_args, char *lv_args, uint64_t free_offset) +{ + struct sanlk_resourced rd; + char lock_lv_name[MAX_ARGS]; + char lock_args_version[MAX_ARGS]; + uint64_t offset; + int align_size; + int rv; + + memset(&rd, 0, sizeof(rd)); + memset(lock_lv_name, 0, sizeof(lock_lv_name)); + memset(lock_args_version, 0, sizeof(lock_args_version)); + + rv = lock_lv_name_from_args(vg_args, lock_lv_name); + if (rv < 0) { + log_error("S %s init_lv_san lock_lv_name_from_args error %d %s", + ls_name, rv, vg_args); + return rv; + } + + snprintf(lock_args_version, MAX_ARGS, "%u.%u.%u", + LV_LOCK_ARGS_MAJOR, LV_LOCK_ARGS_MINOR, LV_LOCK_ARGS_PATCH); + + strncpy(rd.rs.lockspace_name, ls_name, SANLK_NAME_LEN); + rd.rs.num_disks = 1; + snprintf(rd.rs.disks[0].path, SANLK_PATH_LEN, "/dev/mapper/%s-%s", vg_name, lock_lv_name); + + align_size = sanlock_align(&rd.rs.disks[0]); + if (align_size <= 0) { + log_error("S %s init_lv_san align error %d", ls_name, align_size); + return -EINVAL; + } + + if (free_offset) + offset = free_offset; + else + offset = align_size * LV_LOCK_BEGIN; + rd.rs.disks[0].offset = offset; + + if (daemon_test) { + snprintf(lv_args, MAX_ARGS, "%s:%llu", + lock_args_version, (unsigned long long)1111); + return 0; + } + + while (1) { + rd.rs.disks[0].offset = offset; + + memset(rd.rs.name, 0, SANLK_NAME_LEN); + + rv = sanlock_read_resource(&rd.rs, 0); + if (rv == -EMSGSIZE || rv == -ENOSPC) { + /* This indicates the end of the device is reached. */ + log_debug("S %s init_lv_san read limit offset %llu", + ls_name, (unsigned long long)offset); + rv = -EMSGSIZE; + return rv; + } + + if (rv && rv != SANLK_LEADER_MAGIC) { + log_error("S %s init_lv_san read error %d offset %llu", + ls_name, rv, (unsigned long long)offset); + break; + } + + if (!strncmp(rd.rs.name, lv_name, SANLK_NAME_LEN)) { + log_error("S %s init_lv_san resource name %s already exists at %llu", + ls_name, lv_name, (unsigned long long)offset); + return -EEXIST; + } + + /* + * If we read newly extended space, it will not be initialized + * with an "#unused" resource, but will return SANLK_LEADER_MAGIC + * indicating an uninitialized paxos structure on disk. + */ + if ((rv == SANLK_LEADER_MAGIC) || !strcmp(rd.rs.name, "#unused")) { + log_debug("S %s init_lv_san %s found unused area at %llu", + ls_name, lv_name, (unsigned long long)offset); + + strncpy(rd.rs.name, lv_name, SANLK_NAME_LEN); + + rv = sanlock_write_resource(&rd.rs, 0, 0, 0); + if (!rv) { + snprintf(lv_args, MAX_ARGS, "%s:%llu", + lock_args_version, (unsigned long long)offset); + } else { + log_error("S %s init_lv_san write error %d offset %llu", + ls_name, rv, (unsigned long long)rv); + } + break; + } + + offset += align_size; + } + + return rv; +} + +/* + * Read the lockspace and each resource, replace the lockspace name, + * and write it back. + */ + +int lm_rename_vg_sanlock(char *ls_name, char *vg_name, uint32_t flags, char *vg_args) +{ + struct sanlk_lockspace ss; + struct sanlk_resourced rd; + struct sanlk_disk disk; + char lock_lv_name[MAX_ARGS]; + uint64_t offset; + uint32_t io_timeout; + int align_size; + int i, rv; + + memset(&disk, 0, sizeof(disk)); + memset(lock_lv_name, 0, sizeof(lock_lv_name)); + + if (!vg_args || !vg_args[0] || !strcmp(vg_args, "none")) { + log_error("S %s rename_vg_san vg_args missing", ls_name); + return -EINVAL; + } + + rv = lock_lv_name_from_args(vg_args, lock_lv_name); + if (rv < 0) { + log_error("S %s init_lv_san lock_lv_name_from_args error %d %s", + ls_name, rv, vg_args); + return rv; + } + + snprintf(disk.path, SANLK_PATH_LEN, "/dev/mapper/%s-%s", vg_name, lock_lv_name); + + log_debug("S %s rename_vg_san path %s", ls_name, disk.path); + + if (daemon_test) + return 0; + + /* FIXME: device is not always ready for us here */ + sleep(1); + + align_size = sanlock_align(&disk); + if (align_size <= 0) { + log_error("S %s rename_vg_san bad align size %d %s", + ls_name, align_size, disk.path); + return -EINVAL; + } + + /* + * Lockspace + */ + + memset(&ss, 0, sizeof(ss)); + memcpy(ss.host_id_disk.path, disk.path, SANLK_PATH_LEN); + ss.host_id_disk.offset = LS_BEGIN * align_size; + + rv = sanlock_read_lockspace(&ss, 0, &io_timeout); + if (rv < 0) { + log_error("S %s rename_vg_san read_lockspace error %d %s", + ls_name, rv, ss.host_id_disk.path); + return rv; + } + + strncpy(ss.name, ls_name, SANLK_NAME_LEN); + + rv = sanlock_write_lockspace(&ss, 0, 0, 0); + if (rv < 0) { + log_error("S %s rename_vg_san write_lockspace error %d %s", + ls_name, rv, ss.host_id_disk.path); + return rv; + } + + /* + * GL resource + */ + + memset(&rd, 0, sizeof(rd)); + memcpy(rd.rs.disks[0].path, disk.path, SANLK_PATH_LEN); + rd.rs.disks[0].offset = align_size * GL_LOCK_BEGIN; + rd.rs.num_disks = 1; + + rv = sanlock_read_resource(&rd.rs, 0); + if (rv < 0) { + log_error("S %s rename_vg_san read_resource gl error %d %s", + ls_name, rv, rd.rs.disks[0].path); + return rv; + } + + strncpy(rd.rs.lockspace_name, ss.name, SANLK_NAME_LEN); + + rv = sanlock_write_resource(&rd.rs, 0, 0, 0); + if (rv < 0) { + log_error("S %s rename_vg_san write_resource gl error %d %s", + ls_name, rv, rd.rs.disks[0].path); + return rv; + } + + /* + * VG resource + */ + + memset(&rd, 0, sizeof(rd)); + memcpy(rd.rs.disks[0].path, disk.path, SANLK_PATH_LEN); + rd.rs.disks[0].offset = align_size * VG_LOCK_BEGIN; + rd.rs.num_disks = 1; + + rv = sanlock_read_resource(&rd.rs, 0); + if (rv < 0) { + log_error("S %s rename_vg_san write_resource vg error %d %s", + ls_name, rv, rd.rs.disks[0].path); + return rv; + } + + strncpy(rd.rs.lockspace_name, ss.name, SANLK_NAME_LEN); + + rv = sanlock_write_resource(&rd.rs, 0, 0, 0); + if (rv < 0) { + log_error("S %s rename_vg_san write_resource vg error %d %s", + ls_name, rv, rd.rs.disks[0].path); + return rv; + } + + /* + * LV resources + */ + + offset = align_size * LV_LOCK_BEGIN; + + for (i = 0; ; i++) { + memset(&rd, 0, sizeof(rd)); + memcpy(rd.rs.disks[0].path, disk.path, SANLK_PATH_LEN); + rd.rs.disks[0].offset = offset; + rd.rs.num_disks = 1; + + rv = sanlock_read_resource(&rd.rs, 0); + if (rv == -EMSGSIZE || rv == -ENOSPC) { + /* This indicates the end of the device is reached. */ + rv = -EMSGSIZE; + break; + } + + if (rv < 0) { + log_error("S %s rename_vg_san read_resource resource area %llu error %d", + ls_name, (unsigned long long)offset, rv); + break; + } + + strncpy(rd.rs.lockspace_name, ss.name, SANLK_NAME_LEN); + + rv = sanlock_write_resource(&rd.rs, 0, 0, 0); + if (rv) { + log_error("S %s rename_vg_san write_resource resource area %llu error %d", + ls_name, (unsigned long long)offset, rv); + break; + } + offset += align_size; + } + + return 0; +} + +/* lvremove */ +int lm_free_lv_sanlock(struct lockspace *ls, struct resource *r) +{ + struct rd_sanlock *rds = (struct rd_sanlock *)r->lm_data; + struct sanlk_resource *rs = &rds->rs; + int rv; + + log_debug("S %s R %s free_lv_san", ls->name, r->name); + + if (daemon_test) + return 0; + + strcpy(rs->name, "#unused"); + + rv = sanlock_write_resource(rs, 0, 0, 0); + if (rv < 0) { + log_error("S %s R %s free_lv_san write error %d", + ls->name, r->name, rv); + } + + return rv; +} + +int lm_ex_disable_gl_sanlock(struct lockspace *ls) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + struct sanlk_resourced rd1; + struct sanlk_resourced rd2; + struct sanlk_resource *rs1; + struct sanlk_resource *rs2; + struct sanlk_resource **rs_args; + int rv; + + rs_args = malloc(2 * sizeof(struct sanlk_resource *)); + if (!rs_args) + return -ENOMEM; + + rs1 = &rd1.rs; + rs2 = &rd2.rs; + + memset(&rd1, 0, sizeof(rd1)); + memset(&rd2, 0, sizeof(rd2)); + + strncpy(rd1.rs.lockspace_name, ls->name, SANLK_NAME_LEN); + strncpy(rd1.rs.name, R_NAME_GL, SANLK_NAME_LEN); + + strncpy(rd2.rs.lockspace_name, ls->name, SANLK_NAME_LEN); + strncpy(rd2.rs.name, R_NAME_GL_DISABLED, SANLK_NAME_LEN); + + rd1.rs.num_disks = 1; + strncpy(rd1.rs.disks[0].path, lms->ss.host_id_disk.path, SANLK_PATH_LEN); + rd1.rs.disks[0].offset = lms->align_size * GL_LOCK_BEGIN; + + rv = sanlock_acquire(lms->sock, -1, 0, 1, &rs1, NULL); + if (rv < 0) { + log_error("S %s ex_disable_gl_san acquire error %d", + ls->name, rv); + goto out; + } + + rs_args[0] = rs1; + rs_args[1] = rs2; + + rv = sanlock_release(lms->sock, -1, SANLK_REL_RENAME, 2, rs_args); + if (rv < 0) { + log_error("S %s ex_disable_gl_san release_rename error %d", + ls->name, rv); + } + +out: + free(rs_args); + return rv; +} + +/* + * enable/disable exist because each vg contains a global lock, + * but we only want to use the gl from one of them. The first + * sanlock vg created, has its gl enabled, and subsequent + * sanlock vgs have their gl disabled. If the vg containing the + * gl is removed, the gl from another sanlock vg needs to be + * enabled. Or, if gl in multiple vgs are somehow enabled, we + * want to be able to disable one of them. + * + * Disable works by naming/renaming the gl resource to have a + * name that is different from the predefined name. + * When a host attempts to acquire the gl with its standard + * predefined name, it will fail because the resource's name + * on disk doesn't match. + */ + +int lm_able_gl_sanlock(struct lockspace *ls, int enable) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + struct sanlk_resourced rd; + const char *gl_name; + int rv; + + if (enable) + gl_name = R_NAME_GL; + else + gl_name = R_NAME_GL_DISABLED; + + memset(&rd, 0, sizeof(rd)); + + strncpy(rd.rs.lockspace_name, ls->name, SANLK_NAME_LEN); + strncpy(rd.rs.name, gl_name, SANLK_NAME_LEN); + + rd.rs.num_disks = 1; + strncpy(rd.rs.disks[0].path, lms->ss.host_id_disk.path, SANLK_PATH_LEN); + rd.rs.disks[0].offset = lms->align_size * GL_LOCK_BEGIN; + + rv = sanlock_write_resource(&rd.rs, 0, 0, 0); + if (rv < 0) { + log_error("S %s able_gl %d write_resource gl error %d %s", + ls->name, enable, rv, rd.rs.disks[0].path); + return rv; + } + + log_debug("S %s able_gl %s", ls->name, gl_name); + + ls->sanlock_gl_enabled = enable; + if (ls->sanlock_gl_dup && !enable) + ls->sanlock_gl_dup = 0; + + if (enable) + strncpy(gl_lsname_sanlock, ls->name, MAX_NAME); + + if (!enable && !strcmp(gl_lsname_sanlock, ls->name)) + memset(gl_lsname_sanlock, 0, sizeof(gl_lsname_sanlock)); + + return 0; +} + +static int gl_is_enabled(struct lockspace *ls, struct lm_sanlock *lms) +{ + char strname[SANLK_NAME_LEN + 1]; + struct sanlk_resourced rd; + uint64_t offset; + int rv; + + memset(&rd, 0, sizeof(rd)); + + strncpy(rd.rs.lockspace_name, ls->name, SANLK_NAME_LEN); + + /* leave rs.name empty, it is what we're checking */ + + rd.rs.num_disks = 1; + strncpy(rd.rs.disks[0].path, lms->ss.host_id_disk.path, SANLK_PATH_LEN); + + offset = lms->align_size * GL_LOCK_BEGIN; + rd.rs.disks[0].offset = offset; + + rv = sanlock_read_resource(&rd.rs, 0); + if (rv < 0) { + log_error("gl_is_enabled read_resource error %d", rv); + return rv; + } + + memset(strname, 0, sizeof(strname)); + memcpy(strname, rd.rs.name, SANLK_NAME_LEN); + + if (!strcmp(strname, R_NAME_GL_DISABLED)) { + return 0; + } + + if (!strcmp(strname, R_NAME_GL)) { + return 1; + } + + log_error("gl_is_enabled invalid gl name %s", strname); + return -1; +} + +int lm_gl_is_enabled(struct lockspace *ls) +{ + int rv; + rv = gl_is_enabled(ls, ls->lm_data); + ls->sanlock_gl_enabled = rv; + return rv; +} + +/* + * This is called at the beginning of lvcreate to + * ensure there is free space for a new LV lock. + * If not, lvcreate will extend the lvmlock lv + * before continuing with creating the new LV. + * This way, lm_init_lv_san() should find a free + * lock (unless the autoextend of lvmlock lv has + * been disabled.) + */ + +int lm_find_free_lock_sanlock(struct lockspace *ls, uint64_t *free_offset) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + struct sanlk_resourced rd; + uint64_t offset; + int rv; + + if (daemon_test) + return 0; + + memset(&rd, 0, sizeof(rd)); + + strncpy(rd.rs.lockspace_name, ls->name, SANLK_NAME_LEN); + rd.rs.num_disks = 1; + strncpy(rd.rs.disks[0].path, lms->ss.host_id_disk.path, SANLK_PATH_LEN); + + offset = lms->align_size * LV_LOCK_BEGIN; + + while (1) { + rd.rs.disks[0].offset = offset; + + memset(rd.rs.name, 0, SANLK_NAME_LEN); + + rv = sanlock_read_resource(&rd.rs, 0); + if (rv == -EMSGSIZE || rv == -ENOSPC) { + /* This indicates the end of the device is reached. */ + log_debug("S %s find_free_lock_san read limit offset %llu", + ls->name, (unsigned long long)offset); + return -EMSGSIZE; + } + + /* + * If we read newly extended space, it will not be initialized + * with an "#unused" resource, but will return an error about + * an invalid paxos structure on disk. + */ + if (rv == SANLK_LEADER_MAGIC) { + log_debug("S %s find_free_lock_san found empty area at %llu", + ls->name, (unsigned long long)offset); + *free_offset = offset; + return 0; + } + + if (rv) { + log_error("S %s find_free_lock_san read error %d offset %llu", + ls->name, rv, (unsigned long long)offset); + break; + } + + if (!strcmp(rd.rs.name, "#unused")) { + log_debug("S %s find_free_lock_san found unused area at %llu", + ls->name, (unsigned long long)offset); + *free_offset = offset; + return 0; + } + + offset += lms->align_size; + } + + return rv; +} + +/* + * host A: start_vg/add_lockspace + * host B: vgremove + * + * The global lock cannot always be held around start_vg + * on host A because the gl is in a vg that may not be + * started yet, or may be in the vg we are starting. + * + * If B removes the vg, destroying the delta leases, + * while A is a lockspace member, it will cause A's + * sanlock delta lease renewal to fail, and lockspace + * recovery. + * + * I expect this overlap would usually cause a failure + * in the add_lockspace() on host A when it sees that + * the lockspace structures have been clobbered by B. + * Having add_lockspace() fail should be a fine result. + * + * If add_lockspace was somehow able to finish, the + * subsequent renewal would probably fail instead. + * This should also not create any major problems. + */ + +int lm_prepare_lockspace_sanlock(struct lockspace *ls) +{ + struct stat st; + struct lm_sanlock *lms = NULL; + char lock_lv_name[MAX_ARGS]; + char lsname[SANLK_NAME_LEN + 1]; + char disk_path[SANLK_PATH_LEN]; + int gl_found; + int ret, rv; + + memset(disk_path, 0, sizeof(disk_path)); + memset(lock_lv_name, 0, sizeof(lock_lv_name)); + + rv = check_args_version(ls->vg_args, VG_LOCK_ARGS_MAJOR); + if (rv < 0) { + ret = -EARGS; + goto fail; + } + + rv = lock_lv_name_from_args(ls->vg_args, lock_lv_name); + if (rv < 0) { + log_error("S %s prepare_lockspace_san lock_lv_name_from_args error %d %s", + ls->name, rv, ls->vg_args); + ret = -EARGS; + goto fail; + } + + snprintf(disk_path, SANLK_PATH_LEN, "/dev/mapper/%s-%s", + ls->vg_name, lock_lv_name); + + /* + * When a vg is started, the internal sanlock lv should be + * activated before lvmlockd is asked to add the lockspace. + * (sanlock needs to use the lv.) + * + * In the future we might be able to ask something on the system + * to activate the sanlock lv from here, and with that we might be + * able to start sanlock VGs without requiring a + * vgchange --lock-start command. + */ + + /* FIXME: device is not always ready for us here */ + sleep(1); + + rv = stat(disk_path, &st); + if (rv < 0) { + log_error("S %s prepare_lockspace_san stat error %d disk_path %s", + ls->name, errno, disk_path); + ret = -EARGS; + goto fail; + } + + if (!ls->host_id) { + if (daemon_host_id) + ls->host_id = daemon_host_id; + else if (daemon_host_id_file) + ls->host_id = read_host_id_file(); + } + + if (!ls->host_id || ls->host_id > 2000) { + log_error("S %s prepare_lockspace_san invalid host_id %llu", + ls->name, (unsigned long long)ls->host_id); + ret = -EHOSTID; + goto fail; + } + + lms = malloc(sizeof(struct lm_sanlock)); + if (!lms) { + ret = -ENOMEM; + goto fail; + } + + memset(lsname, 0, sizeof(lsname)); + strncpy(lsname, ls->name, SANLK_NAME_LEN); + + memset(lms, 0, sizeof(struct lm_sanlock)); + memcpy(lms->ss.name, lsname, SANLK_NAME_LEN); + lms->ss.host_id_disk.offset = 0; + lms->ss.host_id = ls->host_id; + strncpy(lms->ss.host_id_disk.path, disk_path, SANLK_PATH_LEN); + + if (daemon_test) { + if (!gl_lsname_sanlock[0]) { + strncpy(gl_lsname_sanlock, lsname, MAX_NAME); + log_debug("S %s prepare_lockspace_san use global lock", lsname); + } + goto out; + } + + lms->sock = sanlock_register(); + if (lms->sock < 0) { + log_error("S %s prepare_lockspace_san register error %d", lsname, lms->sock); + lms->sock = 0; + ret = -EMANAGER; + goto fail; + } + + rv = sanlock_restrict(lms->sock, SANLK_RESTRICT_SIGKILL); + if (rv < 0) { + log_error("S %s restrict error %d", lsname, rv); + ret = -EMANAGER; + goto fail; + } + + lms->align_size = sanlock_align(&lms->ss.host_id_disk); + if (lms->align_size <= 0) { + log_error("S %s prepare_lockspace_san align error %d", lsname, lms->align_size); + ret = -EMANAGER; + goto fail; + } + + gl_found = gl_is_enabled(ls, lms); + if (gl_found < 0) { + log_error("S %s prepare_lockspace_san gl_enabled error %d", lsname, gl_found); + ret = -EARGS; + goto fail; + } + + ls->sanlock_gl_enabled = gl_found; + + if (gl_found) { + if (gl_use_dlm) { + log_error("S %s prepare_lockspace_san gl_use_dlm is set", lsname); + } else if (gl_lsname_sanlock[0] && strcmp(gl_lsname_sanlock, lsname)) { + log_error("S %s prepare_lockspace_san multiple sanlock global locks current %s", + lsname, gl_lsname_sanlock); + } else { + strncpy(gl_lsname_sanlock, lsname, MAX_NAME); + log_debug("S %s prepare_lockspace_san use global lock %s", + lsname, gl_lsname_sanlock); + } + } + +out: + ls->lm_data = lms; + log_debug("S %s prepare_lockspace_san done", lsname); + return 0; + +fail: + if (lms && lms->sock) + close(lms->sock); + if (lms) + free(lms); + return ret; +} + +int lm_add_lockspace_sanlock(struct lockspace *ls, int adopt) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + int rv; + + rv = sanlock_add_lockspace(&lms->ss, 0); + if (rv == -EEXIST && adopt) { + /* We could alternatively just skip the sanlock call for adopt. */ + log_debug("S %s add_lockspace_san adopt found ls", ls->name); + goto out; + } + if (rv < 0) { + /* retry for some errors? */ + log_error("S %s add_lockspace_san add_lockspace error %d", ls->name, rv); + goto fail; + } + + /* + * Don't let the lockspace be cleanly released if orphan locks + * exist, because the orphan locks are still protecting resources + * that are being used on the host, e.g. active lvs. If the + * lockspace is cleanly released, another host could acquire the + * orphan leases. + */ + + rv = sanlock_set_config(ls->name, 0, SANLK_CONFIG_USED_BY_ORPHANS, NULL); + if (rv < 0) { + log_error("S %s add_lockspace_san set_config error %d", ls->name, rv); + sanlock_rem_lockspace(&lms->ss, 0); + goto fail; + } + +out: + log_debug("S %s add_lockspace_san done", ls->name); + return 0; + +fail: + close(lms->sock); + free(lms); + ls->lm_data = NULL; + return rv; +} + +int lm_rem_lockspace_sanlock(struct lockspace *ls, int free_vg) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + int rv; + + if (daemon_test) + goto out; + + rv = sanlock_rem_lockspace(&lms->ss, 0); + if (rv < 0) { + log_error("S %s rem_lockspace_san error %d", ls->name, rv); + return rv; + } + + if (free_vg) { + /* + * Destroy sanlock lockspace (delta leases). Forces failure for any + * other host that is still using or attempts to use this lockspace. + * This shouldn't be generally necessary, but there may some races + * between nodes starting and removing a vg which this could help. + */ + strncpy(lms->ss.name, "#unused", SANLK_NAME_LEN); + + rv = sanlock_write_lockspace(&lms->ss, 0, 0, 0); + if (rv < 0) { + log_error("S %s rem_lockspace free_vg write_lockspace error %d %s", + ls->name, rv, lms->ss.host_id_disk.path); + } + } +out: + close(lms->sock); + + free(lms); + ls->lm_data = NULL; + + /* FIXME: should we only clear gl_lsname when doing free_vg? */ + + if (!strcmp(ls->name, gl_lsname_sanlock)) + memset(gl_lsname_sanlock, 0, sizeof(gl_lsname_sanlock)); + + return 0; +} + +static int lm_add_resource_sanlock(struct lockspace *ls, struct resource *r) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + struct rd_sanlock *rds = (struct rd_sanlock *)r->lm_data; + + strncpy(rds->rs.lockspace_name, ls->name, SANLK_NAME_LEN); + strncpy(rds->rs.name, r->name, SANLK_NAME_LEN); + rds->rs.num_disks = 1; + memcpy(rds->rs.disks[0].path, lms->ss.host_id_disk.path, SANLK_PATH_LEN); + + if (r->type == LD_RT_GL) + rds->rs.disks[0].offset = GL_LOCK_BEGIN * lms->align_size; + else if (r->type == LD_RT_VG) + rds->rs.disks[0].offset = VG_LOCK_BEGIN * lms->align_size; + + /* LD_RT_LV offset is set in each lm_lock call from lv_args. */ + + if (r->type == LD_RT_GL || r->type == LD_RT_VG) { + rds->vb = malloc(sizeof(struct val_blk)); + if (!rds->vb) + return -ENOMEM; + memset(rds->vb, 0, sizeof(struct val_blk)); + } + + return 0; +} + +int lm_rem_resource_sanlock(struct lockspace *ls, struct resource *r) +{ + struct rd_sanlock *rds = (struct rd_sanlock *)r->lm_data; + + /* FIXME: assert r->mode == UN or unlock if it's not? */ + + if (rds->vb) + free(rds->vb); + + memset(rds, 0, sizeof(struct rd_sanlock)); + r->lm_init = 0; + return 0; +} + +int lm_lock_sanlock(struct lockspace *ls, struct resource *r, int ld_mode, + uint32_t *r_version, int *retry, int adopt) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + struct rd_sanlock *rds = (struct rd_sanlock *)r->lm_data; + struct sanlk_resource *rs; + uint64_t lock_lv_offset; + uint32_t flags = 0; + struct val_blk vb; + uint16_t vb_version; + int added = 0; + int rv; + + if (!r->lm_init) { + rv = lm_add_resource_sanlock(ls, r); + if (rv < 0) + return rv; + r->lm_init = 1; + added = 1; + } + + rs = &rds->rs; + + if (r->type == LD_RT_LV) { + /* + * The lv may have been removed and recreated with a new lease + * offset, so we need to get the offset from lv_args each time + * instead of reusing the value that we last set in rds->rs. + * act->lv_args is copied to r->lv_args before every lm_lock(). + */ + + rv = check_args_version(r->lv_args, LV_LOCK_ARGS_MAJOR); + if (rv < 0) { + log_error("S %s R %s lock_san wrong lv_args version %s", + ls->name, r->name, r->lv_args); + return rv; + } + + rv = lock_lv_offset_from_args(r->lv_args, &lock_lv_offset); + if (rv < 0) { + log_error("S %s R %s lock_san lv_offset_from_args error %d %s", + ls->name, r->name, rv, r->lv_args); + return rv; + } + + if (!added && (rds->rs.disks[0].offset != lock_lv_offset)) { + log_debug("S %s R %s lock_san offset old %llu new %llu", + ls->name, r->name, + (unsigned long long)rds->rs.disks[0].offset, + (unsigned long long)lock_lv_offset); + } + + rds->rs.disks[0].offset = lock_lv_offset; + } + + if (ld_mode == LD_LK_SH) { + rs->flags |= SANLK_RES_SHARED; + } else if (ld_mode == LD_LK_EX) { + rs->flags &= ~SANLK_RES_SHARED; + } else { + log_error("lock_san invalid mode %d", ld_mode); + return -EINVAL; + } + + /* + * Use PERSISTENT because if lvmlockd exits while holding + * a lock, it's not safe to simply clear/drop the lock while + * a command or lv is using it. + */ + + rs->flags |= SANLK_RES_PERSISTENT; + + log_debug("S %s R %s lock_san acquire %s:%llu", + ls->name, r->name, rs->disks[0].path, + (unsigned long long)rs->disks[0].offset); + + if (daemon_test) { + *r_version = 0; + return 0; + } + + if (rds->vb) + flags |= SANLK_ACQUIRE_LVB; + if (adopt) + flags |= SANLK_ACQUIRE_ORPHAN_ONLY; + + rv = sanlock_acquire(lms->sock, -1, flags, 1, &rs, NULL); + + if (rv == -EAGAIN) { + /* + * It appears that sanlock_acquire returns EAGAIN when we request + * a shared lock but the lock is held ex by another host. + * There's no point in retrying this case, just return an error. + */ + log_debug("S %s R %s lock_san acquire mode %d rv EAGAIN", ls->name, r->name, ld_mode); + *retry = 0; + return -EAGAIN; + } + + if ((rv == -EMSGSIZE) && (r->type == LD_RT_LV)) { + /* + * sanlock tried to read beyond the end of the device, + * so the offset of the lv lease is beyond the end of the + * device, which means that the lease lv was extended, and + * the lease for this lv was allocated in the new space. + * The lvm command will see this error, refresh the lvmlock + * lv, and try again. + */ + log_debug("S %s R %s lock_san acquire offset %llu rv EMSGSIZE", + ls->name, r->name, (unsigned long long)rs->disks[0].offset); + *retry = 0; + return -EMSGSIZE; + } + + if (adopt && (rv == -EUCLEAN)) { + /* + * The orphan lock exists but in a different mode than we asked + * for, so the caller should try again with the other mode. + */ + log_debug("S %s R %s lock_san adopt mode %d try other mode", + ls->name, r->name, ld_mode); + *retry = 0; + return -EUCLEAN; + } + + if (adopt && (rv == -ENOENT)) { + /* + * No orphan lock exists. + */ + log_debug("S %s R %s lock_san adopt mode %d no orphan found", + ls->name, r->name, ld_mode); + *retry = 0; + return -ENOENT; + } + + if (rv == SANLK_ACQUIRE_IDLIVE || rv == SANLK_ACQUIRE_OWNED || rv == SANLK_ACQUIRE_OTHER) { + /* + * The lock is held by another host. These failures can + * happen while multiple hosts are concurrently acquiring + * shared locks. We want to retry a couple times in this + * case because we'll probably get the sh lock. + * + * I believe these are also the errors when requesting an + * ex lock that another host holds ex. We want to report + * something like: "lock is held by another host" in this case. + * Retry is pointless here. + * + * We can't distinguish between the two cases above, + * so if requesting a sh lock, retry a couple times, + * otherwise don't. + */ + log_debug("S %s R %s lock_san acquire mode %d rv %d", ls->name, r->name, ld_mode, rv); + *retry = (ld_mode == LD_LK_SH) ? 1 : 0; + return -EAGAIN; + } + + if (rv < 0) { + log_error("S %s R %s lock_san acquire error %d", + ls->name, r->name, rv); + + if (added) { + lm_rem_resource_sanlock(ls, r); + return rv; + } + + /* if the gl has been disabled, remove and free the gl resource */ + if ((rv == SANLK_LEADER_RESOURCE) && (r->type == LD_RT_GL)) { + if (!lm_gl_is_enabled(ls)) { + log_error("S %s R %s lock_san gl has been disabled", + ls->name, r->name); + if (!strcmp(gl_lsname_sanlock, ls->name)) + memset(gl_lsname_sanlock, 0, sizeof(gl_lsname_sanlock)); + return -EUNATCH; + } + } + + return rv; + } + + if (rds->vb) { + rv = sanlock_get_lvb(0, rs, (char *)&vb, sizeof(vb)); + if (rv < 0) { + log_error("S %s R %s lock_san get_lvb error %d", ls->name, r->name, rv); + *r_version = 0; + goto out; + } + + vb_version = le16_to_cpu(vb.version); + + if (vb_version && ((vb_version & 0xFF00) > (VAL_BLK_VERSION & 0xFF00))) { + log_error("S %s R %s lock_san ignore vb_version %x", + ls->name, r->name, vb_version); + *r_version = 0; + free(rds->vb); + rds->vb = NULL; + goto out; + } + + *r_version = le32_to_cpu(vb.r_version); + memcpy(rds->vb, &vb, sizeof(vb)); /* rds->vb saved as le */ + + log_debug("S %s R %s lock_san get r_version %u", + ls->name, r->name, *r_version); + } +out: + return rv; +} + +int lm_convert_sanlock(struct lockspace *ls, struct resource *r, + int ld_mode, uint32_t r_version) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + struct rd_sanlock *rds = (struct rd_sanlock *)r->lm_data; + struct sanlk_resource *rs = &rds->rs; + struct val_blk vb; + uint32_t flags = 0; + int rv; + + log_debug("S %s R %s convert_san", ls->name, r->name); + + if (daemon_test) + goto rs_flag; + + if (rds->vb && r_version && (r->mode == LD_LK_EX)) { + if (!rds->vb->version) { + /* first time vb has been written */ + rds->vb->version = cpu_to_le16(VAL_BLK_VERSION); + } + if (r_version) + rds->vb->r_version = cpu_to_le32(r_version); + memcpy(&vb, rds->vb, sizeof(vb)); + + log_debug("S %s R %s convert_san set r_version %u", + ls->name, r->name, r_version); + + rv = sanlock_set_lvb(0, rs, (char *)&vb, sizeof(vb)); + if (rv < 0) { + log_error("S %s R %s convert_san set_lvb error %d", + ls->name, r->name, rv); + } + } + + rs_flag: + if (ld_mode == LD_LK_SH) + rs->flags |= SANLK_RES_SHARED; + else + rs->flags &= ~SANLK_RES_SHARED; + + if (daemon_test) + return 0; + + rv = sanlock_convert(lms->sock, -1, flags, rs); + if (rv == -EAGAIN) { + /* FIXME: When could this happen? Should something different be done? */ + log_error("S %s R %s convert_san EAGAIN", ls->name, r->name); + return -EAGAIN; + } + if (rv < 0) { + log_error("S %s R %s convert_san convert error %d", ls->name, r->name, rv); + } + + return rv; +} + +static int release_rename(struct lockspace *ls, struct resource *r) +{ + struct rd_sanlock rd1; + struct rd_sanlock rd2; + struct sanlk_resource *res1; + struct sanlk_resource *res2; + struct sanlk_resource **res_args; + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + struct rd_sanlock *rds = (struct rd_sanlock *)r->lm_data; + int rv; + + log_debug("S %s R %s release rename", ls->name, r->name); + + res_args = malloc(2 * sizeof(struct sanlk_resource *)); + if (!res_args) + return -ENOMEM; + + memcpy(&rd1, rds, sizeof(struct rd_sanlock)); + memcpy(&rd2, rds, sizeof(struct rd_sanlock)); + + res1 = (struct sanlk_resource *)&rd1; + res2 = (struct sanlk_resource *)&rd2; + + strcpy(res2->name, "invalid_removed"); + + res_args[0] = res1; + res_args[1] = res2; + + rv = sanlock_release(lms->sock, -1, SANLK_REL_RENAME, 2, res_args); + if (rv < 0) { + log_error("S %s R %s unlock_san release rename error %d", ls->name, r->name, rv); + } + + free(res_args); + + return rv; +} + +/* + * rds->vb is stored in le + * + * r_version is r->version + * + * for GL locks lvmlockd just increments this value + * each time the global lock is released from ex. + * + * for VG locks it is the seqno from the vg metadata. + */ + +int lm_unlock_sanlock(struct lockspace *ls, struct resource *r, + uint32_t r_version, uint32_t lmu_flags) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + struct rd_sanlock *rds = (struct rd_sanlock *)r->lm_data; + struct sanlk_resource *rs = &rds->rs; + struct val_blk vb; + int rv; + + log_debug("S %s R %s unlock_san r_version %u flags %x", + ls->name, r->name, r_version, lmu_flags); + + if (daemon_test) + return 0; + + if (rds->vb && r_version && (r->mode == LD_LK_EX)) { + if (!rds->vb->version) { + /* first time vb has been written */ + rds->vb->version = cpu_to_le16(VAL_BLK_VERSION); + } + if (r_version) + rds->vb->r_version = cpu_to_le32(r_version); + memcpy(&vb, rds->vb, sizeof(vb)); + + log_debug("S %s R %s unlock_san set r_version %u", + ls->name, r->name, r_version); + + rv = sanlock_set_lvb(0, rs, (char *)&vb, sizeof(vb)); + if (rv < 0) { + log_error("S %s R %s unlock_san set_lvb error %d", + ls->name, r->name, rv); + } + } + + /* + * For vgremove (FREE_VG) we unlock-rename the vg and gl locks + * so they cannot be reacquired. + */ + if ((lmu_flags & LMUF_FREE_VG) && + (r->type == LD_RT_GL || r->type == LD_RT_VG)) { + return release_rename(ls, r); + } + + rv = sanlock_release(lms->sock, -1, 0, 1, &rs); + if (rv < 0) { + log_error("S %s R %s unlock_san release error %d", ls->name, r->name, rv); + } + + return rv; +} + +int lm_hosts_sanlock(struct lockspace *ls, int notify) +{ + struct sanlk_host *hss = NULL; + struct sanlk_host *hs; + uint32_t state; + int hss_count = 0; + int found_self = 0; + int found_others = 0; + int i, rv; + + rv = sanlock_get_hosts(ls->name, 0, &hss, &hss_count, 0); + if (rv < 0) { + log_error("S %s hosts_san get_hosts error %d", ls->name, rv); + return 0; + } + + if (!hss || !hss_count) { + log_error("S %s hosts_san zero hosts", ls->name); + return 0; + } + + hs = hss; + + for (i = 0; i < hss_count; i++) { + log_debug("S %s hosts_san host_id %llu gen %llu flags %x", + ls->name, + (unsigned long long)hs->host_id, + (unsigned long long)hs->generation, + hs->flags); + + if (hs->host_id == ls->host_id) { + found_self = 1; + hs++; + continue; + } + + state = hs->flags & SANLK_HOST_MASK; + if (state == SANLK_HOST_LIVE) + found_others++; + hs++; + } + free(hss); + + if (found_others && notify) { + /* + * We could use the sanlock event mechanism to notify lvmlockd + * on other hosts to stop this VG. lvmlockd would need to + * register for and listen for sanlock events in the main loop. + * The events are slow to propagate. We'd need to retry for a + * while before all the hosts see the event and stop the VG. + * sanlock_set_event(ls->name, &he, SANLK_SETEV_ALL_HOSTS); + * + * Wait to try this until there appears to be real value/interest + * in doing it. + */ + } + + if (!found_self) { + log_error("S %s hosts_san self not found others %d", ls->name, found_others); + return 0; + } + + return found_others; +} + +int lm_get_lockspaces_sanlock(struct list_head *ls_rejoin) +{ + struct sanlk_lockspace *ss_all = NULL; + struct sanlk_lockspace *ss; + struct lockspace *ls; + int ss_count = 0; + int i, rv; + + rv = sanlock_get_lockspaces(&ss_all, &ss_count, 0); + if (rv < 0) + return rv; + + if (!ss_all || !ss_count) + return 0; + + ss = ss_all; + + for (i = 0; i < ss_count; i++) { + + if (strncmp(ss->name, LVM_LS_PREFIX, strlen(LVM_LS_PREFIX))) + continue; + + if (!(ls = alloc_lockspace())) + return -ENOMEM; + + ls->lm_type = LD_LM_SANLOCK; + ls->host_id = ss->host_id; + strncpy(ls->name, ss->name, MAX_NAME); + strncpy(ls->vg_name, ss->name + strlen(LVM_LS_PREFIX), MAX_NAME); + list_add_tail(&ls->list, ls_rejoin); + + ss++; + } + + free(ss_all); + return 0; +} + +int lm_is_running_sanlock(void) +{ + uint32_t daemon_version; + uint32_t daemon_proto; + int rv; + + rv = sanlock_version(0, &daemon_version, &daemon_proto); + if (rv < 0) + return 0; + return 1; +} + diff --git a/include/.symlinks.in b/include/.symlinks.in index d6a95fd3d..dc4456a3b 100644 --- a/include/.symlinks.in +++ b/include/.symlinks.in @@ -3,11 +3,13 @@ @top_srcdir@/daemons/lvmetad/lvmetad-client.h @top_srcdir@/daemons/lvmpolld/lvmpolld-protocol.h @top_srcdir@/daemons/lvmpolld/polling_ops.h +@top_srcdir@/daemons/lvmlockd/lvmlockd-client.h @top_srcdir@/liblvm/lvm2app.h @top_srcdir@/lib/activate/activate.h @top_srcdir@/lib/activate/targets.h @top_srcdir@/lib/cache/lvmcache.h @top_srcdir@/lib/cache/lvmetad.h +@top_srcdir@/lib/locking/lvmlockd.h @top_srcdir@/lib/commands/toolcontext.h @top_srcdir@/lib/config/config.h @top_srcdir@/lib/config/config_settings.h diff --git a/lib/Makefile.in b/lib/Makefile.in index 4a282eb8d..4380ae5bd 100644 --- a/lib/Makefile.in +++ b/lib/Makefile.in @@ -201,6 +201,11 @@ ifeq ("@BUILD_LVMPOLLD@", "yes") lvmpolld/lvmpolld-client.c endif +ifeq ("@BUILD_LVMLOCKD@", "yes") + SOURCES +=\ + locking/lvmlockd.c +endif + ifeq ("@DMEVENTD@", "yes") CLDFLAGS += -L$(top_builddir)/daemons/dmeventd LIBS += -ldevmapper-event diff --git a/lib/commands/toolcontext.c b/lib/commands/toolcontext.c index 47aa209d5..2a1e0335d 100644 --- a/lib/commands/toolcontext.c +++ b/lib/commands/toolcontext.c @@ -29,6 +29,7 @@ #include "segtype.h" #include "lvmcache.h" #include "lvmetad.h" +#include "lvmlockd.h" #include "archiver.h" #include "lvmpolld-client.h" @@ -479,7 +480,10 @@ static int _process_config(struct cmd_context *cmd) int64_t pv_min_kb; const char *lvmetad_socket; const char *lvmpolld_socket; + const char *lvmlockd_socket; int udev_disabled = 0; + int locking_type; + int use_lvmlockd; char sysfs_dir[PATH_MAX]; if (!_check_config(cmd)) @@ -656,6 +660,33 @@ static int _process_config(struct cmd_context *cmd) lvmpolld_set_active(find_config_tree_bool(cmd, global_use_lvmpolld_CFG, NULL)); + /* + * clvmd and lvmlockd cannot be used concurrently, it is + * one or the other. + * global/locking_type=3 is the clvmd configuration. + * global/use_lvmlockd=1 is the lvmlockd configuration. + * + * use_lvmlockd should be combined with locking_type 1 (local). + */ + + locking_type = find_config_tree_int(cmd, global_locking_type_CFG, NULL); + use_lvmlockd = find_config_tree_bool(cmd, global_use_lvmlockd_CFG, NULL); + + if (locking_type == 3 && use_lvmlockd) { + log_error("ERROR: configuration setting use_lvmlockd cannot be used with locking_type 3."); + return 0; + } + + lvmlockd_disconnect(); /* start over when tool context is refreshed */ + lvmlockd_socket = getenv("LVM_LVMLOCKD_SOCKET"); + if (!lvmlockd_socket) + lvmlockd_socket = DEFAULT_RUN_DIR "/lvmlockd.socket"; + + lvmlockd_set_socket(lvmlockd_socket); + lvmlockd_set_use(use_lvmlockd); + if (use_lvmlockd) + lvmlockd_init(cmd); + return 1; } @@ -2075,6 +2106,7 @@ void destroy_toolcontext(struct cmd_context *cmd) #endif dm_free(cmd); + lvmlockd_disconnect(); lvmetad_release_token(); lvmetad_disconnect(); lvmpolld_disconnect(); diff --git a/lib/commands/toolcontext.h b/lib/commands/toolcontext.h index 66bea5fc9..26fbc55ca 100644 --- a/lib/commands/toolcontext.h +++ b/lib/commands/toolcontext.h @@ -100,6 +100,9 @@ struct cmd_context { unsigned include_foreign_vgs:1; unsigned include_active_foreign_vgs:1; unsigned error_foreign_vgs:1; + unsigned lockd_vg_disable:1; + unsigned lockd_lv_disable:1; + unsigned lockd_vg_default_sh:1; struct dev_types *dev_types; @@ -142,6 +145,11 @@ struct cmd_context { const char *report_list_item_separator; int hosttags; + /* Locking */ + const char *lock_gl_mode; /* gl mode, from --lock-gl */ + const char *lock_vg_mode; /* vg mode, from --lock-vg */ + const char *lock_lv_mode; /* lv mode, from --lock-lv */ + const char *lib_dir; /* Cache value global/library_dir */ char system_dir[PATH_MAX]; char dev_dir[PATH_MAX]; diff --git a/lib/config/config_settings.h b/lib/config/config_settings.h index 2c5e2f8c9..6c6a34fb5 100644 --- a/lib/config/config_settings.h +++ b/lib/config/config_settings.h @@ -831,6 +831,27 @@ cfg(global_use_lvmetad_CFG, "use_lvmetad", global_CFG_SECTION, 0, CFG_TYPE_BOOL, "LVM prints warnings and ignores lvmetad if this combination\n" "is seen.\n") +cfg(global_use_lvmlockd_CFG, "use_lvmlockd", global_CFG_SECTION, 0, CFG_TYPE_BOOL, 0, vsn(2, 2, 120), NULL, 0, NULL, + "Use lvmlockd for locking among hosts using LVM on shared storage.\n") + +cfg(global_lock_retries_CFG, "lock_retries", global_CFG_SECTION, CFG_DEFAULT_COMMENTED, CFG_TYPE_INT, DEFAULT_LOCK_RETRIES, vsn(2, 2, 120), NULL, 0, NULL, + "Retry lvmlockd lock requests this many times.\n") + +cfg(global_sanlock_lv_extend_CFG, "sanlock_lv_extend", global_CFG_SECTION, CFG_DEFAULT_COMMENTED, CFG_TYPE_INT, DEFAULT_SANLOCK_LV_EXTEND_MB, vsn(2, 2, 120), NULL, 0, NULL, + "Size in MiB to extend the internal LV holding sanlock locks.\n" + "The internal LV holds locks for each LV in the VG, and after\n" + "enough LVs have been created, the internal LV needs to be extended.\n" + "lvcreate will automatically extend the internal LV when needed by\n" + "the amount specified here. Setting this to 0 disables the\n" + "automatic extension and can cause lvcreate to fail.\n") + +cfg(global_allow_override_lock_modes_CFG, "allow_override_lock_modes", global_CFG_SECTION, 0, CFG_TYPE_BOOL, 0, vsn(2, 2, 120), NULL, 0, NULL, + "Allow command options to override normal locking.\n") + +cfg(global_read_only_lock_modes_CFG, "read_only_lock_modes", global_CFG_SECTION, 0, CFG_TYPE_BOOL, 0, vsn(2, 2, 120), NULL, 0, NULL, + "Limit commands to actions that use read locks.\n" + "This disallows any actions that require a write (exclusive) lock.\n") + cfg(global_thin_check_executable_CFG, "thin_check_executable", global_CFG_SECTION, CFG_ALLOW_EMPTY | CFG_DEFAULT_COMMENTED, CFG_TYPE_STRING, THIN_CHECK_CMD, vsn(2, 2, 94), "@THIN_CHECK_CMD@", 0, NULL, "The full path to the thin_check command.\n" "LVM uses this command to check that a thin metadata\n" @@ -1256,6 +1277,14 @@ cfg(activation_mode_CFG, "activation_mode", activation_CFG_SECTION, 0, CFG_TYPE_ "sometimes assist with data recovery.\n" "The '--activationmode' option overrides this setting.\n") +cfg_array(activation_lock_start_list_CFG, "lock_start_list", activation_CFG_SECTION, CFG_ALLOW_EMPTY|CFG_DEFAULT_UNDEFINED, CFG_TYPE_STRING, NULL, vsn(2, 2, 120), NULL, 0, NULL, + "Locking is started only for VGs selected by this list.\n" + "The rules are the same as those for LVs in volume_list.\n") + +cfg_array(activation_auto_lock_start_list_CFG, "auto_lock_start_list", activation_CFG_SECTION, CFG_ALLOW_EMPTY|CFG_DEFAULT_UNDEFINED, CFG_TYPE_STRING, NULL, vsn(2, 2, 120), NULL, 0, NULL, + "Locking is auto-started only for VGs selected by this list.\n" + "The rules are the same as those for LVs in auto_activation_volume_list.\n") + cfg(metadata_pvmetadatacopies_CFG, "pvmetadatacopies", metadata_CFG_SECTION, CFG_ADVANCED | CFG_DEFAULT_COMMENTED, CFG_TYPE_INT, DEFAULT_PVMETADATACOPIES, vsn(1, 0, 0), NULL, 0, NULL, "Number of copies of metadata to store on each PV.\n" "Possible options are: 0, 1, 2.\n" @@ -1507,4 +1536,9 @@ cfg_array(local_extra_system_ids_CFG, "extra_system_ids", local_CFG_SECTION, CFG "Use this only after consulting 'man lvmsystemid'\n" "to be certain of correct usage and possible dangers.\n") +cfg(local_host_id_CFG, "host_id", local_CFG_SECTION, CFG_DEFAULT_COMMENTED, CFG_TYPE_INT, 0, vsn(2, 2, 120), NULL, 0, NULL, + "The lvmlockd sanlock host_id.\n" + "This must be a unique among all hosts,\n" + "and must be between 1 and 2000.\n") + cfg(CFG_COUNT, NULL, root_CFG_SECTION, 0, CFG_TYPE_INT, 0, vsn(0, 0, 0), NULL, 0, NULL, NULL) diff --git a/lib/config/defaults.h b/lib/config/defaults.h index d764ec9a3..e581ca981 100644 --- a/lib/config/defaults.h +++ b/lib/config/defaults.h @@ -51,11 +51,14 @@ #define DEFAULT_FALLBACK_TO_LOCAL_LOCKING 1 #define DEFAULT_FALLBACK_TO_CLUSTERED_LOCKING 1 #define DEFAULT_WAIT_FOR_LOCKS 1 +#define DEFAULT_LOCK_RETRIES 3 #define DEFAULT_PRIORITISE_WRITE_LOCKS 1 #define DEFAULT_USE_MLOCKALL 0 #define DEFAULT_METADATA_READ_ONLY 0 #define DEFAULT_LVDISPLAY_SHOWS_FULL_DEVICE_PATH 0 +#define DEFAULT_SANLOCK_LV_EXTEND_MB 256 + #define DEFAULT_MIRRORLOG MIRROR_LOG_DISK #define DEFAULT_MIRROR_LOG_FAULT_POLICY "allocate" #define DEFAULT_MIRROR_IMAGE_FAULT_POLICY "remove" @@ -226,4 +229,6 @@ #define DEFAULT_THIN_POOL_AUTOEXTEND_THRESHOLD 100 #define DEFAULT_THIN_POOL_AUTOEXTEND_PERCENT 20 +#define DEFAULT_CY_LOCK_TYPE "sanlock" + #endif /* _LVM_DEFAULTS_H */ diff --git a/lib/display/display.c b/lib/display/display.c index 8f075d910..669f11d6f 100644 --- a/lib/display/display.c +++ b/lib/display/display.c @@ -86,6 +86,38 @@ alloc_policy_t get_alloc_from_string(const char *str) return ALLOC_INVALID; } +const char *get_lock_type_string(lock_type_t lock_type) +{ + switch (lock_type) { + case LOCK_TYPE_INVALID: + return "invalid"; + case LOCK_TYPE_NONE: + return "none"; + case LOCK_TYPE_CLVM: + return "clvm"; + case LOCK_TYPE_DLM: + return "dlm"; + case LOCK_TYPE_SANLOCK: + return "sanlock"; + } + return "invalid"; +} + +lock_type_t get_lock_type_from_string(const char *str) +{ + if (!str) + return LOCK_TYPE_NONE; + if (!strcmp(str, "none")) + return LOCK_TYPE_NONE; + if (!strcmp(str, "clvm")) + return LOCK_TYPE_CLVM; + if (!strcmp(str, "dlm")) + return LOCK_TYPE_DLM; + if (!strcmp(str, "sanlock")) + return LOCK_TYPE_SANLOCK; + return LOCK_TYPE_INVALID; +} + static const char *_percent_types[7] = { "NONE", "VG", "FREE", "LV", "PVS", "ORIGIN" }; const char *get_percent_string(percent_type_t def) diff --git a/lib/display/display.h b/lib/display/display.h index cc5654b61..f4e766c09 100644 --- a/lib/display/display.h +++ b/lib/display/display.h @@ -64,6 +64,9 @@ const char *get_alloc_string(alloc_policy_t alloc); char alloc_policy_char(alloc_policy_t alloc); alloc_policy_t get_alloc_from_string(const char *str); +const char *get_lock_type_string(lock_type_t lock_type); +lock_type_t get_lock_type_from_string(const char *str); + const char *get_percent_string(percent_type_t def); char yes_no_prompt(const char *prompt, ...) __attribute__ ((format(printf, 1, 2))); diff --git a/lib/format_text/export.c b/lib/format_text/export.c index 018772eb8..a8fc2f4a2 100644 --- a/lib/format_text/export.c +++ b/lib/format_text/export.c @@ -430,8 +430,11 @@ static int _print_vg(struct formatter *f, struct volume_group *vg) else if (vg->lvm1_system_id && *vg->lvm1_system_id) outf(f, "system_id = \"%s\"", vg->lvm1_system_id); - if (vg->lock_type) + if (vg->lock_type) { outf(f, "lock_type = \"%s\"", vg->lock_type); + if (vg->lock_args) + outf(f, "lock_args = \"%s\"", vg->lock_args); + } outsize(f, (uint64_t) vg->extent_size, "extent_size = %u", vg->extent_size); @@ -657,6 +660,9 @@ static int _print_lv(struct formatter *f, struct logical_volume *lv) lv->timestamp); } + if (lv->lock_args) + outf(f, "lock_args = \"%s\"", lv->lock_args); + if (lv->alloc != ALLOC_INHERIT) outf(f, "allocation_policy = \"%s\"", get_alloc_string(lv->alloc)); diff --git a/lib/format_text/flags.c b/lib/format_text/flags.c index bbd47c307..717ef65ab 100644 --- a/lib/format_text/flags.c +++ b/lib/format_text/flags.c @@ -67,6 +67,7 @@ static const struct flag _lv_flags[] = { {LV_NOSCAN, NULL, 0}, {LV_TEMPORARY, NULL, 0}, {POOL_METADATA_SPARE, NULL, 0}, + {LOCKD_SANLOCK_LV, NULL, 0}, {RAID, NULL, 0}, {RAID_META, NULL, 0}, {RAID_IMAGE, NULL, 0}, diff --git a/lib/format_text/import_vsn1.c b/lib/format_text/import_vsn1.c index 048c8fe63..2cd75153a 100644 --- a/lib/format_text/import_vsn1.c +++ b/lib/format_text/import_vsn1.c @@ -20,6 +20,7 @@ #include "toolcontext.h" #include "lvmcache.h" #include "lvmetad.h" +#include "lvmlockd.h" #include "lv_alloc.h" #include "pv_alloc.h" #include "segtype.h" @@ -578,6 +579,11 @@ static int _read_lvnames(struct format_instance *fid __attribute__((unused)), return 0; } + if (dm_config_get_str(lvn, "lock_args", &str)) { + if (!(lv->lock_args = dm_pool_strdup(mem, str))) + return_0; + } + lv->alloc = ALLOC_INHERIT; if (dm_config_get_str(lvn, "allocation_policy", &str)) { lv->alloc = get_alloc_from_string(str); @@ -643,6 +649,12 @@ static int _read_lvnames(struct format_instance *fid __attribute__((unused)), vg->pool_metadata_spare_lv = lv; } + if (!lv_is_visible(lv) && !strcmp(lv->name, LOCKD_SANLOCK_LV_NAME)) { + log_debug_metadata("Logical volume %s is sanlock lv.", lv->name); + lv->status |= LOCKD_SANLOCK_LV; + vg->sanlock_lv = lv; + } + return 1; } @@ -795,6 +807,11 @@ static struct volume_group *_read_vg(struct format_instance *fid, goto bad; } + if (dm_config_get_str(vgn, "lock_args", &str)) { + if (!(vg->lock_args = dm_pool_strdup(vg->vgmem, str))) + goto bad; + } + if (!_read_id(&vg->id, vgn, "id")) { log_error("Couldn't read uuid for volume group %s.", vg->name); goto bad; diff --git a/lib/locking/lvmlockd.c b/lib/locking/lvmlockd.c new file mode 100644 index 000000000..00448cf9e --- /dev/null +++ b/lib/locking/lvmlockd.c @@ -0,0 +1,2426 @@ +/* + * Copyright (C) 2014 Red Hat, Inc. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU Lesser General Public License v.2.1. + */ + +#include "lib.h" +#include "toolcontext.h" +#include "metadata.h" +#include "segtype.h" +#include "activate.h" +#include "lvmetad.h" +#include "lvmlockd.h" +#include "lvmcache.h" +#include "lvmlockd-client.h" + +static daemon_handle _lvmlockd; +static const char *_lvmlockd_socket = NULL; +static struct cmd_context *_lvmlockd_cmd = NULL; +static int _use_lvmlockd; /* is 1 if command is configured to use lvmlockd */ +static int _lvmlockd_connected; /* is 1 if command is connected to lvmlockd */ +static int _lvmlockd_init_failed; /* used to suppress further warnings */ + +void lvmlockd_set_socket(const char *sock) +{ + _lvmlockd_socket = sock; +} + +/* + * Set directly from global/use_lvmlockd + */ +void lvmlockd_set_use(int use) +{ + _use_lvmlockd = use; +} + +/* + * Returns the value of global/use_lvmlockd being used by the command. + */ +int lvmlockd_use(void) +{ + return _use_lvmlockd; +} + +/* + * The command continues even if init and/or connect fail, + * because the command is allowed to use local VGs without lvmlockd, + * and is allowed to read lockd VGs without locks from lvmlockd. + */ +void lvmlockd_init(struct cmd_context *cmd) +{ + if (!_use_lvmlockd) { + /* Should never happen, don't call init when not using lvmlockd. */ + log_error("Should not initialize lvmlockd with use_lvmlockd=0."); + } + + if (!_lvmlockd_socket) { + log_warn("WARNING: lvmlockd socket location is not configured."); + _lvmlockd_init_failed = 1; + } + + if (!!access(LVMLOCKD_PIDFILE, F_OK)) { + log_warn("WARNING: lvmlockd process is not running."); + _lvmlockd_init_failed = 1; + } else { + _lvmlockd_init_failed = 0; + } + + _lvmlockd_cmd = cmd; +} + +void lvmlockd_connect(void) +{ + if (!_use_lvmlockd) { + /* Should never happen, don't call connect when not using lvmlockd. */ + log_error("Should not connect to lvmlockd with use_lvmlockd=0."); + } + + if (_lvmlockd_connected) { + /* Should never happen, only call connect once. */ + log_error("lvmlockd is already connected."); + } + + if (_lvmlockd_init_failed) + return; + + _lvmlockd = lvmlockd_open(_lvmlockd_socket); + + if (_lvmlockd.socket_fd >= 0 && !_lvmlockd.error) { + log_debug("Successfully connected to lvmlockd on fd %d.", _lvmlockd.socket_fd); + _lvmlockd_connected = 1; + } else { + log_warn("WARNING: lvmlockd connect failed."); + } +} + +void lvmlockd_disconnect(void) +{ + if (_lvmlockd_connected) + daemon_close(_lvmlockd); + _lvmlockd_connected = 0; + _lvmlockd_cmd = NULL; +} + +/* Translate the result strings from lvmlockd to bit flags. */ +static void _flags_str_to_lockd_flags(const char *flags_str, uint32_t *lockd_flags) +{ + if (strstr(flags_str, "NO_LOCKSPACES")) + *lockd_flags |= LD_RF_NO_LOCKSPACES; + + if (strstr(flags_str, "NO_GL_LS")) + *lockd_flags |= LD_RF_NO_GL_LS; + + if (strstr(flags_str, "LOCAL_LS")) + *lockd_flags |= LD_RF_LOCAL_LS; + + if (strstr(flags_str, "DUP_GL_LS")) + *lockd_flags |= LD_RF_DUP_GL_LS; + + if (strstr(flags_str, "INACTIVE_LS")) + *lockd_flags |= LD_RF_INACTIVE_LS; + + if (strstr(flags_str, "ADD_LS_ERROR")) + *lockd_flags |= LD_RF_ADD_LS_ERROR; +} + +/* + * evaluate the reply from lvmlockd, check for errors, extract + * the result and lockd_flags returned by lvmlockd. + * 0 failure (no result/lockd_flags set) + * 1 success (result/lockd_flags set) + */ + +/* + * This is an arbitrary number that we know lvmlockd + * will not return. daemon_reply_int reverts to this + * value if it finds no result value. + */ +#define NO_LOCKD_RESULT -1000 + +static int _lockd_result(daemon_reply reply, int *result, uint32_t *lockd_flags) +{ + int reply_result; + const char *flags_str = NULL; + const char *lock_type = NULL; + + if (reply.error) { + log_error("lockd_result reply error %d", reply.error); + return 0; + } + + if (strcmp(daemon_reply_str(reply, "response", ""), "OK")) { + log_error("lockd_result bad response"); + return 0; + } + + reply_result = daemon_reply_int(reply, "op_result", NO_LOCKD_RESULT); + if (reply_result == NO_LOCKD_RESULT) { + log_error("lockd_result no op_result"); + return 0; + } + + /* The lock_type that lvmlockd used for locking. */ + lock_type = daemon_reply_str(reply, "lock_type", "none"); + + *result = reply_result; + + if (lockd_flags) { + if ((flags_str = daemon_reply_str(reply, "result_flags", NULL))) + _flags_str_to_lockd_flags(flags_str, lockd_flags); + } + + log_debug("lockd_result %d flags %s lm %s", reply_result, + flags_str ? flags_str : "none", lock_type); + return 1; +} + +static daemon_reply _lockd_send(const char *req_name, ...) +{ + va_list ap; + daemon_reply repl; + daemon_request req; + + req = daemon_request_make(req_name); + + va_start(ap, req_name); + daemon_request_extend_v(req, ap); + va_end(ap); + + repl = daemon_send(_lvmlockd, req); + + daemon_request_destroy(req); + + return repl; +} + +/* + * result/lockd_flags are values returned from lvmlockd. + * + * return 0 (failure) + * return 1 (result/lockd_flags indicate success/failure) + * + * return 1 result 0 (success) + * return 1 result < 0 (failure) + * + * caller may ignore result < 0 failure depending on + * lockd_flags and the specific command/mode. + * + * When this function returns 0 (failure), no result/lockd_flags + * were obtained from lvmlockd. + * + * When this function returns 1 (success), result/lockd_flags may + * have been obtained from lvmlockd. This lvmlockd result may + * indicate a locking failure. + */ + +static int _lockd_request(struct cmd_context *cmd, + const char *req_name, + const char *vg_name, + const char *vg_lock_type, + const char *vg_lock_args, + const char *lv_name, + const char *lv_uuid, + const char *lv_lock_args, + const char *mode, + const char *opts, + int *result, + uint32_t *lockd_flags) +{ + const char *cmd_name = get_cmd_name(); + daemon_reply reply; + int pid = getpid(); + + *result = 0; + *lockd_flags = 0; + + if (!strcmp(mode, "na")) + return 1; + + if (!_use_lvmlockd) + return 1; + if (!_lvmlockd_connected) + return 0; + + /* cmd and pid are passed for informational and debugging purposes */ + + if (!cmd_name || !cmd_name[0]) + cmd_name = "none"; + + if (vg_name && lv_name) { + reply = _lockd_send(req_name, + "cmd = %s", cmd_name, + "pid = %d", pid, + "mode = %s", mode, + "opts = %s", opts ?: "none", + "vg_name = %s", vg_name, + "lv_name = %s", lv_name, + "lv_uuid = %s", lv_uuid, + "vg_lock_type = %s", vg_lock_type ?: "none", + "vg_lock_args = %s", vg_lock_args ?: "none", + "lv_lock_args = %s", lv_lock_args ?: "none", + NULL); + + if (!_lockd_result(reply, result, lockd_flags)) + goto fail; + + log_debug("lvmlockd %s %s vg %s lv %s result %d %x", + req_name, mode, vg_name, lv_name, *result, *lockd_flags); + + } else if (vg_name) { + reply = _lockd_send(req_name, + "cmd = %s", cmd_name, + "pid = %d", pid, + "mode = %s", mode, + "opts = %s", opts ?: "none", + "vg_name = %s", vg_name, + "vg_lock_type = %s", vg_lock_type ?: "none", + "vg_lock_args = %s", vg_lock_args ?: "none", + NULL); + + if (!_lockd_result(reply, result, lockd_flags)) + goto fail; + + log_debug("lvmlockd %s %s vg %s result %d %x", + req_name, mode, vg_name, *result, *lockd_flags); + + } else { + reply = _lockd_send(req_name, + "cmd = %s", cmd_name, + "pid = %d", pid, + "mode = %s", mode, + "opts = %s", opts ?: "none", + "vg_lock_type = %s", vg_lock_type ?: "none", + NULL); + + if (!_lockd_result(reply, result, lockd_flags)) + goto fail; + + log_debug("lvmlockd %s %s result %d %x", + req_name, mode, *result, *lockd_flags); + } + + daemon_reply_destroy(reply); + + /* result/lockd_flags have lvmlockd result */ + return 1; + + fail: + /* no result was obtained from lvmlockd */ + + log_error("lvmlockd %s %s failed no result", req_name, mode); + + daemon_reply_destroy(reply); + return 0; +} + +/* + * Eventually add an option to specify which pv the lvmlock lv should be placed on. + */ + +static int _create_sanlock_lv(struct cmd_context *cmd, struct volume_group *vg, + const char *lock_lv_name, int extend_mb) +{ + struct logical_volume *lv; + struct lvcreate_params lp = { + .activate = CHANGE_ALY, + .alloc = ALLOC_INHERIT, + .extents = (extend_mb * 1024 * 1024) / (vg->extent_size * SECTOR_SIZE), + .major = -1, + .minor = -1, + .permission = LVM_READ | LVM_WRITE, + .pvh = &vg->pvs, + .read_ahead = DM_READ_AHEAD_NONE, + .stripes = 1, + .vg_name = vg->name, + .lv_name = dm_pool_strdup(cmd->mem, lock_lv_name), + .zero = 1, + }; + + dm_list_init(&lp.tags); + + if (!(lp.segtype = get_segtype_from_string(vg->cmd, "striped"))) + return_0; + + lv = lv_create_single(vg, &lp); + if (!lv) { + log_error("Failed to create sanlock lv %s in vg %s", lock_lv_name, vg->name); + return 0; + } + + vg->sanlock_lv = lv; + + return 1; +} + +static int _remove_sanlock_lv(struct cmd_context *cmd, struct volume_group *vg) +{ + if (!lv_remove(vg->sanlock_lv)) { + log_error("Failed to remove sanlock LV %s/%s", vg->name, vg->sanlock_lv->name); + return 0; + } + + return 1; +} + +static int _extend_sanlock_lv(struct cmd_context *cmd, struct volume_group *vg, int extend_mb) +{ + struct logical_volume *lv = vg->sanlock_lv; + struct lvresize_params lp = { + .lv_name = vg->sanlock_lv->name, + .sign = SIGN_NONE, + .percent = PERCENT_NONE, + .resize = LV_EXTEND, + .ac_force = 1, + .sizeargs = 1, + }; + + lp.size = lv->size + ((extend_mb * 1024 * 1024) / SECTOR_SIZE); + + if (!lv_resize_prepare(cmd, lv, &lp, &vg->pvs) || + !lv_resize(cmd, lv, &lp, &vg->pvs)) { + log_error("Extend LV %s/%s to size %llu failed.", + vg->name, lv->name, (unsigned long long)lp.size); + return 0; + } + + return 1; +} + +/* When one host does _extend_sanlock_lv, the others need to refresh the size. */ + +static int _refresh_sanlock_lv(struct cmd_context *cmd, struct volume_group *vg) +{ + if (!lv_refresh_suspend_resume(cmd, vg->sanlock_lv)) { + log_error("Failed to refresh %s.", vg->sanlock_lv->name); + return 0; + } + + return 1; +} + +/* + * Called at the beginning of lvcreate in a sanlock VG to ensure + * that there is space in the sanlock LV for a new lock. If it's + * full, then this extends it. + */ + +int handle_sanlock_lv(struct cmd_context *cmd, struct volume_group *vg) +{ + daemon_reply reply; + int extend_mb; + int result; + int ret; + + if (!_use_lvmlockd) + return 1; + if (!_lvmlockd_connected) + return 0; + + extend_mb = find_config_tree_int(cmd, global_sanlock_lv_extend_CFG, NULL); + + /* + * User can choose to not automatically extend the lvmlock LV + * so they can manually extend it. + */ + if (!extend_mb) + return 1; + + /* + * Another host may have extended the lvmlock LV already. + * Refresh so that we'll find the new space they added + * when we search for new space. + */ + if (!_refresh_sanlock_lv(cmd, vg)) + return 0; + + /* + * Ask lvmlockd/sanlock to look for an unused lock. + */ + reply = _lockd_send("find_free_lock", + "pid = %d", getpid(), + "vg_name = %s", vg->name, + NULL); + + if (!_lockd_result(reply, &result, NULL)) { + ret = 0; + } else { + ret = (result < 0) ? 0 : 1; + } + + /* No space on the lvmlock lv for a new lease. */ + if (result == -EMSGSIZE) + ret = _extend_sanlock_lv(cmd, vg, extend_mb); + + daemon_reply_destroy(reply); + + return ret; +} + +static int _activate_sanlock_lv(struct cmd_context *cmd, struct volume_group *vg) +{ + if (!activate_lv(cmd, vg->sanlock_lv)) { + log_error("Failed to activate sanlock lv %s/%s", vg->name, vg->sanlock_lv->name); + return 0; + } + + return 1; +} + +static int _deactivate_sanlock_lv(struct cmd_context *cmd, struct volume_group *vg) +{ + if (!deactivate_lv(cmd, vg->sanlock_lv)) { + log_error("Failed to deactivate sanlock lv %s/%s", vg->name, vg->sanlock_lv->name); + return 0; + } + + return 1; +} + +static int _init_vg_dlm(struct cmd_context *cmd, struct volume_group *vg) +{ + daemon_reply reply; + const char *reply_str; + const char *vg_lock_args = NULL; + int result; + int ret; + + if (!_use_lvmlockd) + return 1; + if (!_lvmlockd_connected) + return 0; + + reply = _lockd_send("init_vg", + "pid = %d", getpid(), + "vg_name = %s", vg->name, + "vg_lock_type = %s", "dlm", + NULL); + + if (!_lockd_result(reply, &result, NULL)) { + ret = 0; + result = -ELOCKD; + } else { + ret = (result < 0) ? 0 : 1; + } + + switch (result) { + case 0: + break; + case -ELOCKD: + log_error("VG %s init failed: lvmlockd not available", vg->name); + break; + case -EARGS: + log_error("VG %s init failed: invalid parameters for dlm", vg->name); + break; + case -EMANAGER: + log_error("VG %s init failed: lock manager dlm is not running", vg->name); + break; + default: + log_error("VG %s init failed: %d", vg->name, result); + } + + if (!ret) + goto out; + + if (!(reply_str = daemon_reply_str(reply, "vg_lock_args", NULL))) { + log_error("VG %s init failed: lock_args not returned", vg->name); + ret = 0; + goto out; + } + + if (!(vg_lock_args = dm_pool_strdup(cmd->mem, reply_str))) { + log_error("VG %s init failed: lock_args alloc failed", vg->name); + ret = 0; + goto out; + } + + vg->lock_type = "dlm"; + vg->lock_args = vg_lock_args; + + if (!vg_write(vg) || !vg_commit(vg)) { + log_error("VG %s init failed: vg_write vg_commit", vg->name); + ret = 0; + goto out; + } + + ret = 1; +out: + daemon_reply_destroy(reply); + return ret; +} + +static int _init_vg_sanlock(struct cmd_context *cmd, struct volume_group *vg) +{ + daemon_reply reply; + const char *reply_str; + const char *vg_lock_args = NULL; + const char *opts = NULL; + int extend_mb; + int result; + int ret; + + if (!_use_lvmlockd) + return 1; + if (!_lvmlockd_connected) + return 0; + + /* + * Automatic extension of the sanlock lv is disabled by + * setting sanlock_lv_extend to 0. Zero won't work as + * an initial size, so in this case, use the default as + * the initial size. + */ + if (!(extend_mb = find_config_tree_int(cmd, global_sanlock_lv_extend_CFG, NULL))) + extend_mb = DEFAULT_SANLOCK_LV_EXTEND_MB; + + /* + * Creating the sanlock LV writes the VG containing the new lvmlock + * LV, then activates the lvmlock LV. The lvmlock LV must be active + * before we ask lvmlockd to initialize the VG because sanlock needs + * to initialize leases on the lvmlock LV. + */ + if (!_create_sanlock_lv(cmd, vg, LOCKD_SANLOCK_LV_NAME, extend_mb)) { + log_error("Failed to create internal lv."); + return 0; + } + + /* + * N.B. this passes the sanlock lv name as vg_lock_args + * even though it is only part of the final args string + * which will be returned from lvmlockd. + */ + + reply = _lockd_send("init_vg", + "pid = %d", getpid(), + "vg_name = %s", vg->name, + "vg_lock_type = %s", "sanlock", + "vg_lock_args = %s", vg->sanlock_lv->name, + "opts = %s", opts ?: "none", + NULL); + + if (!_lockd_result(reply, &result, NULL)) { + ret = 0; + result = -ELOCKD; + } else { + ret = (result < 0) ? 0 : 1; + } + + switch (result) { + case 0: + break; + case -ELOCKD: + log_error("VG %s init failed: lvmlockd not available", vg->name); + break; + case -EARGS: + log_error("VG %s init failed: invalid parameters for sanlock", vg->name); + break; + case -EMANAGER: + log_error("VG %s init failed: lock manager sanlock is not running", vg->name); + break; + case -EMSGSIZE: + log_error("VG %s init failed: no disk space for leases", vg->name); + break; + default: + log_error("VG %s init failed: %d", vg->name, result); + } + + if (!ret) + goto out; + + if (!(reply_str = daemon_reply_str(reply, "vg_lock_args", NULL))) { + log_error("VG %s init failed: lock_args not returned", vg->name); + ret = 0; + goto out; + } + + if (!(vg_lock_args = dm_pool_strdup(cmd->mem, reply_str))) { + log_error("VG %s init failed: lock_args alloc failed", vg->name); + ret = 0; + goto out; + } + + lv_set_hidden(vg->sanlock_lv); + vg->sanlock_lv->status |= LOCKD_SANLOCK_LV; + + vg->lock_type = "sanlock"; + vg->lock_args = vg_lock_args; + + if (!vg_write(vg) || !vg_commit(vg)) { + log_error("VG %s init failed: vg_write vg_commit", vg->name); + ret = 0; + goto out; + } + + ret = 1; +out: + if (!ret) { + /* + * The usleep delay gives sanlock time to close the lock lv, + * and usually avoids having an annoying error printed. + */ + usleep(1000000); + _deactivate_sanlock_lv(cmd, vg); + _remove_sanlock_lv(cmd, vg); + if (!vg_write(vg) || !vg_commit(vg)) + stack; + } + + daemon_reply_destroy(reply); + return ret; +} + +/* called after vg_remove on disk */ + +static int _free_vg_dlm(struct cmd_context *cmd, struct volume_group *vg) +{ + uint32_t lockd_flags; + int result; + int ret; + + /* + * Unlocking the vg lock here preempts the lvmlockd unlock in + * toollib.c which happens too late since the lockspace is + * left here. + */ + + /* Equivalent to a standard unlock. */ + ret = _lockd_request(cmd, "lock_vg", + vg->name, NULL, NULL, NULL, NULL, NULL, "un", NULL, + &result, &lockd_flags); + + if (!ret || result < 0) { + log_error("_free_vg_dlm lvmlockd result %d", result); + return 0; + } + + /* Leave the dlm lockspace. */ + lockd_stop_vg(cmd, vg); + + return 1; +} + +/* called before vg_remove on disk */ + +static int _free_vg_sanlock(struct cmd_context *cmd, struct volume_group *vg) +{ + daemon_reply reply; + int result; + int ret; + + if (!_use_lvmlockd) + return 1; + if (!_lvmlockd_connected) + return 0; + + if (!vg->lock_args || !strlen(vg->lock_args)) { + /* Shouldn't happen in general, but maybe in some error cases? */ + log_debug("_free_vg_sanlock %s no lock_args", vg->name); + return 1; + } + + reply = _lockd_send("free_vg", + "pid = %d", getpid(), + "vg_name = %s", vg->name, + "vg_lock_type = %s", vg->lock_type, + "vg_lock_args = %s", vg->lock_args, + NULL); + + if (!_lockd_result(reply, &result, NULL)) { + ret = 0; + } else { + ret = (result < 0) ? 0 : 1; + } + + /* + * Other hosts could still be joined to the lockspace, which means they + * are using the internal sanlock LV, which means we cannot remove the + * VG. Once other hosts stop using the VG it can be removed. + */ + if (result == -EBUSY) { + log_error("Lockspace for \"%s\" not stopped on other hosts", vg->name); + goto out; + } + + if (!ret) { + log_error("_free_vg_sanlock lvmlockd result %d", result); + goto out; + } + + /* + * The usleep delay gives sanlock time to close the lock lv, + * and usually avoids having an annoying error printed. + */ + usleep(1000000); + + _deactivate_sanlock_lv(cmd, vg); + _remove_sanlock_lv(cmd, vg); + out: + daemon_reply_destroy(reply); + + return ret; +} + +/* vgcreate */ + +int lockd_init_vg(struct cmd_context *cmd, struct volume_group *vg, + const char *lock_type) +{ + switch (get_lock_type_from_string(lock_type)) { + case LOCK_TYPE_NONE: + case LOCK_TYPE_CLVM: + return 1; + case LOCK_TYPE_DLM: + return _init_vg_dlm(cmd, vg); + case LOCK_TYPE_SANLOCK: + return _init_vg_sanlock(cmd, vg); + default: + log_error("Unknown lock_type."); + return 0; + } +} + +/* vgremove before the vg is removed */ + +int lockd_free_vg_before(struct cmd_context *cmd, struct volume_group *vg) +{ + if (cmd->lock_vg_mode && !strcmp(cmd->lock_vg_mode, "na")) + return 1; + + switch (get_lock_type_from_string(vg->lock_type)) { + case LOCK_TYPE_NONE: + case LOCK_TYPE_CLVM: + case LOCK_TYPE_DLM: + return 1; + case LOCK_TYPE_SANLOCK: + /* returning an error will prevent vg_remove() */ + return _free_vg_sanlock(cmd, vg); + default: + log_error("Unknown lock_type."); + return 0; + } +} + +/* vgremove after the vg is removed */ + +void lockd_free_vg_final(struct cmd_context *cmd, struct volume_group *vg) +{ + if (cmd->lock_vg_mode && !strcmp(cmd->lock_vg_mode, "na")) + return; + + switch (get_lock_type_from_string(vg->lock_type)) { + case LOCK_TYPE_NONE: + case LOCK_TYPE_CLVM: + case LOCK_TYPE_SANLOCK: + break; + case LOCK_TYPE_DLM: + _free_vg_dlm(cmd, vg); + break; + default: + log_error("Unknown lock_type."); + } + + /* The vg lock no longer exists, so don't bother trying to unlock. */ + cmd->lockd_vg_disable = 1; +} + +/* + * Starting a vg involves: + * 1. reading the vg without a lock + * 2. getting the lock_type/lock_args from the vg metadata + * 3. doing start_vg in lvmlockd for the lock_type; + * this means joining the lockspace + * + * The vg read in step 1 should not be used for anything + * other than getting the lock_type/lock_args/uuid necessary + * for starting the lockspace. To use the vg after starting + * the lockspace, follow the standard method which is: + * lock the vg, read/use/write the vg, unlock the vg. + */ + +int lockd_start_vg(struct cmd_context *cmd, struct volume_group *vg) +{ + char uuid[64] __attribute__((aligned(8))); + daemon_reply reply; + int host_id = 0; + int result; + int ret; + + memset(uuid, 0, sizeof(uuid)); + + if (!_use_lvmlockd) + return 1; + + /* Skip starting the vg lockspace when the vg lock is skipped. */ + + if (cmd->lock_vg_mode && !strcmp(cmd->lock_vg_mode, "na")) + return 1; + + if (!is_lockd_type(vg->lock_type)) + return 1; + + if (!_lvmlockd_connected) { + log_error("VG %s start failed: lvmlockd not running", vg->name); + return 0; + } + + log_debug("lockd_start_vg %s lock_type %s", vg->name, + vg->lock_type ? vg->lock_type : "empty"); + + if (vg->lock_type && !strcmp(vg->lock_type, "sanlock")) { + /* + * This is the big difference between starting + * sanlock vgs vs starting dlm vgs: the internal + * sanlock lv needs to be activated before lvmlockd + * does the start because sanlock needs to use the lv + * to access locks. + */ + if (!_activate_sanlock_lv(cmd, vg)) + return 0; + + host_id = find_config_tree_int(cmd, local_host_id_CFG, NULL); + } + + id_write_format(&vg->id, uuid, sizeof(uuid)); + + reply = _lockd_send("start_vg", + "pid = %d", getpid(), + "vg_name = %s", vg->name, + "vg_lock_type = %s", vg->lock_type, + "vg_lock_args = %s", vg->lock_args ?: "none", + "vg_uuid = %s", uuid[0] ? uuid : "none", + "version = %d", (int64_t)vg->seqno, + "host_id = %d", host_id, + NULL); + + if (!_lockd_result(reply, &result, NULL)) { + ret = 0; + result = -ELOCKD; + } else { + ret = (result < 0) ? 0 : 1; + } + + switch (result) { + case 0: + log_print_unless_silent("VG %s starting %s lockspace", vg->name, vg->lock_type); + break; + case -ELOCKD: + log_error("VG %s start failed: lvmlockd not available", vg->name); + break; + case -EEXIST: + log_debug("VG %s start error: already started", vg->name); + ret = 1; + break; + case -EARGS: + log_error("VG %s start failed: invalid parameters for %s", vg->name, vg->lock_type); + break; + case -EHOSTID: + log_error("VG %s start failed: invalid sanlock host_id, set in lvmlocal.conf", vg->name); + break; + case -EMANAGER: + log_error("VG %s start failed: lock manager %s is not running", vg->name, vg->lock_type); + break; + default: + log_error("VG %s start failed: %d", vg->name, result); + } + + daemon_reply_destroy(reply); + + return ret; +} + +int lockd_stop_vg(struct cmd_context *cmd, struct volume_group *vg) +{ + daemon_reply reply; + int result; + int ret; + + if (!is_lockd_type(vg->lock_type)) + return 1; + + if (!_use_lvmlockd) + return 1; + if (!_lvmlockd_connected) + return 0; + + log_debug("lockd_stop_vg %s lock_type %s", vg->name, + vg->lock_type ? vg->lock_type : "empty"); + + reply = _lockd_send("stop_vg", + "pid = %d", getpid(), + "vg_name = %s", vg->name, + NULL); + + if (!_lockd_result(reply, &result, NULL)) { + ret = 0; + } else { + ret = (result < 0) ? 0 : 1; + } + + if (result == -ENOLS) { + ret = 1; + goto out; + } + + if (result == -EBUSY) { + log_error("VG %s stop failed: LVs must first be deactivated", vg->name); + goto out; + } + + if (!ret) { + log_error("VG %s stop failed: %d", vg->name, result); + goto out; + } + + if (!strcmp(vg->lock_type, "sanlock")) { + log_debug("lockd_stop_vg deactivate sanlock lv"); + _deactivate_sanlock_lv(cmd, vg); + } +out: + daemon_reply_destroy(reply); + + return ret; +} + +int lockd_start_wait(struct cmd_context *cmd) +{ + daemon_reply reply; + int result; + int ret; + + if (!_use_lvmlockd) + return 1; + if (!_lvmlockd_connected) + return 0; + + reply = _lockd_send("start_wait", + "pid = %d", getpid(), + NULL); + + if (!_lockd_result(reply, &result, NULL)) { + ret = 0; + } else { + ret = (result < 0) ? 0 : 1; + } + + if (!ret) + log_error("Lock start failed"); + + /* + * FIXME: get a list of vgs that started so we can + * better report what worked and what didn't? + */ + + daemon_reply_destroy(reply); + + return ret; +} + +static int _mode_num(const char *mode) +{ + if (!strcmp(mode, "na")) + return -2; + if (!strcmp(mode, "un")) + return -1; + if (!strcmp(mode, "nl")) + return 0; + if (!strcmp(mode, "sh")) + return 1; + if (!strcmp(mode, "ex")) + return 2; + return -3; +} + +/* same rules as strcmp */ +static int _mode_compare(const char *m1, const char *m2) +{ + int n1 = _mode_num(m1); + int n2 = _mode_num(m2); + + if (n1 < n2) + return -1; + if (n1 == n2) + return 0; + if (n1 > n2) + return 1; + return -2; +} + +/* + * lockd_gl_create() is a variation of lockd_gl() used only by vgcreate. + * It handles the case that when using sanlock, the global lock does + * not exist until after the first vgcreate is complete, since the global + * lock exists on storage within an actual VG. So, the first vgcreate + * needs special logic to detect this bootstrap case. + * + * When the vgcreate is not creating the first VG, then lockd_gl_create() + * behaves the same as lockd_gl(). + * + * vgcreate will have a lock_type for the new VG which lockd_gl_create() + * can provide in the lock-gl call. + * + * lockd_gl() and lockd_gl_create() differ in the specific cases where + * ENOLS (no lockspace found) is overriden. In the vgcreate case, the + * override cases are related to sanlock bootstrap, and the lock_type of + * the vg being created is needed. + * + * 1. vgcreate of the first lockd-type vg calls lockd_gl_create() + * to acquire the global lock. + * + * 2. vgcreate/lockd_gl_create passes gl lock request to lvmlockd, + * along with lock_type of the new vg. + * + * 3. lvmlockd finds no global lockspace/lock. + * + * 4. dlm: + * If the lock_type from vgcreate is dlm, lvmlockd creates the + * dlm global lockspace, and queues the global lock request + * for vgcreate. lockd_gl_create returns sucess with the gl held. + * + * sanlock: + * If the lock_type from vgcreate is sanlock, lvmlockd returns -ENOLS + * with the NO_GL_LS flag. lvmlockd cannot create or acquire a sanlock + * global lock until the VG exists on disk (the locks live within the VG). + * + * lockd_gl_create sees sanlock/ENOLS/NO_GL_LS (and optionally the + * "enable" lock-gl arg), determines that this is the sanlock + * bootstrap special case, and returns success without the global lock. + * + * vgcreate creates the VG on disk, and calls lockd_init_vg() which + * initializes/enables a global lock on the new VG's internal sanlock lv. + * Future lockd_gl/lockd_gl_create calls will acquire the existing gl. + */ + +int lockd_gl_create(struct cmd_context *cmd, const char *def_mode, const char *vg_lock_type) +{ + const char *mode = NULL; + uint32_t lockd_flags; + int retries = 0; + int result; + + /* A specific lock mode was given on the command line. */ + if (cmd->lock_gl_mode) { + mode = cmd->lock_gl_mode; + if (mode && def_mode && strcmp(mode, "enable") && (_mode_compare(mode, def_mode) < 0)) { + if (!find_config_tree_bool(cmd, global_allow_override_lock_modes_CFG, NULL)) { + log_error("Disallowed lock-gl mode \"%s\"", mode); + return 0; + } else { + log_warn("WARNING: overriding default global lock mode."); + } + } + } + + if (!mode) + mode = def_mode; + if (!mode) { + log_error("Unknown lock-gl mode"); + return 0; + } + + if (!strcmp(mode, "ex") && find_config_tree_bool(cmd, global_read_only_lock_modes_CFG, NULL)) { + log_error("Exclusive global lock not allowed with read_only_lock_modes"); + return 0; + } + + req: + if (!_lockd_request(cmd, "lock_gl", + NULL, vg_lock_type, NULL, NULL, NULL, NULL, mode, NULL, + &result, &lockd_flags)) { + /* No result from lvmlockd, it is probably not running. */ + log_error("Global lock failed: check that lvmlockd is running."); + return 0; + } + + if (result == -EAGAIN) { + if (retries < find_config_tree_int(cmd, global_lock_retries_CFG, NULL)) { + log_warn("Retrying %s global lock", mode); + sleep(1); + retries++; + goto req; + } + } + + /* + * ENOLS: no lockspace was found with a global lock. + * It may not exist (perhaps this command is creating the first), + * or it may not be visible or started on the system yet. + */ + + if (result == -ENOLS) { + if (!strcmp(mode, "un")) + return 1; + + /* + * This is the explicit sanlock bootstrap condition for + * proceding without the global lock: a chicken/egg case + * for the first sanlock VG that is created. + * + * When creating the first sanlock VG, there is no global + * lock to acquire because the gl will exist in the VG + * being created. The "enable" option makes explicit that + * this is expected: + * + * vgcreate --lock-type sanlock --lock-gl enable + * + * There are three indications that this is the unique + * first-sanlock-vg bootstrap case: + * + * - result from lvmlockd is -ENOLS because lvmlockd found + * no lockspace for this VG; expected because it's being + * created here. + * + * - result flag LD_RF_NO_GL_LS from lvmlockd means that + * lvmlockd has seen no other lockspace with a global lock. + * This implies that this is probably the first sanlock vg + * to be created. If other sanlock vgs exist, the global + * lock should be available from one of them. + * + * - command line lock-gl arg is "enable" which means the + * user expects this to be the first sanlock vg, and the + * global lock should be enabled in it. + */ + + if ((lockd_flags & LD_RF_NO_GL_LS) && + !strcmp(vg_lock_type, "sanlock") && + !strcmp(mode, "enable")) { + log_print_unless_silent("Enabling sanlock global lock"); + lvmetad_validate_global_cache(cmd, 1); + return 1; + } + + /* + * This is an implicit sanlock bootstrap condition for + * proceeding without the global lock. The command line does + * not indicate explicitly that this is a bootstrap situation + * (via "enable"), but it seems likely to be because lvmlockd + * has seen no lockd-type vgs. It is possible that a global + * lock does exist in a vg that has not yet been seen. If that + * vg appears after this creates a new vg with a new enabled + * gl, then there will be two enabled global locks, and one + * will need to be disabled. (We could instead return an error + * here and insist with an error message that the --lock-gl + * enable option be used to exercise the explicit case above.) + */ + + if ((lockd_flags & LD_RF_NO_GL_LS) && + (lockd_flags & LD_RF_NO_LOCKSPACES) && + !strcmp(vg_lock_type, "sanlock")) { + log_print_unless_silent("Enabling sanlock global lock"); + lvmetad_validate_global_cache(cmd, 1); + return 1; + } + + if (!strcmp(vg_lock_type, "sanlock")) + log_error("Global lock failed: check that VG holding global lock exists and is started."); + else + log_error("Global lock failed: check that global lockspace is started."); + return 0; + } + + /* + * Check for each specific error that can be returned so a helpful + * message can be printed for it. + */ + if (result < 0) { + if (result == -ESTARTING) + log_error("Global lock failed: lockspace is starting."); + else if (result -EAGAIN) + log_error("Global lock failed: held by other host."); + else + log_error("Global lock failed: error %d", result); + return 0; + } + + lvmetad_validate_global_cache(cmd, 1); + + return 1; +} + +/* + * The global lock protects: + * + * - The global VG namespace. Two VGs cannot have the same name. + * Used by any command that creates or removes a VG name, + * e.g. vgcreate, vgremove, vgrename, vgsplit, vgmerge. + * + * - The set of orphan PVs. + * Used by any command that changes a non-PV device into an orphan PV, + * an orphan PV into a device, a non-orphan PV (in a VG) into an orphan PV + * (not in a VG), or an orphan PV into a non-orphan PV, + * e.g. pvcreate, pvremove, vgcreate, vgremove, vgextend, vgreduce. + * + * - The properties of orphan PVs. It is possible to make changes to the + * properties of an orphan PV, e.g. pvresize, pvchange. + * + * These are things that cannot be protected by a VG lock alone, since + * orphan PVs do not belong to a real VG (an artificial VG does not + * apply since a sanlock lock only exists on real storage.) + * + * If a command will change any of the things above, it must first acquire + * the global lock in exclusive mode. + * + * If command is reading any of the things above, it must acquire the global + * lock in shared mode. A number of commands read the things above, including: + * + * - Reporting/display commands which show all VGs. Any command that + * will iterate through the entire VG namespace must first acquire the + * global lock shared so that it has an accurate view of the namespace. + * + * - A command where a tag name is used to identify what to process. + * A tag requires reading all VGs to check if they match the tag. + * + * In these cases, the global lock must be acquired before the list of + * all VGs is created. + * + * The global lock is not generally unlocked explicitly in the code. + * When the command disconnects from lvmlockd, lvmlockd automatically + * releases the locks held by the command. The exception is if a command + * will continue running for a long time while not needing the global lock, + * e.g. commands that poll to report progress. + * + * Acquiring the global lock also updates the local lvmetad cache if + * necessary. lockd_gl() first acquires the lock via lvmlockd, then + * before returning to the caller, it checks that the global information + * (e.g. VG namespace, set of orphans) is up to date in lvmetad. If + * not, it scans disks and updates the lvmetad cache before returning + * to the caller. It does this checking using a version number associated + * with the global lock. The version number is incremented each time + * a change is made to the state associated with the global lock, and + * if the local version number is lower than the version number in the + * lock, then the local lvmetad state must be updated. + */ + +int lockd_gl(struct cmd_context *cmd, const char *def_mode, uint32_t flags) +{ + const char *mode = NULL; + const char *opts = NULL; + uint32_t lockd_flags; + int retries = 0; + int result; + + /* A specific lock mode was given on the command line. */ + if (!(flags & LDGL_MODE_NOARG) && cmd->lock_gl_mode) { + mode = cmd->lock_gl_mode; + if (mode && def_mode && (_mode_compare(mode, def_mode) < 0)) { + if (!find_config_tree_bool(cmd, global_allow_override_lock_modes_CFG, NULL)) { + log_error("Disallowed lock-gl mode \"%s\"", mode); + return 0; + } else { + log_warn("WARNING: overriding default global lock mode."); + } + } + } + + if (!mode) + mode = def_mode; + if (!mode) { + log_error("Unknown lock-gl mode"); + return 0; + } + + if (!strcmp(mode, "ex") && find_config_tree_bool(cmd, global_read_only_lock_modes_CFG, NULL)) { + log_error("Exclusive global lock not allowed with read_only_lock_modes"); + return 0; + } + + req: + if (!_lockd_request(cmd, "lock_gl", + NULL, NULL, NULL, NULL, NULL, NULL, mode, opts, + &result, &lockd_flags)) { + /* No result from lvmlockd, it is probably not running. */ + + /* We don't care if an unlock fails. */ + if (!strcmp(mode, "un")) + return 1; + + /* We can continue reading if a shared lock fails. */ + if (!strcmp(mode, "sh")) { + log_warn("Reading without shared global lock."); + lvmetad_validate_global_cache(cmd, 1); + return 1; + } + + log_error("Global lock failed: check that lvmlockd is running."); + return 0; + } + + if (result == -EAGAIN) { + if (retries < find_config_tree_int(cmd, global_lock_retries_CFG, NULL)) { + log_warn("Retrying %s global lock", mode); + sleep(1); + retries++; + goto req; + } + } + + /* + * ENOLS: no lockspace was found with a global lock. + * The VG with the global lock may not be visible or started yet, + * this should be a temporary condition. + * + * ESTARTING: the lockspace with the gl is starting. + * The VG with the global lock is starting and should finish shortly. + */ + + if (result == -ENOLS || result == -ESTARTING) { + if (!strcmp(mode, "un")) + return 1; + + /* + * If an ex global lock fails, then the command fails. + */ + if (strcmp(mode, "sh")) { + if (result == -ESTARTING) + log_error("Global lock failed: lockspace is starting."); + else if (result == -ENOLS) + log_error("Global lock failed: check that global lockspace is started."); + else + log_error("Global lock failed: error %d", result); + return 0; + } + + /* + * If a sh global lock fails, then the command can continue + * reading without it, but force a global cache validation, + * and print a warning. + */ + + if (result == -ESTARTING) { + log_warn("Skipping global lock: lockspace is starting"); + lvmetad_validate_global_cache(cmd, 1); + return 1; + } + + if ((lockd_flags & LD_RF_NO_GL_LS) || (lockd_flags & LD_RF_NO_LOCKSPACES)) { + log_warn("Skipping global lock: lockspace not found or started"); + lvmetad_validate_global_cache(cmd, 1); + return 1; + } + + /* + * This is for completeness. If we reach here, then + * a specific check for the error should be added above + * with a more helpful message. + */ + log_error("Global lock failed: error %d", result); + return 0; + } + + if ((lockd_flags & LD_RF_DUP_GL_LS) && strcmp(mode, "un")) + log_warn("Duplicate sanlock global locks should be corrected"); + + if (result < 0) { + if (ignorelockingfailure()) { + log_debug("Ignore failed locking for global lock"); + lvmetad_validate_global_cache(cmd, 1); + return 1; + } else if (result == -EAGAIN) { + /* + * Most of the time, retries should avoid this case. + */ + log_error("Global lock failed: held by other host."); + return 0; + } else { + /* + * We don't intend to reach this. We should check + * any known/possible error specifically and print + * a more helpful message. This is for completeness. + */ + log_error("Global lock failed: error %d.", result); + return 0; + } + } + + if (!(flags & LDGL_SKIP_CACHE_VALIDATE)) + lvmetad_validate_global_cache(cmd, 0); + + return 1; +} + +/* + * VG lock + * + * Return 1: continue, lockd_state may still indicate an error + * Return 0: failure, do not continue + * + * lvmlockd could also return the lock_type that it used for the VG, + * and we could encode that in lockd_state, and verify later that it + * matches vg->lock_type. + * + * The result of the VG lock operation needs to be saved in lockd_state + * because the result needs to be passed into vg_read so it can be + * assessed in combination with vg->lock_state. + * + * The VG lock protects the VG metadata on disk from concurrent access + * among hosts. The VG lock also ensures that the local lvmetad cache + * contains the latest version of the VG metadata from disk. (Since + * another host may have changed the VG since it was last read.) + * + * The VG lock must be acquired before the VG is read, i.e. before vg_read(). + * The result from lockd_vg() is saved in the "lockd_state" variable, and + * this result is passed into vg_read(). After vg_read() reads the VG, + * it checks if the VG lock_type (sanlock or dlm) requires a lock to be + * held, and if so, it verifies that the lock was correctly acquired by + * looking at lockd_state. If vg_read() sees that the VG is a local VG, + * i.e. lock_type is not sanlock or dlm, then no lock is required, and it + * ignores lockd_state (which would indicate no lock was found.) + * + * When acquiring the VG lock, lvmlockd checks if the local cached copy + * of the VG metadata in lvmetad is up to date. If not, it invalidates + * the VG cached in lvmetad. This would happen if another host changed + * the VG since it was last read. When lvm commands read the VG from + * lvmetad, they will check if the metadata is invalid, and if so they + * will reread it from disk, and update the copy in lvmetad. + */ + +int lockd_vg(struct cmd_context *cmd, const char *vg_name, const char *def_mode, + uint32_t flags, uint32_t *lockd_state) +{ + const char *mode = NULL; + uint32_t lockd_flags; + int retries = 0; + int result; + int ret; + + if (!is_real_vg(vg_name)) + return 1; + + /* + * Some special cases need to disable the vg lock. + */ + if (cmd->lockd_vg_disable) + return 1; + + /* + * An unlock is simply sent or skipped without any need + * for the mode checking for sh/ex. + * + * Look at lockd_state from the sh/ex lock, and if it failed, + * don't bother sending the unlock to lvmlockd. The main + * purpose of this is to avoid sending an unnecessary unlock + * for local VGs (the lockd_state from sh/ex on the local VG + * will be failed.) This implies that the lockd_state value + * should be preserved from the sh/ex lockd_vg() call and + * passed back to lockd_vg() for the corresponding unlock. + */ + if (def_mode && !strcmp(def_mode, "un")) { + if (cmd->lock_vg_mode && !strcmp(cmd->lock_vg_mode, "na")) + return 1; + + if (*lockd_state & LDST_FAIL) { + log_debug("VG %s unlock skipped: lockd_state is failed", vg_name); + return 1; + } + + mode = "un"; + goto req; + } + + *lockd_state = 0; + + /* + * A specific lock mode was given on the command line. + * LDVG_MODE_NOARG disables getting the mode from --lock-vg arg. + */ + if (!(flags & LDVG_MODE_NOARG) && cmd->lock_vg_mode) { + mode = cmd->lock_vg_mode; + if (mode && def_mode && (_mode_compare(mode, def_mode) < 0)) { + if (!find_config_tree_bool(cmd, global_allow_override_lock_modes_CFG, NULL)) { + log_error("Disallowed lock-vg mode \"%s\"", mode); + return 0; + } else { + log_warn("WARNING: overriding default VG lock mode."); + } + } + } + + /* + * The default mode may not have been provided in the + * function args. This happens when lockd_vg is called + * from a process_each function that handles different + * commands. Commands that only read/check/report/display + * the vg have LOCKD_VG_SH set in commands.h, which is + * copied to lockd_vg_default_sh. Commands without this + * set modify the vg and need ex. + */ + if (!mode) + mode = def_mode; + if (!mode) + mode = cmd->lockd_vg_default_sh ? "sh" : "ex"; + + if (!strcmp(mode, "ex") && find_config_tree_bool(cmd, global_read_only_lock_modes_CFG, NULL)) { + log_error("Exclusive VG lock not allowed with read_only_lock_modes"); + return 0; + } + + if (!strcmp(mode, "ex")) + *lockd_state |= LDST_EX; + else if (!strcmp(mode, "sh")) + *lockd_state |= LDST_SH; + + req: + if (!_lockd_request(cmd, "lock_vg", + vg_name, NULL, NULL, NULL, NULL, NULL, mode, NULL, + &result, &lockd_flags)) { + /* + * No result from lvmlockd, it is probably not running. + * Decide if it is ok to continue without a lock after + * reading the VG and looking at the lock_type. + */ + *lockd_state |= LDST_FAIL_REQUEST; + return 1; + } + + if (result == -EAGAIN) { + if (retries < find_config_tree_int(cmd, global_lock_retries_CFG, NULL)) { + log_warn("Retrying %s lock on VG %s", mode, vg_name); + sleep(1); + retries++; + goto req; + } + } + + switch (result) { + case 0: + /* success */ + break; + case -ENOLS: + *lockd_state |= LDST_FAIL_NOLS; + break; + case -ESTARTING: + *lockd_state |= LDST_FAIL_STARTING; + break; + default: + *lockd_state |= LDST_FAIL_OTHER; + } + + /* + * Normal success. + */ + if (!result) { + ret = 1; + goto out; + } + + /* + * The lockspace for the VG is starting (the VG must not + * be local), and is not yet ready to do locking. Allow + * reading without a sh lock during this period. + */ + if (result == -ESTARTING) { + if (!strcmp(mode, "un")) { + ret = 1; + goto out; + } else if (!strcmp(mode, "sh")) { + log_warn("VG %s lock skipped: lock start in progress", vg_name); + ret = 1; + goto out; + } else { + log_error("VG %s lock failed: lock start in progress", vg_name); + ret = 0; + goto out; + } + } + + /* + * An unused/previous lockspace for the VG was found. + * This means it must be a lockd VG, not local. The + * lockspace needs to be started to be used. + */ + if ((result == -ENOLS) && (lockd_flags & LD_RF_INACTIVE_LS)) { + if (!strcmp(mode, "un")) { + ret = 1; + goto out; + } else if (!strcmp(mode, "sh")) { + log_warn("VG %s lock skipped: lockspace is inactive", vg_name); + ret = 1; + goto out; + } else { + log_error("VG %s lock failed: lockspace is inactive", vg_name); + ret = 0; + goto out; + } + } + + /* + * An unused lockspace for the VG was found. The previous + * start of the lockspace failed, so we can print a more useful + * error message. + */ + if ((result == -ENOLS) && (lockd_flags & LD_RF_ADD_LS_ERROR)) { + if (!strcmp(mode, "un")) { + ret = 1; + goto out; + } else if (!strcmp(mode, "sh")) { + log_warn("VG %s lock skipped: lockspace start error", vg_name); + ret = 1; + goto out; + } else { + log_error("VG %s lock failed: lockspace start error", vg_name); + ret = 0; + goto out; + } + } + + /* + * No lockspace for the VG was found. It may be a local + * VG that lvmlockd doesn't keep track of, or it may be + * a lockd VG that lvmlockd doesn't yet know about (it hasn't + * been started yet.) Decide what to do after the VG is + * read and we can see the lock_type. + */ + if (result == -ENOLS) { + ret = 1; + goto out; + } + + /* + * Another error. We don't intend to reach here, but + * want to check for each specific error above so that + * a helpful message can be printed. + */ + if (result) { + if (!strcmp(mode, "un")) { + ret = 1; + goto out; + } else if (!strcmp(mode, "sh")) { + log_warn("VG %s lock skipped: error %d", vg_name, result); + ret = 1; + goto out; + } else { + log_error("VG %s lock failed: error %d", vg_name, result); + ret = 0; + goto out; + } + } + +out: + /* + * A notice from lvmlockd that duplicate gl locks have been found. + * It would be good for the user to disable one of them. + */ + if ((lockd_flags & LD_RF_DUP_GL_LS) && strcmp(mode, "un")) + log_warn("Duplicate sanlock global lock in VG %s", vg_name); + + if (!ret && ignorelockingfailure()) { + log_debug("Ignore failed locking for VG %s", vg_name); + return 1; + } + + return ret; +} + +/* + * This must be called before a new version of the VG metadata is + * written to disk. For local VGs, this is a no-op, but for lockd + * VGs, this notifies lvmlockd of the new VG seqno. lvmlockd must + * know the latest VG seqno so that it can save it within the lock's + * LVB. The VG seqno in the VG lock's LVB is used by other hosts to + * detect when their cached copy of the VG metadata is stale, i.e. + * the cached VG metadata has a lower seqno than the seqno seen in + * the VG lock. + */ + +int lockd_vg_update(struct volume_group *vg) +{ + daemon_reply reply; + int result; + int ret; + + if (!is_lockd_type(vg->lock_type)) + return 1; + + if (!_use_lvmlockd) + return 1; + if (!_lvmlockd_connected) + return 0; + + reply = _lockd_send("vg_update", + "pid = %d", getpid(), + "vg_name = %s", vg->name, + "version = %d", (int64_t)vg->seqno, + NULL); + + if (!_lockd_result(reply, &result, NULL)) { + ret = 0; + } else { + ret = (result < 0) ? 0 : 1; + } + + daemon_reply_destroy(reply); + return ret; +} + +/* + * When this is called directly (as opposed to being called from + * lockd_lv), the caller knows that the LV has a lock. + */ + +int lockd_lv_name(struct cmd_context *cmd, struct volume_group *vg, + const char *lv_name, struct id *lv_id, + const char *lock_args, const char *def_mode, uint32_t flags) +{ + char lv_uuid[64] __attribute__((aligned(8))); + const char *mode = NULL; + const char *opts = NULL; + uint32_t lockd_flags; + int refreshed = 0; + int result; + + if (cmd->lockd_lv_disable) + return 1; + + id_write_format(lv_id, lv_uuid, sizeof(lv_uuid)); + + /* + * For lvchange/vgchange activation, def_mode is "sh" or "ex" + * according to the specific -a{e,s}y mode designation. + * No e,s designation gives NULL def_mode. + * + * The --lock-lv option is saved in cmd->lock_lv_mode. + */ + + if (cmd->lock_lv_mode && def_mode && strcmp(cmd->lock_lv_mode, "na") && + strcmp(cmd->lock_lv_mode, def_mode)) { + log_error("Different LV lock modes from activation %s and lock-lv %s", + def_mode, cmd->lock_lv_mode); + return 0; + } + + /* A specific lock mode was given on the command line. */ + if (cmd->lock_lv_mode && (_mode_compare(cmd->lock_lv_mode, "sh") < 0)) { + if (!find_config_tree_bool(cmd, global_allow_override_lock_modes_CFG, NULL)) { + log_error("Disallowed lock-lv mode \"%s\"", cmd->lock_lv_mode); + return 0; + } else { + log_warn("WARNING: overriding default LV lock mode."); + } + } + + if (cmd->lock_lv_mode) + mode = cmd->lock_lv_mode; + else if (def_mode) + mode = def_mode; + + if (mode && !strcmp(mode, "sh") && (flags & LDLV_MODE_NO_SH)) { + log_error("Shared activation not compatible with LV type: %s/%s", + vg->name, lv_name); + return 0; + } + + if (!mode) + mode = "ex"; + + if (flags & LDLV_PERSISTENT) + opts = "persistent"; + + retry: + if (!_lockd_request(cmd, "lock_lv", + vg->name, vg->lock_type, vg->lock_args, + lv_name, lv_uuid, lock_args, mode, opts, + &result, &lockd_flags)) { + /* No result from lvmlockd, it is probably not running. */ + log_error("Locking failed for LV %s/%s", vg->name, lv_name); + return 0; + } + + /* The lv was not active/locked. */ + if (result == -ENOENT && !strcmp(mode, "un")) + return 1; + + if (result == -EALREADY) + return 1; + + if (result == -EAGAIN) { + log_error("LV locked by other host: %s/%s", vg->name, lv_name); + return 0; + } + + if (result == -EMSGSIZE) { + /* Another host probably extended lvmlock. */ + if (!refreshed++) { + log_debug("Refresh lvmlock"); + _refresh_sanlock_lv(cmd, vg); + goto retry; + } + } + + if (result == -ENOLS) { + log_error("LV %s/%s lock failed: lockspace is inactive", vg->name, lv_name); + return 0; + } + + if (result < 0) { + log_error("LV %s/%s lock failed: error %d", vg->name, lv_name, result); + return 0; + } + + return 1; +} + +/* + * Direct the lock request to the pool LV. + * For a thin pool and all its thin volumes, one ex lock is used. + * It is the one specified in metadata of the pool data lv. + */ + +static int _lockd_lv_thin(struct cmd_context *cmd, struct logical_volume *lv, + const char *def_mode, uint32_t flags) +{ + struct logical_volume *pool_lv; + + if (lv_is_thin_volume(lv)) { + struct lv_segment *pool_seg = first_seg(lv); + pool_lv = pool_seg ? pool_seg->pool_lv : NULL; + + } else if (lv_is_thin_pool(lv)) { + pool_lv = lv; + + } else { + /* This should not happen AFAIK. */ + log_error("Lock on incorrect thin lv type %s/%s", + lv->vg->name, lv->name); + return 0; + } + + if (!pool_lv) { + /* This should not happen. */ + log_error("Cannot find thin pool for %s/%s", + lv->vg->name, lv->name); + return 0; + } + + /* + * Locking a locked lv (pool in this case) is a no-op. + * Unlock when the pool is no longer active. + */ + + if (def_mode && !strcmp(def_mode, "un") && pool_is_active(pool_lv)) + return 1; + + flags |= LDLV_MODE_NO_SH; + + return lockd_lv_name(cmd, pool_lv->vg, pool_lv->name, &pool_lv->lvid.id[1], + pool_lv->lock_args, def_mode, flags); +} + +/* + * If the VG has no lock_type, then this function can return immediately. + * The LV itself may have no lock (NULL lv->lock_args), but the lock request + * may be directed to another lock, e.g. the pool LV lock in _lockd_lv_thin. + * If the lock request is not directed to another LV, and the LV has no + * lock_type set, it means that the LV has no lock, and no locking is done + * for it. + * + * An LV lock is acquired before the LV is activated, and released + * after the LV is deactivated. If the LV lock cannot be acquired, + * it means that the LV is active on another host and the activation + * fails. Commands that modify an inactive LV also acquire the LV lock. + * + * In non-lockd VGs, this is a no-op. + * + * In lockd VGs, normal LVs each have their own lock, but other + * LVs do not have their own lock, e.g. the lock for a thin LV is + * acquired on the thin pool LV, and a thin LV does not have a lock + * of its own. A cache pool LV does not have a lock of its own. + * When the cache pool LV is linked to an origin LV, the lock of + * the orgin LV protects the combined origin + cache pool. + */ + +int lockd_lv(struct cmd_context *cmd, struct logical_volume *lv, + const char *def_mode, uint32_t flags) +{ + if (!is_lockd_type(lv->vg->lock_type)) + return 1; + + if (lv_is_thin_type(lv)) + return _lockd_lv_thin(cmd, lv, def_mode, flags); + + /* + * An LV with NULL lock_args does not have a lock of its own. + */ + if (!lv->lock_args) + return 1; + + /* + * LV type cannot be active concurrently on multiple hosts, + * so shared mode activation is not allowed. + */ + if (lv_is_external_origin(lv) || + lv_is_thin_type(lv) || + lv_is_mirror_type(lv) || + lv_is_raid_type(lv) || + lv_is_cache_type(lv)) { + flags |= LDLV_MODE_NO_SH; + } + + return lockd_lv_name(cmd, lv->vg, lv->name, &lv->lvid.id[1], + lv->lock_args, def_mode, flags); +} + +static int _init_lv_sanlock(struct cmd_context *cmd, struct volume_group *vg, + const char *lv_name, struct id *lv_id, + const char **lock_args_ret) +{ + char lv_uuid[64] __attribute__((aligned(8))); + daemon_reply reply; + const char *reply_str; + const char *lv_lock_args = NULL; + int result; + int ret; + + if (!_use_lvmlockd) + return 1; + if (!_lvmlockd_connected) + return 0; + + id_write_format(lv_id, lv_uuid, sizeof(lv_uuid)); + + reply = _lockd_send("init_lv", + "pid = %d", getpid(), + "vg_name = %s", vg->name, + "lv_name = %s", lv_name, + "lv_uuid = %s", lv_uuid, + "vg_lock_type = %s", "sanlock", + "vg_lock_args = %s", vg->lock_args, + NULL); + + if (!_lockd_result(reply, &result, NULL)) { + ret = 0; + } else { + ret = (result < 0) ? 0 : 1; + } + + if (result == -EEXIST) { + log_error("Lock already exists for LV %s/%s", vg->name, lv_name); + goto out; + } + + if (result == -EMSGSIZE) { + /* + * No space on the lvmlock lv for a new lease, this should be + * detected by handle_sanlock_lv() called before. + */ + log_error("No sanlock space for lock for LV %s/%s", vg->name, lv_name); + goto out; + } + + if (!ret) { + log_error("_init_lv_sanlock lvmlockd result %d", result); + goto out; + } + + if (!(reply_str = daemon_reply_str(reply, "lv_lock_args", NULL))) { + log_error("lv_lock_args not returned"); + ret = 0; + goto out; + } + + if (!(lv_lock_args = dm_pool_strdup(cmd->mem, reply_str))) { + log_error("lv_lock_args allocation failed"); + ret = 0; + } +out: + daemon_reply_destroy(reply); + + *lock_args_ret = lv_lock_args; + return ret; +} + +static int _free_lv(struct cmd_context *cmd, struct volume_group *vg, + const char *lv_name, struct id *lv_id, const char *lock_args) +{ + char lv_uuid[64] __attribute__((aligned(8))); + daemon_reply reply; + int result; + int ret; + + if (!_use_lvmlockd) + return 1; + if (!_lvmlockd_connected) + return 0; + + id_write_format(lv_id, lv_uuid, sizeof(lv_uuid)); + + reply = _lockd_send("free_lv", + "pid = %d", getpid(), + "vg_name = %s", vg->name, + "lv_name = %s", lv_name, + "lv_uuid = %s", lv_uuid, + "vg_lock_type = %s", vg->lock_type, + "vg_lock_args = %s", vg->lock_args, + "lv_lock_args = %s", lock_args ?: "none", + NULL); + + if (!_lockd_result(reply, &result, NULL)) { + ret = 0; + } else { + ret = (result < 0) ? 0 : 1; + } + + if (!ret) + log_error("_free_lv lvmlockd result %d", result); + + daemon_reply_destroy(reply); + + return ret; +} + +int lockd_init_lv_args(struct cmd_context *cmd, struct volume_group *vg, + struct logical_volume *lv, + const char *lock_type, const char **lock_args) +{ + /* sanlock is the only lock type that sets per-LV lock_args. */ + if (!strcmp(lock_type, "sanlock")) + return _init_lv_sanlock(cmd, vg, lv->name, &lv->lvid.id[1], lock_args); + return 1; +} + +/* + * lvcreate + * + * An LV created in a lockd VG inherits the lock_type of the VG. In some + * cases, e.g. thin LVs, this function may decide that the LV should not be + * given a lock, in which case it sets lp lock_args to NULL, which will cause + * the LV to not have lock_args set in its metadata. A lockd_lv() request on + * an LV with no lock_args will do nothing (unless the LV type causes the lock + * request to be directed to another LV with a lock, e.g. to the thin pool LV + * for thin LVs.) + * + * Current limitations: + * - cache-type LV's in a lockd VG must be created with lvconvert. + * - creating a thin pool and thin lv in one command is not allowed. + */ + +int lockd_init_lv(struct cmd_context *cmd, struct volume_group *vg, struct logical_volume *lv, + struct lvcreate_params *lp) +{ + int lock_type_num = get_lock_type_from_string(vg->lock_type); + + if (cmd->lock_lv_mode && !strcmp(cmd->lock_lv_mode, "na")) + return 1; + + switch (lock_type_num) { + case LOCK_TYPE_NONE: + case LOCK_TYPE_CLVM: + return 1; + case LOCK_TYPE_SANLOCK: + case LOCK_TYPE_DLM: + break; + default: + log_error("lockd_init_lv: unknown lock_type."); + return 0; + } + + if (!lp->needs_lockd_init) { + /* needs_lock_init is set for LVs that need a lockd lock. */ + return 1; + + } else if (seg_is_cache(lp) || seg_is_cache_pool(lp)) { + log_error("Use lvconvert for cache with lock type %s", vg->lock_type); + return 0; + + } else if (!seg_is_thin_volume(lp) && lp->snapshot) { + struct logical_volume *origin_lv; + + /* + * COW snapshots are associated with their origin LV, + * and only the origin LV needs its own lock, which + * represents itself and all associated cow snapshots. + */ + + if (!(origin_lv = find_lv(vg, lp->origin_name))) { + log_error("Failed to find origin LV %s/%s", vg->name, lp->origin_name); + return 0; + } + if (!lockd_lv(cmd, origin_lv, "ex", LDLV_PERSISTENT)) { + log_error("Failed to lock origin LV %s/%s", vg->name, lp->origin_name); + return 0; + } + lv->lock_args = NULL; + return 1; + + } else if (seg_is_thin(lp)) { + if ((seg_is_thin_volume(lp) && !lp->create_pool) || + (!seg_is_thin_volume(lp) && lp->snapshot)) { + struct lv_list *lvl; + + /* + * Creating a new thin lv or snapshot. These lvs do not get + * their own lock but use the pool lock. If an lv does not + * use its own lock, its lock_args is set to NULL. + */ + + if (!(lvl = find_lv_in_vg(vg, lp->pool_name))) { + log_error("Failed to find thin pool %s/%s", vg->name, lp->pool_name); + return 0; + } + if (!lockd_lv(cmd, lvl->lv, "ex", LDLV_PERSISTENT)) { + log_error("Failed to lock thin pool %s/%s", vg->name, lp->pool_name); + return 0; + } + lv->lock_args = NULL; + return 1; + + } else if (seg_is_thin_volume(lp) && lp->create_pool) { + /* + * Creating a thin pool and a thin lv in it. We could + * probably make this work. + */ + log_error("Create thin pool and thin LV separately with lock type %s", + vg->lock_type); + return 0; + + } else if (!seg_is_thin_volume(lp) && lp->create_pool) { + /* Creating a thin pool only. */ + /* lv_name_lock = lp->pool_name; */ + + } else { + log_error("Unknown thin options for lock init."); + return 0; + } + + } else { + /* Creating a normal lv. */ + /* lv_name_lock = lv_name; */ + } + + /* + * The LV gets its own lock, so set lock_args to non-NULL. + * + * lockd_init_lv_args() will be called during vg_write() + * to complete the sanlock LV lock initialization, where + * actual space on disk is allocated. Waiting to do this + * last step until vg_write() avoids the need to revert + * the sanlock allocation if the lvcreate function isn't + * completed. + * + * This works, but would leave the sanlock lease allocated + * unless the lease was freed on each early exit path from + * lvcreate: + * + * return lockd_init_lv_args(cmd, vg, lv_name_lock, lv_id, + * vg->lock_type, &lv->lock_args); + */ + + if (!strcmp(vg->lock_type, "sanlock")) + lv->lock_args = "pending"; + else if (!strcmp(vg->lock_type, "dlm")) + lv->lock_args = "dlm"; + + return 1; +} + +/* lvremove */ + +int lockd_free_lv(struct cmd_context *cmd, struct volume_group *vg, + const char *lv_name, struct id *lv_id, const char *lock_args) +{ + if (cmd->lock_lv_mode && !strcmp(cmd->lock_lv_mode, "na")) + return 1; + + switch (get_lock_type_from_string(vg->lock_type)) { + case LOCK_TYPE_NONE: + case LOCK_TYPE_CLVM: + return 1; + case LOCK_TYPE_DLM: + case LOCK_TYPE_SANLOCK: + if (!lock_args) + return 1; + return _free_lv(cmd, vg, lv_name, lv_id, lock_args); + default: + log_error("lockd_free_lv: unknown lock_type."); + return 0; + } +} + +int lockd_rename_vg_before(struct cmd_context *cmd, struct volume_group *vg) +{ + struct lv_list *lvl; + daemon_reply reply; + int result; + int ret; + + if (!is_lockd_type(vg->lock_type)) + return 1; + + if (lvs_in_vg_activated(vg)) { + log_error("LVs must be inactive before vgrename."); + return 0; + } + + /* Check that no LVs are active on other hosts. */ + + dm_list_iterate_items(lvl, &vg->lvs) { + if (!lockd_lv(cmd, lvl->lv, "ex", 0)) { + log_error("LV %s/%s must be inactive on all hosts before vgrename.", + vg->name, lvl->lv->name); + return 0; + } + + if (!lockd_lv(cmd, lvl->lv, "un", 0)) { + log_error("Failed to unlock LV %s/%s.", vg->name, lvl->lv->name); + return 0; + } + } + + /* + * lvmlockd: + * checks for other hosts in lockspace + * leaves the lockspace + */ + + reply = _lockd_send("rename_vg_before", + "pid = %d", getpid(), + "vg_name = %s", vg->name, + "vg_lock_type = %s", vg->lock_type, + "vg_lock_args = %s", vg->lock_args, + NULL); + + if (!_lockd_result(reply, &result, NULL)) { + ret = 0; + } else { + ret = (result < 0) ? 0 : 1; + } + + daemon_reply_destroy(reply); + + if (!ret) { + log_error("lockd_rename_vg_before lvmlockd result %d", result); + return 0; + } + + if (!strcmp(vg->lock_type, "sanlock")) { + log_debug("lockd_rename_vg_before deactivate sanlock lv"); + _deactivate_sanlock_lv(cmd, vg); + } + + return 1; +} + +int lockd_rename_vg_final(struct cmd_context *cmd, struct volume_group *vg, int success) +{ + daemon_reply reply; + int result; + int ret; + + if (!is_lockd_type(vg->lock_type)) + return 1; + + if (!success) { + /* + * Depending on the problem that caused the rename to + * fail, it may make sense to not restart the VG here. + */ + if (!lockd_start_vg(cmd, vg)) + log_error("Failed to restart VG %s lockspace.", vg->name); + return 1; + } + + if (!strcmp(vg->lock_type, "sanlock")) { + if (!_activate_sanlock_lv(cmd, vg)) + return 0; + + /* + * lvmlockd needs to rewrite the leases on disk + * with the new VG (lockspace) name. + */ + reply = _lockd_send("rename_vg_final", + "pid = %d", getpid(), + "vg_name = %s", vg->name, + "vg_lock_type = %s", vg->lock_type, + "vg_lock_args = %s", vg->lock_args, + NULL); + + if (!_lockd_result(reply, &result, NULL)) { + ret = 0; + } else { + ret = (result < 0) ? 0 : 1; + } + + daemon_reply_destroy(reply); + + if (!ret) { + /* + * The VG has been renamed on disk, but renaming the + * sanlock leases failed. Cleaning this up can + * probably be done by converting the VG to lock_type + * none, then converting back to sanlock. + */ + log_error("lockd_rename_vg_final lvmlockd result %d", result); + return 0; + } + } + + if (!lockd_start_vg(cmd, vg)) + log_error("Failed to start VG %s lockspace.", vg->name); + + return 1; +} + +const char *lockd_running_lock_type(struct cmd_context *cmd) +{ + daemon_reply reply; + const char *lock_type = NULL; + int result; + + if (!_use_lvmlockd) + return NULL; + if (!_lvmlockd_connected) + return NULL; + + reply = _lockd_send("running_lm", + "pid = %d", getpid(), + NULL); + + if (!_lockd_result(reply, &result, NULL)) { + log_error("Failed to get result from lvmlockd"); + goto out; + } + + switch (result) { + case -EXFULL: + log_error("lvmlockd found multiple lock managers, use --lock-type to select one."); + break; + case -ENOLCK: + log_error("lvmlockd found no lock manager running."); + break; + case LOCK_TYPE_SANLOCK: + log_debug("lvmlockd found sanlock"); + lock_type = "sanlock"; + break; + case LOCK_TYPE_DLM: + log_debug("lvmlockd found dlm"); + lock_type = "dlm"; + break; + default: + log_error("Failed to find a running lock manager."); + break; + } +out: + daemon_reply_destroy(reply); + + return lock_type; +} + +/* Some LV types have no lock. */ + +int lockd_lv_uses_lock(struct logical_volume *lv) +{ + if (!lv_is_visible(lv) || + lv_is_thin_volume(lv) || + lv_is_thin_pool_data(lv) || + lv_is_thin_pool_metadata(lv) || + lv_is_pool_metadata_spare(lv) || + lv_is_cache_pool(lv) || + lv_is_cache_pool_data(lv) || + lv_is_cache_pool_metadata(lv) || + lv_is_lockd_sanlock_lv(lv)) + return 0; + return 1; +} + diff --git a/lib/locking/lvmlockd.h b/lib/locking/lvmlockd.h new file mode 100644 index 000000000..6db632df0 --- /dev/null +++ b/lib/locking/lvmlockd.h @@ -0,0 +1,262 @@ +/* + * Copyright (C) 2014 Red Hat, Inc. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU Lesser General Public License v.2.1. + */ + +#ifndef _LVMLOCKD_H +#define _LVMLOCKD_H + +#include "config-util.h" +#include "daemon-client.h" + +#define LOCKD_SANLOCK_LV_NAME "lvmlock" + +/* lockd_gl flags */ +#define LDGL_MODE_NOARG 0x00000001 +#define LDGL_SKIP_CACHE_VALIDATE 0x00000002 +#define LDGL_UPDATE_NAMES 0x00000004 + +/* lockd_vg flags */ +#define LDVG_MODE_NOARG 0x00000001 + +/* lockd_lv flags */ +#define LDLV_MODE_NOARG 0x00000001 +#define LDLV_MODE_NO_SH 0x00000002 +#define LDLV_PERSISTENT 0x00000004 + +/* lvmlockd result flags */ +#define LD_RF_NO_LOCKSPACES 0x00000001 +#define LD_RF_NO_GL_LS 0x00000002 +#define LD_RF_LOCAL_LS 0x00000004 +#define LD_RF_DUP_GL_LS 0x00000008 +#define LD_RF_INACTIVE_LS 0x00000010 +#define LD_RF_ADD_LS_ERROR 0x00000020 + +/* lockd_state flags */ +#define LDST_EX 0x00000001 +#define LDST_SH 0x00000002 +#define LDST_FAIL_REQUEST 0x00000004 +#define LDST_FAIL_NOLS 0x00000008 +#define LDST_FAIL_STARTING 0x00000010 +#define LDST_FAIL_OTHER 0x00000020 +#define LDST_FAIL (LDST_FAIL_REQUEST | LDST_FAIL_NOLS | LDST_FAIL_STARTING | LDST_FAIL_OTHER) + +/* + * Check if a lock_type uses lvmlockd. + * If not (none, clvm), return 0. + * If so (dlm, sanlock), return 1. + */ + +static inline int is_lockd_type(const char *lock_type) +{ + if (!lock_type) + return 0; + + if (!strcmp(lock_type, "dlm")) + return 1; + if (!strcmp(lock_type, "sanlock")) + return 1; + + return 0; +} + +#ifdef LVMLOCKD_SUPPORT + +/* lvmlockd connection and communication */ + +void lvmlockd_set_socket(const char *sock); +void lvmlockd_set_use(int use); +int lvmlockd_use(void); +void lvmlockd_init(struct cmd_context *cmd); +void lvmlockd_connect(void); +void lvmlockd_disconnect(void); + +/* vgcreate/vgremove use init/free */ + +int lockd_init_vg(struct cmd_context *cmd, struct volume_group *vg, const char *lock_type); +int lockd_free_vg_before(struct cmd_context *cmd, struct volume_group *vg); +void lockd_free_vg_final(struct cmd_context *cmd, struct volume_group *vg); + +/* vgrename */ + +int lockd_rename_vg_before(struct cmd_context *cmd, struct volume_group *vg); +int lockd_rename_vg_final(struct cmd_context *cmd, struct volume_group *vg, int success); + +/* start and stop the lockspace for a vg */ + +int lockd_start_vg(struct cmd_context *cmd, struct volume_group *vg); +int lockd_stop_vg(struct cmd_context *cmd, struct volume_group *vg); +int lockd_start_wait(struct cmd_context *cmd); + +/* locking */ + +int lockd_gl_create(struct cmd_context *cmd, const char *def_mode, const char *vg_lock_type); +int lockd_gl(struct cmd_context *cmd, const char *def_mode, uint32_t flags); +int lockd_vg(struct cmd_context *cmd, const char *vg_name, const char *def_mode, + uint32_t flags, uint32_t *lockd_state); +int lockd_vg_update(struct volume_group *vg); + +int lockd_lv_name(struct cmd_context *cmd, struct volume_group *vg, + const char *lv_name, struct id *lv_id, + const char *lock_args, const char *def_mode, uint32_t flags); +int lockd_lv(struct cmd_context *cmd, struct logical_volume *lv, + const char *def_mode, uint32_t flags); + +/* lvcreate/lvremove use init/free */ + +int lockd_init_lv(struct cmd_context *cmd, struct volume_group *vg, struct logical_volume *lv, + struct lvcreate_params *lp); +int lockd_init_lv_args(struct cmd_context *cmd, struct volume_group *vg, + struct logical_volume *lv, const char *lock_type, const char **lock_args); +int lockd_free_lv(struct cmd_context *cmd, struct volume_group *vg, + const char *lv_name, struct id *lv_id, const char *lock_args); + +const char *lockd_running_lock_type(struct cmd_context *cmd); + +int handle_sanlock_lv(struct cmd_context *cmd, struct volume_group *vg); + +int lockd_lv_uses_lock(struct logical_volume *lv); + +#else /* LVMLOCKD_SUPPORT */ + +static inline void lvmlockd_set_socket(const char *sock) +{ +} + +static inline void lvmlockd_set_use(int use) +{ +} + +static inline void lvmlockd_init(struct cmd_context *cmd) +{ +} + +static inline void lvmlockd_disconnect(void) +{ +} + +static inline void lvmlockd_connect(void) +{ +} + +static inline int lvmlockd_use(void) +{ + return 0; +} + +static inline int lockd_init_vg(struct cmd_context *cmd, struct volume_group *vg, const char *lock_type) +{ + return 1; +} + +static inline int lockd_free_vg_before(struct cmd_context *cmd, struct volume_group *vg) +{ + return 1; +} + +static inline void lockd_free_vg_final(struct cmd_context *cmd, struct volume_group *vg) +{ + return; +} + +static inline int lockd_rename_vg_before(struct cmd_context *cmd, struct volume_group *vg) +{ + return 1; +} + +static inline int lockd_rename_vg_final(struct cmd_context *cmd, struct volume_group *vg, int success) +{ + return 1; +} + +static inline int lockd_start_vg(struct cmd_context *cmd, struct volume_group *vg) +{ + return 0; +} + +static inline int lockd_stop_vg(struct cmd_context *cmd, struct volume_group *vg) +{ + return 0; +} + +static inline int lockd_start_wait(struct cmd_context *cmd) +{ + return 0; +} + +static inline int lockd_gl_create(struct cmd_context *cmd, const char *def_mode, const char *vg_lock_type) +{ + return 1; +} + +static inline int lockd_gl(struct cmd_context *cmd, const char *def_mode, uint32_t flags) +{ + return 1; +} + +static inline int lockd_vg(struct cmd_context *cmd, const char *vg_name, const char *def_mode, + uint32_t flags, uint32_t *lockd_state) +{ + return 1; +} + +static inline int lockd_vg_update(struct volume_group *vg) +{ + return 1; +} + +static inline int lockd_lv_name(struct cmd_context *cmd, struct volume_group *vg, + const char *lv_name, struct id *lv_id, + const char *lock_args, const char *def_mode, uint32_t flags) +{ + return 1; +} + +static inline int lockd_lv(struct cmd_context *cmd, struct logical_volume *lv, + const char *def_mode, uint32_t flags) +{ + return 1; +} + +static inline int lockd_init_lv(struct cmd_context *cmd, struct volume_group *vg, + struct logical_volume *lv, struct lvcreate_params *lp) +{ + return 0; +} + +static inline int lockd_init_lv_args(struct cmd_context *cmd, struct volume_group *vg, + struct logical_volume *lv, const char *lock_type, const char **lock_args) +{ + return 0; +} + +static inline int lockd_free_lv(struct cmd_context *cmd, struct volume_group *vg, + const char *lv_name, struct id *lv_id, const char *lock_args) +{ + return 0; +} + +static inline const char *lockd_running_lock_type(struct cmd_context *cmd) +{ + return NULL; +} + +static inline int handle_sanlock_lv(struct cmd_context *cmd, struct volume_group *vg) +{ + return 0; +} + +static inline int lockd_lv_uses_lock(struct logical_volume *lv) +{ + return 0; +} + +#endif /* LVMLOCKD_SUPPORT */ + +#endif + diff --git a/lib/metadata/lv.c b/lib/metadata/lv.c index 9a99c8d1c..24fbefe05 100644 --- a/lib/metadata/lv.c +++ b/lib/metadata/lv.c @@ -20,6 +20,7 @@ #include "toolcontext.h" #include "segtype.h" #include "str_list.h" +#include "lvmlockd.h" #include <time.h> #include <sys/utsname.h> @@ -911,6 +912,19 @@ static int _lv_is_exclusive(struct logical_volume *lv) int lv_active_change(struct cmd_context *cmd, struct logical_volume *lv, enum activation_change activate, int needs_exclusive) { + const char *ay_with_mode = NULL; + + if (activate == CHANGE_ASY) + ay_with_mode = "sh"; + if (activate == CHANGE_AEY) + ay_with_mode = "ex"; + + if (is_change_activating(activate) && + !lockd_lv(cmd, lv, ay_with_mode, LDLV_PERSISTENT)) { + log_error("Failed to lock logical volume %s/%s", lv->vg->name, lv->name); + return 0; + } + switch (activate) { case CHANGE_AN: deactivate: @@ -963,6 +977,10 @@ exclusive: return_0; } + if (!is_change_activating(activate) && + !lockd_lv(cmd, lv, "un", LDLV_PERSISTENT)) + log_error("Failed to unlock logical volume %s/%s", lv->vg->name, lv->name); + return 1; } @@ -1002,6 +1020,12 @@ char *lv_profile_dup(struct dm_pool *mem, const struct logical_volume *lv) return dm_pool_strdup(mem, profile_name); } +char *lv_lock_args_dup(struct dm_pool *mem, const struct logical_volume *lv) +{ + const char *lock_args = lv->lock_args ? lv->lock_args : ""; + return dm_pool_strdup(mem, lock_args); +} + /* For given LV find recursively the LV which holds lock for it */ const struct logical_volume *lv_lock_holder(const struct logical_volume *lv) { diff --git a/lib/metadata/lv.h b/lib/metadata/lv.h index 44750851a..d7977709a 100644 --- a/lib/metadata/lv.h +++ b/lib/metadata/lv.h @@ -51,7 +51,9 @@ struct logical_volume { struct dm_list segs_using_this_lv; uint64_t timestamp; + unsigned new_lock_args:1; const char *hostname; + const char *lock_args; }; struct lv_with_info_and_seg_status; @@ -103,6 +105,7 @@ const struct logical_volume *lv_lock_holder(const struct logical_volume *lv); const struct logical_volume *lv_ondisk(const struct logical_volume *lv); struct profile *lv_config_profile(const struct logical_volume *lv); char *lv_profile_dup(struct dm_pool *mem, const struct logical_volume *lv); +char *lv_lock_args_dup(struct dm_pool *mem, const struct logical_volume *lv); int lv_mirror_image_in_sync(const struct logical_volume *lv); int lv_raid_image_in_sync(const struct logical_volume *lv); int lv_raid_healthy(const struct logical_volume *lv); diff --git a/lib/metadata/lv_manip.c b/lib/metadata/lv_manip.c index de9674339..ad8eccbd7 100644 --- a/lib/metadata/lv_manip.c +++ b/lib/metadata/lv_manip.c @@ -30,6 +30,7 @@ #include "lvm-exec.h" #include "lvm-signal.h" #include "memlock.h" +#include "lvmlockd.h" typedef enum { PREFERRED, @@ -4588,7 +4589,9 @@ static int _lvresize_check_lv(struct cmd_context *cmd, struct logical_volume *lv return 0; } - if (!lv_is_visible(lv) && !lv_is_thin_pool_metadata(lv)) { + /* FIXME: use a status flag instead of the name "lvmlock". */ + + if (!lv_is_visible(lv) && !lv_is_thin_pool_metadata(lv) && strcmp(lv->name, "lvmlock")) { log_error("Can't resize internal logical volume %s", lv->name); return 0; } @@ -5238,6 +5241,13 @@ int lv_resize(struct cmd_context *cmd, struct logical_volume *lv, return 0; } + /* + * If the LV is locked from activation, this lock call is a no-op. + * Otherwise, this acquires a transient lock on the lv (not PERSISTENT). + */ + if (!lockd_lv(cmd, lv, "ex", 0)) + return_0; + if (lp->sizeargs && !(lock_lv = _lvresize_volume(cmd, lv, lp, pvh))) return_0; @@ -5586,6 +5596,7 @@ int lv_remove_single(struct cmd_context *cmd, struct logical_volume *lv, int format1_reload_required = 0; int visible; struct logical_volume *pool_lv = NULL; + struct logical_volume *lock_lv = lv; struct lv_segment *cache_seg = NULL; int ask_discard; struct lv_list *lvl; @@ -5632,14 +5643,19 @@ int lv_remove_single(struct cmd_context *cmd, struct logical_volume *lv, log_error("Can't remove logical volume %s used by a pool.", lv->name); return 0; - } else if (lv_is_thin_volume(lv)) + } else if (lv_is_thin_volume(lv)) { pool_lv = first_seg(lv)->pool_lv; + lock_lv = pool_lv; + } if (lv_is_locked(lv)) { log_error("Can't remove locked LV %s", lv->name); return 0; } + if (!lockd_lv(cmd, lock_lv, "ex", LDLV_PERSISTENT)) + return_0; + /* FIXME Ensure not referred to by another existing LVs */ ask_discard = find_config_tree_bool(cmd, devices_issue_discards_CFG, NULL); @@ -5814,6 +5830,9 @@ int lv_remove_single(struct cmd_context *cmd, struct logical_volume *lv, backup(vg); + lockd_lv(cmd, lock_lv, "un", LDLV_PERSISTENT | LDLV_MODE_NOARG); + lockd_free_lv(cmd, vg, lv->name, &lv->lvid.id[1], lv->lock_args); + if (!suppress_remove_message && visible) log_print_unless_silent("Logical volume \"%s\" successfully removed", lv->name); @@ -7196,6 +7215,14 @@ static struct logical_volume *_lv_create_an_lv(struct volume_group *vg, lv->major, lv->minor); } + /* + * The specific LV may not use a lock. lockd_init_lv() sets + * lv->lock_args to NULL if this LV does not use its own lock. + */ + + if (!lockd_init_lv(vg->cmd, vg, lv, lp)) + return_NULL; + dm_list_splice(&lv->tags, &lp->tags); if (!lv_extend(lv, create_segtype, @@ -7506,6 +7533,8 @@ deactivate_and_revert_new_lv: } revert_new_lv: + lockd_free_lv(vg->cmd, vg, lp->lv_name, &lv->lvid.id[1], lp->lock_args); + /* FIXME Better to revert to backup of metadata? */ if (!lv_remove(lv) || !vg_write(vg) || !vg_commit(vg)) log_error("Manual intervention may be required to remove " diff --git a/lib/metadata/metadata-exported.h b/lib/metadata/metadata-exported.h index ff10a98b9..cda8e28e8 100644 --- a/lib/metadata/metadata-exported.h +++ b/lib/metadata/metadata-exported.h @@ -101,6 +101,7 @@ #define THIN_POOL_DATA UINT64_C(0x0000004000000000) /* LV - Internal use only */ #define THIN_POOL_METADATA UINT64_C(0x0000008000000000) /* LV - Internal use only */ #define POOL_METADATA_SPARE UINT64_C(0x0000010000000000) /* LV - Internal use only */ +#define LOCKD_SANLOCK_LV UINT64_C(0x0000020000000000) /* LV - Internal use only */ #define LV_WRITEMOSTLY UINT64_C(0x0000020000000000) /* LV (RAID1) */ @@ -228,6 +229,7 @@ #define lv_is_pool_data(lv) (((lv)->status & (CACHE_POOL_DATA | THIN_POOL_DATA)) ? 1 : 0) #define lv_is_pool_metadata(lv) (((lv)->status & (CACHE_POOL_METADATA | THIN_POOL_METADATA)) ? 1 : 0) #define lv_is_pool_metadata_spare(lv) (((lv)->status & POOL_METADATA_SPARE) ? 1 : 0) +#define lv_is_lockd_sanlock_lv(lv) (((lv)->status & LOCKD_SANLOCK_LV) ? 1 : 0) #define lv_is_rlog(lv) (((lv)->status & REPLICATOR_LOG) ? 1 : 0) @@ -262,6 +264,14 @@ typedef enum { THIN_DISCARDS_PASSDOWN, } thin_discards_t; +typedef enum { + LOCK_TYPE_INVALID = -1, + LOCK_TYPE_NONE = 0, + LOCK_TYPE_CLVM = 1, + LOCK_TYPE_DLM = 2, + LOCK_TYPE_SANLOCK = 3, +} lock_type_t; + struct cmd_context; struct format_handler; struct labeller; @@ -640,9 +650,9 @@ int lv_resize(struct cmd_context *cmd, struct logical_volume *lv, * Return a handle to VG metadata. */ struct volume_group *vg_read(struct cmd_context *cmd, const char *vg_name, - const char *vgid, uint32_t flags); + const char *vgid, uint32_t flags, uint32_t lockd_state); struct volume_group *vg_read_for_update(struct cmd_context *cmd, const char *vg_name, - const char *vgid, uint32_t flags); + const char *vgid, uint32_t flags, uint32_t lockd_state); /* * Test validity of a VG handle. @@ -685,6 +695,7 @@ struct volume_group *vg_create(struct cmd_context *cmd, const char *vg_name); int vg_remove_mdas(struct volume_group *vg); int vg_remove_check(struct volume_group *vg); void vg_remove_pvs(struct volume_group *vg); +int vg_remove_direct(struct volume_group *vg); int vg_remove(struct volume_group *vg); int vg_rename(struct cmd_context *cmd, struct volume_group *vg, const char *new_name); @@ -863,12 +874,15 @@ struct lvcreate_params { #define THIN_CHUNK_SIZE_CALC_METHOD_GENERIC 0x01 #define THIN_CHUNK_SIZE_CALC_METHOD_PERFORMANCE 0x02 int thin_chunk_size_calc_policy; + unsigned needs_lockd_init : 1; const char *vg_name; /* only-used when VG is not yet opened (in /tools) */ const char *lv_name; /* all */ const char *origin_name; /* snap */ const char *pool_name; /* thin */ + const char *lock_args; + /* Keep args given by the user on command line */ /* FIXME: create some more universal solution here */ #define PASS_ARG_CHUNK_SIZE 0x01 @@ -1211,6 +1225,8 @@ struct vgcreate_params { int clustered; /* FIXME: put this into a 'status' variable instead? */ uint32_t vgmetadatacopies; const char *system_id; + const char *lock_type; + const char *lock_args; }; int validate_major_minor(const struct cmd_context *cmd, diff --git a/lib/metadata/metadata.c b/lib/metadata/metadata.c index 75f3038d2..641d6d43a 100644 --- a/lib/metadata/metadata.c +++ b/lib/metadata/metadata.c @@ -31,6 +31,7 @@ #include "locking.h" #include "archiver.h" #include "defaults.h" +#include "lvmlockd.h" #include <math.h> #include <sys/param.h> @@ -557,20 +558,14 @@ void vg_remove_pvs(struct volume_group *vg) } } -int vg_remove(struct volume_group *vg) +int vg_remove_direct(struct volume_group *vg) { struct physical_volume *pv; struct pv_list *pvl; int ret = 1; - if (!lock_vol(vg->cmd, VG_ORPHANS, LCK_VG_WRITE, NULL)) { - log_error("Can't get lock for orphan PVs"); - return 0; - } - if (!vg_remove_mdas(vg)) { log_error("vg_remove_mdas %s failed", vg->name); - unlock_vg(vg->cmd, VG_ORPHANS); return 0; } @@ -604,6 +599,8 @@ int vg_remove(struct volume_group *vg) if (!lvmetad_vg_remove(vg)) stack; + lockd_vg_update(vg); + if (!backup_remove(vg->cmd, vg->name)) stack; @@ -612,6 +609,20 @@ int vg_remove(struct volume_group *vg) else log_error("Volume group \"%s\" not properly removed", vg->name); + return ret; +} + +int vg_remove(struct volume_group *vg) +{ + int ret; + + if (!lock_vol(vg->cmd, VG_ORPHANS, LCK_VG_WRITE, NULL)) { + log_error("Can't get lock for orphan PVs"); + return 0; + } + + ret = vg_remove_direct(vg); + unlock_vg(vg->cmd, VG_ORPHANS); return ret; } @@ -2428,6 +2439,7 @@ struct validate_hash { struct dm_hash_table *lvname; struct dm_hash_table *lvid; struct dm_hash_table *pvid; + struct dm_hash_table *lv_lock_args; }; /* @@ -2786,6 +2798,87 @@ int vg_validate(struct volume_group *vg) if (vg_max_lv_reached(vg)) stack; + + if (!(vhash.lv_lock_args = dm_hash_create(lv_count))) { + log_error("Failed to allocate lv_lock_args hash"); + r = 0; + goto out; + } + + if (is_lockd_type(vg->lock_type)) { + if (!vg->lock_args) { + log_error(INTERNAL_ERROR "VG %s with lock_type %s without lock_args", + vg->name, vg->lock_type); + r = 0; + } + + if (vg_is_clustered(vg)) { + log_error(INTERNAL_ERROR "VG %s with lock_type %s is clustered", + vg->name, vg->lock_type); + r = 0; + } + + if (vg->system_id && vg->system_id[0]) { + log_error(INTERNAL_ERROR "VG %s with lock_type %s has system_id %s", + vg->name, vg->lock_type, vg->system_id); + r = 0; + } + + if (strcmp(vg->lock_type, "sanlock") && strcmp(vg->lock_type, "dlm")) { + log_error(INTERNAL_ERROR "VG %s has unknown lock_type %s", + vg->name, vg->lock_type); + r = 0; + } + } else { + if (vg->lock_args) { + log_error(INTERNAL_ERROR "VG %s has lock_args %s without lock_type", + vg->name, vg->lock_args); + r = 0; + } + } + + dm_list_iterate_items(lvl, &vg->lvs) { + if (is_lockd_type(vg->lock_type)) { + if (lockd_lv_uses_lock(lvl->lv)) { + if (vg->skip_validate_lock_args) { + continue; + } else if (!lvl->lv->lock_args) { + log_error(INTERNAL_ERROR "LV %s/%s missing lock_args", + vg->name, lvl->lv->name); + r = 0; + } else if (!strcmp(vg->lock_type, "sanlock")) { + if (dm_hash_lookup(vhash.lv_lock_args, lvl->lv->lock_args)) { + log_error(INTERNAL_ERROR "LV %s/%s has duplicate lock_args %s.", + vg->name, lvl->lv->name, lvl->lv->lock_args); + r = 0; + } + + if (!dm_hash_insert(vhash.lv_lock_args, lvl->lv->lock_args, lvl)) { + log_error("Failed to hash lvname."); + r = 0; + } + + } else if (!strcmp(vg->lock_type, "dlm") && strcmp(lvl->lv->lock_args, "dlm")) { + log_error(INTERNAL_ERROR "LV %s/%s bad dlm lock_args %s", + vg->name, lvl->lv->name, lvl->lv->lock_args); + r = 0; + } + } else { + if (lvl->lv->lock_args) { + log_error(INTERNAL_ERROR "LV %s/%s shouldn't have lock_args", + vg->name, lvl->lv->name); + r = 0; + } + } + } else { + if (lvl->lv->lock_args) { + log_error(INTERNAL_ERROR "LV %s/%s with no lock_type has lock_args %s", + vg->name, lvl->lv->name, lvl->lv->lock_args); + r = 0; + } + } + } + out: if (vhash.lvid) dm_hash_destroy(vhash.lvid); @@ -2793,6 +2886,8 @@ out: dm_hash_destroy(vhash.lvname); if (vhash.pvid) dm_hash_destroy(vhash.pvid); + if (vhash.lv_lock_args) + dm_hash_destroy(vhash.lv_lock_args); return r; } @@ -2806,8 +2901,19 @@ int vg_write(struct volume_group *vg) struct dm_list *mdah; struct pv_to_create *pv_to_create; struct metadata_area *mda; + struct lv_list *lvl; int revert = 0, wrote = 0; + dm_list_iterate_items(lvl, &vg->lvs) { + if (lvl->lv->lock_args && !strcmp(lvl->lv->lock_args, "pending")) { + if (!lockd_init_lv_args(vg->cmd, vg, lvl->lv, vg->lock_type, &lvl->lv->lock_args)) { + log_error("Cannot allocate lock for new LV."); + return 0; + } + lvl->lv->new_lock_args = 1; + } + } + if (!vg_validate(vg)) return_0; @@ -2974,6 +3080,8 @@ int vg_commit(struct volume_group *vg) cache_updated = _vg_commit_mdas(vg); + lockd_vg_update(vg); + if (cache_updated) { /* Instruct remote nodes to upgrade cached metadata. */ if (!remote_commit_cached_metadata(vg)) @@ -3007,6 +3115,14 @@ int vg_commit(struct volume_group *vg) void vg_revert(struct volume_group *vg) { struct metadata_area *mda; + struct lv_list *lvl; + + dm_list_iterate_items(lvl, &vg->lvs) { + if (lvl->lv->new_lock_args) { + lockd_free_lv(vg->cmd, vg, lvl->lv->name, &lvl->lv->lvid.id[1], lvl->lv->lock_args); + lvl->lv->new_lock_args = 0; + } + } release_vg(vg->vg_precommitted); /* VG is no longer needed */ vg->vg_precommitted = NULL; @@ -3818,6 +3934,16 @@ static struct volume_group *_vg_read_by_vgid(struct cmd_context *cmd, release_vg(vg); } + /* + * When using lvmlockd we should never reach this point. + * The VG is locked, then vg_read() is done, which gets + * the latest VG from lvmetad, or disk if lvmetad has + * been invalidated. When we get here the VG should + * always be cached and returned above. + */ + if (lvmlockd_use()) + log_error(INTERNAL_ERROR "vg_read_by_vgid failed with lvmlockd"); + /* Mustn't scan if memory locked: ensure cache gets pre-populated! */ if (critical_section()) return_NULL; @@ -4506,18 +4632,47 @@ static int _access_vg_clustered(struct cmd_context *cmd, struct volume_group *vg return 1; } -static int _access_vg_lock_type(struct cmd_context *cmd, struct volume_group *vg) +static int _access_vg_lock_type(struct cmd_context *cmd, struct volume_group *vg, + uint32_t lockd_state) { if (!is_real_vg(vg->name)) return 1; + if (cmd->lockd_vg_disable) + return 1; + /* - * Until lock_type support is added, reject any VG that has a lock_type. + * Local VG requires no lock from lvmlockd. */ - if (vg->lock_type && vg->lock_type[0] && strcmp(vg->lock_type, "none")) { - log_error("Cannot access VG %s with unsupported lock_type %s.", - vg->name, vg->lock_type); - return 0; + if (!is_lockd_type(vg->lock_type)) + return 1; + + /* + * When lvmlockd is not used, only allow read access to the VG. + */ + if (!lvmlockd_use()) { + if (lockd_state & LDST_EX) { + log_error("Cannot access VG %s which requires lvmlockd for lock_type %s.", + vg->name, vg->lock_type); + return 0; + } else { + log_warn("Reading VG %s without a lock.", vg->name); + return 1; + } + } + + /* + * The lock failed. If the lock was ex, we cannot continue. + * If the lock was sh, we can allow reading. + */ + if (lockd_state & LDST_FAIL) { + if (lockd_state & LDST_EX) { + log_error("Cannot access VG %s due to failed lock.", vg->name); + return 0; + } else { + log_warn("Reading VG %s without a lock.", vg->name); + return 1; + } } return 1; @@ -4598,7 +4753,8 @@ static int _access_vg_systemid(struct cmd_context *cmd, struct volume_group *vg) /* * FIXME: move _vg_bad_status_bits() checks in here. */ -static int _vg_access_permitted(struct cmd_context *cmd, struct volume_group *vg, uint32_t *failure) +static int _vg_access_permitted(struct cmd_context *cmd, struct volume_group *vg, + uint32_t lockd_state, uint32_t *failure) { if (!is_real_vg(vg->name)) { /* Disallow use of LVM1 orphans when a host system ID is set. */ @@ -4614,7 +4770,7 @@ static int _vg_access_permitted(struct cmd_context *cmd, struct volume_group *vg return 0; } - if (!_access_vg_lock_type(cmd, vg)) { + if (!_access_vg_lock_type(cmd, vg, lockd_state)) { *failure |= FAILED_LOCK_TYPE; return 0; } @@ -4640,7 +4796,8 @@ static int _vg_access_permitted(struct cmd_context *cmd, struct volume_group *vg */ static struct volume_group *_vg_lock_and_read(struct cmd_context *cmd, const char *vg_name, const char *vgid, uint32_t lock_flags, - uint64_t status_flags, uint32_t misc_flags) + uint64_t status_flags, uint32_t misc_flags, + uint32_t lockd_state) { struct volume_group *vg = NULL; int consistent = 1; @@ -4686,7 +4843,7 @@ static struct volume_group *_vg_lock_and_read(struct cmd_context *cmd, const cha goto bad; } - if (!_vg_access_permitted(cmd, vg, &failure)) + if (!_vg_access_permitted(cmd, vg, lockd_state, &failure)) goto bad; /* consistent == 0 when VG is not found, but failed == FAILED_NOTFOUND */ @@ -4762,7 +4919,7 @@ bad_no_unlock: * *consistent = 1. */ struct volume_group *vg_read(struct cmd_context *cmd, const char *vg_name, - const char *vgid, uint32_t flags) + const char *vgid, uint32_t flags, uint32_t lockd_state) { uint64_t status = UINT64_C(0); uint32_t lock_flags = LCK_VG_READ; @@ -4775,7 +4932,7 @@ struct volume_group *vg_read(struct cmd_context *cmd, const char *vg_name, if (flags & READ_ALLOW_EXPORTED) status &= ~EXPORTED_VG; - return _vg_lock_and_read(cmd, vg_name, vgid, lock_flags, status, flags); + return _vg_lock_and_read(cmd, vg_name, vgid, lock_flags, status, flags, lockd_state); } /* @@ -4784,9 +4941,9 @@ struct volume_group *vg_read(struct cmd_context *cmd, const char *vg_name, * request the new metadata to be written and committed). */ struct volume_group *vg_read_for_update(struct cmd_context *cmd, const char *vg_name, - const char *vgid, uint32_t flags) + const char *vgid, uint32_t flags, uint32_t lockd_state) { - return vg_read(cmd, vg_name, vgid, flags | READ_FOR_UPDATE); + return vg_read(cmd, vg_name, vgid, flags | READ_FOR_UPDATE, lockd_state); } /* diff --git a/lib/metadata/raid_manip.c b/lib/metadata/raid_manip.c index 64cfb3f13..f88381b4e 100644 --- a/lib/metadata/raid_manip.c +++ b/lib/metadata/raid_manip.c @@ -21,6 +21,7 @@ #include "activate.h" #include "lv_alloc.h" #include "lvm-string.h" +#include "lvmlockd.h" static int _lv_is_raid_with_tracking(const struct logical_volume *lv, struct logical_volume **tracking) @@ -1082,6 +1083,12 @@ int lv_raid_split(struct logical_volume *lv, const char *split_name, dm_list_init(&removal_list); dm_list_init(&data_list); + if (is_lockd_type(lv->vg->lock_type)) { + log_error("Splitting raid image is not allowed with lock_type %s", + lv->vg->lock_type); + return 0; + } + if ((old_count - new_count) != 1) { log_error("Unable to split more than one image from %s/%s", lv->vg->name, lv->name); diff --git a/lib/metadata/replicator_manip.c b/lib/metadata/replicator_manip.c index 54dc75929..fc4bf5fa1 100644 --- a/lib/metadata/replicator_manip.c +++ b/lib/metadata/replicator_manip.c @@ -566,7 +566,7 @@ int cmd_vg_read(struct cmd_context *cmd, struct dm_list *cmd_vgs) /* Iterate through alphabeticaly ordered cmd_vg list */ dm_list_iterate_items(cvl, cmd_vgs) { - cvl->vg = vg_read(cmd, cvl->vg_name, cvl->vgid, cvl->flags); + cvl->vg = vg_read(cmd, cvl->vg_name, cvl->vgid, cvl->flags, 0); if (vg_read_error(cvl->vg)) { log_debug_metadata("Failed to vg_read %s", cvl->vg_name); return 0; @@ -644,7 +644,7 @@ int lv_read_replicator_vgs(const struct logical_volume *lv) dm_list_iterate_items(rsite, &first_seg(lv)->replicator->rsites) { if (!rsite->vg_name) continue; - vg = vg_read(lv->vg->cmd, rsite->vg_name, 0, 0); // READ_WITHOUT_LOCK + vg = vg_read(lv->vg->cmd, rsite->vg_name, 0, 0, 0); // READ_WITHOUT_LOCK if (vg_read_error(vg)) { log_error("Unable to read volume group %s", rsite->vg_name); diff --git a/lib/metadata/vg.c b/lib/metadata/vg.c index 1db8e7588..0e5aee5af 100644 --- a/lib/metadata/vg.c +++ b/lib/metadata/vg.c @@ -20,6 +20,7 @@ #include "toolcontext.h" #include "lvmcache.h" #include "archiver.h" +#include "lvmlockd.h" struct volume_group *alloc_vg(const char *pool_name, struct cmd_context *cmd, const char *vg_name) @@ -134,6 +135,16 @@ char *vg_system_id_dup(const struct volume_group *vg) return dm_pool_strdup(vg->vgmem, vg->system_id ? : vg->lvm1_system_id ? : ""); } +char *vg_lock_type_dup(const struct volume_group *vg) +{ + return dm_pool_strdup(vg->vgmem, vg->lock_type ? : vg->lock_type ? : ""); +} + +char *vg_lock_args_dup(const struct volume_group *vg) +{ + return dm_pool_strdup(vg->vgmem, vg->lock_args ? : vg->lock_args ? : ""); +} + char *vg_uuid_dup(const struct volume_group *vg) { return id_format_and_copy(vg->vgmem, &vg->id); @@ -637,6 +648,19 @@ int vg_set_system_id(struct volume_group *vg, const char *system_id) return 1; } +int vg_set_lock_type(struct volume_group *vg, const char *lock_type) +{ + if (!lock_type) + lock_type = "none"; + + if (!(vg->lock_type = dm_pool_strdup(vg->vgmem, lock_type))) { + log_error("vg_set_lock_type %s no mem", lock_type); + return 0; + } + + return 1; +} + char *vg_attr_dup(struct dm_pool *mem, const struct volume_group *vg) { char *repstr; @@ -651,7 +675,14 @@ char *vg_attr_dup(struct dm_pool *mem, const struct volume_group *vg) repstr[2] = (vg_is_exported(vg)) ? 'x' : '-'; repstr[3] = (vg_missing_pv_count(vg)) ? 'p' : '-'; repstr[4] = alloc_policy_char(vg->alloc); - repstr[5] = (vg_is_clustered(vg)) ? 'c' : '-'; + + if (vg_is_clustered(vg)) + repstr[5] = 'c'; + else if (is_lockd_type(vg->lock_type)) + repstr[5] = 's'; + else + repstr[5] = '-'; + return repstr; } @@ -706,7 +737,7 @@ int vgreduce_single(struct cmd_context *cmd, struct volume_group *vg, vg->extent_count -= pv_pe_count(pv); orphan_vg = vg_read_for_update(cmd, vg->fid->fmt->orphan_vg_name, - NULL, 0); + NULL, 0, 0); if (vg_read_error(orphan_vg)) goto bad; diff --git a/lib/metadata/vg.h b/lib/metadata/vg.h index 2da565151..a21af8b06 100644 --- a/lib/metadata/vg.h +++ b/lib/metadata/vg.h @@ -49,6 +49,7 @@ struct volume_group { struct dm_list *cmd_vgs;/* List of wanted/locked and opened VGs */ uint32_t cmd_missing_vgs;/* Flag marks missing VG */ uint32_t seqno; /* Metadata sequence number */ + unsigned skip_validate_lock_args : 1; /* * The parsed on-disk copy of this VG; is NULL if this is the on-disk @@ -71,6 +72,7 @@ struct volume_group { const char *system_id; char *lvm1_system_id; const char *lock_type; + const char *lock_args; uint32_t extent_size; uint32_t extent_count; @@ -151,6 +153,7 @@ struct volume_group { struct dm_hash_table *hostnames; /* map of creation hostnames */ struct logical_volume *pool_metadata_spare_lv; /* one per VG */ + struct logical_volume *sanlock_lv; /* one per VG */ }; struct volume_group *alloc_vg(const char *pool_name, struct cmd_context *cmd, @@ -166,11 +169,14 @@ void free_orphan_vg(struct volume_group *vg); char *vg_fmt_dup(const struct volume_group *vg); char *vg_name_dup(const struct volume_group *vg); char *vg_system_id_dup(const struct volume_group *vg); +char *vg_lock_type_dup(const struct volume_group *vg); +char *vg_lock_args_dup(const struct volume_group *vg); uint32_t vg_seqno(const struct volume_group *vg); uint64_t vg_status(const struct volume_group *vg); int vg_set_alloc_policy(struct volume_group *vg, alloc_policy_t alloc); int vg_set_clustered(struct volume_group *vg, int clustered); int vg_set_system_id(struct volume_group *vg, const char *system_id); +int vg_set_lock_type(struct volume_group *vg, const char *lock_type); uint64_t vg_size(const struct volume_group *vg); uint64_t vg_free(const struct volume_group *vg); uint64_t vg_extent_size(const struct volume_group *vg); diff --git a/lib/report/columns.h b/lib/report/columns.h index 06282c5b4..22141d885 100644 --- a/lib/report/columns.h +++ b/lib/report/columns.h @@ -84,6 +84,7 @@ FIELD(LVS, lv, STR, "Meta", lvid, 4, metadatalv, metadata_lv, "For thin and cach FIELD(LVS, lv, STR, "Pool", lvid, 4, poollv, pool_lv, "For thin volumes, the thin pool LV for this volume.", 0) FIELD(LVS, lv, STR_LIST, "LV Tags", tags, 7, tags, lv_tags, "Tags, if any.", 0) FIELD(LVS, lv, STR, "LProfile", lvid, 8, lvprofile, lv_profile, "Configuration profile attached to this LV.", 0) +FIELD(LVS, lv, STR, "Lock Args", lvid, 9, lvlockargs, lv_lockargs, "Lock args of the LV used by lvmlockd.", 0) FIELD(LVS, lv, STR, "Time", lvid, 26, lvtime, lv_time, "Creation time of the LV, if known", 0) FIELD(LVS, lv, STR, "Host", lvid, 10, lvhost, lv_host, "Creation host of the LV, if known.", 0) FIELD(LVS, lv, STR_LIST, "Modules", lvid, 7, modules, lv_modules, "Kernel device-mapper modules required for this LV.", 0) @@ -143,6 +144,8 @@ FIELD(VGS, vg, SIZ, "VSize", cmd, 5, vgsize, vg_size, "Total size of VG in curre FIELD(VGS, vg, SIZ, "VFree", cmd, 5, vgfree, vg_free, "Total amount of free space in current units.", 0) FIELD(VGS, vg, STR, "SYS ID", cmd, 6, vgsystemid, vg_sysid, "System ID of the VG indicating which host owns it.", 0) FIELD(VGS, vg, STR, "System ID", cmd, 9, vgsystemid, vg_systemid, "System ID of the VG indicating which host owns it.", 0) +FIELD(VGS, vg, STR, "Lock Type", cmd, 9, vglocktype, vg_locktype, "Lock type of the VG used by lvmlockd.", 0) +FIELD(VGS, vg, STR, "Lock Args", cmd, 9, vglockargs, vg_lockargs, "Lock args of the VG used by lvmlockd.", 0) FIELD(VGS, vg, SIZ, "Ext", extent_size, 3, size32, vg_extent_size, "Size of Physical Extents in current units.", 0) FIELD(VGS, vg, NUM, "#Ext", extent_count, 4, uint32, vg_extent_count, "Total number of Physical Extents.", 0) FIELD(VGS, vg, NUM, "Free", free_count, 4, uint32, vg_free_count, "Total number of unallocated Physical Extents.", 0) diff --git a/lib/report/properties.c b/lib/report/properties.c index b0a91a7da..108836f2e 100644 --- a/lib/report/properties.c +++ b/lib/report/properties.c @@ -350,6 +350,8 @@ GET_LV_STR_PROPERTY_FN(lv_active, lv_active_dup(lv->vg->vgmem, lv)) #define _lv_active_set prop_not_implemented_set GET_LV_STR_PROPERTY_FN(lv_profile, lv_profile_dup(lv->vg->vgmem, lv)) #define _lv_profile_set prop_not_implemented_set +GET_LV_STR_PROPERTY_FN(lv_lockargs, lv_lock_args_dup(lv->vg->vgmem, lv)) +#define _lv_lockargs_set prop_not_implemented_set /* VG */ GET_VG_STR_PROPERTY_FN(vg_fmt, vg_fmt_dup(vg)) @@ -368,6 +370,10 @@ GET_VG_STR_PROPERTY_FN(vg_sysid, vg_system_id_dup(vg)) #define _vg_sysid_set prop_not_implemented_set GET_VG_STR_PROPERTY_FN(vg_systemid, vg_system_id_dup(vg)) #define _vg_systemid_set prop_not_implemented_set +GET_VG_STR_PROPERTY_FN(vg_locktype, vg_lock_type_dup(vg)) +#define _vg_locktype_set prop_not_implemented_set +GET_VG_STR_PROPERTY_FN(vg_lockargs, vg_lock_args_dup(vg)) +#define _vg_lockargs_set prop_not_implemented_set GET_VG_NUM_PROPERTY_FN(vg_extent_size, (SECTOR_SIZE * vg->extent_size)) #define _vg_extent_size_set prop_not_implemented_set GET_VG_NUM_PROPERTY_FN(vg_extent_count, vg->extent_count) diff --git a/lib/report/report.c b/lib/report/report.c index a1076690a..5c6f64762 100644 --- a/lib/report/report.c +++ b/lib/report/report.c @@ -369,6 +369,16 @@ static int _lvprofile_disp(struct dm_report *rh, struct dm_pool *mem, return _field_set_value(field, "", NULL); } +static int _lvlockargs_disp(struct dm_report *rh, struct dm_pool *mem, + struct dm_report_field *field, + const void *data, void *private) +{ + const struct logical_volume *lv = (const struct logical_volume *) data; + const char *repstr = lv->lock_args ? lv->lock_args : ""; + + return _string_disp(rh, mem, field, &repstr, private); +} + static int _vgfmt_disp(struct dm_report *rh, struct dm_pool *mem, struct dm_report_field *field, const void *data, void *private) @@ -1100,6 +1110,26 @@ static int _vgsystemid_disp(struct dm_report *rh, struct dm_pool *mem, return _string_disp(rh, mem, field, &repstr, private); } +static int _vglocktype_disp(struct dm_report *rh, struct dm_pool *mem, + struct dm_report_field *field, + const void *data, void *private) +{ + const struct volume_group *vg = (const struct volume_group *) data; + const char *repstr = vg->lock_type ? vg->lock_type : ""; + + return _string_disp(rh, mem, field, &repstr, private); +} + +static int _vglockargs_disp(struct dm_report *rh, struct dm_pool *mem, + struct dm_report_field *field, + const void *data, void *private) +{ + const struct volume_group *vg = (const struct volume_group *) data; + const char *repstr = vg->lock_args ? vg->lock_args : ""; + + return _string_disp(rh, mem, field, &repstr, private); +} + static int _uuid_disp(struct dm_report *rh __attribute__((unused)), struct dm_pool *mem, struct dm_report_field *field, const void *data, void *private __attribute__((unused))) diff --git a/liblvm/lvm_vg.c b/liblvm/lvm_vg.c index 76c5c6356..a2d42d23f 100644 --- a/liblvm/lvm_vg.c +++ b/liblvm/lvm_vg.c @@ -218,7 +218,7 @@ static vg_t _lvm_vg_open(lvm_t libh, const char *vgname, const char *mode, return NULL; } - vg = vg_read((struct cmd_context *)libh, vgname, NULL, internal_flags); + vg = vg_read((struct cmd_context *)libh, vgname, NULL, internal_flags, 0); if (vg_read_error(vg)) { /* FIXME: use log_errno either here in inside vg_read */ release_vg(vg); diff --git a/man/Makefile.in b/man/Makefile.in index d75d9168f..0eca98712 100644 --- a/man/Makefile.in +++ b/man/Makefile.in @@ -46,6 +46,12 @@ else LVMPOLLD = endif +ifeq ("@BUILD_LVMLOCKD@", "yes") +LVMLOCKD = lvmlockd.8 +else +LVMLOCKD = +endif + MAN5=lvm.conf.5 MAN7=lvmsystemid.7 MAN8=lvm-config.8 lvm-dumpconfig.8 lvm-lvpoll.8 \ @@ -56,7 +62,8 @@ MAN8=lvm-config.8 lvm-dumpconfig.8 lvm-lvpoll.8 \ pvresize.8 pvs.8 pvscan.8 vgcfgbackup.8 vgcfgrestore.8 vgchange.8 \ vgck.8 vgcreate.8 vgconvert.8 vgdisplay.8 vgexport.8 vgextend.8 \ vgimport.8 vgimportclone.8 vgmerge.8 vgmknodes.8 vgreduce.8 vgremove.8 \ - vgrename.8 vgs.8 vgscan.8 vgsplit.8 $(FSADMMAN) $(LVMETAD) $(LVMPOLLD) + vgrename.8 vgs.8 vgscan.8 vgsplit.8 $(FSADMMAN) $(LVMETAD) $(LVMPOLLD) \ + $(LVMLOCKD) ifneq ("@CLVMD@", "none") MAN8CLUSTER=clvmd.8 diff --git a/man/lvmlockd.8.in b/man/lvmlockd.8.in new file mode 100644 index 000000000..79a3218fe --- /dev/null +++ b/man/lvmlockd.8.in @@ -0,0 +1,755 @@ +.TH "LVMLOCKD" "8" "LVM TOOLS #VERSION#" "Red Hat, Inc" "\"" + +.SH NAME +lvmlockd \(em LVM locking daemon + +.SH DESCRIPTION +LVM commands use lvmlockd to coordinate access to shared storage. +.br +When LVM is used on devices shared by multiple hosts, locks will: + +.IP \[bu] 2 +coordinate reading and writing of LVM metadata +.IP \[bu] 2 +validate caching of LVM metadata +.IP \[bu] 2 +prevent concurrent activation of logical volumes + +.P + +lvmlockd uses an external lock manager to perform basic locking. +.br +Lock manager (lock type) options are: + +.IP \[bu] 2 +sanlock: places locks on disk within LVM storage. +.IP \[bu] 2 +dlm: uses network communication and a cluster manager. + +.P + +.SH OPTIONS + +lvmlockd [options] + +For default settings, see lvmlockd -h. + +.B --help | -h + Show this help information. + +.B --version | -V + Show version of lvmlockd. + +.B --test | -T + Test mode, do not call lock manager. + +.B --foreground | -f + Don't fork. + +.B --daemon-debug | -D + Don't fork and print debugging to stdout. + +.B --pid-file | -p +.I path + Set path to the pid file. + +.B --socket-path | -s +.I path + Set path to the socket to listen on. + +.B --syslog-priority | -S err|warning|debug + Write log messages from this level up to syslog. + +.B --gl-type | -g +.I str + Set global lock type to be sanlock|dlm. + +.B --host-id | -i +.I num + Set the local sanlock host id. + +.B --host-id-file | -F +.I path + A file containing the local sanlock host_id. + +.B --adopt | A 0|1 + Adopt locks from a previous instance of lvmlockd. + + +.SH USAGE + +.SS Initial set up + +Using LVM with lvmlockd for the first time includes some one-time set up +steps: + +.SS 1. choose a lock manager + +.I dlm +.br +If dlm (or corosync) are already being used by other cluster +software, then select dlm. dlm uses corosync which requires additional +configuration beyond the scope of this document. See corosync and dlm +documentation for instructions on configuration, setup and usage. + +.I sanlock +.br +Choose sanlock if dlm/corosync are not otherwise required. +sanlock does not depend on any clustering software or configuration. + +.SS 2. configure hosts to use lvmlockd + +On all hosts running lvmlockd, configure lvm.conf: +.nf +locking_type = 1 +use_lvmlockd = 1 +use_lvmetad = 1 +.fi + +.I sanlock +.br +Assign each host a unique host_id in the range 1-2000 by setting +.br +/etc/lvm/lvmlocal.conf local/host_id = <num> + +.SS 3. start lvmlockd + +Use a service/init file if available, or just run "lvmlockd". + +.SS 4. start lock manager + +.I sanlock +.br +systemctl start wdmd sanlock + +.I dlm +.br +Follow external clustering documentation when applicable, otherwise: +.br +systemctl start corosync dlm + +.SS 5. create VGs on shared devices + +vgcreate --shared <vg_name> <devices> + +The vgcreate --shared option sets the VG lock type to sanlock or dlm +depending on which lock manager is running. LVM commands will perform +locking for the VG using lvmlockd. + +.SS 6. start VGs on all hosts + +vgchange --lock-start + +lvmlockd requires shared VGs to be "started" before they are used. This +is a lock manager operation to start/join the VG lockspace, and it may +take some time. Until the start completes, locks for the VG are not +available. LVM commands are allowed to read the VG while start is in +progress. (A service/init file can be used to start VGs.) + +.SS 7. create and activate LVs + +Standard lvcreate and lvchange commands are used to create and activate +LVs in a lockd VG. + +An LV activated exclusively on one host cannot be activated on another. +When multiple hosts need to use the same LV concurrently, the LV can be +activated with a shared lock (see lvchange options -aey vs -asy.) +(Shared locks are disallowed for certain LV types that cannot be used from +multiple hosts.) + + +.SS Normal start up and shut down + +After initial set up, start up and shut down include the following general +steps. They can be performed manually or using the system init/service +manager. + +.IP \[bu] 2 +start lvmetad +.IP \[bu] 2 +start lvmlockd +.IP \[bu] 2 +start lock manager +.IP \[bu] 2 +vgchange --lock-start +.IP \[bu] 2 +activate LVs in shared VGs + +.P + +The shut down sequence is the reverse: + +.IP \[bu] 2 +deactivate LVs in shared VGs +.IP \[bu] 2 +vgchange --lock-stop +.IP \[bu] 2 +stop lock manager +.IP \[bu] 2 +stop lvmlockd +.IP \[bu] 2 +stop lvmetad + +.P + +.SH TOPICS + +.SS locking terms + +The following terms are used to distinguish VGs that require locking from +those that do not. + +.I "lockd VG" + +A "lockd VG" is a shared VG that has a "lock type" of dlm or sanlock. +Using it requires lvmlockd. These VGs exist on shared storage that is +visible to multiple hosts. LVM commands use lvmlockd to perform locking +for these VGs when they are used. + +If the lock manager for a lock type is not available (e.g. not started or +failed), lvmlockd is not able to acquire locks from it, and LVM commands +are unable to fully use VGs with the given lock type. Commands generally +allow reading VGs in this condition, but changes and activation are not +allowed. Maintaining a properly running lock manager can require +background not covered here. + +.I "local VG" + +A "local VG" is meant to be used by a single host. It has no lock type or +lock type "none". LVM commands and lvmlockd do not perform locking for +these VGs. A local VG typically exists on local (non-shared) devices and +cannot be used concurrently from different hosts. + +If a local VG does exist on shared devices, it should be owned by a single +host by having its system ID set, see +.BR lvmsystemid (7). +Only the host with a matching system ID can use the local VG. A VG +with no lock type and no system ID should be excluded from all but one +host using lvm.conf filters. Without any of these protections, a local VG +on shared devices can be easily damaged or destroyed. + +.I "clvm VG" + +A "clvm VG" is a VG on shared storage (like a lockd VG) that requires +clvmd for clustering. See below for converting a clvm VG to a lockd VG. + + +.SS lockd VGs from hosts not using lvmlockd + +Only hosts that will use lockd VGs should be configured to run lvmlockd. +However, devices with lockd VGs may be visible from hosts not using +lvmlockd. From a host not using lvmlockd, visible lockd VGs are ignored +in the same way as foreign VGs, i.e. those with a foreign system ID, see +.BR lvmsystemid (7). + + +.SS vgcreate differences + +Forms of the vgcreate command: + +.B vgcreate <vg_name> <devices> + +.IP \[bu] 2 +Creates a local VG with the local system ID when neither lvmlockd nor clvm are configured. +.IP \[bu] 2 +Creates a local VG with the local system ID when lvmlockd is configured. +.IP \[bu] 2 +Creates a clvm VG when clvm is configured. + +.P + +.B vgcreate --shared <vg_name> <devices> +.IP \[bu] 2 +Requires lvmlockd to be configured (use_lvmlockd=1). +.IP \[bu] 2 +Creates a lockd VG with lock type sanlock|dlm depending on which is running. +.IP \[bu] 2 +LVM commands request locks from lvmlockd to use the VG. +.IP \[bu] 2 +lvmlockd obtains locks from the selected lock manager. + +.P + +.B vgcreate -c|--clustered y <vg_name> <devices> +.IP \[bu] 2 +Requires clvm to be configured (locking_type=3). +.IP \[bu] 2 +Creates a clvm VG with the "clustered" flag. +.IP \[bu] 2 +LVM commands request locks from clvmd to use the VG. + +.P + +.SS using lockd VGs + +When use_lvmlockd is first enabled, and before the first lockd VG is +created, no global lock will exist, and LVM commands will try and fail to +acquire it. LVM commands will report a warning until the first lockd VG +is created which will create the global lock. Before the global lock +exists, VGs can still be read, but commands that require the global lock +exclusively will fail. + +When a new lockd VG is created, its lockspace is automatically started on +the host that creates the VG. Other hosts will need to run 'vgcreate +--lock-start' to start the new VG before they can use it. + +From the 'vgs' reporting command, lockd VGs are indicated by "s" (for +shared) in the sixth attr field. The specific lock type and lock args +for a lockd VG can be displayed with 'vgs -o+locktype,lockargs'. + + +.SS starting and stopping VGs + +Starting a lockd VG (vgchange --lock-start) causes the lock manager to +start or join the lockspace for the VG. This makes locks for the VG +accessible to the host. Stopping the VG leaves the lockspace and makes +locks for the VG inaccessible to the host. + +Lockspaces should be started as early as possible because starting +(joining) a lockspace can take a long time (potentially minutes after a +host failure when using sanlock.) A VG can be started after all the +following are true: + +.nf +- lvmlockd is running +- lock manager is running +- VG is visible to the system +.fi + +All lockd VGs can be started/stopped using: +.br +vgchange --lock-start +.br +vgchange --lock-stop + + +Individual VGs can be started/stopped using: +.br +vgchange --lock-start <vg_name> ... +.br +vgchange --lock-stop <vg_name> ... + +To make vgchange not wait for start to complete: +.br +vgchange --lock-start --lock-opt nowait +.br +vgchange --lock-start --lock-opt nowait <vg_name> + +To stop all lockspaces and wait for all to complete: +.br +lvmlockctl --stop-lockspaces --wait + +To start only selected lockd VGs, use the lvm.conf +activation/lock_start_list. When defined, only VG names in this list are +started by vgchange. If the list is not defined (the default), all +visible lockd VGs are started. To start only "vg1", use the following +lvm.conf configuration: + +.nf +activation { + lock_start_list = [ "vg1" ] + ... +} +.fi + + +.SS automatic starting and automatic activation + +Scripts or programs on a host that automatically start VGs will use the +"auto" option to indicate that the command is being run automatically by +the system: + +vgchange --lock-start --lock-opt auto [vg_name ...] + +Without any additional configuration, including the "auto" option has no +effect; all VGs are started unless restricted by lock_start_list. + +However, when the lvm.conf activation/auto_lock_start_list is defined, the +auto start command performs an additional filtering phase to all VGs being +started, testing each VG name against the auto_lock_start_list. The +auto_lock_start_list defines lockd VGs that will be started by the auto +start command. Visible lockd VGs not included in the list are ignored by +the auto start command. If the list is undefined, all VG names pass this +filter. (The lock_start_list is also still used to filter all VGs.) + +The auto_lock_start_list allows a user to select certain lockd VGs that +should be automatically started by the system (or indirectly, those that +should not). + +To use auto activation of lockd LVs (see auto_activation_volume_list), +auto starting of the corresponding lockd VGs is necessary. + + +.SS locking activity + +To optimize the use of LVM with lvmlockd, consider the three kinds of +locks in lvmlockd and when they are used: + +.I GL lock + +The global lock (GL lock) is associated with global information, which is +information not isolated to a single VG. This includes: + +- The global VG namespace. +.br +- The set of orphan PVs and unused devices. +.br +- The properties of orphan PVs, e.g. PV size. + +The global lock is used in shared mode by commands that read this +information, or in exclusive mode by commands that change it. + +The command 'vgs' acquires the global lock in shared mode because it +reports the list of all VG names. + +The vgcreate command acquires the global lock in exclusive mode because it +creates a new VG name, and it takes a PV from the list of unused PVs. + +When an LVM command is given a tag argument, or uses select, it must read +all VGs to match the tag or selection, which causes the global lock to be +acquired. To avoid use of the global lock, avoid using tags and select, +and specify VG name arguments. + +When use_lvmlockd is enabled, LVM commands attempt to acquire the global +lock even if no lockd VGs exist. For this reason, lvmlockd should not be +enabled unless lockd VGs will be used. + +.I VG lock + +A VG lock is associated with each VG. The VG lock is acquired in shared +mode to read the VG and in exclusive mode to change the VG (modify the VG +metadata). This lock serializes modifications to a VG with all other LVM +commands on other hosts. + +The command 'vgs' will not only acquire the GL lock to read the list of +all VG names, but will acquire the VG lock for each VG prior to reading +it. + +The command 'vgs <vg_name>' does not acquire the GL lock (it does not need +the list of all VG names), but will acquire the VG lock on each VG name +argument. + +.I LV lock + +An LV lock is acquired before the LV is activated, and is released after +the LV is deactivated. If the LV lock cannot be acquired, the LV is not +activated. LV locks are persistent and remain in place after the +activation command is done. GL and VG locks are transient, and are held +only while an LVM command is running. + +.I retries + +If a request for a GL or VG lock fails due to a lock conflict with another +host, lvmlockd automatically retries for a short time before returning a +failure to the LVM command. The LVM command will then retry the entire +lock request a number of times specified by global/lock_retries before +failing. If a request for an LV lock fails due to a lock conflict, the +command fails immediately. + + +.SS sanlock global lock + +There are some special cases related to the global lock in sanlock VGs. + +The global lock exists in one of the sanlock VGs. The first sanlock VG +created will contain the global lock. Subsequent sanlock VGs will each +contain disabled global locks that can be enabled later if necessary. + +The VG containing the global lock must be visible to all hosts using +sanlock VGs. This can be a reason to create a small sanlock VG, visible +to all hosts, and dedicated to just holding the global lock. While not +required, this strategy can help to avoid extra work in the future if VGs +are moved or removed. + +The vgcreate command typically acquires the global lock, but in the case +of the first sanlock VG, there will be no global lock to acquire until the +initial vgcreate is complete. So, creating the first sanlock VG is a +special case that skips the global lock. + +vgcreate for a sanlock VG determines it is the first one to exist if no +other sanlock VGs are visible. It is possible that other sanlock VGs do +exist but are not visible or started on the host running vgcreate. This +raises the possibility of more than one global lock existing. If this +happens, commands will warn of the condition, and it should be manually +corrected. + +If the situation arises where more than one sanlock VG contains a global +lock, the global lock should be manually disabled in all but one of them +with the command: + +lvmlockctl --gl-disable <vg_name> + +(The one VG with the global lock enabled must be visible to all hosts.) + +An opposite problem can occur if the VG holding the global lock is +removed. In this case, no global lock will exist following the vgremove, +and subsequent LVM commands will fail to acquire it. In this case, the +global lock needs to be manually enabled in one of the remaining sanlock +VGs with the command: + +lvmlockctl --gl-enable <vg_name> + +A small sanlock VG dedicated to holding the global lock can avoid the case +where the GL lock must be manually enabled after a vgremove. + + +.SS changing lock type + +To change a local VG to a lockd VG: + +vgchange --lock-type sanlock|dlm <vg_name> + +All LVs must be inactive to change the lock type. + +To change a clvm VG to a lockd VG: + +vgchange --lock-type sanlock|dlm <vg_name> + +Changing a lockd VG to a local VG is not yet generally allowed. +(It can be done partially in certain recovery cases.) + + +.SS vgremove of a sanlock VG + +vgremove of a sanlock VG will fail if other hosts have the VG started. +Run vgchange --lock-stop <vg_name> on all other hosts before vgremove. + +(It may take several seconds before vgremove recognizes that all hosts +have stopped.) + + +.SS shared LVs + +When an LV is used concurrently from multiple hosts (e.g. by a +multi-host/cluster application or file system), the LV can be activated on +multiple hosts concurrently using a shared lock. + +To activate the LV with a shared lock: lvchange -asy vg/lv. + +With lvmlockd, an unspecified activation mode is always exclusive, i.e. +-ay defaults to -aey. + +If the LV type does not allow the LV to be used concurrently from multiple +hosts, then a shared activation lock is not allowed and the lvchange +command will report an error. LV types that cannot be used concurrently +from multiple hosts include thin, cache, raid, mirror, and snapshot. + +lvextend on LV with shared locks is not yet allowed. The LV must be +deactivated, or activated exclusively to run lvextend. + + +.SS recover from lost PV holding sanlock locks + +A number of special manual steps must be performed to restore sanlock +locks if the PV holding the locks is lost. Contact the LVM group for +help with this process. + + +.\" This is not clean or safe enough to suggest using without help. +.\" +.\" .SS recover from lost PV holding sanlock locks +.\" +.\" In a sanlock VG, the locks are stored on a PV within the VG. If this PV +.\" is lost, the locks need to be reconstructed as follows: +.\" +.\" 1. Enable the unsafe lock modes option in lvm.conf so that default locking requirements can be overriden. +.\" +.\" .nf +.\" allow_override_lock_modes = 1 +.\" .fi +.\" +.\" 2. Remove missing PVs and partial LVs from the VG. +.\" +.\" Warning: this is a dangerous operation. Read the man page +.\" for vgreduce first, and try running with the test option. +.\" Verify that the only missing PV is the PV holding the sanlock locks. +.\" +.\" .nf +.\" vgreduce --removemissing --force --lock-gl na --lock-vg na <vg> +.\" .fi +.\" +.\" 3. If step 2 does not remove the internal/hidden "lvmlock" lv, it should be removed. +.\" +.\" .nf +.\" lvremove --lock-vg na --lock-lv na <vg>/lvmlock +.\" .fi +.\" +.\" 4. Change the lock type to none. +.\" +.\" .nf +.\" vgchange --lock-type none --force --lock-gl na --lock-vg na <vg> +.\" .fi +.\" +.\" 5. VG space is needed to recreate the locks. If there is not enough space, vgextend the vg. +.\" +.\" 6. Change the lock type back to sanlock. This creates a new internal +.\" lvmlock lv, and recreates locks. +.\" +.\" .nf +.\" vgchange --lock-type sanlock <vg> +.\" .fi + +.SS locking system failures + +.B lvmlockd failure + +If lvmlockd fails or is killed while holding locks, the locks are orphaned +in the lock manager. lvmlockd can be restarted, and it will adopt the +locks from the lock manager that had been held by the previous instance. + +.B dlm/corosync failure + +If dlm or corosync fail, the clustering system will fence the host using a +method configured within the dlm/corosync clustering environment. + +LVM commands on other hosts will be blocked from acquiring any locks until +the dlm/corosync recovery process is complete. + +.B sanlock lock storage failure + +If access to the device containing the VG's locks is lost, sanlock cannot +renew its leases for locked LVs. This means that the host could soon lose +the lease to another host which could activate the LV exclusively. +sanlock is designed to never reach the point where two hosts hold the +same lease exclusively at once, so the same LV should never be active on +two hosts at once when activated exclusively. + +The current method of handling this involves no action from lvmlockd, +while allowing sanlock to protect the leases itself. This produces a safe +but potentially inconvenient result. Doing nothing from lvmlockd leads to +the host's LV locks not being released, which leads to sanlock using the +local watchdog to reset the host before another host can acquire any locks +held by the local host. + +LVM commands on other hosts will be blocked from acquiring locks held by +the failed/reset host until the sanlock recovery time expires (2-4 +minutes). This includes activation of any LVs that were locked by the +failed host. It also includes GL/VG locks held by any LVM commands that +happened to be running on the failed host at the time of the failure. + +(In the future, lvmlockd may have the option to suspend locked LVs in +response the sanlock leases expiring. This would avoid the need for +sanlock to reset the host.) + +.B sanlock daemon failure + +If the sanlock daemon fails or exits while a lockspace is started, the +local watchdog will reset the host. See previous section for the impact +on other hosts. + + +.SS changing dlm cluster name + +When a dlm VG is created, the cluster name is saved in the VG metadata for +the new VG. To use the VG, a host must be in the named cluster. If the +cluster name is changed, or the VG is moved to a different cluster, the +cluster name for the dlm VG must be changed. To do this: + +1. Ensure the VG is not being used by any hosts. + +2. The new cluster must be active on the node making the change. +.br + The current dlm cluster name can be seen by: +.br + cat /sys/kernel/config/dlm/cluster/cluster_name + +3. Change the VG lock type to none: +.br + vgchange --lock-type none --force <vg_name> + +4. Change the VG lock type back to dlm which sets the new cluster name: +.br + vgchange --lock-type dlm <vg_name> + + +.SS limitations of lvmlockd and lockd VGs + +lvmlockd currently requires using lvmetad and lvmpolld. + +If a lockd VG becomes visible after the initial system startup, it is not +automatically started through the system service/init manager, and LVs in +it are not autoactivated. + +Things that do not yet work in lockd VGs: +.br +- old style mirror LVs (only raid1) +.br +- creating a new thin pool and a new thin LV in a single command +.br +- using lvcreate to create cache pools or cache LVs (use lvconvert) +.br +- splitting raid1 mirror LVs +.br +- vgsplit +.br +- vgmerge +.br +- resizing an LV that is active in the shared mode on multiple hosts + + +.SS clvmd to lvmlockd transition + +(See above for converting an existing clvm VG to a lockd VG.) + +While lvmlockd and clvmd are entirely different systems, LVM usage remains +largely the same. Differences are more notable when using lvmlockd's +sanlock option. + +Visible usage differences between lockd VGs with lvmlockd and clvm VGs +with clvmd: + +.IP \[bu] 2 +lvm.conf must be configured to use either lvmlockd (use_lvmlockd=1) or +clvmd (locking_type=3), but not both. + +.IP \[bu] 2 +vgcreate --shared creates a lockd VG, and vgcreate --clustered y creates a +clvm VG. + +.IP \[bu] 2 +lvmlockd adds the option of using sanlock for locking, avoiding the +need for network clustering. + +.IP \[bu] 2 +lvmlockd does not require all hosts to see all the same shared devices. + +.IP \[bu] 2 +lvmlockd defaults to the exclusive activation mode whenever the activation +mode is unspecified, i.e. -ay means -aey, not -asy. + +.IP \[bu] 2 +lvmlockd commands always apply to the local host, and never have an effect +on a remote host. (The activation option 'l' is not used.) + +.IP \[bu] 2 +lvmlockd works with thin and cache pools and LVs. + +.IP \[bu] 2 +lvmlockd saves the cluster name for a lockd VG using dlm. Only hosts in +the matching cluster can use the VG. + +.IP \[bu] 2 +lvmlockd requires starting/stopping lockd VGs with vgchange --lock-start +and --lock-stop. + +.IP \[bu] 2 +vgremove of a sanlock VG may fail indicating that all hosts have not +stopped the lockspace for the VG. Stop the VG lockspace on all uses using +vgchange --lock-stop. + +.IP \[bu] 2 +Long lasting lock contention among hosts may result in a command giving up +and failing. The number of lock retries can be adjusted with +global/lock_retries. + +.IP \[bu] 2 +The reporting options locktype and lockargs can be used to view lockd VG +and LV lock_type and lock_args fields, i.g. vgs -o+locktype,lockargs. +In the sixth VG attr field, "s" for "shared" is displayed for lockd VGs. + +.IP \[bu] 2 +If lvmlockd fails or is killed while in use, locks it held remain but are +orphaned in the lock manager. lvmlockd can be restarted with an option to +adopt the orphan locks from the previous instance of lvmlockd. + +.P diff --git a/nix/default.nix b/nix/default.nix index cd2fd335d..c7462a09e 100644 --- a/nix/default.nix +++ b/nix/default.nix @@ -367,6 +367,7 @@ let centos66 = centos65; centos70 = [ "dlm-devel" "dlm" "corosynclib-devel" "perl-Digest-MD5" "systemd-devel" "socat" # used by test suite lvmpolld + "sanlock" # used by test suite lvmlockd "procps-ng" ]; fedora17_18 = [ "dlm-devel" "corosynclib-devel" "libblkid" "libblkid-devel" diff --git a/scripts/Makefile.in b/scripts/Makefile.in index e9cce3aa3..2ae532583 100644 --- a/scripts/Makefile.in +++ b/scripts/Makefile.in @@ -121,6 +121,10 @@ ifeq ("@BUILD_LVMPOLLD@", "yes") $(INSTALL_DATA) lvm2_lvmpolld_systemd_red_hat.socket $(systemd_unit_dir)/lvm2-lvmpolld.socket $(INSTALL_DATA) lvm2_lvmpolld_systemd_red_hat.service $(systemd_unit_dir)/lvm2-lvmpolld.service endif +ifeq ("@BUILD_LVMLOCKD@", "yes") + $(INSTALL_DATA) lvm2_lvmlockd_systemd_red_hat.service $(systemd_unit_dir)/lvm2-lvmlockd.service + $(INSTALL_DATA) lvm2_lvmlocking_systemd_red_hat.service $(systemd_unit_dir)/lvm2-lvmlocking.service +endif ifneq ("@CLVMD@", "none") $(INSTALL_DATA) lvm2_clvmd_systemd_red_hat.service $(systemd_unit_dir)/lvm2-clvmd.service $(INSTALL_DATA) lvm2_cluster_activation_systemd_red_hat.service $(systemd_unit_dir)/lvm2-cluster-activation.service @@ -151,6 +155,8 @@ DISTCLEAN_TARGETS += \ lvm2_lvmetad_systemd_red_hat.socket \ lvm2_lvmpolld_systemd_red_hat.service \ lvm2_lvmpolld_systemd_red_hat.socket \ + lvm2_lvmlockd_systemd_red_hat.service \ + lvm2_lvmlocking_systemd_red_hat.service \ lvm2_monitoring_init_red_hat \ lvm2_monitoring_systemd_red_hat.service \ lvm2_pvscan_systemd_red_hat@.service \ diff --git a/scripts/lvm2_lvmlockd_systemd_red_hat.service.in b/scripts/lvm2_lvmlockd_systemd_red_hat.service.in new file mode 100644 index 000000000..17c7dbf91 --- /dev/null +++ b/scripts/lvm2_lvmlockd_systemd_red_hat.service.in @@ -0,0 +1,16 @@ +[Unit] +Description=LVM2 lock daemon +Documentation=man:lvmlockd(8) +After=lvm2-lvmetad.service + +[Service] +Type=simple +NonBlocking=true +ExecStart=@sbindir@/lvmlockd -f +Environment=SD_ACTIVATION=1 +PIDFile=@LVMLOCKD_PIDFILE@ +SendSIGKILL=no + +[Install] +WantedBy=multi-user.target + diff --git a/scripts/lvm2_lvmlocking_systemd_red_hat.service.in b/scripts/lvm2_lvmlocking_systemd_red_hat.service.in new file mode 100644 index 000000000..bfac578a7 --- /dev/null +++ b/scripts/lvm2_lvmlocking_systemd_red_hat.service.in @@ -0,0 +1,24 @@ +[Unit] +Description=Availability of lockspaces in lvmlockd +Documentation=man:lvmlockd(8) +After=lvm2-lvmlockd.service sanlock.service dlm.service + +[Service] +Type=oneshot +RemainAfterExit=yes + +# start lockspaces and wait for them to finish starting +ExecStart=@sbindir@/vgchange --lock-start --lock-opt autowait + +# auto activate LVs in the newly started lockd VGs +ExecStart=@sbindir@/vgchange -aay -S 'locktype=sanlock || locktype=dlm' + +# deactivate LVs in lockd VGs +ExecStop=@sbindir@/vgchange -an -S 'locktype=sanlock || locktype=dlm' + +# stop lockspaces and wait for them to finish stopping +ExecStop=@sbindir@/lvmlockctl --stop-lockspaces --wait 1 + +[Install] +WantedBy=multi-user.target + diff --git a/spec/build.inc b/spec/build.inc index 3979073d3..db9bcc2f6 100644 --- a/spec/build.inc +++ b/spec/build.inc @@ -6,6 +6,8 @@ %enableif %{enable_lvmetad} lvmetad %global enable_lvmpolld %(if echo %{services} | grep -q lvmpolld; then echo 1; else echo 0; fi) %enableif %{enable_lvmpolld} lvmpolld +%global enable_lvmlockd %(if echo %{services} | grep -q lvmlockd; then echo 1; else echo 0; fi) +%enableif %{enable_lvmlockd} lvmlockd %build %configure \ diff --git a/spec/packages.inc b/spec/packages.inc index b9caa6552..7cfaa2f55 100644 --- a/spec/packages.inc +++ b/spec/packages.inc @@ -86,6 +86,10 @@ fi %if %{have_service lvmpolld} %{_sbindir}/lvmpolld %endif +%if %{have_service lvmlockd} + %{_sbindir}/lvmlockd + %{_sbindir}/lvmlockctl +%endif %if %{have_with cache} %{_mandir}/man7/lvmcache.7.gz %endif @@ -156,6 +160,9 @@ fi %{_mandir}/man8/lvmpolld.8.gz %{_mandir}/man8/lvm-lvpoll.8.gz %endif +%if %{have_service lvmlockd} + %{_mandir}/man8/lvmlockd.8.gz +%endif %dir %{_sysconfdir}/lvm %ghost %{_sysconfdir}/lvm/cache/.cache %config(noreplace) %verify(not md5 mtime size) %{_sysconfdir}/lvm/lvm.conf @@ -182,6 +189,10 @@ fi %{_unitdir}/lvm2-lvmpolld.service %{_unitdir}/lvm2-lvmpolld.socket %endif + #%if %{have_service lvmlockd} + # %{_unitdir}/lvm2-lvmlockd.service + # %{_unitdir}/lvm2-lvmlockd.socket + #%endif %else %{_sysconfdir}/rc.d/init.d/lvm2-monitor %{_sysconfdir}/rc.d/init.d/blk-availability @@ -191,6 +202,9 @@ fi %if %{have_service lvmpolld} %{_sysconfdir}/rc.d/init.d/lvm2-lvmpolld %endif + #%if %{have_service lvmlockd} + # %{_sysconfdir}/rc.d/init.d/lvm2-lvmlockd + #%endif %endif ############################################################################## diff --git a/spec/source.inc b/spec/source.inc index a11f4b784..00d52821c 100644 --- a/spec/source.inc +++ b/spec/source.inc @@ -27,6 +27,8 @@ %service lvmpolld 1 +%service lvmlockd 1 + ############################################################## %if %{fedora} == 16 || %{rhel} == 6 diff --git a/test/Makefile.in b/test/Makefile.in index cd2c42532..bba33dba4 100644 --- a/test/Makefile.in +++ b/test/Makefile.in @@ -70,6 +70,8 @@ help: @echo " check_cluster Run tests with cluster daemon." @echo " check_lvmetad Run tests with lvmetad daemon." @echo " check_lvmpolld Run tests with lvmpolld daemon." + @echo " check_lvmlockd_sanlock Run tests with lvmlockd and sanlock." + @echo " check_lvmlockd_dlm Run tests with lvmlockd and dlm." @echo " clean Clean dir." @echo " help Display callable targets." @echo -e "\nSupported variables:" @@ -138,6 +140,32 @@ check_lvmpolld: .tests-stamp --flavours ndev-lvmpolld,ndev-cluster-lvmpolld,ndev-lvmetad-lvmpolld --only $(T) --skip $(S) endif +ifeq ("@BUILD_LVMLOCKD@", "yes") +check_lvmlockd_sanlock: .tests-stamp + VERBOSE=$(VERBOSE) ./lib/runner \ + --testdir . --outdir results \ + --flavours udev-lvmlockd-sanlock --only shell/sanlock-prepare.sh + VERBOSE=$(VERBOSE) ./lib/runner \ + --testdir . --outdir results \ + --flavours udev-lvmlockd-sanlock --only $(T) --skip $(S) + VERBOSE=$(VERBOSE) ./lib/runner \ + --testdir . --outdir results \ + --flavours udev-lvmlockd-sanlock --only shell/sanlock-remove.sh +endif + +ifeq ("@BUILD_LVMLOCKD@", "yes") +check_lvmlockd_dlm: .tests-stamp + VERBOSE=$(VERBOSE) ./lib/runner \ + --testdir . --outdir results \ + --flavours udev-lvmlockd-dlm --only shell/dlm-prepare.sh + VERBOSE=$(VERBOSE) ./lib/runner \ + --testdir . --outdir results \ + --flavours udev-lvmlockd-dlm --only $(T) --skip $(S) + VERBOSE=$(VERBOSE) ./lib/runner \ + --testdir . --outdir results \ + --flavours udev-lvmlockd-dlm --only shell/dlm-remove.sh +endif + DATADIR = $(datadir)/lvm2-testsuite EXECDIR = $(libexecdir)/lvm2-testsuite @@ -153,6 +181,8 @@ LIB_FLAVOURS = \ lib/flavour-udev-lvmetad-lvmpolld\ lib/flavour-udev-lvmetad\ lib/flavour-udev-lvmpolld\ + lib/flavour-udev-lvmlockd-sanlock\ + lib/flavour-udev-lvmlockd-dlm\ lib/flavour-udev-vanilla LIB_LOCAL = lib/paths lib/runner diff --git a/test/lib/aux.sh b/test/lib/aux.sh index 53ebd8acd..55508439f 100644 --- a/test/lib/aux.sh +++ b/test/lib/aux.sh @@ -817,6 +817,9 @@ generate_config() { LVM_TEST_LOCKING=${LVM_TEST_LOCKING:-1} LVM_TEST_LVMETAD=${LVM_TEST_LVMETAD:-0} LVM_TEST_LVMPOLLD=${LVM_TEST_LVMPOLLD:-0} + LVM_TEST_LVMLOCKD=${LVM_TEST_LVMLOCKD:-0} + LVM_TEST_LOCK_TYPE_SANLOCK=${LVM_TEST_LOCK_TYPE_SANLOCK:-0} + LVM_TEST_LOCK_TYPE_DLM=${LVM_TEST_LOCK_TYPE_DLM:-0} if test "$DM_DEV_DIR" = "/dev"; then LVM_VERIFY_UDEV=${LVM_VERIFY_UDEV:-0} else @@ -859,6 +862,7 @@ global/thin_dump_executable = "$LVM_TEST_THIN_DUMP_CMD" global/thin_repair_executable = "$LVM_TEST_THIN_REPAIR_CMD" global/use_lvmetad = $LVM_TEST_LVMETAD global/use_lvmpolld = $LVM_TEST_LVMPOLLD +global/use_lvmlockd = $LVM_TEST_LVMLOCKD log/activation = 1 log/file = "$TESTDIR/debug.log" log/indent = 1 diff --git a/test/lib/flavour-udev-lvmlockd-dlm.sh b/test/lib/flavour-udev-lvmlockd-dlm.sh new file mode 100644 index 000000000..5bd274911 --- /dev/null +++ b/test/lib/flavour-udev-lvmlockd-dlm.sh @@ -0,0 +1,6 @@ +export LVM_TEST_LOCKING=1 +export LVM_TEST_LVMETAD=1 +export LVM_TEST_LVMPOLLD=1 +export LVM_TEST_LVMLOCKD=1 +export LVM_TEST_LOCK_TYPE_DLM=1 +export LVM_TEST_DEVDIR=/dev diff --git a/test/lib/flavour-udev-lvmlockd-sanlock.sh b/test/lib/flavour-udev-lvmlockd-sanlock.sh new file mode 100644 index 000000000..859ee2e66 --- /dev/null +++ b/test/lib/flavour-udev-lvmlockd-sanlock.sh @@ -0,0 +1,6 @@ +export LVM_TEST_LOCKING=1 +export LVM_TEST_LVMETAD=1 +export LVM_TEST_LVMPOLLD=1 +export LVM_TEST_LVMLOCKD=1 +export LVM_TEST_LOCK_TYPE_SANLOCK=1 +export LVM_TEST_DEVDIR=/dev diff --git a/test/lib/inittest.sh b/test/lib/inittest.sh index 0b898f39b..e509ae425 100644 --- a/test/lib/inittest.sh +++ b/test/lib/inittest.sh @@ -106,6 +106,13 @@ test -n "$LVM_TEST_LVMPOLLD" && { aux prepare_lvmpolld } +if test -n "$LVM_TEST_LVMLOCKD" ; then + if test -n "$LVM_TEST_LOCK_TYPE_SANLOCK" ; then + aux lvmconf 'local/host_id = 1' + fi +#alias vgcreate='vgcreate --shared' +fi + echo "<======== Processing test: \"$TESTNAME\" ========>" set -vx diff --git a/test/lib/test-corosync-conf b/test/lib/test-corosync-conf new file mode 100644 index 000000000..ccc958f1d --- /dev/null +++ b/test/lib/test-corosync-conf @@ -0,0 +1,19 @@ +# created by lvm test suite +totem { + version: 2 + secauth: off + cluster_name: test +} +nodelist { + node { + ring0_addr: @LOCAL_NODE@ + nodeid: 1 + } +} +quorum { + provider: corosync_votequorum +} +logging { + to_syslog: yes +} + diff --git a/test/lib/test-dlm-conf b/test/lib/test-dlm-conf new file mode 100644 index 000000000..a93c93fca --- /dev/null +++ b/test/lib/test-dlm-conf @@ -0,0 +1,4 @@ +# created by lvm test suite +log_debug=1 +enable_fencing=0 + diff --git a/test/lib/test-sanlock-conf b/test/lib/test-sanlock-conf new file mode 100644 index 000000000..d1df598b0 --- /dev/null +++ b/test/lib/test-sanlock-conf @@ -0,0 +1,2 @@ +# created by lvm test suite +SANLOCKOPTS="-U sanlock -G sanlock -w 0" diff --git a/test/lib/utils.sh b/test/lib/utils.sh index fe7ccd3bc..24c9076d4 100644 --- a/test/lib/utils.sh +++ b/test/lib/utils.sh @@ -57,6 +57,8 @@ mkdtemp() { destdir=$1 template=$2 + test -d "$destdir" || die "DIR ('$destdir') does not exist." + case "$template" in *XXXX) ;; *) die "Invalid template: $template (must have a suffix of at least 4 X's)";; diff --git a/test/shell/dlm-prepare.sh b/test/shell/dlm-prepare.sh new file mode 100644 index 000000000..c4f02a480 --- /dev/null +++ b/test/shell/dlm-prepare.sh @@ -0,0 +1,90 @@ +#!/bin/sh +# Copyright (C) 2008-2012 Red Hat, Inc. All rights reserved. +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions +# of the GNU General Public License v.2. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +test_description='Set up things to run tests with dlm' + +. lib/utils +. lib/inittest + +[ -z "$LVM_TEST_LOCK_TYPE_DLM" ] && skip; + +COROSYNC_CONF="/etc/corosync/corosync.conf" +COROSYNC_NODE="$(hostname)" +create_corosync_conf() { + if test -a $COROSYNC_CONF; then + if ! grep "created by lvm test suite" $COROSYNC_CONF; then + rm $COROSYNC_CONF + else + mv $COROSYNC_CONF $COROSYNC_CONF.prelvmtest + fi + fi + + sed -e "s/@LOCAL_NODE@/$COROSYNC_NODE/" lib/test-corosync-conf > $COROSYNC_CONF + echo "created new $COROSYNC_CONF" +} + +DLM_CONF="/etc/dlm/dlm.conf" +create_dlm_conf() { + if test -a $DLM_CONF; then + if ! grep "created by lvm test suite" $DLM_CONF; then + rm $DLM_CONF + else + mv $DLM_CONF $DLM_CONF.prelvmtest + fi + fi + + cp lib/test-dlm-conf $DLM_CONF + echo "created new $DLM_CONF" +} + +prepare_lvmlockd_dlm() { + if pgrep lvmlockd ; then + echo "Cannot run while existing lvmlockd process exists" + exit 1 + fi + + if pgrep dlm_controld ; then + echo "Cannot run while existing dlm_controld process exists" + exit 1 + fi + + if pgrep corosync; then + echo "Cannot run while existing corosync process exists" + exit 1 + fi + + create_corosync_conf + create_dlm_conf + + systemctl start corosync + sleep 1 + if ! pgrep corosync; then + echo "Failed to start corosync" + exit 1 + fi + + systemctl start dlm + sleep 1 + if ! pgrep dlm_controld; then + echo "Failed to start dlm" + exit 1 + fi + + lvmlockd + sleep 1 + if ! pgrep lvmlockd ; then + echo "Failed to start lvmlockd" + exit 1 + fi +} + +prepare_lvmlockd_dlm + diff --git a/test/shell/dlm-remove.sh b/test/shell/dlm-remove.sh new file mode 100644 index 000000000..d7af46f67 --- /dev/null +++ b/test/shell/dlm-remove.sh @@ -0,0 +1,20 @@ +#!/bin/sh +# Copyright (C) 2008-2012 Red Hat, Inc. All rights reserved. +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions +# of the GNU General Public License v.2. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +test_description='Remove the dlm test setup' + +. lib/inittest + +[ -z "$LVM_TEST_LOCK_TYPE_DLM" ] && skip; + +systemctl stop dlm +systemctl stop corosync +killall lvmlockd diff --git a/test/shell/sanlock-hello-world.sh b/test/shell/sanlock-hello-world.sh new file mode 100644 index 000000000..f9578ec0c --- /dev/null +++ b/test/shell/sanlock-hello-world.sh @@ -0,0 +1,27 @@ +#!/bin/sh +# Copyright (C) 2008-2012 Red Hat, Inc. All rights reserved. +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions +# of the GNU General Public License v.2. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +test_description='Hello world for vgcreate with sanlock' + +. lib/inittest + +[ -z "$LVM_TEST_LOCK_TYPE_SANLOCK" ] && skip; + +aux prepare_pvs 1 1024 + +vgcreate --shared $vg "$dev1" + +vgs -o+locktype,lockargs $vg + +check vg_field $vg vg_locktype sanlock + +vgremove $vg + diff --git a/test/shell/sanlock-prepare.sh b/test/shell/sanlock-prepare.sh new file mode 100644 index 000000000..7cd91ea4e --- /dev/null +++ b/test/shell/sanlock-prepare.sh @@ -0,0 +1,85 @@ +#!/bin/sh +# Copyright (C) 2008-2012 Red Hat, Inc. All rights reserved. +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions +# of the GNU General Public License v.2. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +test_description='Set up things to run tests with sanlock' + +. lib/utils +. lib/inittest + +[ -z "$LVM_TEST_LOCK_TYPE_SANLOCK" ] && skip; + +SANLOCK_CONF="/etc/sysconfig/sanlock" +create_sanlock_conf() { + if test -a $SANLOCK_CONF; then + if ! grep "created by lvm test suite" $SANLOCK_CONF; then + rm $SANLOCK_CONF + else + mv $SANLOCK_CONF $SANLOCK_CONF.prelvmtest + fi + fi + + cp lib/test-sanlock-conf $SANLOCK_CONF + echo "created new $SANLOCK_CONF" +} + +prepare_lvmlockd_sanlock() { + if pgrep lvmlockd ; then + echo "Cannot run while existing lvmlockd process exists" + exit 1 + fi + + if pgrep sanlock ; then + echo "Cannot run while existing sanlock process exists" + exit 1 + fi + + create_sanlock_conf + + # FIXME: use 'systemctl start sanlock' once sysconfig options work + sanlock daemon -U sanlock -G sanlock -w 0 + sleep 1 + if ! pgrep sanlock; then + echo "Failed to start sanlock" + exit 1 + fi + + lvmlockd + sleep 1 + if ! pgrep lvmlockd; then + echo "Failed to start lvmlockd" + exit 1 + fi +} + +# Create a device and a VG that are both outside the scope of +# the standard lvm test suite so that they will not be removed +# and will remain in place while all the tests are run. +# +# Use this VG to hold the sanlock global lock which will be used +# by lvmlockd during other tests. +# +# This script will be run before any standard tests are run. +# After all the tests are run, another script will be run +# to remove this VG and device. + +GL_DEV="/dev/mapper/GL_DEV" +GL_FILE="$PWD/gl_file.img" +rm -f "$GL_FILE" +dd if=/dev/zero of="$GL_FILE" bs=$((1024*1024)) count=1024 2> /dev/null +GL_LOOP=$(losetup -f "$GL_FILE" --show) +echo "0 `blockdev --getsize $GL_LOOP` linear $GL_LOOP 0" | dmsetup create GL_DEV + +prepare_lvmlockd_sanlock + +vgcreate --config 'devices { global_filter=["a|GL_DEV|", "r|.*|"] filter=["a|GL_DEV|", "r|.*|"]}' --lock-type sanlock --lock-gl enable --lock-opt wait glvg $GL_DEV + +vgs --config 'devices { global_filter=["a|GL_DEV|", "r|.*|"] filter=["a|GL_DEV|", "r|.*|"]}' -o+locktype,lockargs glvg + diff --git a/test/shell/sanlock-remove.sh b/test/shell/sanlock-remove.sh new file mode 100644 index 000000000..a8ea9a720 --- /dev/null +++ b/test/shell/sanlock-remove.sh @@ -0,0 +1,28 @@ +#!/bin/sh +# Copyright (C) 2008-2012 Red Hat, Inc. All rights reserved. +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions +# of the GNU General Public License v.2. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +test_description='Remove the sanlock test setup' + +. lib/inittest + +[ -z "$LVM_TEST_LOCK_TYPE_SANLOCK" ] && skip; + +# Removes the VG with the global lock that was created by +# the corresponding create script. + +vgremove --config 'devices { global_filter=["a|GL_DEV|", "r|.*|"] filter=["a|GL_DEV|", "r|.*|"]}' glvg + + +killall lvmlockd +killall sanlock + +dmsetup remove GL_DEV +dmsetup remove glvg-lvmlock diff --git a/tools/args.h b/tools/args.h index 7f7667da2..80c2a24d3 100644 --- a/tools/args.h +++ b/tools/args.h @@ -50,6 +50,13 @@ arg(ignoremonitoring_ARG, '\0', "ignoremonitoring", NULL, 0) arg(ignoreskippedcluster_ARG, '\0', "ignoreskippedcluster", NULL, 0) arg(ignoreunsupported_ARG, '\0', "ignoreunsupported", NULL, 0) arg(labelsector_ARG, '\0', "labelsector", int_arg, 0) +arg(lockgl_ARG, '\0', "lockgl", string_arg, 0) +arg(lockvg_ARG, '\0', "lockvg", string_arg, 0) +arg(locklv_ARG, '\0', "locklv", string_arg, 0) +arg(lockopt_ARG, '\0', "lockopt", string_arg, 0) +arg(lockstart_ARG, '\0', "lockstart", NULL, 0) +arg(lockstop_ARG, '\0', "lockstop", NULL, 0) +arg(locktype_ARG, '\0', "locktype", locktype_arg, 0) arg(maxrecoveryrate_ARG, '\0', "maxrecoveryrate", size_kb_arg, 0) arg(merge_ARG, '\0', "merge", NULL, 0) arg(mergedconfig_ARG, '\0', "mergedconfig", NULL, 0) @@ -96,6 +103,7 @@ arg(resync_ARG, '\0', "resync", NULL, 0) arg(rows_ARG, '\0', "rows", NULL, 0) arg(segments_ARG, '\0', "segments", NULL, 0) arg(separator_ARG, '\0', "separator", string_arg, 0) +arg(shared_ARG, '\0', "shared", NULL, 0) arg(split_ARG, '\0', "split", NULL, 0) arg(splitcache_ARG, '\0', "splitcache", NULL, 0) arg(splitmirrors_ARG, '\0', "splitmirrors", int_arg, 0) diff --git a/tools/commands.h b/tools/commands.h index d89b21462..d3ec5e014 100644 --- a/tools/commands.h +++ b/tools/commands.h @@ -394,7 +394,7 @@ xx(lvcreate, xx(lvdisplay, "Display information about a logical volume", - PERMITTED_READ_ONLY | ALL_VGS_IS_DEFAULT | ENABLE_FOREIGN_VGS, + PERMITTED_READ_ONLY | ALL_VGS_IS_DEFAULT | ENABLE_FOREIGN_VGS | LOCKD_VG_SH, "lvdisplay\n" "\t[-a|--all]\n" "\t[-c|--colon]\n" @@ -646,7 +646,7 @@ xx(lvresize, xx(lvs, "Display information about logical volumes", - PERMITTED_READ_ONLY | ALL_VGS_IS_DEFAULT | ENABLE_FOREIGN_VGS, + PERMITTED_READ_ONLY | ALL_VGS_IS_DEFAULT | ENABLE_FOREIGN_VGS | LOCKD_VG_SH, "lvs\n" "\t[-a|--all]\n" "\t[--aligned]\n" @@ -684,7 +684,7 @@ xx(lvs, xx(lvscan, "List all logical volumes in all volume groups", - PERMITTED_READ_ONLY | ALL_VGS_IS_DEFAULT, + PERMITTED_READ_ONLY | ALL_VGS_IS_DEFAULT | LOCKD_VG_SH, "lvscan\n" "\t[-a|--all]\n" "\t[-b|--blockdevice]\n" @@ -744,7 +744,7 @@ xx(pvresize, xx(pvck, "Check the consistency of physical volume(s)", - 0, + LOCKD_VG_SH, "pvck " "\t[--commandprofile ProfileName]\n" "\t[-d|--debug]\n" @@ -810,7 +810,7 @@ xx(pvdata, xx(pvdisplay, "Display various attributes of physical volume(s)", - CACHE_VGMETADATA | PERMITTED_READ_ONLY | ENABLE_ALL_DEVS | ENABLE_FOREIGN_VGS, + CACHE_VGMETADATA | PERMITTED_READ_ONLY | ENABLE_ALL_DEVS | ENABLE_FOREIGN_VGS | LOCKD_VG_SH, "pvdisplay\n" "\t[-c|--colon]\n" "\t[--commandprofile ProfileName]\n" @@ -919,7 +919,7 @@ xx(pvremove, xx(pvs, "Display information about physical volumes", - CACHE_VGMETADATA | PERMITTED_READ_ONLY | ALL_VGS_IS_DEFAULT | ENABLE_ALL_DEVS | ENABLE_FOREIGN_VGS, + CACHE_VGMETADATA | PERMITTED_READ_ONLY | ALL_VGS_IS_DEFAULT | ENABLE_ALL_DEVS | ENABLE_FOREIGN_VGS | LOCKD_VG_SH, "pvs\n" "\t[-a|--all]\n" "\t[--aligned]\n" @@ -957,7 +957,7 @@ xx(pvs, xx(pvscan, "List all physical volumes", - PERMITTED_READ_ONLY | ENABLE_FOREIGN_VGS, + PERMITTED_READ_ONLY | ENABLE_FOREIGN_VGS | LOCKD_VG_SH, "pvscan\n" "\t[-b|--background]\n" "\t[--cache [-a|--activate ay] [ DevicePath | -j|--major major --minor minor]...]\n" @@ -994,7 +994,7 @@ xx(tags, xx(vgcfgbackup, "Backup volume group configuration(s)", - PERMITTED_READ_ONLY | ALL_VGS_IS_DEFAULT | ENABLE_FOREIGN_VGS, + PERMITTED_READ_ONLY | ALL_VGS_IS_DEFAULT | ENABLE_FOREIGN_VGS | LOCKD_VG_SH, "vgcfgbackup\n" "\t[--commandprofile ProfileName]\n" "\t[-d|--debug]\n" @@ -1074,11 +1074,12 @@ xx(vgchange, metadataprofile_ARG, monitor_ARG, noudevsync_ARG, metadatacopies_ARG, vgmetadatacopies_ARG, partial_ARG, physicalextentsize_ARG, poll_ARG, refresh_ARG, resizeable_ARG, resizable_ARG, select_ARG, sysinit_ARG, - systemid_ARG, test_ARG, uuid_ARG) + systemid_ARG, test_ARG, uuid_ARG, lockstart_ARG, lockstop_ARG, locktype_ARG, lockopt_ARG, + force_ARG) xx(vgck, "Check the consistency of volume group(s)", - ALL_VGS_IS_DEFAULT, + ALL_VGS_IS_DEFAULT | LOCKD_VG_SH, "vgck " "\t[--commandprofile ProfileName]\n" "\t[-d|--debug]\n" @@ -1138,11 +1139,11 @@ xx(vgcreate, physicalextentsize_ARG, test_ARG, force_ARG, zero_ARG, labelsector_ARG, metadatasize_ARG, pvmetadatacopies_ARG, metadatacopies_ARG, vgmetadatacopies_ARG, dataalignment_ARG, dataalignmentoffset_ARG, - systemid_ARG) + shared_ARG, systemid_ARG, locktype_ARG, lockopt_ARG) xx(vgdisplay, "Display volume group information", - PERMITTED_READ_ONLY | ALL_VGS_IS_DEFAULT | ENABLE_FOREIGN_VGS, + PERMITTED_READ_ONLY | ALL_VGS_IS_DEFAULT | ENABLE_FOREIGN_VGS | LOCKD_VG_SH, "vgdisplay\n" "\t[-A|--activevolumegroups]\n" "\t[-c|--colon | -s|--short | -v|--verbose]\n" @@ -1190,7 +1191,7 @@ xx(vgdisplay, xx(vgexport, "Unregister volume group(s) from the system", - ALL_VGS_IS_DEFAULT, + ALL_VGS_IS_DEFAULT | LOCKD_VG_SH, "vgexport\n" "\t[-a|--all]\n" "\t[--commandprofile ProfileName]\n" @@ -1330,7 +1331,7 @@ xx(vgrename, xx(vgs, "Display information about volume groups", - PERMITTED_READ_ONLY | ALL_VGS_IS_DEFAULT | ENABLE_FOREIGN_VGS, + PERMITTED_READ_ONLY | ALL_VGS_IS_DEFAULT | ENABLE_FOREIGN_VGS | LOCKD_VG_SH, "vgs\n" "\t[--aligned]\n" "\t[--binary]\n" @@ -1367,7 +1368,7 @@ xx(vgs, xx(vgscan, "Search for all volume groups", - PERMITTED_READ_ONLY | ALL_VGS_IS_DEFAULT | ENABLE_FOREIGN_VGS, + PERMITTED_READ_ONLY | ALL_VGS_IS_DEFAULT | ENABLE_FOREIGN_VGS | LOCKD_VG_SH, "vgscan " "\t[--cache]\n" "\t[--commandprofile ProfileName]\n" diff --git a/tools/lvchange.c b/tools/lvchange.c index e790ea06b..994336934 100644 --- a/tools/lvchange.c +++ b/tools/lvchange.c @@ -601,6 +601,9 @@ static int _lvchange_persistent(struct cmd_context *cmd, { enum activation_change activate = CHANGE_AN; + /* The LV lock in lvmlockd should remain as it is. */ + cmd->lockd_lv_disable = 1; + if (!get_and_validate_major_minor(cmd, lv->vg->fid->fmt, &lv->major, &lv->minor)) return_0; @@ -984,6 +987,22 @@ static int _lvchange_single(struct cmd_context *cmd, struct logical_volume *lv, return ECMD_FAILED; } + if (!arg_count(cmd, activate_ARG) && !arg_count(cmd, refresh_ARG)) { + /* + * If a persistent lv lock already exists from activation + * (with the needed mode or higher), this will be a no-op. + * Otherwise, the lv lock will be taken as non-persistent + * and released when this command exits. + * + * FIXME: use "sh" if the options imply that the lvchange + * operation does not modify the LV. + */ + if (!lockd_lv(cmd, lv, "ex", 0)) { + stack; + return ECMD_FAILED; + } + } + /* * FIXME: DEFAULT_BACKGROUND_POLLING should be "unspecified". * If --poll is explicitly provided use it; otherwise polling @@ -1257,6 +1276,14 @@ int lvchange(struct cmd_context *cmd, int argc, char **argv) if (arg_is_set(cmd, activate_ARG)) cmd->include_active_foreign_vgs = 1; + /* + * The default vg lock mode for lvchange is ex, but these options + * are cases where lvchange does not modify the vg, so they can use + * the sh lock mode. + */ + if (arg_count(cmd, activate_ARG) || arg_count(cmd, refresh_ARG)) + cmd->lockd_vg_default_sh = 1; + return process_each_lv(cmd, argc, argv, update ? READ_FOR_UPDATE : 0, NULL, &_lvchange_single); diff --git a/tools/lvconvert.c b/tools/lvconvert.c index fe8b76144..5b84cd868 100644 --- a/tools/lvconvert.c +++ b/tools/lvconvert.c @@ -16,6 +16,7 @@ #include "polldaemon.h" #include "lv_alloc.h" #include "lvconvert_poll.h" +#include "lvmpolld-client.h" struct lvconvert_params { int cache; @@ -2524,6 +2525,12 @@ static int _lvconvert_thin(struct cmd_context *cmd, return 0; } + if (is_lockd_type(lv->vg->lock_type)) { + log_error("Can't use lock_type %s LV as external origin.", + lv->vg->lock_type); + return 0; + } + dm_list_init(&lvc.tags); if (!pool_supports_external_origin(first_seg(pool_lv), lv)) @@ -2641,6 +2648,12 @@ static int _lvconvert_pool(struct cmd_context *cmd, struct logical_volume *data_lv; struct logical_volume *metadata_lv = NULL; struct logical_volume *pool_metadata_lv; + char *lockd_data_args = NULL; + char *lockd_meta_args = NULL; + char *lockd_data_name = NULL; + char *lockd_meta_name = NULL; + struct id lockd_data_id; + struct id lockd_meta_id; char metadata_name[NAME_LEN], data_name[NAME_LEN]; int activate_pool; @@ -2657,6 +2670,13 @@ static int _lvconvert_pool(struct cmd_context *cmd, } } + /* An existing LV needs to have its lock freed once it becomes a data LV. */ + if (is_lockd_type(vg->lock_type) && !lv_is_pool(pool_lv) && pool_lv->lock_args) { + lockd_data_args = dm_pool_strdup(cmd->mem, pool_lv->lock_args); + lockd_data_name = dm_pool_strdup(cmd->mem, pool_lv->name); + memcpy(&lockd_data_id, &pool_lv->lvid.id[1], sizeof(struct id)); + } + if (!lv_is_visible(pool_lv)) { log_error("Can't convert internal LV %s.", display_lvname(pool_lv)); return 0; @@ -2712,6 +2732,13 @@ static int _lvconvert_pool(struct cmd_context *cmd, lp->pool_metadata_extents = lp->pool_metadata_lv->le_count; metadata_lv = lp->pool_metadata_lv; + /* An existing LV needs to have its lock freed once it becomes a meta LV. */ + if (is_lockd_type(vg->lock_type) && metadata_lv->lock_args) { + lockd_meta_args = dm_pool_strdup(cmd->mem, metadata_lv->lock_args); + lockd_meta_name = dm_pool_strdup(cmd->mem, metadata_lv->name); + memcpy(&lockd_meta_id, &metadata_lv->lvid.id[1], sizeof(struct id)); + } + if (metadata_lv == pool_lv) { log_error("Can't use same LV for pool data and metadata LV %s.", display_lvname(metadata_lv)); @@ -2974,6 +3001,27 @@ static int _lvconvert_pool(struct cmd_context *cmd, if (!attach_pool_data_lv(seg, data_lv)) return_0; + /* + * Create a new lock for a thin pool LV. A cache pool LV has no lock. + * Locks are removed from existing LVs that are being converted to + * data and meta LVs (they are unlocked and deleted below.) + */ + if (is_lockd_type(vg->lock_type)) { + if (segtype_is_cache_pool(lp->segtype)) { + data_lv->lock_args = NULL; + metadata_lv->lock_args = NULL; + } else { + data_lv->lock_args = NULL; + metadata_lv->lock_args = NULL; + + if (!strcmp(vg->lock_type, "sanlock")) + pool_lv->lock_args = "pending"; + else if (!strcmp(vg->lock_type, "dlm")) + pool_lv->lock_args = "dlm"; + /* The lock_args will be set in vg_write(). */ + } + } + /* FIXME: revert renamed LVs in fail path? */ /* FIXME: any common code with metadata/thin_manip.c extend_pool() ? */ @@ -3007,6 +3055,11 @@ mda_write: log_warn("WARNING: Pool zeroing and large %s chunk size slows down " "provisioning.", display_size(cmd, seg->chunk_size)); + if (activate_pool && !lockd_lv(cmd, pool_lv, "ex", LDLV_PERSISTENT)) { + log_error("Failed to lock pool LV %s/%s", vg->name, pool_lv->name); + goto out; + } + if (activate_pool && !activate_lv_excl(cmd, pool_lv)) { log_error("Failed to activate pool logical volume %s.", @@ -3031,6 +3084,22 @@ out: (segtype_is_cache_pool(lp->segtype)) ? "cache" : "thin"); + /* + * Unlock and free the locks from existing LVs that became pool data + * and meta LVs. + */ + if (lockd_data_name) { + if (!lockd_lv_name(cmd, vg, lockd_data_name, &lockd_data_id, lockd_data_args, "un", LDLV_PERSISTENT)) + log_error("Failed to unlock pool data LV %s/%s", vg->name, lockd_data_name); + lockd_free_lv(cmd, vg, lockd_data_name, &lockd_data_id, lockd_data_args); + } + + if (lockd_meta_name) { + if (!lockd_lv_name(cmd, vg, lockd_meta_name, &lockd_meta_id, lockd_meta_args, "un", LDLV_PERSISTENT)) + log_error("Failed to unlock pool metadata LV %s/%s", vg->name, lockd_meta_name); + lockd_free_lv(cmd, vg, lockd_meta_name, &lockd_meta_id, lockd_meta_args); + } + return r; #if 0 revert_new_lv: @@ -3250,13 +3319,21 @@ static int lvconvert_single(struct cmd_context *cmd, struct lvconvert_params *lp struct volume_group *vg; int ret = ECMD_FAILED; int saved_ignore_suspended_devices = ignore_suspended_devices(); + uint32_t lockd_state; if (arg_count(cmd, repair_ARG)) { init_ignore_suspended_devices(1); cmd->handles_missing_pvs = 1; } - vg = vg_read(cmd, lp->vg_name, NULL, READ_FOR_UPDATE); + /* + * The VG lock will be released when the command exits. + * Commands that poll the LV will reacquire the VG lock. + */ + if (!lockd_vg(cmd, lp->vg_name, "ex", 0, &lockd_state)) + goto_out; + + vg = vg_read(cmd, lp->vg_name, NULL, READ_FOR_UPDATE, lockd_state); if (vg_read_error(vg)) { release_vg(vg); goto_out; @@ -3269,6 +3346,17 @@ static int lvconvert_single(struct cmd_context *cmd, struct lvconvert_params *lp } /* + * If the lv is inactive before and after the command, the + * use of PERSISTENT here means the lv will remain locked as + * an effect of running the lvconvert. + * To unlock it, it would need to be activated+deactivated. + * Or, we could identify the commands for which the lv remains + * inactive, and not use PERSISTENT here for those cases. + */ + if (!lockd_lv(cmd, lv, "ex", LDLV_PERSISTENT)) + goto_bad; + + /* * lp->pvh holds the list of PVs available for allocation or removal */ if (lp->pv_count) { @@ -3288,6 +3376,12 @@ static int lvconvert_single(struct cmd_context *cmd, struct lvconvert_params *lp bad: unlock_vg(cmd, lp->vg_name); + /* + * The command may sit and monitor progress for some time, + * and we do not need or want the VG lock held during that. + */ + lockd_vg(cmd, vg->name, "un", 0, &lockd_state); + if (ret == ECMD_PROCESSED && lp->need_polling) ret = _poll_logical_volume(cmd, lp->lv_to_poll, lp->wait_completion); @@ -3306,6 +3400,7 @@ static int _lvconvert_merge_single(struct cmd_context *cmd, struct logical_volum struct volume_group *vg_fresh; struct logical_volume *lv_fresh; int ret = ECMD_FAILED; + uint32_t lockd_state = 0; /* dummy placeholder, lvmlockd doesn't use this path */ /* * FIXME can't trust lv's VG to be current given that caller @@ -3317,7 +3412,7 @@ static int _lvconvert_merge_single(struct cmd_context *cmd, struct logical_volum vg_name = lv->vg->name; unlock_vg(cmd, vg_name); - vg_fresh = vg_read(cmd, vg_name, NULL, READ_FOR_UPDATE); + vg_fresh = vg_read(cmd, vg_name, NULL, READ_FOR_UPDATE, lockd_state); if (vg_read_error(vg_fresh)) { log_error("ABORTING: Can't reread VG %s", vg_name); goto out; @@ -3356,6 +3451,32 @@ out: return ret; } +/* + * process_each_lv locks the VG, reads the VG, calls this which starts the + * conversion, then unlocks the VG. The lvpoll command will come along later + * and lock the VG, read the VG, check the progress, unlock the VG, sleep and + * repeat until done. + */ + +static int _lvconvert_lvmpolld_merge_single(struct cmd_context *cmd, struct logical_volume *lv, + struct processing_handle *handle) +{ + struct lvconvert_params *lp = (struct lvconvert_params *) handle->custom_handle; + int ret; + + lp->lv_to_poll = lv; + if ((ret = _lvconvert_single(cmd, lv, lp)) != ECMD_PROCESSED) + stack; + + if (ret == ECMD_PROCESSED && lp->need_polling) { + if ((ret = _poll_logical_volume(cmd, lp->lv_to_poll, + lp->wait_completion)) != ECMD_PROCESSED) + stack; + } + + return ret; +} + int lvconvert(struct cmd_context * cmd, int argc, char **argv) { int ret; @@ -3379,7 +3500,8 @@ int lvconvert(struct cmd_context * cmd, int argc, char **argv) if (lp.merge) ret = process_each_lv(cmd, argc, argv, READ_FOR_UPDATE, handle, - &_lvconvert_merge_single); + lvmpolld_use() ? &_lvconvert_lvmpolld_merge_single : + &_lvconvert_merge_single); else ret = lvconvert_single(cmd, &lp); out: diff --git a/tools/lvcreate.c b/tools/lvcreate.c index e41f76ccb..f3167673d 100644 --- a/tools/lvcreate.c +++ b/tools/lvcreate.c @@ -1453,6 +1453,7 @@ int lvcreate(struct cmd_context *cmd, int argc, char **argv) }; struct lvcreate_cmdline_params lcp = { 0 }; struct volume_group *vg; + uint32_t lockd_state; if (!_lvcreate_params(cmd, argc, argv, &lp, &lcp)) { stack; @@ -1464,8 +1465,11 @@ int lvcreate(struct cmd_context *cmd, int argc, char **argv) return EINVALID_CMD_LINE; } + if (!lockd_vg(cmd, lp.vg_name, "ex", 0, &lockd_state)) + return_ECMD_FAILED; + log_verbose("Finding volume group \"%s\"", lp.vg_name); - vg = vg_read_for_update(cmd, lp.vg_name, NULL, 0); + vg = vg_read_for_update(cmd, lp.vg_name, NULL, 0, lockd_state); if (vg_read_error(vg)) { release_vg(vg); return_ECMD_FAILED; @@ -1510,6 +1514,13 @@ int lvcreate(struct cmd_context *cmd, int argc, char **argv) lp.pool_name ? : "with generated name", lp.vg_name, lp.segtype->name); } + if (vg->lock_type && !strcmp(vg->lock_type, "sanlock")) { + if (!handle_sanlock_lv(cmd, vg)) { + log_error("No space for sanlock lock, extend the internal lvmlock LV."); + goto_out; + } + } + if (seg_is_thin_volume(&lp)) log_verbose("Making thin LV %s in pool %s in VG %s%s%s using segtype %s", lp.lv_name ? : "with generated name", @@ -1517,6 +1528,9 @@ int lvcreate(struct cmd_context *cmd, int argc, char **argv) lp.snapshot ? " as snapshot of " : "", lp.snapshot ? lp.origin_name : "", lp.segtype->name); + if (is_lockd_type(vg->lock_type)) + lp.needs_lockd_init = 1; + if (!lv_create_single(vg, &lp)) goto_out; diff --git a/tools/lvmcmdline.c b/tools/lvmcmdline.c index 023a3d6c8..80b52a7c8 100644 --- a/tools/lvmcmdline.c +++ b/tools/lvmcmdline.c @@ -625,6 +625,19 @@ int alloc_arg(struct cmd_context *cmd __attribute__((unused)), struct arg_values return 1; } +int locktype_arg(struct cmd_context *cmd __attribute__((unused)), struct arg_values *av) +{ + lock_type_t lock_type; + + av->sign = SIGN_NONE; + + lock_type = get_lock_type_from_string(av->value); + if (lock_type == LOCK_TYPE_INVALID) + return 0; + + return 1; +} + int segtype_arg(struct cmd_context *cmd, struct arg_values *av) { struct segment_type *segtype; @@ -757,6 +770,7 @@ void lvm_register_commands(void) yes_ARG, \ quiet_ARG, config_ARG, \ commandprofile_ARG, \ + lockgl_ARG, lockvg_ARG, locklv_ARG, \ profile_ARG, -1); #include "commands.h" #undef xx @@ -1045,6 +1059,15 @@ static int _get_settings(struct cmd_context *cmd) cmd->current_settings.backup = 0; } + if (arg_is_set(cmd, lockgl_ARG)) + cmd->lock_gl_mode = arg_str_value(cmd, lockgl_ARG, NULL); + if (arg_is_set(cmd, lockvg_ARG)) + cmd->lock_vg_mode = arg_str_value(cmd, lockvg_ARG, NULL); + if (cmd->command->flags & LOCKD_VG_SH) + cmd->lockd_vg_default_sh = 1; + if (arg_is_set(cmd, locklv_ARG)) + cmd->lock_lv_mode = arg_str_value(cmd, locklv_ARG, NULL); + cmd->partial_activation = 0; cmd->degraded_activation = 0; activation_mode = find_config_tree_str(cmd, activation_mode_CFG, NULL); @@ -1085,9 +1108,13 @@ static int _get_settings(struct cmd_context *cmd) cmd->include_foreign_vgs = arg_is_set(cmd, foreign_ARG) ? 1 : 0; cmd->include_active_foreign_vgs = cmd->command->flags & ENABLE_FOREIGN_VGS ? 1 : 0; - if (!arg_count(cmd, sysinit_ARG)) + if (!arg_count(cmd, sysinit_ARG)) { lvmetad_connect_or_warn(); + if (lvmlockd_use()) + lvmlockd_connect(); + } + if (arg_count(cmd, nosuffix_ARG)) cmd->current_settings.suffix = 0; diff --git a/tools/lvrename.c b/tools/lvrename.c index eeff76da2..6bbf31bf7 100644 --- a/tools/lvrename.c +++ b/tools/lvrename.c @@ -27,6 +27,7 @@ int lvrename(struct cmd_context *cmd, int argc, char **argv) char *st; struct volume_group *vg; struct lv_list *lvl; + uint32_t lockd_state; int r = ECMD_FAILED; if (argc == 3) { @@ -98,8 +99,11 @@ int lvrename(struct cmd_context *cmd, int argc, char **argv) return EINVALID_CMD_LINE; } + if (!lockd_vg(cmd, vg_name, "ex", 0, &lockd_state)) + return_ECMD_FAILED; + log_verbose("Checking for existing volume group \"%s\"", vg_name); - vg = vg_read_for_update(cmd, vg_name, NULL, 0); + vg = vg_read_for_update(cmd, vg_name, NULL, 0, lockd_state); if (vg_read_error(vg)) { release_vg(vg); return_ECMD_FAILED; diff --git a/tools/lvresize.c b/tools/lvresize.c index 08248bbec..30ac4f987 100644 --- a/tools/lvresize.c +++ b/tools/lvresize.c @@ -169,13 +169,17 @@ int lvresize(struct cmd_context *cmd, int argc, char **argv) struct volume_group *vg; struct dm_list *pvh = NULL; struct logical_volume *lv; + uint32_t lockd_state; int r = ECMD_FAILED; if (!_lvresize_params(cmd, argc, argv, &lp)) return EINVALID_CMD_LINE; + if (!lockd_vg(cmd, lp.vg_name, "ex", 0, &lockd_state)) + return_ECMD_FAILED; + log_verbose("Finding volume group %s", lp.vg_name); - vg = vg_read_for_update(cmd, lp.vg_name, NULL, 0); + vg = vg_read_for_update(cmd, lp.vg_name, NULL, 0, lockd_state); if (vg_read_error(vg)) { release_vg(vg); return_ECMD_FAILED; diff --git a/tools/polldaemon.c b/tools/polldaemon.c index 2e86cb1c2..0544542b6 100644 --- a/tools/polldaemon.c +++ b/tools/polldaemon.c @@ -138,14 +138,20 @@ int wait_for_single_lv(struct cmd_context *cmd, struct poll_operation_id *id, struct volume_group *vg; struct logical_volume *lv; int finished = 0; + uint32_t lockd_state; /* Poll for completion */ while (!finished) { if (parms->wait_before_testing) _sleep_and_rescan_devices(parms); + if (!lockd_vg(cmd, id->vg_name, "sh", 0, &lockd_state)) { + log_error("ABORTING: Can't lock VG for %s.", id->display_name); + return 0; + } + /* Locks the (possibly renamed) VG again */ - vg = vg_read(cmd, id->vg_name, NULL, READ_FOR_UPDATE); + vg = vg_read(cmd, id->vg_name, NULL, READ_FOR_UPDATE, lockd_state); if (vg_read_error(vg)) { release_vg(vg); log_error("ABORTING: Can't reread VG for %s.", id->display_name); @@ -189,6 +195,8 @@ int wait_for_single_lv(struct cmd_context *cmd, struct poll_operation_id *id, unlock_and_release_vg(cmd, vg, vg->name); + lockd_vg(cmd, vg->name, "un", 0, &lockd_state); + /* * FIXME Sleeping after testing, while preferred, also works around * unreliable "finished" state checking in _percent_run. If the @@ -360,12 +368,32 @@ static int report_progress(struct cmd_context *cmd, struct poll_operation_id *id { struct volume_group *vg; struct logical_volume *lv; + uint32_t lockd_state; + int ret; + + /* + * FIXME: we don't really need to take the vg lock here, + * because we only report the progress on the same host + * where the pvmove/lvconvert is happening. This means + * that the local pvmove/lvconvert/lvpoll commands are + * updating the local lvmetad with the latest info they + * have, and we just need to read the latest info that + * they have put into lvmetad about their progress. + * No VG lock is needed to protect anything here + * (we're just reading the VG), and no VG lock is + * needed to force a VG read from disk to get changes + * from other hosts, because the only change to the VG + * we're interested in is the change done locally. + */ + if (!lockd_vg(cmd, id->vg_name, "sh", 0, &lockd_state)) + return 0; - vg = vg_read(cmd, id->vg_name, NULL, 0); + vg = vg_read(cmd, id->vg_name, NULL, 0, lockd_state); if (vg_read_error(vg)) { release_vg(vg); log_error("Can't reread VG for %s", id->display_name); - return 0; + ret = 0; + goto out_ret; } lv = find_lv(vg, id->lv_name); @@ -382,23 +410,29 @@ static int report_progress(struct cmd_context *cmd, struct poll_operation_id *id else log_verbose("Can't find LV in %s for %s. Already finished or removed.", vg->name, id->display_name); + ret = 1; goto out; } if (!lv_is_active_locally(lv)) { log_verbose("%s: Interrupted: No longer active.", id->display_name); + ret = 1; goto out; } if (parms->poll_fns->poll_progress(cmd, lv, id->display_name, parms) == PROGRESS_CHECK_FAILED) { unlock_and_release_vg(cmd, vg, vg->name); - return_0; + ret = 0; + goto out; } + ret = 1; + out: unlock_and_release_vg(cmd, vg, vg->name); - - return 1; +out_ret: + lockd_vg(cmd, vg->name, "un", 0, &lockd_state); + return ret; } static int _lvmpolld_init_poll_vg(struct cmd_context *cmd, const char *vgname, diff --git a/tools/pvchange.c b/tools/pvchange.c index 3e0894f61..91e93c1e7 100644 --- a/tools/pvchange.c +++ b/tools/pvchange.c @@ -82,6 +82,14 @@ static int _pvchange_single(struct cmd_context *cmd, struct volume_group *vg, } } + /* + * Needed to change a property on an orphan PV. + * i.e. the global lock is only needed for orphans. + * Convert sh to ex. + */ + if (is_orphan(pv) && !lockd_gl(cmd, "ex", 0)) + return_ECMD_FAILED; + if (tagargs) { /* tag or deltag */ if (arg_count(cmd, addtag_ARG) && !change_tag(cmd, NULL, NULL, pv, addtag_ARG)) diff --git a/tools/pvcreate.c b/tools/pvcreate.c index 139819883..1f45ad91d 100644 --- a/tools/pvcreate.c +++ b/tools/pvcreate.c @@ -96,6 +96,10 @@ int pvcreate(struct cmd_context *cmd, int argc, char **argv) int ret = ECMD_PROCESSED; struct pvcreate_params pp; + /* Needed to change the set of orphan PVs. */ + if (!lockd_gl(cmd, "ex", 0)) + return_ECMD_FAILED; + pvcreate_params_set_defaults(&pp); if (!pvcreate_restore_params_validate(cmd, argc, argv, &pp)) { diff --git a/tools/pvmove.c b/tools/pvmove.c index f4b9d6c39..8efa6b099 100644 --- a/tools/pvmove.c +++ b/tools/pvmove.c @@ -17,6 +17,7 @@ #include "polldaemon.h" #include "display.h" #include "pvmove_poll.h" +#include "lvmpolld-client.h" #define PVMOVE_FIRST_TIME 0x00000001 /* Called for first time */ @@ -598,6 +599,7 @@ static int _set_up_pvmove(struct cmd_context *cmd, const char *pv_name, struct dm_list *lvs_changed; struct physical_volume *pv; struct logical_volume *lv_mirr; + uint32_t lockd_state; unsigned flags = PVMOVE_FIRST_TIME; unsigned exclusive; int r = ECMD_FAILED; @@ -631,10 +633,13 @@ static int _set_up_pvmove(struct cmd_context *cmd, const char *pv_name, /* Read VG */ log_verbose("Finding volume group \"%s\"", vg_name); - vg = vg_read(cmd, vg_name, NULL, READ_FOR_UPDATE); + if (!lockd_vg(cmd, vg_name, "ex", 0, &lockd_state)) + return_ECMD_FAILED; + + vg = vg_read(cmd, vg_name, NULL, READ_FOR_UPDATE, lockd_state); if (vg_read_error(vg)) { release_vg(vg); - return_ECMD_FAILED; + goto out_ret; } exclusive = _pvmove_is_exclusive(cmd, vg); @@ -700,6 +705,14 @@ static int _set_up_pvmove(struct cmd_context *cmd, const char *pv_name, out: free_pv_fid(pv); unlock_and_release_vg(cmd, vg, vg_name); +out_ret: + /* + * Release explicitly because the command may continue running + * for some time monitoring the progress, and we don not want + * or need the lockd lock held over that. + */ + lockd_vg(cmd, vg_name, "un", 0, &lockd_state); + return r; } @@ -712,6 +725,7 @@ static int _read_poll_id_from_pvname(struct cmd_context *cmd, const char *pv_nam struct logical_volume *lv; struct physical_volume *pv; struct volume_group *vg; + uint32_t lockd_state; if (!pv_name) { log_error(INTERNAL_ERROR "Invalid PV name parameter."); @@ -723,13 +737,16 @@ static int _read_poll_id_from_pvname(struct cmd_context *cmd, const char *pv_nam vg_name = pv_vg_name(pv); + if (!lockd_vg(cmd, vg_name, "sh", 0, &lockd_state)) + return_0; + /* need read-only access */ - vg = vg_read(cmd, vg_name, NULL, 0); + vg = vg_read(cmd, vg_name, NULL, 0, lockd_state); if (vg_read_error(vg)) { log_error("ABORTING: Can't read VG for %s.", pv_name); release_vg(vg); - free_pv_fid(pv); - return 0; + ret = 0; + goto out; } if (!(lv = find_pvmove_lv(vg, pv_dev(pv), PVMOVE))) { @@ -743,6 +760,8 @@ static int _read_poll_id_from_pvname(struct cmd_context *cmd, const char *pv_nam } unlock_and_release_vg(cmd, vg, vg_name); +out: + lockd_vg(cmd, vg_name, "un", 0, &lockd_state); free_pv_fid(pv); return ret; } @@ -828,6 +847,24 @@ int pvmove(struct cmd_context *cmd, int argc, char **argv) return ECMD_FAILED; } + if (lvmlockd_use() && !lvmpolld_use()) { + /* + * Don't want to spend the time making lvmlockd + * work without lvmpolld. + */ + log_error("Enable lvmpolld when using lvmlockd."); + return ECMD_FAILED; + } + + if (lvmlockd_use() && !argc) { + /* + * FIXME: move process_each_vg from polldaemon up to here, + * then we can remove this limitation. + */ + log_error("Specify pvmove args when using lvmlockd."); + return ECMD_FAILED; + } + if (argc) { if (!(lvid = dm_pool_alloc(cmd->mem, sizeof(*lvid)))) { log_error("Failed to allocate lvid."); @@ -845,6 +882,15 @@ int pvmove(struct cmd_context *cmd, int argc, char **argv) if (colon) *colon = '\0'; + /* + * To do a reverse mapping from PV name to VG name, we need the + * correct global mapping of PVs to VGs. + */ + if (!lockd_gl(cmd, "sh", 0)) { + stack; + return ECMD_FAILED; + } + if (!arg_count(cmd, abort_ARG)) { if ((ret = _set_up_pvmove(cmd, pv_name, argc, argv, lvid, &vg_name, &lv_name)) != ECMD_PROCESSED) { stack; @@ -857,6 +903,13 @@ int pvmove(struct cmd_context *cmd, int argc, char **argv) if (!in_progress) return ECMD_PROCESSED; } + + /* + * The command may sit and report progress for some time, + * and we do not want or need the lockd locks held during + * that time. + */ + lockd_gl(cmd, "un", 0); } return pvmove_poll(cmd, pv_name, lvid ? lvid->s : NULL, vg_name, lv_name, diff --git a/tools/pvremove.c b/tools/pvremove.c index b40ff794a..e6ae86641 100644 --- a/tools/pvremove.c +++ b/tools/pvremove.c @@ -32,6 +32,10 @@ int pvremove(struct cmd_context *cmd, int argc, char **argv) dm_list_init(&pv_names); + /* Needed to change the set of orphan PVs. */ + if (!lockd_gl(cmd, "ex", 0)) + return_ECMD_FAILED; + for (i = 0; i < argc; i++) { dm_unescape_colons_and_at_signs(argv[i], NULL, NULL); if (!str_list_add(cmd->mem, &pv_names, argv[i])) diff --git a/tools/pvresize.c b/tools/pvresize.c index 3057a7fb9..0b055e6ef 100644 --- a/tools/pvresize.c +++ b/tools/pvresize.c @@ -36,6 +36,14 @@ static int _pvresize_single(struct cmd_context *cmd, } params->total++; + /* + * Needed to change a property on an orphan PV. + * i.e. the global lock is only needed for orphans. + * Convert sh to ex. + */ + if (is_orphan(pv) && !lockd_gl(cmd, "ex", 0)) + return_ECMD_FAILED; + if (!pv_resize_single(cmd, vg, pv, params->new_size)) return_ECMD_FAILED; diff --git a/tools/pvscan.c b/tools/pvscan.c index 2c997b7f4..144023ce6 100644 --- a/tools/pvscan.c +++ b/tools/pvscan.c @@ -106,7 +106,7 @@ static int _auto_activation_handler(struct cmd_context *cmd, return_0; /* NB. This is safe because we know lvmetad is running and we won't hit disk. */ - vg = vg_read(cmd, vgname, (const char *)&vgid_raw, 0); + vg = vg_read(cmd, vgname, (const char *)&vgid_raw, 0, 0); if (vg_read_error(vg)) { log_error("Failed to read Volume Group \"%s\" (%s) during autoactivation.", vgname, vgid); release_vg(vg); @@ -321,7 +321,6 @@ static int _pvscan_lvmetad(struct cmd_context *cmd, int argc, char **argv) out: sync_local_dev_names(cmd); unlock_vg(cmd, VG_GLOBAL); - return ret; } @@ -371,6 +370,10 @@ int pvscan(struct cmd_context *cmd, int argc, char **argv) return ECMD_FAILED; } + /* Needed for a current listing of the global VG namespace. */ + if (!lockd_gl(cmd, "sh", 0)) + return_ECMD_FAILED; + if (cmd->full_filter->wipe) cmd->full_filter->wipe(cmd->full_filter); lvmcache_destroy(cmd, 1, 0); diff --git a/tools/toollib.c b/tools/toollib.c index bfd378985..ff7574ed4 100644 --- a/tools/toollib.c +++ b/tools/toollib.c @@ -220,6 +220,12 @@ static int _ignore_vg(struct volume_group *vg, const char *vg_name, return 1; } + if (read_error & FAILED_LOCK_TYPE) { + read_error &= ~FAILED_LOCK_TYPE; /* Check for other errors */ + log_verbose("Skipping volume group %s", vg_name); + *skip = 1; + } + if (read_error != SUCCESS) { *skip = 0; log_error("Cannot process volume group %s", vg_name); @@ -718,6 +724,11 @@ int vgcreate_params_set_from_args(struct cmd_context *cmd, struct vgcreate_params *vp_def) { const char *system_id_arg_str; + const char *lock_type = NULL; + int locking_type; + int use_lvmlockd; + int use_clvmd; + lock_type_t lock_type_num; vp_new->vg_name = skip_dev_dir(cmd, vp_def->vg_name, NULL); vp_new->max_lv = arg_uint_value(cmd, maxlogicalvolumes_ARG, @@ -730,12 +741,6 @@ int vgcreate_params_set_from_args(struct cmd_context *cmd, vp_new->extent_size = arg_uint_value(cmd, physicalextentsize_ARG, vp_def->extent_size); - if (arg_count(cmd, clustered_ARG)) - vp_new->clustered = arg_int_value(cmd, clustered_ARG, vp_def->clustered); - else - /* Default depends on current locking type */ - vp_new->clustered = locking_is_clustered(); - if (arg_sign_value(cmd, physicalextentsize_ARG, SIGN_NONE) == SIGN_MINUS) { log_error(_pe_size_may_not_be_negative_msg); return 0; @@ -766,16 +771,9 @@ int vgcreate_params_set_from_args(struct cmd_context *cmd, else vp_new->vgmetadatacopies = find_config_tree_int(cmd, metadata_vgmetadatacopies_CFG, NULL); - /* A clustered VG has no system ID. */ - if (vp_new->clustered) { - if (arg_is_set(cmd, systemid_ARG)) { - log_error("system ID cannot be set on clustered Volume Groups."); - return 0; - } - vp_new->system_id = NULL; - } else if (!(system_id_arg_str = arg_str_value(cmd, systemid_ARG, NULL))) + if (!(system_id_arg_str = arg_str_value(cmd, systemid_ARG, NULL))) { vp_new->system_id = vp_def->system_id; - else { + } else { if (!(vp_new->system_id = system_id_from_string(cmd, system_id_arg_str))) return_0; @@ -790,6 +788,186 @@ int vgcreate_params_set_from_args(struct cmd_context *cmd, } } + if ((system_id_arg_str = arg_str_value(cmd, systemid_ARG, NULL))) { + vp_new->system_id = system_id_from_string(cmd, system_id_arg_str); + } else { + vp_new->system_id = vp_def->system_id; + } + + if (system_id_arg_str) { + if (!vp_new->system_id || !vp_new->system_id[0]) + log_warn("WARNING: A VG without a system ID allows unsafe access from other hosts."); + + if (vp_new->system_id && cmd->system_id && + strcmp(vp_new->system_id, cmd->system_id)) { + log_warn("VG with system ID %s might become inaccessible as local system ID is %s", + vp_new->system_id, cmd->system_id); + } + } + + /* + * Locking: what kind of locking should be used for the + * new VG, and is it compatible with current lvm.conf settings. + * + * The end result is to set vp_new->lock_type to: + * none | clvm | dlm | sanlock. + * + * If 'vgcreate --lock-type <arg>' is set, the answer is given + * directly by <arg> which is one of none|clvm|dlm|sanlock. + * + * 'vgcreate --clustered y' is the way to create clvm VGs. + * + * 'vgcreate --shared' is the way to create lockd VGs. + * lock_type of sanlock or dlm is selected based on + * which lock manager is running. + * + * + * 1. Using neither clvmd nor lvmlockd. + * ------------------------------------------------ + * lvm.conf: + * global/use_lvmlockd = 0 + * global/locking_type = 1 + * + * - no locking is enabled + * - clvmd is not used + * - lvmlockd is not used + * - VGs with CLUSTERED set are ignored (requires clvmd) + * - VGs with lockd type are ignored (requires lvmlockd) + * - vgcreate can create new VGs with lock_type none + * - 'vgcreate --clustered y' fails + * - 'vgcreate --shared' fails + * - 'vgcreate' (neither option) creates a local VG + * + * 2. Using clvmd. + * ------------------------------------------------ + * lvm.conf: + * global/use_lvmlockd = 0 + * global/locking_type = 3 + * + * - locking through clvmd is enabled (traditional clvm config) + * - clvmd is used + * - lvmlockd is not used + * - VGs with CLUSTERED set can be used + * - VGs with lockd type are ignored (requires lvmlockd) + * - vgcreate can create new VGs with CLUSTERED status flag + * - 'vgcreate --clustered y' works + * - 'vgcreate --shared' fails + * - 'vgcreate' (neither option) creates a clvm VG + * + * 3. Using lvmlockd. + * ------------------------------------------------ + * lvm.conf: + * global/use_lvmlockd = 1 + * global/locking_type = 1 + * + * - locking through lvmlockd is enabled + * - clvmd is not used + * - lvmlockd is used + * - VGs with CLUSTERED set are ignored (requires clvmd) + * - VGs with lockd type can be used + * - vgcreate can create new VGs with lock_type sanlock or dlm + * - 'vgcreate --clustered y' fails + * - 'vgcreate --shared' works + * - 'vgcreate' (neither option) creates a local VG + */ + + locking_type = find_config_tree_int(cmd, global_locking_type_CFG, NULL); + use_lvmlockd = find_config_tree_bool(cmd, global_use_lvmlockd_CFG, NULL); + use_clvmd = (locking_type == 3); + + if (arg_is_set(cmd, locktype_ARG)) { + if (arg_is_set(cmd, clustered_ARG) || arg_is_set(cmd, shared_ARG)) { + log_error("A lock type cannot be specified with --shared or --clustered."); + return 0; + } + lock_type = arg_str_value(cmd, locktype_ARG, ""); + + } else if (arg_is_set(cmd, clustered_ARG)) { + const char *arg_str = arg_str_value(cmd, clustered_ARG, ""); + int clustery = strcmp(arg_str, "y") ? 0 : 1; + + if (use_clvmd) { + lock_type = clustery ? "clvm" : "none"; + + } else if (use_lvmlockd) { + log_error("lvmlockd is configured, use --shared with lvmlockd, and --clustered with clvmd."); + return 0; + + } else { + if (clustery) { + log_error("The --clustered option requires clvmd (locking_type=3)."); + return 0; + } else { + lock_type = "none"; + } + } + + } else if (arg_is_set(cmd, shared_ARG)) { + if (use_lvmlockd) { + if (!(lock_type = lockd_running_lock_type(cmd))) { + log_error("Failed to detect a running lock manager to select lock_type."); + return 0; + } + + } else if (use_clvmd) { + log_error("Use --shared with lvmlockd, and --clustered with clvmd."); + return 0; + + } else { + log_error("The --shared option requires lvmlockd (use_lvmlockd=1)."); + return 0; + } + + } else { + if (use_clvmd) + lock_type = locking_is_clustered() ? "clvm" : "none"; + else + lock_type = "none"; + } + + /* + * Check that the lock_type is recognized, and is being + * used with the correct lvm.conf settings. + */ + lock_type_num = get_lock_type_from_string(lock_type); + + switch (lock_type_num) { + case LOCK_TYPE_INVALID: + log_error("lock_type %s is invalid", lock_type); + return 0; + + case LOCK_TYPE_SANLOCK: + case LOCK_TYPE_DLM: + if (!use_lvmlockd) { + log_error("lock_type %s requires use_lvmlockd configuration setting", lock_type); + return 0; + } + break; + case LOCK_TYPE_CLVM: + if (!use_clvmd) { + log_error("lock_type clvm requires locking_type 3 configuration setting"); + return 0; + } + break; + case LOCK_TYPE_NONE: + break; + }; + + /* + * The vg is not owned by one host/system_id. + * Locking coordinates access from multiple hosts. + */ + if (lock_type_num == LOCK_TYPE_DLM || lock_type_num == LOCK_TYPE_SANLOCK || lock_type_num == LOCK_TYPE_CLVM) + vp_new->system_id = NULL; + + vp_new->lock_type = lock_type; + + if (lock_type_num == LOCK_TYPE_CLVM) + vp_new->clustered = 1; + else + vp_new->clustered = 0; + + log_debug("Setting lock_type to %s", vp_new->lock_type); return 1; } @@ -1697,6 +1875,7 @@ static int _process_vgnameid_list(struct cmd_context *cmd, uint32_t flags, struct vgnameid_list *vgnl; const char *vg_name; const char *vg_uuid; + uint32_t lockd_state; int selected; int whole_selected = 0; int ret_max = ECMD_PROCESSED; @@ -1721,17 +1900,17 @@ static int _process_vgnameid_list(struct cmd_context *cmd, uint32_t flags, vg_uuid = vgnl->vgid; skip = 0; - vg = vg_read(cmd, vg_name, vg_uuid, flags); + if (!lockd_vg(cmd, vg_name, NULL, 0, &lockd_state)) + continue; + + vg = vg_read(cmd, vg_name, vg_uuid, flags, lockd_state); if (_ignore_vg(vg, vg_name, arg_vgnames, flags & READ_ALLOW_INCONSISTENT, &skip)) { stack; ret_max = ECMD_FAILED; - release_vg(vg); - continue; - } - if (skip) { - release_vg(vg); - continue; + goto endvg; } + if (skip) + goto endvg; /* Process this VG? */ if ((process_all || @@ -1746,10 +1925,11 @@ static int _process_vgnameid_list(struct cmd_context *cmd, uint32_t flags, ret_max = ret; } - if (vg_read_error(vg)) - release_vg(vg); - else - unlock_and_release_vg(cmd, vg, vg_name); + if (!vg_read_error(vg)) + unlock_vg(cmd, vg_name); +endvg: + release_vg(vg); + lockd_vg(cmd, vg_name, "un", 0, &lockd_state); } /* the VG is selected if at least one LV is selected */ @@ -1821,9 +2001,14 @@ int process_each_vg(struct cmd_context *cmd, int argc, char **argv, * any tags were supplied and need resolving; or * no VG names were given and the command defaults to processing all VGs. */ - if (((dm_list_empty(&arg_vgnames) && enable_all_vgs) || !dm_list_empty(&arg_tags)) && - !get_vgnameids(cmd, &vgnameids_on_system, NULL, 0)) - goto_out; + if ((dm_list_empty(&arg_vgnames) && enable_all_vgs) || !dm_list_empty(&arg_tags)) { + /* Needed for a current listing of the global VG namespace. */ + if (!lockd_gl(cmd, "sh", 0)) + goto_out; + + if (!get_vgnameids(cmd, &vgnameids_on_system, NULL, 0)) + goto_out; + } if (dm_list_empty(&arg_vgnames) && dm_list_empty(&vgnameids_on_system)) { /* FIXME Should be log_print, but suppressed for reporting cmds */ @@ -2137,6 +2322,7 @@ static int _process_lv_vgnameid_list(struct cmd_context *cmd, uint32_t flags, struct dm_str_list *sl; struct dm_list *tags_arg; struct dm_list lvnames; + uint32_t lockd_state; const char *vg_name; const char *vg_uuid; const char *vgn; @@ -2183,18 +2369,18 @@ static int _process_lv_vgnameid_list(struct cmd_context *cmd, uint32_t flags, } } - vg = vg_read(cmd, vg_name, vg_uuid, flags); + if (!lockd_vg(cmd, vg_name, NULL, 0, &lockd_state)) + continue; + + vg = vg_read(cmd, vg_name, vg_uuid, flags, lockd_state); if (_ignore_vg(vg, vg_name, arg_vgnames, flags & READ_ALLOW_INCONSISTENT, &skip)) { stack; ret_max = ECMD_FAILED; - release_vg(vg); - continue; + goto endvg; } - if (skip) { - release_vg(vg); - continue; - } + if (skip) + goto endvg; ret = process_each_lv_in_vg(cmd, vg, &lvnames, tags_arg, 0, handle, process_single_lv); @@ -2203,7 +2389,10 @@ static int _process_lv_vgnameid_list(struct cmd_context *cmd, uint32_t flags, if (ret > ret_max) ret_max = ret; - unlock_and_release_vg(cmd, vg, vg_name); + unlock_vg(cmd, vg_name); +endvg: + release_vg(vg); + lockd_vg(cmd, vg_name, "un", 0, &lockd_state); } return ret_max; @@ -2260,8 +2449,14 @@ int process_each_lv(struct cmd_context *cmd, int argc, char **argv, uint32_t fla else if (dm_list_empty(&arg_vgnames) && handle->internal_report_for_select) need_vgnameids = 1; - if (need_vgnameids && !get_vgnameids(cmd, &vgnameids_on_system, NULL, 0)) - goto_out; + if (need_vgnameids) { + /* Needed for a current listing of the global VG namespace. */ + if (!lockd_gl(cmd, "sh", 0)) + goto_out; + + if (!get_vgnameids(cmd, &vgnameids_on_system, NULL, 0)) + goto_out; + } if (dm_list_empty(&arg_vgnames) && dm_list_empty(&vgnameids_on_system)) { /* FIXME Should be log_print, but suppressed for reporting cmds */ @@ -2654,6 +2849,7 @@ static int _process_pvs_in_vgs(struct cmd_context *cmd, uint32_t flags, struct vgnameid_list *vgnl; const char *vg_name; const char *vg_uuid; + uint32_t lockd_state; int ret_max = ECMD_PROCESSED; int ret; int skip; @@ -2666,14 +2862,15 @@ static int _process_pvs_in_vgs(struct cmd_context *cmd, uint32_t flags, vg_uuid = vgnl->vgid; skip = 0; - vg = vg_read(cmd, vg_name, vg_uuid, flags | READ_WARN_INCONSISTENT); + if (!lockd_vg(cmd, vg_name, NULL, 0, &lockd_state)) + continue; + + vg = vg_read(cmd, vg_name, vg_uuid, flags | READ_WARN_INCONSISTENT, lockd_state); if (_ignore_vg(vg, vg_name, NULL, flags & READ_ALLOW_INCONSISTENT, &skip)) { stack; ret_max = ECMD_FAILED; - if (!skip) { - release_vg(vg); - continue; - } + if (!skip) + goto endvg; /* Drop through to eliminate a clustered VG's PVs from the devices list */ } @@ -2690,10 +2887,11 @@ static int _process_pvs_in_vgs(struct cmd_context *cmd, uint32_t flags, if (ret > ret_max) ret_max = ret; - if (skip) - release_vg(vg); - else - unlock_and_release_vg(cmd, vg, vg->name); + if (!skip) + unlock_vg(cmd, vg->name); +endvg: + release_vg(vg); + lockd_vg(cmd, vg_name, "un", 0, &lockd_state); /* Quit early when possible. */ if (!process_all_pvs && dm_list_empty(arg_tags) && dm_list_empty(arg_devices)) @@ -2747,6 +2945,10 @@ int process_each_pv(struct cmd_context *cmd, process_all_devices = process_all_pvs && (cmd->command->flags & ENABLE_ALL_DEVS) && arg_count(cmd, all_ARG); + /* Needed for a current listing of the global VG namespace. */ + if (!only_this_vgname && !lockd_gl(cmd, "sh", 0)) + return_ECMD_FAILED; + /* * Need pvid's set on all PVs before processing so that pvid's * can be compared to find duplicates while processing. diff --git a/tools/tools.h b/tools/tools.h index e959d8007..640aa1544 100644 --- a/tools/tools.h +++ b/tools/tools.h @@ -28,6 +28,7 @@ #include "archiver.h" #include "lvmcache.h" #include "lvmetad.h" +#include "lvmlockd.h" #include "lvm-version.h" #include "config.h" #include "defaults.h" @@ -110,6 +111,8 @@ struct arg_value_group_list { #define ONE_VGNAME_ARG 0x00000010 /* Command is allowed to read foreign VGs. */ #define ENABLE_FOREIGN_VGS 0x00000020 +/* Command needs a shared lock on a VG; it only reads the VG. */ +#define LOCKD_VG_SH 0x00000040 /* a register of the lvm commands */ struct command { @@ -146,6 +149,7 @@ int metadatatype_arg(struct cmd_context *cmd, struct arg_values *av); int units_arg(struct cmd_context *cmd, struct arg_values *av); int segtype_arg(struct cmd_context *cmd, struct arg_values *av); int alloc_arg(struct cmd_context *cmd, struct arg_values *av); +int locktype_arg(struct cmd_context *cmd, struct arg_values *av); int readahead_arg(struct cmd_context *cmd, struct arg_values *av); int metadatacopies_arg(struct cmd_context *cmd __attribute__((unused)), struct arg_values *av); diff --git a/tools/vgchange.c b/tools/vgchange.c index 1665d3e5d..1868b7d9c 100644 --- a/tools/vgchange.c +++ b/tools/vgchange.c @@ -308,9 +308,18 @@ static int _vgchange_clustered(struct cmd_context *cmd, struct volume_group *vg) { int clustered = arg_int_value(cmd, clustered_ARG, 0); + const char *lock_type = arg_str_value(cmd, locktype_ARG, NULL); struct lv_list *lvl; struct lv_segment *mirror_seg; + if (find_config_tree_bool(cmd, global_use_lvmlockd_CFG, NULL)) { + log_error("lvmlockd requires using the vgchange --lock-type option."); + return 0; + } + + if (lock_type && !strcmp(lock_type, "clvm")) + clustered = 1; + if (clustered && vg_is_clustered(vg)) { if (vg->system_id && *vg->system_id) log_warn("WARNING: Clearing invalid system ID %s from volume group %s.", @@ -506,6 +515,216 @@ static int _vgchange_profile(struct cmd_context *cmd, return 1; } +static int _vgchange_locktype(struct cmd_context *cmd, + struct volume_group *vg) +{ + const char *lock_type = arg_str_value(cmd, locktype_ARG, NULL); + struct lv_list *lvl; + struct logical_volume *lv; + + /* + * This is a special/forced exception to change the lock type to none. + * It's needed for recovery cases and skips the normal steps of undoing + * the current lock type. It's a way to forcibly get access to a VG + * when the normal locking mechanisms are not working. + * + * It ignores: the current lvm locking config, lvmlockd, the state of + * the vg on other hosts, etc. It is meant to just remove any locking + * related metadata from the VG (cluster/lock_type flags, lock_type, + * lock_args). + * + * This can be necessary when manually recovering from certain failures. + * e.g. when a pv is lost containing the lvmlock lv (holding sanlock + * leases), the vg lock_type needs to be changed to none, and then + * back to sanlock, which recreates the lvmlock lv and leases. + */ + if (!strcmp(lock_type, "none") && arg_is_set(cmd, force_ARG)) { + if (yes_no_prompt("Forcibly change VG %s lock type to none? [y/n]: ", vg->name) == 'n') { + log_error("VG lock type not changed."); + return 0; + } + + vg->status &= ~CLUSTERED; + vg->lock_type = "none"; + vg->lock_args = NULL; + + dm_list_iterate_items(lvl, &vg->lvs) + lvl->lv->lock_args = NULL; + + return 1; + } + + if (!vg->lock_type) { + if (vg_is_clustered(vg)) + vg->lock_type = "clvm"; + else + vg->lock_type = "none"; + } + + if (!strcmp(vg->lock_type, lock_type)) { + log_warn("New lock_type %s matches the current lock_type %s.", + lock_type, vg->lock_type); + return 1; + } + + /* + * When lvm is currently using clvm, this function is just an alternative + * to vgchange -c{y,n}, and can: + * - change none to clvm + * - change clvm to none + * - it CANNOT change to or from a lockd type + */ + if (locking_is_clustered()) { + if (is_lockd_type(lock_type)) { + log_error("Changing to lock type %s requires lvmlockd.", lock_type); + return 0; + } + + return _vgchange_clustered(cmd, vg); + } + + /* + * When lvm is currently using lvmlockd, this function can: + * - change none to lockd type + * - change none to clvm (with warning about not being able to use it) + * - change lockd type to none + * - change lockd type to clvm (with warning about not being able to use it) + * - change clvm to none + * - change clvm to lockd type + */ + + if (lvs_in_vg_activated(vg)) { + log_error("Changing VG %s lock type not allowed with active LVs", + vg->name); + return 0; + } + + /* + * Check if there are any LV types in the VG that cannot be handled + * with the new lock type. Remove this once all LV types can be + * handled. + */ + if (is_lockd_type(lock_type)) { + dm_list_iterate_items(lvl, &vg->lvs) { + lv = lvl->lv; + + if ((lv->status & SNAPSHOT) || lv_is_cow(lv)) { + log_error("Changing to lock type %s is not allowed with cow snapshot LV %s/%s", + lock_type, vg->name, lv->name); + return 0; + } + } + } + + /* none to clvm */ + if (!strcmp(vg->lock_type, "none") && !strcmp(lock_type, "clvm")) { + log_warn("New clvm lock type will not be usable with lvmlockd."); + vg->status |= CLUSTERED; + vg->lock_type = "clvm"; /* this is optional */ + return 1; + } + + /* clvm to none */ + if (!strcmp(vg->lock_type, "clvm") && !strcmp(lock_type, "none")) { + vg->status &= ~CLUSTERED; + vg->lock_type = "none"; + return 1; + } + + /* clvm to ..., first undo clvm */ + if (!strcmp(vg->lock_type, "clvm")) { + vg->status &= ~CLUSTERED; + } + + /* + * lockd type to ..., first undo lockd type + * + * To allow this, we need to do: + * lockd_stop_vg(); + * lockd_free_vg_before(); + * lockd_free_vg_after(); + */ + if (is_lockd_type(vg->lock_type)) { + /* FIXME: implement full undoing of the lock_type */ + log_error("Changing VG %s from lock type %s not yet allowed.", + vg->name, vg->lock_type); + return 0; + } + + /* ... to clvm */ + if (!strcmp(lock_type, "clvm")) { + log_warn("New clvm lock type will not be usable with lvmlockd."); + vg->status |= CLUSTERED; + vg->lock_type = "clvm"; /* this is optional */ + vg->system_id = NULL; + return 1; + } + + /* ... to lockd type */ + if (is_lockd_type(lock_type)) { + /* + * For lock_type dlm, lockd_init_vg() will do a single + * vg_write() that sets lock_type, sets lock_args, clears + * system_id, and sets all LV lock_args to dlm. + */ + if (!strcmp(lock_type, "dlm")) { + dm_list_iterate_items(lvl, &vg->lvs) { + lv = lvl->lv; + if (lockd_lv_uses_lock(lv)) + lv->lock_args = "dlm"; + } + } + + /* + * See below. We cannot set valid LV lock_args until stage 1 + * of the change is done, so we need to skip the validation of + * the lock_args during stage 1. + */ + if (!strcmp(lock_type, "sanlock")) + vg->skip_validate_lock_args = 1; + + vg->system_id = NULL; + + if (!lockd_init_vg(cmd, vg, lock_type)) { + log_error("Failed to initialize lock args for lock type %s", lock_type); + return 0; + } + + /* + * For lock_type sanlock, there must be multiple steps + * because the VG needs an active lvmlock LV before + * LV lock areas can be allocated, which must be done + * before LV lock_args are written. So, the LV lock_args + * remain unset during the first stage of the conversion. + * + * Stage 1: + * lockd_init_vg() creates and activates the lvmlock LV, + * then sets lock_type, sets lock_args, and clears system_id. + * + * Stage 2: + * We get here, and can now set LV lock_args. This uses + * the standard code path for allocating LV locks in + * vg_write() by setting LV lock_args to "pending", + * which tells vg_write() to call lockd_init_lv() + * and sets the lv->lock_args value before writing the VG. + */ + if (!strcmp(lock_type, "sanlock")) { + dm_list_iterate_items(lvl, &vg->lvs) { + lv = lvl->lv; + if (lockd_lv_uses_lock(lv)) + lv->lock_args = "pending"; + } + + vg->skip_validate_lock_args = 0; + } + + return 1; + } + + log_error("Unknown lock type"); + return 0; +} + /* * This function will not be called unless the local host is allowed to use the * VG. Either the VG has no system_id, or the VG and host have matching @@ -577,9 +796,83 @@ static int _vgchange_system_id(struct cmd_context *cmd, struct volume_group *vg) if (vg->lvm1_system_id) *vg->lvm1_system_id = '\0'; + /* update system_id in lvmlockd's record for this vg */ + if (!lockd_start_vg(cmd, vg)) + log_debug("Failed to update lvmlockd."); + return 1; } +static int _passes_lock_start_filter(struct cmd_context *cmd, + struct volume_group *vg, + const int cfg_id) +{ + const struct dm_config_node *cn; + const struct dm_config_value *cv; + const char *str; + + /* undefined list means no restrictions, all vg names pass */ + + cn = find_config_tree_node(cmd, cfg_id, NULL); + if (!cn) + return 1; + + /* with a defined list, the vg name must be included to pass */ + + for (cv = cn->v; cv; cv = cv->next) { + if (cv->type == DM_CFG_EMPTY_ARRAY) + break; + if (cv->type != DM_CFG_STRING) { + log_error("Ignoring invalid string in lock_start list"); + continue; + } + str = cv->v.str; + if (!*str) { + log_error("Ignoring empty string in config file"); + continue; + } + + /* ignoring tags for now */ + + if (!strcmp(str, vg->name)) + return 1; + } + + return 0; +} + +static int _vgchange_lock_start(struct cmd_context *cmd, struct volume_group *vg) +{ + const char *start_opt = arg_str_value(cmd, lockopt_ARG, NULL); + int auto_opt = 0; + + if (!start_opt || arg_is_set(cmd, force_ARG)) + goto do_start; + + if (!strcmp(start_opt, "auto") || !strcmp(start_opt, "autowait")) + auto_opt = 1; + + if (!_passes_lock_start_filter(cmd, vg, activation_lock_start_list_CFG)) { + log_verbose("Not starting %s since it does not pass lock_start_list", vg->name); + return 1; + } + + if (auto_opt && !_passes_lock_start_filter(cmd, vg, activation_auto_lock_start_list_CFG)) { + log_verbose("Not starting %s since it does not pass auto_lock_start_list", vg->name); + return 1; + } + +do_start: + return lockd_start_vg(cmd, vg); +} + +static int _vgchange_lock_stop(struct cmd_context *cmd, struct volume_group *vg) +{ + /* Disable the unlock in toollib because it's pointless after the stop. */ + cmd->lockd_vg_disable = 1; + return lockd_stop_vg(cmd, vg); +} + static int vgchange_single(struct cmd_context *cmd, const char *vg_name, struct volume_group *vg, struct processing_handle *handle __attribute__((unused))) @@ -605,6 +898,7 @@ static int vgchange_single(struct cmd_context *cmd, const char *vg_name, { metadataprofile_ARG, &_vgchange_profile }, { profile_ARG, &_vgchange_profile }, { detachprofile_ARG, &_vgchange_profile }, + { locktype_ARG, &_vgchange_locktype }, { systemid_ARG, &_vgchange_system_id }, }; @@ -694,13 +988,94 @@ static int vgchange_single(struct cmd_context *cmd, const char *vg_name, if (!_vgchange_background_polling(cmd, vg)) return_ECMD_FAILED; + if (arg_is_set(cmd, lockstart_ARG)) { + if (!_vgchange_lock_start(cmd, vg)) + return_ECMD_FAILED; + } else if (arg_is_set(cmd, lockstop_ARG)) { + if (!_vgchange_lock_stop(cmd, vg)) + return_ECMD_FAILED; + } + return ret; } +/* + * vgchange can do different things that require different + * locking, so look at each of those things here. + * + * Set up overrides for the default VG locking for various special cases. + * The VG lock will be acquired in process_each_vg. + * + * Acquire the gl lock according to which kind of vgchange command this is. + */ + +static int lockd_vgchange(struct cmd_context *cmd, int argc, char **argv) +{ + /* The default vg lock mode is ex, but these options only need sh. */ + + if (arg_is_set(cmd, activate_ARG) || arg_is_set(cmd, refresh_ARG)) + cmd->lockd_vg_default_sh = 1; + + /* Starting a vg lockspace means there are no locks available yet. */ + + if (arg_is_set(cmd, lockstart_ARG)) + cmd->lockd_vg_disable = 1; + + /* + * In most cases, lockd_vg does not apply when changing lock type. + * (We don't generally allow changing *from* lockd type yet.) + * lockd_vg could be called within _vgchange_locktype as needed. + */ + + if (arg_is_set(cmd, locktype_ARG)) + cmd->lockd_vg_disable = 1; + + /* + * Changing system_id or lock_type must only be done on explicitly + * named vgs. + */ + + if (arg_is_set(cmd, systemid_ARG) || arg_is_set(cmd, locktype_ARG)) + cmd->command->flags &= ~ALL_VGS_IS_DEFAULT; + + if (arg_is_set(cmd, lockstart_ARG)) { + /* + * The lockstart condition takes the global lock to serialize + * with any other host that tries to remove the VG while this + * tries to start it. (Zero argc means all VGs, in wich case + * process_each_vg will acquire the global lock.) + */ + if (argc && !lockd_gl(cmd, "sh", 0)) + return_ECMD_FAILED; + + } else if (arg_is_set(cmd, systemid_ARG) || + arg_is_set(cmd, uuid_ARG) || + arg_is_set(cmd, locktype_ARG)) { + /* + * This is a special case where taking the global lock is + * helpful to detect changes to local VGs from other hosts. VG + * names, uuids and system_ids are the three things that other + * hosts cache related to local VGs, so we use the VG namespace + * change detection of the global lock to indicate that one of + * these global VG properties has changed so other hosts will + * update these cached values in VGs that they otherwise ignore + * (because they have foreign system_ids). + */ + if (!lockd_gl(cmd, "ex", LDGL_UPDATE_NAMES)) + return_ECMD_FAILED; + } + + return 1; +} + int vgchange(struct cmd_context *cmd, int argc, char **argv) { + int ret; + int noupdate = arg_count(cmd, activate_ARG) || + arg_count(cmd, lockstart_ARG) || + arg_count(cmd, lockstop_ARG) || arg_count(cmd, monitor_ARG) || arg_count(cmd, poll_ARG) || arg_count(cmd, refresh_ARG); @@ -721,6 +1096,7 @@ int vgchange(struct cmd_context *cmd, int argc, char **argv) arg_count(cmd, clustered_ARG) || arg_count(cmd, alloc_ARG) || arg_count(cmd, vgmetadatacopies_ARG) || + arg_count(cmd, locktype_ARG) || arg_count(cmd, systemid_ARG); int update = update_partial_safe || update_partial_unsafe; @@ -819,6 +1195,27 @@ int vgchange(struct cmd_context *cmd, int argc, char **argv) if (arg_is_set(cmd, activate_ARG)) cmd->include_active_foreign_vgs = 1; - return process_each_vg(cmd, argc, argv, update ? READ_FOR_UPDATE : 0, - NULL, &vgchange_single); + if (!lockd_vgchange(cmd, argc, argv)) + return_ECMD_FAILED; + + ret = process_each_vg(cmd, argc, argv, update ? READ_FOR_UPDATE : 0, + NULL, &vgchange_single); + + /* Wait for lock-start ops that were initiated in vgchange_lockstart. */ + + if (arg_is_set(cmd, lockstart_ARG)) { + const char *start_opt = arg_str_value(cmd, lockopt_ARG, NULL); + + lockd_gl(cmd, "un", 0); + + if (!start_opt || !strcmp(start_opt, "wait") || !strcmp(start_opt, "autowait")) { + log_print_unless_silent("Starting locking. Waiting until locks are ready..."); + lockd_start_wait(cmd); + + } else if (!strcmp(start_opt, "nowait")) { + log_print_unless_silent("Starting locking. VG is read-only until locks are ready."); + } + } + + return ret; } diff --git a/tools/vgcreate.c b/tools/vgcreate.c index 0a6ad6f32..20ba4aa31 100644 --- a/tools/vgcreate.c +++ b/tools/vgcreate.c @@ -50,6 +50,13 @@ int vgcreate(struct cmd_context *cmd, int argc, char **argv) if (!vgcreate_params_validate(cmd, &vp_new)) return EINVALID_CMD_LINE; + /* + * Needed to change the global VG namespace, + * and to change the set of orphan PVs. + */ + if (!lockd_gl_create(cmd, "ex", vp_new.lock_type)) + return ECMD_FAILED; + lvmcache_seed_infos_from_lvmetad(cmd); /* Create the new VG */ @@ -119,6 +126,19 @@ int vgcreate(struct cmd_context *cmd, int argc, char **argv) if (!vg_write(vg) || !vg_commit(vg)) goto_bad; + /* + * The VG is initially written without lock_type set, i.e. it starts as + * a local VG. lockd_init_vg() then writes the VG a second time with + * both lock_type and lock_args set. + */ + if (!lockd_init_vg(cmd, vg, vp_new.lock_type)) { + log_error("Failed to initialize lock args for lock type %s", + vp_new.lock_type); + vg_remove_pvs(vg); + vg_remove_direct(vg); + goto_bad; + } + unlock_vg(cmd, VG_ORPHANS); unlock_vg(cmd, vp_new.vg_name); @@ -128,6 +148,33 @@ int vgcreate(struct cmd_context *cmd, int argc, char **argv) clustered_message, *clustered_message ? 'v' : 'V', vg->name, vg->system_id ? " with system ID " : "", vg->system_id ? : ""); + /* + * Start the VG lockspace because it will likely be used right away. + * Optionally wait for the start to complete so the VG can be fully + * used after this command completes (otherwise, the VG can only be + * read without locks until the lockspace is done starting.) + */ + if (is_lockd_type(vg->lock_type)) { + const char *start_opt = arg_str_value(cmd, lockopt_ARG, NULL); + + if (!lockd_start_vg(cmd, vg)) { + log_error("Failed to start locking"); + goto out; + } + + lockd_gl(cmd, "un", 0); + + if (!start_opt || !strcmp(start_opt, "wait")) { + /* It is OK if the user does Ctrl-C to cancel the wait. */ + log_print_unless_silent("Starting locking. Waiting until locks are ready..."); + lockd_start_wait(cmd); + + } else if (!strcmp(start_opt, "nowait")) { + log_print_unless_silent("Starting locking. VG is read-only until locks are ready."); + } + + } +out: release_vg(vg); return ECMD_PROCESSED; diff --git a/tools/vgextend.c b/tools/vgextend.c index de6d862e8..581c21127 100644 --- a/tools/vgextend.c +++ b/tools/vgextend.c @@ -165,6 +165,10 @@ int vgextend(struct cmd_context *cmd, int argc, char **argv) */ cmd->handles_missing_pvs = 1; + /* Needed to change the set of orphan PVs. */ + if (!lockd_gl(cmd, "ex", 0)) + return_ECMD_FAILED; + ret = process_each_vg(cmd, argc, argv, READ_FOR_UPDATE | ONE_VGNAME_ARG, handle, restoremissing ? &_vgextend_restoremissing : &_vgextend_single); diff --git a/tools/vgmerge.c b/tools/vgmerge.c index a17a636c5..c5ac33299 100644 --- a/tools/vgmerge.c +++ b/tools/vgmerge.c @@ -20,11 +20,18 @@ static struct volume_group *_vgmerge_vg_read(struct cmd_context *cmd, { struct volume_group *vg; log_verbose("Checking for volume group \"%s\"", vg_name); - vg = vg_read_for_update(cmd, vg_name, NULL, 0); + vg = vg_read_for_update(cmd, vg_name, NULL, 0, 0); if (vg_read_error(vg)) { release_vg(vg); return NULL; } + + if (is_lockd_type(vg->lock_type)) { + log_error("vgmerge not allowed for lock_type %s", vg->lock_type); + unlock_and_release_vg(cmd, vg, vg_name); + return NULL; + } + return vg; } @@ -194,6 +201,10 @@ int vgmerge(struct cmd_context *cmd, int argc, char **argv) return EINVALID_CMD_LINE; } + /* Needed change the global VG namespace. */ + if (!lockd_gl(cmd, "ex", LDGL_UPDATE_NAMES)) + return ECMD_FAILED; + vg_name_to = skip_dev_dir(cmd, argv[0], NULL); argc--; argv++; diff --git a/tools/vgreduce.c b/tools/vgreduce.c index 0adf1bb85..693f538ae 100644 --- a/tools/vgreduce.c +++ b/tools/vgreduce.c @@ -141,6 +141,7 @@ int vgreduce(struct cmd_context *cmd, int argc, char **argv) { struct volume_group *vg; const char *vg_name; + uint32_t lockd_state; int ret = ECMD_FAILED; int fixed = 1; int repairing = arg_count(cmd, removemissing_ARG); @@ -195,7 +196,14 @@ int vgreduce(struct cmd_context *cmd, int argc, char **argv) init_ignore_suspended_devices(1); cmd->handles_missing_pvs = 1; - vg = vg_read_for_update(cmd, vg_name, NULL, READ_ALLOW_EXPORTED); + /* Needed to change the set of orphan PVs. */ + if (!lockd_gl(cmd, "ex", 0)) + return_ECMD_FAILED; + + if (!lockd_vg(cmd, vg_name, "ex", 0, &lockd_state)) + return_ECMD_FAILED; + + vg = vg_read_for_update(cmd, vg_name, NULL, READ_ALLOW_EXPORTED, lockd_state); if (vg_read_error(vg) == FAILED_ALLOCATION || vg_read_error(vg) == FAILED_NOTFOUND) goto_out; @@ -218,7 +226,7 @@ int vgreduce(struct cmd_context *cmd, int argc, char **argv) log_verbose("Trying to open VG %s for recovery...", vg_name); vg = vg_read_for_update(cmd, vg_name, NULL, - READ_ALLOW_INCONSISTENT | READ_ALLOW_EXPORTED); + READ_ALLOW_INCONSISTENT | READ_ALLOW_EXPORTED, lockd_state); locked |= !vg_read_error(vg); diff --git a/tools/vgremove.c b/tools/vgremove.c index fd9735604..7c1f0ee4e 100644 --- a/tools/vgremove.c +++ b/tools/vgremove.c @@ -68,6 +68,9 @@ static int vgremove_single(struct cmd_context *cmd, const char *vg_name, } } + if (!lockd_free_vg_before(cmd, vg)) + return_ECMD_FAILED; + if (!force && !vg_remove_check(vg)) return_ECMD_FAILED; @@ -76,6 +79,8 @@ static int vgremove_single(struct cmd_context *cmd, const char *vg_name, if (!vg_remove(vg)) return_ECMD_FAILED; + lockd_free_vg_final(cmd, vg); + return ECMD_PROCESSED; } @@ -89,6 +94,13 @@ int vgremove(struct cmd_context *cmd, int argc, char **argv) return EINVALID_CMD_LINE; } + /* + * Needed to change the global VG namespace, + * and to change the set of orphan PVs. + */ + if (!lockd_gl(cmd, "ex", LDGL_UPDATE_NAMES)) + return ECMD_FAILED; + cmd->handles_missing_pvs = 1; ret = process_each_vg(cmd, argc, argv, READ_FOR_UPDATE, diff --git a/tools/vgrename.c b/tools/vgrename.c index 860ccf196..188061be6 100644 --- a/tools/vgrename.c +++ b/tools/vgrename.c @@ -17,13 +17,14 @@ static struct volume_group *_get_old_vg_for_rename(struct cmd_context *cmd, const char *vg_name_old, - const char *vgid) + const char *vgid, + uint32_t lockd_state) { struct volume_group *vg; /* FIXME we used to print an error about EXPORTED, but proceeded nevertheless. */ - vg = vg_read_for_update(cmd, vg_name_old, vgid, READ_ALLOW_EXPORTED); + vg = vg_read_for_update(cmd, vg_name_old, vgid, READ_ALLOW_EXPORTED, lockd_state); if (vg_read_error(vg)) { release_vg(vg); return_NULL; @@ -67,6 +68,7 @@ static int vg_rename_path(struct cmd_context *cmd, const char *old_vg_path, const char *vgid = NULL, *vg_name, *vg_name_old; char old_path[NAME_LEN], new_path[NAME_LEN]; struct volume_group *vg = NULL; + uint32_t lockd_state; int lock_vg_old_first = 1; vg_name_old = skip_dev_dir(cmd, old_vg_path, NULL); @@ -114,11 +116,14 @@ static int vg_rename_path(struct cmd_context *cmd, const char *old_vg_path, } else vgid = NULL; + if (!lockd_vg(cmd, vg_name_old, "ex", 0, &lockd_state)) + return_0; + if (strcmp(vg_name_new, vg_name_old) < 0) lock_vg_old_first = 0; if (lock_vg_old_first) { - vg = _get_old_vg_for_rename(cmd, vg_name_old, vgid); + vg = _get_old_vg_for_rename(cmd, vg_name_old, vgid, lockd_state); if (!vg) return_0; @@ -130,7 +135,7 @@ static int vg_rename_path(struct cmd_context *cmd, const char *old_vg_path, if (!_lock_new_vg_for_rename(cmd, vg_name_new)) return_0; - vg = _get_old_vg_for_rename(cmd, vg_name_old, vgid); + vg = _get_old_vg_for_rename(cmd, vg_name_old, vgid, lockd_state); if (!vg) { unlock_vg(cmd, vg_name_new); return_0; @@ -144,6 +149,9 @@ static int vg_rename_path(struct cmd_context *cmd, const char *old_vg_path, if (!drop_cached_metadata(vg)) stack; + if (!lockd_rename_vg_before(cmd, vg)) + return_0; + /* Change the volume group name */ vg_rename(cmd, vg, vg_name_new); @@ -171,6 +179,8 @@ static int vg_rename_path(struct cmd_context *cmd, const char *old_vg_path, } } + lockd_rename_vg_final(cmd, vg, 1); + if (!backup(vg)) stack; if (!backup_remove(cmd, vg_name_old)) @@ -190,6 +200,8 @@ static int vg_rename_path(struct cmd_context *cmd, const char *old_vg_path, return 1; error: + lockd_rename_vg_final(cmd, vg, 0); + if (lock_vg_old_first) { unlock_vg(cmd, vg_name_new); unlock_and_release_vg(cmd, vg, vg_name_old); @@ -207,6 +219,10 @@ int vgrename(struct cmd_context *cmd, int argc, char **argv) return EINVALID_CMD_LINE; } + /* Needed change the global VG namespace. */ + if (!lockd_gl(cmd, "ex", LDGL_UPDATE_NAMES)) + return_ECMD_FAILED; + if (!vg_rename_path(cmd, argv[0], argv[1])) return_ECMD_FAILED; diff --git a/tools/vgsplit.c b/tools/vgsplit.c index 53f3975c3..7605bc4b0 100644 --- a/tools/vgsplit.c +++ b/tools/vgsplit.c @@ -422,7 +422,7 @@ static struct volume_group *_vgsplit_to(struct cmd_context *cmd, if (vg_read_error(vg_to) == FAILED_EXIST) { *existing_vg = 1; release_vg(vg_to); - vg_to = vg_read_for_update(cmd, vg_name_to, NULL, 0); + vg_to = vg_read_for_update(cmd, vg_name_to, NULL, 0, 0); if (vg_read_error(vg_to)) { release_vg(vg_to); @@ -448,11 +448,18 @@ static struct volume_group *_vgsplit_from(struct cmd_context *cmd, log_verbose("Checking for volume group \"%s\"", vg_name_from); - vg_from = vg_read_for_update(cmd, vg_name_from, NULL, 0); + vg_from = vg_read_for_update(cmd, vg_name_from, NULL, 0, 0); if (vg_read_error(vg_from)) { release_vg(vg_from); return NULL; } + + if (is_lockd_type(vg_from->lock_type)) { + log_error("vgsplit not allowed for lock_type %s", vg_from->lock_type); + unlock_and_release_vg(cmd, vg_from, vg_name_from); + return NULL; + } + return vg_from; } @@ -492,6 +499,10 @@ int vgsplit(struct cmd_context *cmd, int argc, char **argv) return ECMD_FAILED; } + /* Needed change the global VG namespace. */ + if (!lockd_gl(cmd, "ex", LDGL_UPDATE_NAMES)) + return_ECMD_FAILED; + if (arg_count(cmd, name_ARG)) lv_name = arg_value(cmd, name_ARG); else @@ -662,7 +673,7 @@ int vgsplit(struct cmd_context *cmd, int argc, char **argv) if (!test_mode()) { release_vg(vg_to); vg_to = vg_read_for_update(cmd, vg_name_to, NULL, - READ_ALLOW_EXPORTED); + READ_ALLOW_EXPORTED, 0); if (vg_read_error(vg_to)) { log_error("Volume group \"%s\" became inconsistent: " "please fix manually", vg_name_to); |