diff options
Diffstat (limited to 'daemons/lvmlockd/lvmlockd-sanlock.c')
-rw-r--r-- | daemons/lvmlockd/lvmlockd-sanlock.c | 1716 |
1 files changed, 1716 insertions, 0 deletions
diff --git a/daemons/lvmlockd/lvmlockd-sanlock.c b/daemons/lvmlockd/lvmlockd-sanlock.c new file mode 100644 index 000000000..8f3ecffb5 --- /dev/null +++ b/daemons/lvmlockd/lvmlockd-sanlock.c @@ -0,0 +1,1716 @@ +/* + * Copyright (C) 2014 Red Hat, Inc. + * + * This file is part of LVM2. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU Lesser General Public License v.2.1. + */ + +#define _XOPEN_SOURCE 500 /* pthread */ +#define _ISOC99_SOURCE +#define _GNU_SOURCE + +#include <assert.h> +#include <pthread.h> +#include <stdint.h> +#include <stddef.h> +#include <stdlib.h> +#include <unistd.h> +#include <stdio.h> +#include <poll.h> +#include <errno.h> +#include <string.h> +#include <syslog.h> +#include <sys/types.h> +#include <sys/socket.h> + +#include "configure.h" +#include "daemon-server.h" +#include "daemon-log.h" +#include "xlate.h" + +#include "lvmlockd-internal.h" +#include "lvmlockd-client.h" + +#include "sanlock.h" +#include "sanlock_rv.h" +#include "sanlock_admin.h" +#include "sanlock_resource.h" + +/* + * If access to the pv containing the vg's leases is lost, sanlock cannot renew + * the leases we have acquired for locked LVs. This means that we could soon + * loose the lease to another host which could activate our LV exclusively. We + * do not want to get to the point of two hosts having the same LV active + * exclusively (it obviously violates the purpose of LV locks.) + * + * The default method of preventing this problem is for lvmlockd to do nothing, + * which produces a safe but potentially inconvenient result. Doing nothing + * leads to our LV leases not being released, which leads to sanlock using the + * local watchdog to reset us before another host can acquire our lock. It + * would often be preferrable to avoid the abrupt hard reset from the watchdog. + * + * There are other options to avoid being reset by our watchdog. If we can + * quickly stop using the LVs in question and release the locks for them, then + * we could avoid a reset (there's a certain grace period of about 40 seconds + * in which we can attempt this.) To do this, we can tell sanlock to run a + * specific program when it has lost access to our leases. We could use this + * program to: + * + * 1. Deactivate all lvs in the effected vg. If all the leases are + * deactivated, then our LV locks would be released and sanlock would no longer + * use the watchdog to reset us. If file systems are mounted on the active + * lvs, then deactivating them would fail, so this option would be of limited + * usefulness. + * + * 2. Option 1 could be extended to kill pids using the fs on the lv, unmount + * the fs, and deactivate the lv. This is probably out of scope for lvm + * directly, and would likely need the help of another system service. + * + * 3. Use dmsetup suspend to block access to lvs in the effected vg. If this + * was successful, the local host could no longer write to the lvs, we could + * safely release the LV locks, and sanlock would no longer reset us. At this + * point, with suspended lvs, the host would be in a fairly hobbled state, and + * would almost certainly need a manual, forcible reset. + * + * 4. Option 3 could be extended to monitor the lost storage, and if it is + * reconnected, the leases could be reacquired, and the suspended lvs resumed + * (reacquiring leases will fail if another host has acquired them since they + * were released.) This complexity of this option, combined with the fact that + * the error conditions are often not as simple as storage being lost and then + * later connecting, will result in this option being too unreliable. + * + * Add a config option that we could use to select a different behavior than + * the default. Then implement one of the simpler options as a proof of + * concept, which could be extended if needed. + */ + +/* + * Each lockspace thread has its own sanlock daemon connection. + * If they shared one, sanlock acquire/release calls would be + * serialized. Some aspects of sanlock expect a single connection + * from each pid: signals due to a sanlock_request, and + * acquire/release/convert/inquire. The later can probably be + * addressed with a flag to indicate that the pid field should be + * interpretted as 'ci' (which the caller would need to figure + * out somehow.) + */ + +struct lm_sanlock { + struct sanlk_lockspace ss; + int align_size; + int sock; /* sanlock daemon connection */ +}; + +struct rd_sanlock { + union { + struct sanlk_resource rs; + char buf[sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk)]; + }; + struct val_blk *vb; +}; + +struct sanlk_resourced { + union { + struct sanlk_resource rs; + char buf[sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk)]; + }; +}; + +int lm_data_size_sanlock(void) +{ + return sizeof(struct rd_sanlock); +} + +/* + * lock_args format + * + * vg_lock_args format for sanlock is + * vg_version_string:undefined:lock_lv_name + * + * lv_lock_args format for sanlock is + * lv_version_string:undefined:offset + * + * version_string is MAJOR.MINOR.PATCH + * undefined may contain ":" + * + * If a new version of the lock_args string cannot be + * handled by an old version of lvmlockd, then the + * new lock_args string should contain a larger major number. + */ + +#define VG_LOCK_ARGS_MAJOR 1 +#define VG_LOCK_ARGS_MINOR 0 +#define VG_LOCK_ARGS_PATCH 0 + +#define LV_LOCK_ARGS_MAJOR 1 +#define LV_LOCK_ARGS_MINOR 0 +#define LV_LOCK_ARGS_PATCH 0 + +/* + * offset 0 is lockspace + * offset align_size * 1 is unused + * offset align_size * 2 is unused + * ... + * offset align_size * 64 is unused + * offset align_size * 65 is gl lock + * offset align_size * 66 is vg lock + * offset align_size * 67 is first lv lock + * offset align_size * 68 is second lv lock + * ... + */ + +#define LS_BEGIN 0 +#define GL_LOCK_BEGIN 65 +#define VG_LOCK_BEGIN 66 +#define LV_LOCK_BEGIN 67 + +static int lock_lv_name_from_args(char *vg_args, char *lock_lv_name) +{ + return last_string_from_args(vg_args, lock_lv_name); +} + +static int lock_lv_offset_from_args(char *lv_args, uint64_t *lock_lv_offset) +{ + char offset_str[MAX_ARGS]; + int rv; + + memset(offset_str, 0, sizeof(offset_str)); + + rv = last_string_from_args(lv_args, offset_str); + if (rv < 0) + return rv; + + *lock_lv_offset = strtoull(offset_str, NULL, 10); + return 0; +} + +static int check_args_version(char *args, unsigned int our_major) +{ + unsigned int major = 0; + int rv; + + rv = version_from_args(args, &major, NULL, NULL); + if (rv < 0) { + log_error("check_args_version %s error %d", args, rv); + return rv; + } + + if (major > our_major) { + log_error("check_args_version %s major %u %u", args, major, our_major); + return -1; + } + + return 0; +} + +#define MAX_LINE 64 + +static int read_host_id_file(void) +{ + FILE *file; + char line[MAX_LINE]; + char key_str[MAX_LINE]; + char val_str[MAX_LINE]; + char *key, *val, *sep; + int host_id = 0; + + file = fopen(daemon_host_id_file, "r"); + if (!file) + goto out; + + while (fgets(line, MAX_LINE, file)) { + if (line[0] == '#' || line[0] == '\n') + continue; + + key = line; + sep = strstr(line, "="); + val = sep + 1; + + if (!sep || !val) + continue; + + *sep = '\0'; + memset(key_str, 0, sizeof(key_str)); + memset(val_str, 0, sizeof(val_str)); + sscanf(key, "%s", key_str); + sscanf(val, "%s", val_str); + + if (!strcmp(key_str, "host_id")) { + host_id = atoi(val_str); + break; + } + } + fclose(file); +out: + log_debug("host_id %d from %s", host_id, daemon_host_id_file); + return host_id; +} + +/* + * vgcreate + * + * For init_vg, vgcreate passes the internal lv name as vg_args. + * This constructs the full/proper vg_args format, containing the + * version and lv name, and returns the real lock_args in vg_args. + */ + +int lm_init_vg_sanlock(char *ls_name, char *vg_name, uint32_t flags, char *vg_args) +{ + struct sanlk_lockspace ss; + struct sanlk_resourced rd; + struct sanlk_disk disk; + char lock_lv_name[MAX_ARGS]; + char lock_args_version[MAX_ARGS]; + const char *gl_name = NULL; + uint32_t daemon_version; + uint32_t daemon_proto; + uint64_t offset; + int align_size; + int i, rv; + + memset(&ss, 0, sizeof(ss)); + memset(&rd, 0, sizeof(rd)); + memset(&disk, 0, sizeof(disk)); + memset(lock_lv_name, 0, sizeof(lock_lv_name)); + memset(lock_args_version, 0, sizeof(lock_args_version)); + + if (!vg_args || !vg_args[0] || !strcmp(vg_args, "none")) { + log_error("S %s init_vg_san vg_args missing", ls_name); + return -EARGS; + } + + snprintf(lock_args_version, MAX_ARGS, "%u.%u.%u", + VG_LOCK_ARGS_MAJOR, VG_LOCK_ARGS_MINOR, VG_LOCK_ARGS_PATCH); + + /* see comment above about input vg_args being only lock_lv_name */ + snprintf(lock_lv_name, MAX_ARGS, "%s", vg_args); + + if (strlen(lock_lv_name) + strlen(lock_args_version) + 2 > MAX_ARGS) + return -EARGS; + + snprintf(disk.path, SANLK_PATH_LEN, "/dev/mapper/%s-%s", vg_name, lock_lv_name); + + log_debug("S %s init_vg_san path %s", ls_name, disk.path); + + if (daemon_test) { + if (!gl_lsname_sanlock[0]) + strncpy(gl_lsname_sanlock, ls_name, MAX_NAME); + return 0; + } + + rv = sanlock_version(0, &daemon_version, &daemon_proto); + if (rv < 0) { + log_error("S %s init_vg_san failed to connect to sanlock daemon", ls_name); + return -EMANAGER; + } + + log_debug("sanlock daemon version %08x proto %08x", + daemon_version, daemon_proto); + + align_size = sanlock_align(&disk); + if (align_size <= 0) { + log_error("S %s init_vg_san bad disk align size %d %s", + ls_name, align_size, disk.path); + return -EARGS; + } + + strncpy(ss.name, ls_name, SANLK_NAME_LEN); + memcpy(ss.host_id_disk.path, disk.path, SANLK_PATH_LEN); + ss.host_id_disk.offset = LS_BEGIN * align_size; + + rv = sanlock_write_lockspace(&ss, 0, 0, sanlock_io_timeout); + if (rv < 0) { + log_error("S %s init_vg_san write_lockspace error %d %s", + ls_name, rv, ss.host_id_disk.path); + return rv; + } + + /* + * We want to create the global lock in the first sanlock vg. + * If other sanlock vgs exist, then one of them must contain + * the gl. If gl_lsname_sanlock is not set, then perhaps + * the sanlock vg with the gl has been removed or has not yet + * been seen. (Would vgcreate get this far in that case?) + * If dlm vgs exist, then we choose to use the dlm gl and + * not a sanlock gl. + */ + + if (flags & LD_AF_ENABLE) + gl_name = R_NAME_GL; + else if (flags & LD_AF_DISABLE) + gl_name = R_NAME_GL_DISABLED; + else if (!gl_use_sanlock || gl_lsname_sanlock[0] || !lockspaces_empty()) + gl_name = R_NAME_GL_DISABLED; + else + gl_name = R_NAME_GL; + + memcpy(rd.rs.lockspace_name, ss.name, SANLK_NAME_LEN); + strncpy(rd.rs.name, gl_name, SANLK_NAME_LEN); + memcpy(rd.rs.disks[0].path, disk.path, SANLK_PATH_LEN); + rd.rs.disks[0].offset = align_size * GL_LOCK_BEGIN; + rd.rs.num_disks = 1; + + rv = sanlock_write_resource(&rd.rs, 0, 0, 0); + if (rv < 0) { + log_error("S %s init_vg_san write_resource gl error %d %s", + ls_name, rv, rd.rs.disks[0].path); + return rv; + } + + memcpy(rd.rs.lockspace_name, ss.name, SANLK_NAME_LEN); + strncpy(rd.rs.name, R_NAME_VG, SANLK_NAME_LEN); + memcpy(rd.rs.disks[0].path, disk.path, SANLK_PATH_LEN); + rd.rs.disks[0].offset = align_size * VG_LOCK_BEGIN; + rd.rs.num_disks = 1; + + rv = sanlock_write_resource(&rd.rs, 0, 0, 0); + if (rv < 0) { + log_error("S %s init_vg_san write_resource vg error %d %s", + ls_name, rv, rd.rs.disks[0].path); + return rv; + } + + if (!strcmp(gl_name, R_NAME_GL)) + strncpy(gl_lsname_sanlock, ls_name, MAX_NAME); + + snprintf(vg_args, MAX_ARGS, "%s:%s", lock_args_version, lock_lv_name); + + log_debug("S %s init_vg_san done vg_args %s", ls_name, vg_args); + + /* + * Go through all lv resource slots and initialize them with the + * correct lockspace name but a special resource name that indicates + * it is unused. + */ + + memset(&rd, 0, sizeof(rd)); + rd.rs.num_disks = 1; + memcpy(rd.rs.disks[0].path, disk.path, SANLK_PATH_LEN); + strncpy(rd.rs.lockspace_name, ls_name, SANLK_NAME_LEN); + strcpy(rd.rs.name, "#unused"); + + offset = align_size * LV_LOCK_BEGIN; + + log_debug("S %s init_vg_san clearing lv lease areas", ls_name); + + for (i = 0; ; i++) { + rd.rs.disks[0].offset = offset; + + rv = sanlock_write_resource(&rd.rs, 0, 0, 0); + if (rv == -EMSGSIZE || rv == -ENOSPC) { + /* This indicates the end of the device is reached. */ + rv = -EMSGSIZE; + break; + } + + if (rv) { + log_error("clear lv resource area %llu error %d", + (unsigned long long)offset, rv); + break; + } + offset += align_size; + } + + return 0; +} + +/* + * lvcreate + * + * The offset at which the lv lease is written is passed + * all the way back to the lvcreate command so that it + * can be saved in the lv's lock_args in the vg metadata. + */ + +int lm_init_lv_sanlock(char *ls_name, char *vg_name, char *lv_name, + char *vg_args, char *lv_args, uint64_t free_offset) +{ + struct sanlk_resourced rd; + char lock_lv_name[MAX_ARGS]; + char lock_args_version[MAX_ARGS]; + uint64_t offset; + int align_size; + int rv; + + memset(&rd, 0, sizeof(rd)); + memset(lock_lv_name, 0, sizeof(lock_lv_name)); + memset(lock_args_version, 0, sizeof(lock_args_version)); + + rv = lock_lv_name_from_args(vg_args, lock_lv_name); + if (rv < 0) { + log_error("S %s init_lv_san lock_lv_name_from_args error %d %s", + ls_name, rv, vg_args); + return rv; + } + + snprintf(lock_args_version, MAX_ARGS, "%u.%u.%u", + LV_LOCK_ARGS_MAJOR, LV_LOCK_ARGS_MINOR, LV_LOCK_ARGS_PATCH); + + strncpy(rd.rs.lockspace_name, ls_name, SANLK_NAME_LEN); + rd.rs.num_disks = 1; + snprintf(rd.rs.disks[0].path, SANLK_PATH_LEN, "/dev/mapper/%s-%s", vg_name, lock_lv_name); + + align_size = sanlock_align(&rd.rs.disks[0]); + if (align_size <= 0) { + log_error("S %s init_lv_san align error %d", ls_name, align_size); + return -EINVAL; + } + + if (free_offset) + offset = free_offset; + else + offset = align_size * LV_LOCK_BEGIN; + rd.rs.disks[0].offset = offset; + + if (daemon_test) { + snprintf(lv_args, MAX_ARGS, "%s:%llu", + lock_args_version, (unsigned long long)1111); + return 0; + } + + while (1) { + rd.rs.disks[0].offset = offset; + + memset(rd.rs.name, 0, SANLK_NAME_LEN); + + rv = sanlock_read_resource(&rd.rs, 0); + if (rv == -EMSGSIZE || rv == -ENOSPC) { + /* This indicates the end of the device is reached. */ + log_debug("S %s init_lv_san read limit offset %llu", + ls_name, (unsigned long long)offset); + rv = -EMSGSIZE; + return rv; + } + + if (rv && rv != SANLK_LEADER_MAGIC) { + log_error("S %s init_lv_san read error %d offset %llu", + ls_name, rv, (unsigned long long)offset); + break; + } + + if (!strncmp(rd.rs.name, lv_name, SANLK_NAME_LEN)) { + log_error("S %s init_lv_san resource name %s already exists at %llu", + ls_name, lv_name, (unsigned long long)offset); + return -EEXIST; + } + + /* + * If we read newly extended space, it will not be initialized + * with an "#unused" resource, but will return SANLK_LEADER_MAGIC + * indicating an uninitialized paxos structure on disk. + */ + if ((rv == SANLK_LEADER_MAGIC) || !strcmp(rd.rs.name, "#unused")) { + log_debug("S %s init_lv_san %s found unused area at %llu", + ls_name, lv_name, (unsigned long long)offset); + + strncpy(rd.rs.name, lv_name, SANLK_NAME_LEN); + + rv = sanlock_write_resource(&rd.rs, 0, 0, 0); + if (!rv) { + snprintf(lv_args, MAX_ARGS, "%s:%llu", + lock_args_version, (unsigned long long)offset); + } else { + log_error("S %s init_lv_san write error %d offset %llu", + ls_name, rv, (unsigned long long)rv); + } + break; + } + + offset += align_size; + } + + return rv; +} + +/* + * Read the lockspace and each resource, replace the lockspace name, + * and write it back. + */ + +int lm_rename_vg_sanlock(char *ls_name, char *vg_name, uint32_t flags, char *vg_args) +{ + struct sanlk_lockspace ss; + struct sanlk_resourced rd; + struct sanlk_disk disk; + char lock_lv_name[MAX_ARGS]; + uint64_t offset; + uint32_t io_timeout; + int align_size; + int i, rv; + + memset(&disk, 0, sizeof(disk)); + memset(lock_lv_name, 0, sizeof(lock_lv_name)); + + if (!vg_args || !vg_args[0] || !strcmp(vg_args, "none")) { + log_error("S %s rename_vg_san vg_args missing", ls_name); + return -EINVAL; + } + + rv = lock_lv_name_from_args(vg_args, lock_lv_name); + if (rv < 0) { + log_error("S %s init_lv_san lock_lv_name_from_args error %d %s", + ls_name, rv, vg_args); + return rv; + } + + snprintf(disk.path, SANLK_PATH_LEN, "/dev/mapper/%s-%s", vg_name, lock_lv_name); + + log_debug("S %s rename_vg_san path %s", ls_name, disk.path); + + if (daemon_test) + return 0; + + /* FIXME: device is not always ready for us here */ + sleep(1); + + align_size = sanlock_align(&disk); + if (align_size <= 0) { + log_error("S %s rename_vg_san bad align size %d %s", + ls_name, align_size, disk.path); + return -EINVAL; + } + + /* + * Lockspace + */ + + memset(&ss, 0, sizeof(ss)); + memcpy(ss.host_id_disk.path, disk.path, SANLK_PATH_LEN); + ss.host_id_disk.offset = LS_BEGIN * align_size; + + rv = sanlock_read_lockspace(&ss, 0, &io_timeout); + if (rv < 0) { + log_error("S %s rename_vg_san read_lockspace error %d %s", + ls_name, rv, ss.host_id_disk.path); + return rv; + } + + strncpy(ss.name, ls_name, SANLK_NAME_LEN); + + rv = sanlock_write_lockspace(&ss, 0, 0, sanlock_io_timeout); + if (rv < 0) { + log_error("S %s rename_vg_san write_lockspace error %d %s", + ls_name, rv, ss.host_id_disk.path); + return rv; + } + + /* + * GL resource + */ + + memset(&rd, 0, sizeof(rd)); + memcpy(rd.rs.disks[0].path, disk.path, SANLK_PATH_LEN); + rd.rs.disks[0].offset = align_size * GL_LOCK_BEGIN; + rd.rs.num_disks = 1; + + rv = sanlock_read_resource(&rd.rs, 0); + if (rv < 0) { + log_error("S %s rename_vg_san read_resource gl error %d %s", + ls_name, rv, rd.rs.disks[0].path); + return rv; + } + + strncpy(rd.rs.lockspace_name, ss.name, SANLK_NAME_LEN); + + rv = sanlock_write_resource(&rd.rs, 0, 0, 0); + if (rv < 0) { + log_error("S %s rename_vg_san write_resource gl error %d %s", + ls_name, rv, rd.rs.disks[0].path); + return rv; + } + + /* + * VG resource + */ + + memset(&rd, 0, sizeof(rd)); + memcpy(rd.rs.disks[0].path, disk.path, SANLK_PATH_LEN); + rd.rs.disks[0].offset = align_size * VG_LOCK_BEGIN; + rd.rs.num_disks = 1; + + rv = sanlock_read_resource(&rd.rs, 0); + if (rv < 0) { + log_error("S %s rename_vg_san write_resource vg error %d %s", + ls_name, rv, rd.rs.disks[0].path); + return rv; + } + + strncpy(rd.rs.lockspace_name, ss.name, SANLK_NAME_LEN); + + rv = sanlock_write_resource(&rd.rs, 0, 0, 0); + if (rv < 0) { + log_error("S %s rename_vg_san write_resource vg error %d %s", + ls_name, rv, rd.rs.disks[0].path); + return rv; + } + + /* + * LV resources + */ + + offset = align_size * LV_LOCK_BEGIN; + + for (i = 0; ; i++) { + memset(&rd, 0, sizeof(rd)); + memcpy(rd.rs.disks[0].path, disk.path, SANLK_PATH_LEN); + rd.rs.disks[0].offset = offset; + rd.rs.num_disks = 1; + + rv = sanlock_read_resource(&rd.rs, 0); + if (rv == -EMSGSIZE || rv == -ENOSPC) { + /* This indicates the end of the device is reached. */ + rv = -EMSGSIZE; + break; + } + + if (rv < 0) { + log_error("S %s rename_vg_san read_resource resource area %llu error %d", + ls_name, (unsigned long long)offset, rv); + break; + } + + strncpy(rd.rs.lockspace_name, ss.name, SANLK_NAME_LEN); + + rv = sanlock_write_resource(&rd.rs, 0, 0, 0); + if (rv) { + log_error("S %s rename_vg_san write_resource resource area %llu error %d", + ls_name, (unsigned long long)offset, rv); + break; + } + offset += align_size; + } + + return 0; +} + +/* lvremove */ +int lm_free_lv_sanlock(struct lockspace *ls, struct resource *r) +{ + struct rd_sanlock *rds = (struct rd_sanlock *)r->lm_data; + struct sanlk_resource *rs = &rds->rs; + int rv; + + log_debug("S %s R %s free_lv_san", ls->name, r->name); + + if (daemon_test) + return 0; + + strcpy(rs->name, "#unused"); + + rv = sanlock_write_resource(rs, 0, 0, 0); + if (rv < 0) { + log_error("S %s R %s free_lv_san write error %d", + ls->name, r->name, rv); + } + + return rv; +} + +int lm_ex_disable_gl_sanlock(struct lockspace *ls) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + struct sanlk_resourced rd1; + struct sanlk_resourced rd2; + struct sanlk_resource *rs1; + struct sanlk_resource *rs2; + struct sanlk_resource **rs_args; + int rv; + + rs_args = malloc(2 * sizeof(struct sanlk_resource *)); + if (!rs_args) + return -ENOMEM; + + rs1 = &rd1.rs; + rs2 = &rd2.rs; + + memset(&rd1, 0, sizeof(rd1)); + memset(&rd2, 0, sizeof(rd2)); + + strncpy(rd1.rs.lockspace_name, ls->name, SANLK_NAME_LEN); + strncpy(rd1.rs.name, R_NAME_GL, SANLK_NAME_LEN); + + strncpy(rd2.rs.lockspace_name, ls->name, SANLK_NAME_LEN); + strncpy(rd2.rs.name, R_NAME_GL_DISABLED, SANLK_NAME_LEN); + + rd1.rs.num_disks = 1; + strncpy(rd1.rs.disks[0].path, lms->ss.host_id_disk.path, SANLK_PATH_LEN); + rd1.rs.disks[0].offset = lms->align_size * GL_LOCK_BEGIN; + + rv = sanlock_acquire(lms->sock, -1, 0, 1, &rs1, NULL); + if (rv < 0) { + log_error("S %s ex_disable_gl_san acquire error %d", + ls->name, rv); + goto out; + } + + rs_args[0] = rs1; + rs_args[1] = rs2; + + rv = sanlock_release(lms->sock, -1, SANLK_REL_RENAME, 2, rs_args); + if (rv < 0) { + log_error("S %s ex_disable_gl_san release_rename error %d", + ls->name, rv); + } + +out: + free(rs_args); + return rv; +} + +/* + * enable/disable exist because each vg contains a global lock, + * but we only want to use the gl from one of them. The first + * sanlock vg created, has its gl enabled, and subsequent + * sanlock vgs have their gl disabled. If the vg containing the + * gl is removed, the gl from another sanlock vg needs to be + * enabled. Or, if gl in multiple vgs are somehow enabled, we + * want to be able to disable one of them. + * + * Disable works by naming/renaming the gl resource to have a + * name that is different from the predefined name. + * When a host attempts to acquire the gl with its standard + * predefined name, it will fail because the resource's name + * on disk doesn't match. + */ + +int lm_able_gl_sanlock(struct lockspace *ls, int enable) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + struct sanlk_resourced rd; + const char *gl_name; + int rv; + + if (enable) + gl_name = R_NAME_GL; + else + gl_name = R_NAME_GL_DISABLED; + + memset(&rd, 0, sizeof(rd)); + + strncpy(rd.rs.lockspace_name, ls->name, SANLK_NAME_LEN); + strncpy(rd.rs.name, gl_name, SANLK_NAME_LEN); + + rd.rs.num_disks = 1; + strncpy(rd.rs.disks[0].path, lms->ss.host_id_disk.path, SANLK_PATH_LEN); + rd.rs.disks[0].offset = lms->align_size * GL_LOCK_BEGIN; + + rv = sanlock_write_resource(&rd.rs, 0, 0, 0); + if (rv < 0) { + log_error("S %s able_gl %d write_resource gl error %d %s", + ls->name, enable, rv, rd.rs.disks[0].path); + return rv; + } + + log_debug("S %s able_gl %s", ls->name, gl_name); + + ls->sanlock_gl_enabled = enable; + if (ls->sanlock_gl_dup && !enable) + ls->sanlock_gl_dup = 0; + + if (enable) + strncpy(gl_lsname_sanlock, ls->name, MAX_NAME); + + if (!enable && !strcmp(gl_lsname_sanlock, ls->name)) + memset(gl_lsname_sanlock, 0, sizeof(gl_lsname_sanlock)); + + return 0; +} + +static int gl_is_enabled(struct lockspace *ls, struct lm_sanlock *lms) +{ + char strname[SANLK_NAME_LEN + 1]; + struct sanlk_resourced rd; + uint64_t offset; + int rv; + + memset(&rd, 0, sizeof(rd)); + + strncpy(rd.rs.lockspace_name, ls->name, SANLK_NAME_LEN); + + /* leave rs.name empty, it is what we're checking */ + + rd.rs.num_disks = 1; + strncpy(rd.rs.disks[0].path, lms->ss.host_id_disk.path, SANLK_PATH_LEN); + + offset = lms->align_size * GL_LOCK_BEGIN; + rd.rs.disks[0].offset = offset; + + rv = sanlock_read_resource(&rd.rs, 0); + if (rv < 0) { + log_error("gl_is_enabled read_resource error %d", rv); + return rv; + } + + memset(strname, 0, sizeof(strname)); + memcpy(strname, rd.rs.name, SANLK_NAME_LEN); + + if (!strcmp(strname, R_NAME_GL_DISABLED)) { + return 0; + } + + if (!strcmp(strname, R_NAME_GL)) { + return 1; + } + + log_error("gl_is_enabled invalid gl name %s", strname); + return -1; +} + +int lm_gl_is_enabled(struct lockspace *ls) +{ + int rv; + rv = gl_is_enabled(ls, ls->lm_data); + ls->sanlock_gl_enabled = rv; + return rv; +} + +/* + * This is called at the beginning of lvcreate to + * ensure there is free space for a new LV lock. + * If not, lvcreate will extend the lvmlock lv + * before continuing with creating the new LV. + * This way, lm_init_lv_san() should find a free + * lock (unless the autoextend of lvmlock lv has + * been disabled.) + */ + +int lm_find_free_lock_sanlock(struct lockspace *ls, uint64_t *free_offset) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + struct sanlk_resourced rd; + uint64_t offset; + int rv; + + if (daemon_test) + return 0; + + memset(&rd, 0, sizeof(rd)); + + strncpy(rd.rs.lockspace_name, ls->name, SANLK_NAME_LEN); + rd.rs.num_disks = 1; + strncpy(rd.rs.disks[0].path, lms->ss.host_id_disk.path, SANLK_PATH_LEN); + + offset = lms->align_size * LV_LOCK_BEGIN; + + while (1) { + rd.rs.disks[0].offset = offset; + + memset(rd.rs.name, 0, SANLK_NAME_LEN); + + rv = sanlock_read_resource(&rd.rs, 0); + if (rv == -EMSGSIZE || rv == -ENOSPC) { + /* This indicates the end of the device is reached. */ + log_debug("S %s find_free_lock_san read limit offset %llu", + ls->name, (unsigned long long)offset); + return -EMSGSIZE; + } + + /* + * If we read newly extended space, it will not be initialized + * with an "#unused" resource, but will return an error about + * an invalid paxos structure on disk. + */ + if (rv == SANLK_LEADER_MAGIC) { + log_debug("S %s find_free_lock_san found empty area at %llu", + ls->name, (unsigned long long)offset); + *free_offset = offset; + return 0; + } + + if (rv) { + log_error("S %s find_free_lock_san read error %d offset %llu", + ls->name, rv, (unsigned long long)offset); + break; + } + + if (!strcmp(rd.rs.name, "#unused")) { + log_debug("S %s find_free_lock_san found unused area at %llu", + ls->name, (unsigned long long)offset); + *free_offset = offset; + return 0; + } + + offset += lms->align_size; + } + + return rv; +} + +/* + * host A: start_vg/add_lockspace + * host B: vgremove + * + * The global lock cannot always be held around start_vg + * on host A because the gl is in a vg that may not be + * started yet, or may be in the vg we are starting. + * + * If B removes the vg, destroying the delta leases, + * while A is a lockspace member, it will cause A's + * sanlock delta lease renewal to fail, and lockspace + * recovery. + * + * I expect this overlap would usually cause a failure + * in the add_lockspace() on host A when it sees that + * the lockspace structures have been clobbered by B. + * Having add_lockspace() fail should be a fine result. + * + * If add_lockspace was somehow able to finish, the + * subsequent renewal would probably fail instead. + * This should also not create any major problems. + */ + +int lm_prepare_lockspace_sanlock(struct lockspace *ls) +{ + struct stat st; + struct lm_sanlock *lms = NULL; + char lock_lv_name[MAX_ARGS]; + char lsname[SANLK_NAME_LEN + 1]; + char disk_path[SANLK_PATH_LEN]; + int gl_found; + int ret, rv; + + memset(disk_path, 0, sizeof(disk_path)); + memset(lock_lv_name, 0, sizeof(lock_lv_name)); + + rv = check_args_version(ls->vg_args, VG_LOCK_ARGS_MAJOR); + if (rv < 0) { + ret = -EARGS; + goto fail; + } + + rv = lock_lv_name_from_args(ls->vg_args, lock_lv_name); + if (rv < 0) { + log_error("S %s prepare_lockspace_san lock_lv_name_from_args error %d %s", + ls->name, rv, ls->vg_args); + ret = -EARGS; + goto fail; + } + + snprintf(disk_path, SANLK_PATH_LEN, "/dev/mapper/%s-%s", + ls->vg_name, lock_lv_name); + + /* + * When a vg is started, the internal sanlock lv should be + * activated before lvmlockd is asked to add the lockspace. + * (sanlock needs to use the lv.) + * + * In the future we might be able to ask something on the system + * to activate the sanlock lv from here, and with that we might be + * able to start sanlock VGs without requiring a + * vgchange --lock-start command. + */ + + /* FIXME: device is not always ready for us here */ + sleep(1); + + rv = stat(disk_path, &st); + if (rv < 0) { + log_error("S %s prepare_lockspace_san stat error %d disk_path %s", + ls->name, errno, disk_path); + ret = -EARGS; + goto fail; + } + + if (!ls->host_id) { + if (daemon_host_id) + ls->host_id = daemon_host_id; + else if (daemon_host_id_file) + ls->host_id = read_host_id_file(); + } + + if (!ls->host_id || ls->host_id > 2000) { + log_error("S %s prepare_lockspace_san invalid host_id %llu", + ls->name, (unsigned long long)ls->host_id); + ret = -EHOSTID; + goto fail; + } + + lms = malloc(sizeof(struct lm_sanlock)); + if (!lms) { + ret = -ENOMEM; + goto fail; + } + + memset(lsname, 0, sizeof(lsname)); + strncpy(lsname, ls->name, SANLK_NAME_LEN); + + memset(lms, 0, sizeof(struct lm_sanlock)); + memcpy(lms->ss.name, lsname, SANLK_NAME_LEN); + lms->ss.host_id_disk.offset = 0; + lms->ss.host_id = ls->host_id; + strncpy(lms->ss.host_id_disk.path, disk_path, SANLK_PATH_LEN); + + if (daemon_test) { + if (!gl_lsname_sanlock[0]) { + strncpy(gl_lsname_sanlock, lsname, MAX_NAME); + log_debug("S %s prepare_lockspace_san use global lock", lsname); + } + goto out; + } + + lms->sock = sanlock_register(); + if (lms->sock < 0) { + log_error("S %s prepare_lockspace_san register error %d", lsname, lms->sock); + lms->sock = 0; + ret = -EMANAGER; + goto fail; + } + + rv = sanlock_restrict(lms->sock, SANLK_RESTRICT_SIGKILL); + if (rv < 0) { + log_error("S %s restrict error %d", lsname, rv); + ret = -EMANAGER; + goto fail; + } + + lms->align_size = sanlock_align(&lms->ss.host_id_disk); + if (lms->align_size <= 0) { + log_error("S %s prepare_lockspace_san align error %d", lsname, lms->align_size); + ret = -EMANAGER; + goto fail; + } + + gl_found = gl_is_enabled(ls, lms); + if (gl_found < 0) { + log_error("S %s prepare_lockspace_san gl_enabled error %d", lsname, gl_found); + ret = -EARGS; + goto fail; + } + + ls->sanlock_gl_enabled = gl_found; + + if (gl_found) { + if (gl_use_dlm) { + log_error("S %s prepare_lockspace_san gl_use_dlm is set", lsname); + } else if (gl_lsname_sanlock[0] && strcmp(gl_lsname_sanlock, lsname)) { + log_error("S %s prepare_lockspace_san multiple sanlock global locks current %s", + lsname, gl_lsname_sanlock); + } else { + strncpy(gl_lsname_sanlock, lsname, MAX_NAME); + log_debug("S %s prepare_lockspace_san use global lock %s", + lsname, gl_lsname_sanlock); + } + } + +out: + ls->lm_data = lms; + log_debug("S %s prepare_lockspace_san done", lsname); + return 0; + +fail: + if (lms && lms->sock) + close(lms->sock); + if (lms) + free(lms); + return ret; +} + +int lm_add_lockspace_sanlock(struct lockspace *ls, int adopt) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + int rv; + + rv = sanlock_add_lockspace_timeout(&lms->ss, 0, sanlock_io_timeout); + if (rv == -EEXIST && adopt) { + /* We could alternatively just skip the sanlock call for adopt. */ + log_debug("S %s add_lockspace_san adopt found ls", ls->name); + goto out; + } + if (rv < 0) { + /* retry for some errors? */ + log_error("S %s add_lockspace_san add_lockspace error %d", ls->name, rv); + goto fail; + } + + /* + * Don't let the lockspace be cleanly released if orphan locks + * exist, because the orphan locks are still protecting resources + * that are being used on the host, e.g. active lvs. If the + * lockspace is cleanly released, another host could acquire the + * orphan leases. + */ + + rv = sanlock_set_config(ls->name, 0, SANLK_CONFIG_USED_BY_ORPHANS, NULL); + if (rv < 0) { + log_error("S %s add_lockspace_san set_config error %d", ls->name, rv); + sanlock_rem_lockspace(&lms->ss, 0); + goto fail; + } + +out: + log_debug("S %s add_lockspace_san done", ls->name); + return 0; + +fail: + close(lms->sock); + free(lms); + ls->lm_data = NULL; + return rv; +} + +int lm_rem_lockspace_sanlock(struct lockspace *ls, int free_vg) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + int rv; + + if (daemon_test) + goto out; + + rv = sanlock_rem_lockspace(&lms->ss, 0); + if (rv < 0) { + log_error("S %s rem_lockspace_san error %d", ls->name, rv); + return rv; + } + + if (free_vg) { + /* + * Destroy sanlock lockspace (delta leases). Forces failure for any + * other host that is still using or attempts to use this lockspace. + * This shouldn't be generally necessary, but there may some races + * between nodes starting and removing a vg which this could help. + */ + strncpy(lms->ss.name, "#unused", SANLK_NAME_LEN); + + rv = sanlock_write_lockspace(&lms->ss, 0, 0, sanlock_io_timeout); + if (rv < 0) { + log_error("S %s rem_lockspace free_vg write_lockspace error %d %s", + ls->name, rv, lms->ss.host_id_disk.path); + } + } +out: + close(lms->sock); + + free(lms); + ls->lm_data = NULL; + + /* FIXME: should we only clear gl_lsname when doing free_vg? */ + + if (!strcmp(ls->name, gl_lsname_sanlock)) + memset(gl_lsname_sanlock, 0, sizeof(gl_lsname_sanlock)); + + return 0; +} + +static int lm_add_resource_sanlock(struct lockspace *ls, struct resource *r) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + struct rd_sanlock *rds = (struct rd_sanlock *)r->lm_data; + + strncpy(rds->rs.lockspace_name, ls->name, SANLK_NAME_LEN); + strncpy(rds->rs.name, r->name, SANLK_NAME_LEN); + rds->rs.num_disks = 1; + memcpy(rds->rs.disks[0].path, lms->ss.host_id_disk.path, SANLK_PATH_LEN); + + if (r->type == LD_RT_GL) + rds->rs.disks[0].offset = GL_LOCK_BEGIN * lms->align_size; + else if (r->type == LD_RT_VG) + rds->rs.disks[0].offset = VG_LOCK_BEGIN * lms->align_size; + + /* LD_RT_LV offset is set in each lm_lock call from lv_args. */ + + if (r->type == LD_RT_GL || r->type == LD_RT_VG) { + rds->vb = malloc(sizeof(struct val_blk)); + if (!rds->vb) + return -ENOMEM; + memset(rds->vb, 0, sizeof(struct val_blk)); + } + + return 0; +} + +int lm_rem_resource_sanlock(struct lockspace *ls, struct resource *r) +{ + struct rd_sanlock *rds = (struct rd_sanlock *)r->lm_data; + + /* FIXME: assert r->mode == UN or unlock if it's not? */ + + if (rds->vb) + free(rds->vb); + + memset(rds, 0, sizeof(struct rd_sanlock)); + r->lm_init = 0; + return 0; +} + +int lm_lock_sanlock(struct lockspace *ls, struct resource *r, int ld_mode, + uint32_t *r_version, int *retry, int adopt) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + struct rd_sanlock *rds = (struct rd_sanlock *)r->lm_data; + struct sanlk_resource *rs; + uint64_t lock_lv_offset; + uint32_t flags = 0; + struct val_blk vb; + uint16_t vb_version; + int added = 0; + int rv; + + if (!r->lm_init) { + rv = lm_add_resource_sanlock(ls, r); + if (rv < 0) + return rv; + r->lm_init = 1; + added = 1; + } + + rs = &rds->rs; + + if (r->type == LD_RT_LV) { + /* + * The lv may have been removed and recreated with a new lease + * offset, so we need to get the offset from lv_args each time + * instead of reusing the value that we last set in rds->rs. + * act->lv_args is copied to r->lv_args before every lm_lock(). + */ + + rv = check_args_version(r->lv_args, LV_LOCK_ARGS_MAJOR); + if (rv < 0) { + log_error("S %s R %s lock_san wrong lv_args version %s", + ls->name, r->name, r->lv_args); + return rv; + } + + rv = lock_lv_offset_from_args(r->lv_args, &lock_lv_offset); + if (rv < 0) { + log_error("S %s R %s lock_san lv_offset_from_args error %d %s", + ls->name, r->name, rv, r->lv_args); + return rv; + } + + if (!added && (rds->rs.disks[0].offset != lock_lv_offset)) { + log_debug("S %s R %s lock_san offset old %llu new %llu", + ls->name, r->name, + (unsigned long long)rds->rs.disks[0].offset, + (unsigned long long)lock_lv_offset); + } + + rds->rs.disks[0].offset = lock_lv_offset; + } + + if (ld_mode == LD_LK_SH) { + rs->flags |= SANLK_RES_SHARED; + } else if (ld_mode == LD_LK_EX) { + rs->flags &= ~SANLK_RES_SHARED; + } else { + log_error("lock_san invalid mode %d", ld_mode); + return -EINVAL; + } + + /* + * Use PERSISTENT because if lvmlockd exits while holding + * a lock, it's not safe to simply clear/drop the lock while + * a command or lv is using it. + */ + + rs->flags |= SANLK_RES_PERSISTENT; + + log_debug("S %s R %s lock_san acquire %s:%llu", + ls->name, r->name, rs->disks[0].path, + (unsigned long long)rs->disks[0].offset); + + if (daemon_test) { + *r_version = 0; + return 0; + } + + if (rds->vb) + flags |= SANLK_ACQUIRE_LVB; + if (adopt) + flags |= SANLK_ACQUIRE_ORPHAN_ONLY; + + rv = sanlock_acquire(lms->sock, -1, flags, 1, &rs, NULL); + + if (rv == -EAGAIN) { + /* + * It appears that sanlock_acquire returns EAGAIN when we request + * a shared lock but the lock is held ex by another host. + * There's no point in retrying this case, just return an error. + */ + log_debug("S %s R %s lock_san acquire mode %d rv EAGAIN", ls->name, r->name, ld_mode); + *retry = 0; + return -EAGAIN; + } + + if ((rv == -EMSGSIZE) && (r->type == LD_RT_LV)) { + /* + * sanlock tried to read beyond the end of the device, + * so the offset of the lv lease is beyond the end of the + * device, which means that the lease lv was extended, and + * the lease for this lv was allocated in the new space. + * The lvm command will see this error, refresh the lvmlock + * lv, and try again. + */ + log_debug("S %s R %s lock_san acquire offset %llu rv EMSGSIZE", + ls->name, r->name, (unsigned long long)rs->disks[0].offset); + *retry = 0; + return -EMSGSIZE; + } + + if (adopt && (rv == -EUCLEAN)) { + /* + * The orphan lock exists but in a different mode than we asked + * for, so the caller should try again with the other mode. + */ + log_debug("S %s R %s lock_san adopt mode %d try other mode", + ls->name, r->name, ld_mode); + *retry = 0; + return -EUCLEAN; + } + + if (adopt && (rv == -ENOENT)) { + /* + * No orphan lock exists. + */ + log_debug("S %s R %s lock_san adopt mode %d no orphan found", + ls->name, r->name, ld_mode); + *retry = 0; + return -ENOENT; + } + + if (rv == SANLK_ACQUIRE_IDLIVE || rv == SANLK_ACQUIRE_OWNED || rv == SANLK_ACQUIRE_OTHER) { + /* + * The lock is held by another host. These failures can + * happen while multiple hosts are concurrently acquiring + * shared locks. We want to retry a couple times in this + * case because we'll probably get the sh lock. + * + * I believe these are also the errors when requesting an + * ex lock that another host holds ex. We want to report + * something like: "lock is held by another host" in this case. + * Retry is pointless here. + * + * We can't distinguish between the two cases above, + * so if requesting a sh lock, retry a couple times, + * otherwise don't. + */ + log_debug("S %s R %s lock_san acquire mode %d rv %d", ls->name, r->name, ld_mode, rv); + *retry = (ld_mode == LD_LK_SH) ? 1 : 0; + return -EAGAIN; + } + + if (rv < 0) { + log_error("S %s R %s lock_san acquire error %d", + ls->name, r->name, rv); + + if (added) { + lm_rem_resource_sanlock(ls, r); + return rv; + } + + /* if the gl has been disabled, remove and free the gl resource */ + if ((rv == SANLK_LEADER_RESOURCE) && (r->type == LD_RT_GL)) { + if (!lm_gl_is_enabled(ls)) { + log_error("S %s R %s lock_san gl has been disabled", + ls->name, r->name); + if (!strcmp(gl_lsname_sanlock, ls->name)) + memset(gl_lsname_sanlock, 0, sizeof(gl_lsname_sanlock)); + return -EUNATCH; + } + } + + return rv; + } + + if (rds->vb) { + rv = sanlock_get_lvb(0, rs, (char *)&vb, sizeof(vb)); + if (rv < 0) { + log_error("S %s R %s lock_san get_lvb error %d", ls->name, r->name, rv); + *r_version = 0; + goto out; + } + + vb_version = le16_to_cpu(vb.version); + + if (vb_version && ((vb_version & 0xFF00) > (VAL_BLK_VERSION & 0xFF00))) { + log_error("S %s R %s lock_san ignore vb_version %x", + ls->name, r->name, vb_version); + *r_version = 0; + free(rds->vb); + rds->vb = NULL; + goto out; + } + + *r_version = le32_to_cpu(vb.r_version); + memcpy(rds->vb, &vb, sizeof(vb)); /* rds->vb saved as le */ + + log_debug("S %s R %s lock_san get r_version %u", + ls->name, r->name, *r_version); + } +out: + return rv; +} + +int lm_convert_sanlock(struct lockspace *ls, struct resource *r, + int ld_mode, uint32_t r_version) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + struct rd_sanlock *rds = (struct rd_sanlock *)r->lm_data; + struct sanlk_resource *rs = &rds->rs; + struct val_blk vb; + uint32_t flags = 0; + int rv; + + log_debug("S %s R %s convert_san", ls->name, r->name); + + if (daemon_test) + goto rs_flag; + + if (rds->vb && r_version && (r->mode == LD_LK_EX)) { + if (!rds->vb->version) { + /* first time vb has been written */ + rds->vb->version = cpu_to_le16(VAL_BLK_VERSION); + } + if (r_version) + rds->vb->r_version = cpu_to_le32(r_version); + memcpy(&vb, rds->vb, sizeof(vb)); + + log_debug("S %s R %s convert_san set r_version %u", + ls->name, r->name, r_version); + + rv = sanlock_set_lvb(0, rs, (char *)&vb, sizeof(vb)); + if (rv < 0) { + log_error("S %s R %s convert_san set_lvb error %d", + ls->name, r->name, rv); + } + } + + rs_flag: + if (ld_mode == LD_LK_SH) + rs->flags |= SANLK_RES_SHARED; + else + rs->flags &= ~SANLK_RES_SHARED; + + if (daemon_test) + return 0; + + rv = sanlock_convert(lms->sock, -1, flags, rs); + if (rv == -EAGAIN) { + /* FIXME: When could this happen? Should something different be done? */ + log_error("S %s R %s convert_san EAGAIN", ls->name, r->name); + return -EAGAIN; + } + if (rv < 0) { + log_error("S %s R %s convert_san convert error %d", ls->name, r->name, rv); + } + + return rv; +} + +static int release_rename(struct lockspace *ls, struct resource *r) +{ + struct rd_sanlock rd1; + struct rd_sanlock rd2; + struct sanlk_resource *res1; + struct sanlk_resource *res2; + struct sanlk_resource **res_args; + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + struct rd_sanlock *rds = (struct rd_sanlock *)r->lm_data; + int rv; + + log_debug("S %s R %s release rename", ls->name, r->name); + + res_args = malloc(2 * sizeof(struct sanlk_resource *)); + if (!res_args) + return -ENOMEM; + + memcpy(&rd1, rds, sizeof(struct rd_sanlock)); + memcpy(&rd2, rds, sizeof(struct rd_sanlock)); + + res1 = (struct sanlk_resource *)&rd1; + res2 = (struct sanlk_resource *)&rd2; + + strcpy(res2->name, "invalid_removed"); + + res_args[0] = res1; + res_args[1] = res2; + + rv = sanlock_release(lms->sock, -1, SANLK_REL_RENAME, 2, res_args); + if (rv < 0) { + log_error("S %s R %s unlock_san release rename error %d", ls->name, r->name, rv); + } + + free(res_args); + + return rv; +} + +/* + * rds->vb is stored in le + * + * r_version is r->version + * + * for GL locks lvmlockd just increments this value + * each time the global lock is released from ex. + * + * for VG locks it is the seqno from the vg metadata. + */ + +int lm_unlock_sanlock(struct lockspace *ls, struct resource *r, + uint32_t r_version, uint32_t lmu_flags) +{ + struct lm_sanlock *lms = (struct lm_sanlock *)ls->lm_data; + struct rd_sanlock *rds = (struct rd_sanlock *)r->lm_data; + struct sanlk_resource *rs = &rds->rs; + struct val_blk vb; + int rv; + + log_debug("S %s R %s unlock_san r_version %u flags %x", + ls->name, r->name, r_version, lmu_flags); + + if (daemon_test) + return 0; + + if (rds->vb && r_version && (r->mode == LD_LK_EX)) { + if (!rds->vb->version) { + /* first time vb has been written */ + rds->vb->version = cpu_to_le16(VAL_BLK_VERSION); + } + if (r_version) + rds->vb->r_version = cpu_to_le32(r_version); + memcpy(&vb, rds->vb, sizeof(vb)); + + log_debug("S %s R %s unlock_san set r_version %u", + ls->name, r->name, r_version); + + rv = sanlock_set_lvb(0, rs, (char *)&vb, sizeof(vb)); + if (rv < 0) { + log_error("S %s R %s unlock_san set_lvb error %d", + ls->name, r->name, rv); + } + } + + /* + * For vgremove (FREE_VG) we unlock-rename the vg and gl locks + * so they cannot be reacquired. + */ + if ((lmu_flags & LMUF_FREE_VG) && + (r->type == LD_RT_GL || r->type == LD_RT_VG)) { + return release_rename(ls, r); + } + + rv = sanlock_release(lms->sock, -1, 0, 1, &rs); + if (rv < 0) { + log_error("S %s R %s unlock_san release error %d", ls->name, r->name, rv); + } + + return rv; +} + +int lm_hosts_sanlock(struct lockspace *ls, int notify) +{ + struct sanlk_host *hss = NULL; + struct sanlk_host *hs; + uint32_t state; + int hss_count = 0; + int found_self = 0; + int found_others = 0; + int i, rv; + + rv = sanlock_get_hosts(ls->name, 0, &hss, &hss_count, 0); + if (rv < 0) { + log_error("S %s hosts_san get_hosts error %d", ls->name, rv); + return 0; + } + + if (!hss || !hss_count) { + log_error("S %s hosts_san zero hosts", ls->name); + return 0; + } + + hs = hss; + + for (i = 0; i < hss_count; i++) { + log_debug("S %s hosts_san host_id %llu gen %llu flags %x", + ls->name, + (unsigned long long)hs->host_id, + (unsigned long long)hs->generation, + hs->flags); + + if (hs->host_id == ls->host_id) { + found_self = 1; + hs++; + continue; + } + + state = hs->flags & SANLK_HOST_MASK; + if (state == SANLK_HOST_LIVE) + found_others++; + hs++; + } + free(hss); + + if (found_others && notify) { + /* + * We could use the sanlock event mechanism to notify lvmlockd + * on other hosts to stop this VG. lvmlockd would need to + * register for and listen for sanlock events in the main loop. + * The events are slow to propagate. We'd need to retry for a + * while before all the hosts see the event and stop the VG. + * sanlock_set_event(ls->name, &he, SANLK_SETEV_ALL_HOSTS); + * + * Wait to try this until there appears to be real value/interest + * in doing it. + */ + } + + if (!found_self) { + log_error("S %s hosts_san self not found others %d", ls->name, found_others); + return 0; + } + + return found_others; +} + +int lm_get_lockspaces_sanlock(struct list_head *ls_rejoin) +{ + struct sanlk_lockspace *ss_all = NULL; + struct sanlk_lockspace *ss; + struct lockspace *ls; + int ss_count = 0; + int i, rv; + + rv = sanlock_get_lockspaces(&ss_all, &ss_count, 0); + if (rv < 0) + return rv; + + if (!ss_all || !ss_count) + return 0; + + ss = ss_all; + + for (i = 0; i < ss_count; i++) { + + if (strncmp(ss->name, LVM_LS_PREFIX, strlen(LVM_LS_PREFIX))) + continue; + + if (!(ls = alloc_lockspace())) + return -ENOMEM; + + ls->lm_type = LD_LM_SANLOCK; + ls->host_id = ss->host_id; + strncpy(ls->name, ss->name, MAX_NAME); + strncpy(ls->vg_name, ss->name + strlen(LVM_LS_PREFIX), MAX_NAME); + list_add_tail(&ls->list, ls_rejoin); + + ss++; + } + + free(ss_all); + return 0; +} + +int lm_is_running_sanlock(void) +{ + uint32_t daemon_version; + uint32_t daemon_proto; + int rv; + + rv = sanlock_version(0, &daemon_version, &daemon_proto); + if (rv < 0) + return 0; + return 1; +} + |