diff options
Diffstat (limited to 'lib/metadata/metadata.c')
-rw-r--r-- | lib/metadata/metadata.c | 2160 |
1 files changed, 861 insertions, 1299 deletions
diff --git a/lib/metadata/metadata.c b/lib/metadata/metadata.c index 9efc35592..f31b4b979 100644 --- a/lib/metadata/metadata.c +++ b/lib/metadata/metadata.c @@ -28,11 +28,14 @@ #include "lib/display/display.h" #include "lib/locking/locking.h" #include "lib/format_text/archiver.h" +#include "lib/format_text/format-text.h" +#include "lib/format_text/layout.h" +#include "lib/format_text/import-export.h" #include "lib/config/defaults.h" #include "lib/locking/lvmlockd.h" -#include "time.h" #include "lib/notify/lvmnotify.h" +#include <time.h> #include <math.h> static struct physical_volume *_pv_read(struct cmd_context *cmd, @@ -222,6 +225,75 @@ out: (unsigned long long)pv->pe_align_offset, dev_name(pv->dev)); } +/* + * FIXME: we only want to print the warnings when this is called from + * vg_read, not from import_vg_from_metadata, so do the warnings elsewhere + * or avoid calling this from import_vg_from. + */ +static void _set_pv_device(struct format_instance *fid, + struct volume_group *vg, + struct physical_volume *pv) +{ + char buffer[64] __attribute__((aligned(8))); + uint64_t size; + + if (!(pv->dev = lvmcache_device_from_pvid(fid->fmt->cmd, &pv->id, &pv->label_sector))) { + if (!id_write_format(&pv->id, buffer, sizeof(buffer))) + buffer[0] = '\0'; + + if (fid->fmt->cmd && !fid->fmt->cmd->pvscan_cache_single) + log_error_once("Couldn't find device with uuid %s.", buffer); + else + log_debug_metadata("Couldn't find device with uuid %s.", buffer); + } + + /* + * A previous command wrote the VG while this dev was missing, so + * the MISSING flag was included in the PV. + */ + if ((pv->status & MISSING_PV) && pv->dev) + log_warn("WARNING: VG %s was previously updated while PV %s was missing.", vg->name, dev_name(pv->dev)); + + /* + * If this command writes the VG, we want the MISSING flag to be + * written for this PV with no device. + */ + if (!pv->dev) + pv->status |= MISSING_PV; + + /* is this correct? */ + if ((pv->status & MISSING_PV) && pv->dev && (pv_mda_used_count(pv) == 0)) { + pv->status &= ~MISSING_PV; + log_info("Found a previously MISSING PV %s with no MDAs.", pv_dev_name(pv)); + } + + /* Fix up pv size if missing or impossibly large */ + if ((!pv->size || pv->size > (1ULL << 62)) && pv->dev) { + if (!dev_get_size(pv->dev, &pv->size)) { + log_error("%s: Couldn't get size.", pv_dev_name(pv)); + return; + } + log_verbose("Fixing up missing size (%s) for PV %s", display_size(fid->fmt->cmd, pv->size), + pv_dev_name(pv)); + size = pv->pe_count * (uint64_t) vg->extent_size + pv->pe_start; + if (size > pv->size) + log_warn("WARNING: Physical Volume %s is too large " + "for underlying device", pv_dev_name(pv)); + } +} + +/* + * Finds the 'struct device' that correponds to each PV in the metadata, + * and may make some adjustments to vg fields based on the dev properties. + */ +void set_pv_devices(struct format_instance *fid, struct volume_group *vg) +{ + struct pv_list *pvl; + + dm_list_iterate_items(pvl, &vg->pvs) + _set_pv_device(fid, vg, pvl->pv); +} + void add_pvl_to_vgs(struct volume_group *vg, struct pv_list *pvl) { dm_list_add(&vg->pvs, &pvl->list); @@ -370,48 +442,6 @@ int add_pv_to_vg(struct volume_group *vg, const char *pv_name, return 1; } -static int _copy_pv(struct dm_pool *pvmem, - struct physical_volume *pv_to, - struct physical_volume *pv_from) -{ - memcpy(pv_to, pv_from, sizeof(*pv_to)); - - /* We must use pv_set_fid here to update the reference counter! */ - pv_to->fid = NULL; - pv_set_fid(pv_to, pv_from->fid); - - if (!(pv_to->vg_name = dm_pool_strdup(pvmem, pv_from->vg_name))) - return_0; - - if (!str_list_dup(pvmem, &pv_to->tags, &pv_from->tags)) - return_0; - - if (!peg_dup(pvmem, &pv_to->segments, &pv_from->segments)) - return_0; - - return 1; -} - -static struct pv_list *_copy_pvl(struct dm_pool *pvmem, struct pv_list *pvl_from) -{ - struct pv_list *pvl_to = NULL; - - if (!(pvl_to = dm_pool_zalloc(pvmem, sizeof(*pvl_to)))) - return_NULL; - - if (!(pvl_to->pv = dm_pool_alloc(pvmem, sizeof(*pvl_to->pv)))) - goto_bad; - - if (!_copy_pv(pvmem, pvl_to->pv, pvl_from->pv)) - goto_bad; - - return pvl_to; - -bad: - dm_pool_free(pvmem, pvl_to); - return NULL; -} - static int _move_pv(struct volume_group *vg_from, struct volume_group *vg_to, const char *pv_name, int enforce_pv_from_source) { @@ -584,7 +614,7 @@ int vg_remove_check(struct volume_group *vg) { unsigned lv_count; - if (vg_read_error(vg) || vg_missing_pv_count(vg)) { + if (vg_missing_pv_count(vg)) { log_error("Volume group \"%s\" not found, is inconsistent " "or has PVs missing.", vg ? vg->name : ""); log_error("Consider vgreduce --removemissing if metadata " @@ -963,36 +993,6 @@ static int _vg_update_embedded_copy(struct volume_group *vg, struct volume_group return 1; } -/* - * Create a (struct volume_group) volume group handle from a struct volume_group pointer and a - * possible failure code or zero for success. - */ -static struct volume_group *_vg_make_handle(struct cmd_context *cmd, - struct volume_group *vg, - uint32_t failure) -{ - /* Never return a cached VG structure for a failure */ - if (vg && vg->vginfo && failure != SUCCESS) { - release_vg(vg); - vg = NULL; - } - - if (!vg && !(vg = alloc_vg("vg_make_handle", cmd, NULL))) - return_NULL; - - vg->read_status = failure; - - /* - * If we hold a write lock and might be changing the VG contents, embed a pristine - * copy of the VG metadata for the activation code to use later - */ - if (vg->fid && !dm_pool_locked(vg->vgmem) && !vg->vg_committed && !is_orphan_vg(vg->name)) - if (vg_write_lock_held() && !_vg_update_embedded_copy(vg, &vg->vg_committed)) - vg->read_status |= FAILED_ALLOCATION; - - return vg; -} - int lv_has_unknown_segments(const struct logical_volume *lv) { struct lv_segment *seg; @@ -1014,24 +1014,24 @@ int vg_has_unknown_segments(const struct volume_group *vg) return 0; } -struct volume_group *vg_lock_and_create(struct cmd_context *cmd, const char *vg_name) +struct volume_group *vg_lock_and_create(struct cmd_context *cmd, const char *vg_name, int *exists) { uint32_t rc; struct volume_group *vg; if (!validate_name(vg_name)) { log_error("Invalid vg name %s", vg_name); - /* FIXME: use _vg_make_handle() w/proper error code */ return NULL; } rc = vg_lock_newname(cmd, vg_name); + if (rc == FAILED_EXIST) + *exists = 1; if (rc != SUCCESS) - /* NOTE: let caller decide - this may be check for existence */ - return _vg_make_handle(cmd, NULL, rc); + return NULL; vg = vg_create(cmd, vg_name); - if (!vg || vg_read_error(vg)) + if (!vg) unlock_vg(cmd, NULL, vg_name); return vg; @@ -1039,12 +1039,8 @@ struct volume_group *vg_lock_and_create(struct cmd_context *cmd, const char *vg_ /* * Create a VG with default parameters. - * Returns: - * - struct volume_group* with SUCCESS code: VG structure created - * - NULL or struct volume_group* with FAILED_* code: error creating VG structure - * Use vg_read_error() to determine success or failure. - * FIXME: cleanup usage of _vg_make_handle() */ + struct volume_group *vg_create(struct cmd_context *cmd, const char *vg_name) { struct volume_group *vg; @@ -1084,11 +1080,10 @@ struct volume_group *vg_create(struct cmd_context *cmd, const char *vg_name) vg_name); goto bad; } - return _vg_make_handle(cmd, vg, SUCCESS); + return vg; bad: unlock_and_release_vg(cmd, vg, vg_name); - /* FIXME: use _vg_make_handle() w/proper error code */ return NULL; } @@ -2807,57 +2802,6 @@ static int _pv_in_pv_list(struct physical_volume *pv, struct dm_list *head) return 0; } -/* - * Check if any of the PVs in VG still contain old PV headers - * and if yes, schedule them for PV header update. - */ -static int _vg_update_old_pv_ext_if_needed(struct volume_group *vg) -{ - struct pv_list *pvl, *new_pvl; - int pv_needs_rewrite; - - if (!(vg->fid->fmt->features & FMT_PV_FLAGS)) - return 1; - - dm_list_iterate_items(pvl, &vg->pvs) { - if (is_missing_pv(pvl->pv) || - !pvl->pv->fmt->ops->pv_needs_rewrite) - continue; - - if (_pv_in_pv_list(pvl->pv, &vg->pv_write_list)) - continue; - - if (!pvl->pv->fmt->ops->pv_needs_rewrite(pvl->pv->fmt, pvl->pv, - &pv_needs_rewrite)) - return_0; - - if (pv_needs_rewrite) { - /* - * Schedule PV for writing only once! - */ - if (_pv_in_pv_list(pvl->pv, &vg->pv_write_list)) - continue; - - if (!(new_pvl = dm_pool_zalloc(vg->vgmem, sizeof(*new_pvl)))) { - log_error("pv_to_write allocation for '%s' failed", pv_dev_name(pvl->pv)); - return 0; - } - new_pvl->pv = pvl->pv; - dm_list_add(&vg->pv_write_list, &new_pvl->list); - log_debug("PV %s has old extension header, updating to newest version.", - pv_dev_name(pvl->pv)); - } - } - - if (!dm_list_empty(&vg->pv_write_list) && - (!vg_write(vg) || !vg_commit(vg))) { - log_error("Failed to update old PV extension headers in VG %s.", vg->name); - return 0; - } - - return 1; -} - static int _check_historical_lv_is_valid(struct historical_logical_volume *hlv) { struct glv_list *glvl; @@ -2922,6 +2866,69 @@ static int _handle_historical_lvs(struct volume_group *vg) return 1; } +static void _wipe_outdated_pvs(struct cmd_context *cmd, struct volume_group *vg) +{ + struct dm_list devs; + struct dm_list *mdas = NULL; + struct device_list *devl; + struct device *dev; + struct metadata_area *mda; + struct label *label; + struct lvmcache_info *info; + uint32_t ext_flags; + + dm_list_init(&devs); + + /* + * When vg_read selected a good copy of the metadata, it used it to + * update the lvmcache representation of the VG (lvmcache_update_vg). + * At that point outdated PVs were recognized and moved into the + * vginfo->outdated_infos list. Here we clear the PVs on that list. + */ + + lvmcache_get_outdated_devs(cmd, vg->name, (const char *)&vg->id, &devs); + + dm_list_iterate_items(devl, &devs) { + dev = devl->dev; + + lvmcache_get_outdated_mdas(cmd, vg->name, (const char *)&vg->id, dev, &mdas); + + if (mdas) { + dm_list_iterate_items(mda, mdas) { + log_warn("WARNING: wiping mda on outdated PV %s", dev_name(dev)); + + if (!text_wipe_outdated_pv_mda(cmd, dev, mda)) + log_warn("WARNING: failed to wipe mda on outdated PV %s", dev_name(dev)); + } + } + + if (!(label = lvmcache_get_dev_label(dev))) { + log_error("_wipe_outdated_pvs no label for %s", dev_name(dev)); + continue; + } + + info = label->info; + ext_flags = lvmcache_ext_flags(info); + ext_flags &= ~PV_EXT_USED; + lvmcache_set_ext_version(info, PV_HEADER_EXTENSION_VSN); + lvmcache_set_ext_flags(info, ext_flags); + + log_warn("WARNING: wiping header on outdated PV %s", dev_name(dev)); + + if (!label_write(dev, label)) + log_warn("WARNING: failed to wipe header on outdated PV %s", dev_name(dev)); + + lvmcache_del(info); + } + + /* + * A vgremove will involve many vg_write() calls (one for each lv + * removed) but we only need to wipe pvs once, so clear the outdated + * list so it won't be wiped again. + */ + lvmcache_del_outdated_devs(cmd, vg->name, (const char *)&vg->id); +} + /* * After vg_write() returns success, * caller MUST call either vg_commit() or vg_revert() @@ -2929,9 +2936,10 @@ static int _handle_historical_lvs(struct volume_group *vg) int vg_write(struct volume_group *vg) { struct dm_list *mdah; - struct pv_list *pvl, *pvl_safe; + struct pv_list *pvl, *pvl_safe, *new_pvl; struct metadata_area *mda; struct lv_list *lvl; + struct device *mda_dev; int revert = 0, wrote = 0; if (vg_is_shared(vg)) { @@ -2986,6 +2994,9 @@ int vg_write(struct volume_group *vg) return 0; } + if (vg->cmd->wipe_outdated_pvs) + _wipe_outdated_pvs(vg->cmd, vg); + if (critical_section()) log_error(INTERNAL_ERROR "Writing metadata in critical section."); @@ -2994,6 +3005,26 @@ int vg_write(struct volume_group *vg) memlock_unlock(vg->cmd); vg->seqno++; + dm_list_iterate_items(pvl, &vg->pvs) { + int update_pv_header = 0; + + if (_pv_in_pv_list(pvl->pv, &vg->pv_write_list)) + continue; + + if (!pvl->pv->fmt->ops->pv_needs_rewrite(pvl->pv->fmt, pvl->pv, &update_pv_header)) + continue; + + if (!update_pv_header) + continue; + + if (!(new_pvl = dm_pool_zalloc(vg->vgmem, sizeof(*new_pvl)))) + continue; + + new_pvl->pv = pvl->pv; + dm_list_add(&vg->pv_write_list, &new_pvl->list); + log_warn("WARNING: updating PV header on %s for VG %s.", pv_dev_name(pvl->pv), vg->name); + } + dm_list_iterate_items_safe(pvl, pvl_safe, &vg->pv_write_list) { if (!pv_write(vg->cmd, pvl->pv, 1)) return_0; @@ -3002,8 +3033,27 @@ int vg_write(struct volume_group *vg) /* Write to each copy of the metadata area */ dm_list_iterate_items(mda, &vg->fid->metadata_areas_in_use) { + mda_dev = mda_get_device(mda); + if (mda->status & MDA_FAILED) continue; + + /* + * When the scan and vg_read find old metadata in an mda, they + * leave the info struct in lvmcache, and leave the mda in + * info->mdas. That means we use the mda here to write new + * metadata into. This means that a command writing a VG will + * automatically update old metadata to the latest. + * + * This can also happen if the metadata was ignored on this + * dev, and then it's later changed to not ignored, and + * we see the old metadata. + */ + if (lvmcache_has_old_metadata(vg->cmd, vg->name, (const char *)&vg->id, mda_dev)) { + log_warn("WARNING: updating old metadata to %u on %s for VG %s.", + vg->seqno, dev_name(mda_dev), vg->name); + } + if (!mda->ops->vg_write) { log_error("Format does not support writing volume" "group metadata areas"); @@ -3072,6 +3122,7 @@ static int _vg_commit_mdas(struct volume_group *vg) struct metadata_area *mda, *tmda; struct dm_list ignored; int failed = 0; + int good = 0; int cache_updated = 0; /* Rearrange the metadata_areas_in_use so ignored mdas come first. */ @@ -3092,27 +3143,31 @@ static int _vg_commit_mdas(struct volume_group *vg) !mda->ops->vg_commit(vg->fid, vg, mda)) { stack; failed = 1; - } + } else + good++; + /* Update cache first time we succeed */ if (!failed && !cache_updated) { - lvmcache_update_vg(vg, 0); + lvmcache_update_vg_from_write(vg); cache_updated = 1; } } - return cache_updated; + if (good) + return 1; + return 0; } /* Commit pending changes */ int vg_commit(struct volume_group *vg) { - int cache_updated = 0; struct pv_list *pvl; + int ret; - cache_updated = _vg_commit_mdas(vg); + ret = _vg_commit_mdas(vg); set_vg_notify(vg->cmd); - if (cache_updated) { + if (ret) { /* * We need to clear old_name after a successful commit. * The volume_group structure could be reused later. @@ -3126,7 +3181,7 @@ int vg_commit(struct volume_group *vg) } /* If at least one mda commit succeeded, it was committed */ - return cache_updated; + return ret; } /* Don't commit any pending changes */ @@ -3152,14 +3207,6 @@ void vg_revert(struct volume_group *vg) } } -static int _check_mda_in_use(struct metadata_area *mda, void *_in_use) -{ - int *in_use = _in_use; - if (!mda_is_ignored(mda)) - *in_use = 1; - return 1; -} - struct _vg_read_orphan_baton { struct cmd_context *cmd; struct volume_group *vg; @@ -3197,6 +3244,14 @@ struct _vg_read_orphan_baton { */ #if 0 +static int _check_mda_in_use(struct metadata_area *mda, void *_in_use) +{ + int *in_use = _in_use; + if (!mda_is_ignored(mda)) + *in_use = 1; + return 1; +} + static int _check_or_repair_orphan_pv_ext(struct physical_volume *pv, struct lvmcache_info *info, struct _vg_read_orphan_baton *b) @@ -3331,9 +3386,7 @@ static int _vg_read_orphan_pv(struct lvmcache_info *info, void *baton) } /* Make orphan PVs look like a VG. */ -struct volume_group *vg_read_orphans(struct cmd_context *cmd, - uint32_t warn_flags, - const char *orphan_vgname) +struct volume_group *vg_read_orphans(struct cmd_context *cmd, const char *orphan_vgname) { const struct format_type *fmt; struct lvmcache_vginfo *vginfo; @@ -3394,40 +3447,6 @@ struct volume_group *vg_read_orphans(struct cmd_context *cmd, return vg; } -static int _update_pv_list(struct dm_pool *pvmem, struct dm_list *all_pvs, struct volume_group *vg) -{ - struct pv_list *pvl, *pvl2; - - dm_list_iterate_items(pvl, &vg->pvs) { - dm_list_iterate_items(pvl2, all_pvs) { - if (pvl->pv->dev == pvl2->pv->dev) - goto next_pv; - } - - /* - * PV is not on list so add it. - */ - if (!(pvl2 = _copy_pvl(pvmem, pvl))) { - log_error("pv_list allocation for '%s' failed", - pv_dev_name(pvl->pv)); - return 0; - } - dm_list_add(all_pvs, &pvl2->list); - next_pv: - ; - } - - return 1; -} - -static void _free_pv_list(struct dm_list *all_pvs) -{ - struct pv_list *pvl; - - dm_list_iterate_items(pvl, all_pvs) - pvl->pv->fid->fmt->ops->destroy_instance(pvl->pv->fid); -} - static void _destroy_fid(struct format_instance **fid) { if (*fid) { @@ -3447,812 +3466,49 @@ int vg_missing_pv_count(const struct volume_group *vg) return ret; } -static int _check_reappeared_pv(struct volume_group *correct_vg, - struct physical_volume *pv, int act) -{ - struct pv_list *pvl; - int rv = 0; - - /* - * Skip these checks in case the tool is going to deal with missing - * PVs, especially since the resulting messages can be pretty - * confusing. - */ - if (correct_vg->cmd->handles_missing_pvs) - return rv; - - /* - * Skip this if there is no underlying device present for this PV. - */ - if (!pv->dev) - return rv; - - dm_list_iterate_items(pvl, &correct_vg->pvs) - if (pv->dev == pvl->pv->dev && is_missing_pv(pvl->pv)) { - if (act) - log_warn("WARNING: Missing device %s reappeared, updating " - "metadata for VG %s to version %u.", - pv_dev_name(pvl->pv), pv_vg_name(pvl->pv), - correct_vg->seqno); - if (pvl->pv->pe_alloc_count == 0) { - if (act) { - pv->status &= ~MISSING_PV; - pvl->pv->status &= ~MISSING_PV; - } - ++ rv; - } else if (act) - log_warn("WARNING: Device %s still marked missing because of allocated data " - "on it, remove volumes and consider vgreduce --removemissing.", - pv_dev_name(pvl->pv)); - } - - return rv; -} - static int _is_foreign_vg(struct volume_group *vg) { return vg->cmd->system_id && strcmp(vg->system_id, vg->cmd->system_id); } -static int _repair_inconsistent_vg(struct volume_group *vg, uint32_t lockd_state) -{ - unsigned saved_handles_missing_pvs = vg->cmd->handles_missing_pvs; - - if (lvmcache_found_duplicate_pvs()) { - log_debug_metadata("Skip metadata repair with duplicates."); - return 0; - } - - /* Cannot write foreign VGs, the owner will repair it. */ - if (_is_foreign_vg(vg)) { - log_verbose("Skip metadata repair for foreign VG."); - return 0; - } - - if (vg_is_shared(vg) && !(lockd_state & LDST_EX)) { - log_verbose("Skip metadata repair for shared VG without exclusive lock."); - return 0; - } - - log_warn("WARNING: Inconsistent metadata found for VG %s - updating to use version %u", vg->name, vg->seqno); - - vg->cmd->handles_missing_pvs = 1; - if (!vg_write(vg)) { - log_error("Automatic metadata correction failed"); - vg->cmd->handles_missing_pvs = saved_handles_missing_pvs; - return 0; - } - - vg->cmd->handles_missing_pvs = saved_handles_missing_pvs; - - if (!vg_commit(vg)) { - log_error("Automatic metadata correction commit failed"); - return 0; - } - - return 1; -} - -static int _wipe_outdated_pvs(struct cmd_context *cmd, struct volume_group *vg, struct dm_list *to_check, uint32_t lockd_state) -{ - struct pv_list *pvl, *pvl2; - char uuid[64] __attribute__((aligned(8))); - - if (lvmcache_found_duplicate_pvs()) { - log_debug_metadata("Skip wiping outdated PVs with duplicates."); - return 0; - } - - /* - * Cannot write foreign VGs, the owner will repair it. - * Also, if another host is updating its VG, we may read - * the PVs while some are written but not others, making - * some PVs look outdated to us just because we're reading - * the VG while it's only partially written out. - */ - if (_is_foreign_vg(vg)) { - log_debug_metadata("Skip wiping outdated PVs for foreign VG."); - return 0; - } - - if (vg_is_shared(vg) && !(lockd_state & LDST_EX)) { - log_verbose("Skip wiping outdated PVs for shared VG without exclusive lock."); - return 0; - } - - dm_list_iterate_items(pvl, to_check) { - dm_list_iterate_items(pvl2, &vg->pvs) { - if (pvl->pv->dev == pvl2->pv->dev) - goto next_pv; - } - - - if (!id_write_format(&pvl->pv->id, uuid, sizeof(uuid))) - return_0; - log_warn("WARNING: Removing PV %s (%s) that no longer belongs to VG %s", - pv_dev_name(pvl->pv), uuid, vg->name); - if (!pv_write_orphan(cmd, pvl->pv)) - return_0; -next_pv: - ; - } - return 1; -} - -static int _check_or_repair_pv_ext(struct cmd_context *cmd, - struct volume_group *vg, - uint32_t lockd_state, - int repair, int *inconsistent_pvs) +static int _check_pv_ext(struct cmd_context *cmd, struct volume_group *vg) { - char uuid[64] __attribute__((aligned(8))); struct lvmcache_info *info; uint32_t ext_version, ext_flags; struct pv_list *pvl; - unsigned pvs_fixed = 0; - int r = 0; - *inconsistent_pvs = 0; + if (_is_foreign_vg(vg)) + return 1; + + if (vg_is_shared(vg)) + return 1; dm_list_iterate_items(pvl, &vg->pvs) { - /* Missing PV - nothing to do. */ if (is_missing_pv(pvl->pv)) continue; - if (!pvl->pv->dev) { - /* is_missing_pv doesn't catch NULL dev */ - memset(&uuid, 0, sizeof(uuid)); - if (!id_write_format(&pvl->pv->id, uuid, sizeof(uuid))) - goto_out; - log_warn("WARNING: Not repairing PV %s with missing device.", uuid); + /* is_missing_pv doesn't catch NULL dev */ + if (!pvl->pv->dev) continue; - } - if (!(info = lvmcache_info_from_pvid(pvl->pv->dev->pvid, pvl->pv->dev, 0))) { - log_error("Failed to find cached info for PV %s.", pv_dev_name(pvl->pv)); - goto out; - } + if (!(info = lvmcache_info_from_pvid(pvl->pv->dev->pvid, pvl->pv->dev, 0))) + continue; ext_version = lvmcache_ext_version(info); - if (ext_version < 2) + if (ext_version < PV_HEADER_EXTENSION_VSN) { + log_warn("WARNING: PV %s in VG %s is using an old PV header, modify the VG to update.", + dev_name(pvl->pv->dev), vg->name); continue; + } ext_flags = lvmcache_ext_flags(info); if (!(ext_flags & PV_EXT_USED)) { - if (!repair) { - *inconsistent_pvs = 1; - /* we're not repairing now, so no need to - * check further PVs - inconsistent_pvs is already - * set and that will trigger the repair next time */ - return 1; - } - - if (_is_foreign_vg(vg)) { - log_verbose("Skip repair of PV %s that is in foreign " - "VG %s but not marked as used.", - pv_dev_name(pvl->pv), vg->name); - *inconsistent_pvs = 1; - } else if (vg_is_shared(vg) && !(lockd_state & LDST_EX)) { - log_warn("Skip repair of PV %s that is in shared " - "VG %s but not marked as used.", - pv_dev_name(pvl->pv), vg->name); - *inconsistent_pvs = 1; - } else { - log_warn("WARNING: Repairing Physical Volume %s that is " - "in Volume Group %s but not marked as used.", - pv_dev_name(pvl->pv), vg->name); - - /* pv write will set correct ext_flags */ - if (!pv_write(cmd, pvl->pv, 1)) { - *inconsistent_pvs = 1; - log_error("Failed to repair physical volume \"%s\".", - pv_dev_name(pvl->pv)); - goto out; - } - pvs_fixed++; - } - } - } - - r = 1; -out: - if ((pvs_fixed > 0) && !_repair_inconsistent_vg(vg, lockd_state)) - return_0; - - return r; -} - -/* Caller sets consistent to 1 if it's safe for vg_read_internal to correct - * inconsistent metadata on disk (i.e. the VG write lock is held). - * This guarantees only consistent metadata is returned. - * If consistent is 0, caller must check whether consistent == 1 on return - * and take appropriate action if it isn't (e.g. abort; get write lock - * and call vg_read_internal again). - * - * If precommitted is set, use precommitted metadata if present. - * - * Either of vgname or vgid may be NULL. - * - * Note: vginfo structs must not be held or used as parameters - * across the call to this function. - */ -static struct volume_group *_vg_read(struct cmd_context *cmd, - const char *vgname, - const char *vgid, - uint32_t lockd_state, - uint32_t warn_flags, - int enable_repair, - int *mdas_consistent, - unsigned precommitted) -{ - struct format_instance *fid = NULL; - struct format_instance_ctx fic; - const struct format_type *fmt; - struct volume_group *vg, *correct_vg = NULL; - struct metadata_area *mda; - struct lvmcache_info *info; - int inconsistent = 0; - int inconsistent_vgid = 0; - int inconsistent_pvs = 0; - int inconsistent_mdas = 0; - int inconsistent_mda_count = 0; - int strip_historical_lvs = enable_repair; - int update_old_pv_ext = enable_repair; - unsigned use_precommitted = precommitted; - struct dm_list *pvids; - struct pv_list *pvl; - struct dm_list all_pvs; - char uuid[64] __attribute__((aligned(8))); - int skipped_rescan = 0; - struct cached_vg_fmtdata *vg_fmtdata = NULL; /* Additional format-specific data about the vg */ - unsigned use_previous_vg; - - *mdas_consistent = 1; - - if (is_orphan_vg(vgname)) { - log_very_verbose("Reading VG %s", vgname); - - if (use_precommitted) { - log_error(INTERNAL_ERROR "vg_read_internal requires vgname " - "with pre-commit."); - return NULL; - } - return vg_read_orphans(cmd, warn_flags, vgname); - } - - uuid[0] = '\0'; - if (vgid && !id_write_format((const struct id*)vgid, uuid, sizeof(uuid))) - stack; - - log_very_verbose("Reading VG %s %s", vgname ?: "<no name>", vgid ? uuid : "<no vgid>"); - - /* - * Rescan the devices that are associated with this vg in lvmcache. - * This repeats what was done by the command's initial label scan, - * but only the devices associated with this VG. - * - * The lvmcache info about these devs is from the initial label scan - * performed by the command before the vg lock was held. Now the VG - * lock is held, so we rescan all the info from the devs in case - * something changed between the initial scan and now that the lock - * is held. - * - * Some commands (e.g. reporting) are fine reporting data read by - * the label scan. It doesn't matter if the devs changed between - * the label scan and here, we can report what was seen in the - * scan, even though it is the old state, since we will not be - * making any modifications. If the VG was being modified during - * the scan, and caused us to see inconsistent metadata on the - * different PVs in the VG, then we do want to rescan the devs - * here to get a consistent view of the VG. Note that we don't - * know if the scan found all the PVs in the VG at this point. - * We don't know that until vg_read looks at the list of PVs in - * the metadata and compares that to the devices found by the scan. - * - * It's possible that a change made to the VG during scan was - * adding or removing a PV from the VG. In this case, the list - * of devices associated with the VG in lvmcache would change - * due to the rescan. - * - * The devs in the VG may be persistently inconsistent due to some - * previous problem. In this case, rescanning the labels here will - * find the same inconsistency. The VG repair (mistakenly done by - * vg_read below) is supposed to fix that. - * - * FIXME: sort out the usage of the global lock (which is mixed up - * with the orphan lock), and when we can tell that the global - * lock is taken prior to the label scan, and still held here, - * we can also skip the rescan in that case. - */ - if (!cmd->can_use_one_scan || lvmcache_scan_mismatch(cmd, vgname, vgid)) { - /* the skip rescan special case is for clvmd vg_read_by_vgid */ - /* FIXME: this is not a warn flag, pass this differently */ - if (warn_flags & SKIP_RESCAN) - goto find_vg; - skipped_rescan = 0; - log_debug_metadata("Rescanning devices for %s", vgname); - lvmcache_label_rescan_vg(cmd, vgname, vgid); - } else { - log_debug_metadata("Skipped rescanning devices for %s", vgname); - skipped_rescan = 1; - } - - find_vg: - - if (!(fmt = lvmcache_fmt_from_vgname(cmd, vgname, vgid, 0))) { - log_debug_metadata("Cache did not find fmt for vgname %s", vgname); - return_NULL; - } - - /* Now determine the correct vgname if none was supplied */ - if (!vgname && !(vgname = lvmcache_vgname_from_vgid(cmd->mem, vgid))) { - log_debug_metadata("Cache did not find VG name from vgid %s", uuid); - return_NULL; - } - - /* Determine the correct vgid if none was supplied */ - if (!vgid && !(vgid = lvmcache_vgid_from_vgname(cmd, vgname))) { - log_debug_metadata("Cache did not find VG vgid from name %s", vgname); - return_NULL; - } - - if (use_precommitted && !(fmt->features & FMT_PRECOMMIT)) - use_precommitted = 0; - - /* - * A "format instance" is an abstraction for a VG location, - * i.e. where a VG's metadata exists on disk. - * - * An fic (format_instance_ctx) is a temporary struct used - * to create an fid (format_instance). The fid hangs around - * and is used to create a 'vg' to which it connected (vg->fid). - * - * The 'fic' describes a VG in terms of fmt/name/id. - * - * The 'fid' describes a VG in more detail than the fic, - * holding information about where to find the VG metadata. - * - * The 'vg' describes the VG in the most detail representing - * all the VG metadata. - * - * The fic and fid are set up by create_instance() to describe - * the VG location. This happens before the VG metadata is - * assembled into the more familiar struct volume_group "vg". - * - * The fid has one main purpose: to keep track of the metadata - * locations for a given VG. It does this by putting 'mda' - * structs on fid->metadata_areas_in_use, which specify where - * metadata is located on disk. It gets this information - * (metadata locations for a specific VG) from the command's - * initial label scan. The info is passed indirectly via - * lvmcache info/vginfo structs, which are created by the - * label scan and then copied into fid by create_instance(). - */ - - /* create format instance with appropriate metadata area */ - fic.type = FMT_INSTANCE_MDAS | FMT_INSTANCE_AUX_MDAS; - fic.context.vg_ref.vg_name = vgname; - fic.context.vg_ref.vg_id = vgid; - if (!(fid = fmt->ops->create_instance(fmt, &fic))) { - log_error("Failed to create format instance"); - return NULL; - } - - /* Store pvids for later so we can check if any are missing */ - if (!(pvids = lvmcache_get_pvids(cmd, vgname, vgid))) { - _destroy_fid(&fid); - return_NULL; - } - - /* - * We use the fid globally here so prevent the release_vg - * call to destroy the fid - we may want to reuse it! - */ - fid->ref_count++; - /* Ensure contents of all metadata areas match - else do recovery */ - inconsistent_mda_count=0; - dm_list_iterate_items(mda, &fid->metadata_areas_in_use) { - struct device *mda_dev = mda_get_device(mda); - - use_previous_vg = 0; - - log_debug_metadata("Reading VG %s from %s", vgname, dev_name(mda_dev)); - - if ((use_precommitted && - !(vg = mda->ops->vg_read_precommit(fid, vgname, mda, &vg_fmtdata, &use_previous_vg)) && !use_previous_vg) || - (!use_precommitted && - !(vg = mda->ops->vg_read(fid, vgname, mda, &vg_fmtdata, &use_previous_vg)) && !use_previous_vg)) { - inconsistent = 1; - vg_fmtdata = NULL; - continue; - } - - /* Use previous VG because checksum matches */ - if (!vg) { - vg = correct_vg; - continue; - } - - if (!correct_vg) { - correct_vg = vg; - continue; - } - - /* FIXME Also ensure contents same - checksum compare? */ - if (correct_vg->seqno != vg->seqno) { - if (cmd->metadata_read_only || skipped_rescan) - log_warn("Not repairing metadata for VG %s.", vgname); - else - inconsistent = 1; - - if (vg->seqno > correct_vg->seqno) { - release_vg(correct_vg); - correct_vg = vg; - } else { - mda->status |= MDA_INCONSISTENT; - ++inconsistent_mda_count; - } - } - - if (vg != correct_vg) { - release_vg(vg); - vg_fmtdata = NULL; - } - } - fid->ref_count--; - - /* Ensure every PV in the VG was in the cache */ - if (correct_vg) { - /* - * Update the seqno from the cache, for the benefit of - * retro-style metadata formats like LVM1. - */ - // correct_vg->seqno = seqno > correct_vg->seqno ? seqno : correct_vg->seqno; - - /* - * If the VG has PVs without mdas, or ignored mdas, they may - * still be orphans in the cache: update the cache state here, - * and update the metadata lists in the vg. - */ - if (!inconsistent && - dm_list_size(&correct_vg->pvs) > dm_list_size(pvids)) { - dm_list_iterate_items(pvl, &correct_vg->pvs) { - if (!pvl->pv->dev) { - inconsistent_pvs = 1; - break; - } - - if (str_list_match_item(pvids, pvl->pv->dev->pvid)) - continue; - - /* - * PV not marked as belonging to this VG in cache. - * Check it's an orphan without metadata area - * not ignored. - */ - if (!(info = lvmcache_info_from_pvid(pvl->pv->dev->pvid, pvl->pv->dev, 1)) || - !lvmcache_is_orphan(info)) { - inconsistent_pvs = 1; - break; - } - - if (lvmcache_mda_count(info)) { - if (!lvmcache_fid_add_mdas_pv(info, fid)) { - release_vg(correct_vg); - return_NULL; - } - - log_debug_metadata("Empty mda found for VG %s on %s.", - vgname, dev_name(pvl->pv->dev)); - -#if 0 - /* - * If we are going to do any repair we have to be using - * the latest metadata on disk, so we have to rescan devs - * if we skipped that at the start of the vg_read. We'll - * likely come back through here, but without having - * skipped_rescan. - * - * FIXME: in some cases we don't want to do this. - */ - if (skipped_rescan && cmd->can_use_one_scan) { - log_debug_metadata("Restarting read to rescan devs."); - cmd->can_use_one_scan = 0; - release_vg(correct_vg); - correct_vg = NULL; - lvmcache_del(info); - label_read(pvl->pv->dev); - goto restart_scan; - } -#endif - - if (inconsistent_mdas) - continue; - - /* - * If any newly-added mdas are in-use then their - * metadata needs updating. - */ - lvmcache_foreach_mda(info, _check_mda_in_use, - &inconsistent_mdas); - } - } - - /* If the check passed, let's update VG and recalculate pvids */ - if (!inconsistent_pvs) { - log_debug_metadata("Updating cache for PVs without mdas " - "in VG %s.", vgname); - /* - * If there is no precommitted metadata, committed metadata - * is read and stored in the cache even if use_precommitted is set - */ - lvmcache_update_vg(correct_vg, correct_vg->status & PRECOMMITTED); - - if (!(pvids = lvmcache_get_pvids(cmd, vgname, vgid))) { - release_vg(correct_vg); - return_NULL; - } - } - } - - fid->ref_count++; - if (dm_list_size(&correct_vg->pvs) != - dm_list_size(pvids) + vg_missing_pv_count(correct_vg)) { - log_debug_metadata("Cached VG %s had incorrect PV list", - vgname); - - if (prioritized_section()) - inconsistent = 1; - else { - release_vg(correct_vg); - correct_vg = NULL; - } - } else dm_list_iterate_items(pvl, &correct_vg->pvs) { - if (is_missing_pv(pvl->pv)) - continue; - if (!str_list_match_item(pvids, pvl->pv->dev->pvid)) { - log_debug_metadata("Cached VG %s had incorrect PV list", - vgname); - release_vg(correct_vg); - correct_vg = NULL; - break; - } - } - - if (correct_vg && inconsistent_mdas) { - release_vg(correct_vg); - correct_vg = NULL; - } - fid->ref_count--; - } - - dm_list_init(&all_pvs); - - /* Failed to find VG where we expected it - full scan and retry */ - if (!correct_vg) { - /* - * Free outstanding format instance that remained unassigned - * from previous step where we tried to get the "correct_vg", - * but we failed to do so (so there's a dangling fid now). - */ - _destroy_fid(&fid); - vg_fmtdata = NULL; - - inconsistent = 0; - - if (!(fmt = lvmcache_fmt_from_vgname(cmd, vgname, vgid, 0))) - return_NULL; - - if (precommitted && !(fmt->features & FMT_PRECOMMIT)) - use_precommitted = 0; - - /* create format instance with appropriate metadata area */ - fic.type = FMT_INSTANCE_MDAS | FMT_INSTANCE_AUX_MDAS; - fic.context.vg_ref.vg_name = vgname; - fic.context.vg_ref.vg_id = vgid; - if (!(fid = fmt->ops->create_instance(fmt, &fic))) { - log_error("Failed to create format instance"); - return NULL; - } - - /* - * We use the fid globally here so prevent the release_vg - * call to destroy the fid - we may want to reuse it! - */ - fid->ref_count++; - /* Ensure contents of all metadata areas match - else recover */ - inconsistent_mda_count=0; - dm_list_iterate_items(mda, &fid->metadata_areas_in_use) { - use_previous_vg = 0; - - if ((use_precommitted && - !(vg = mda->ops->vg_read_precommit(fid, vgname, mda, &vg_fmtdata, &use_previous_vg)) && !use_previous_vg) || - (!use_precommitted && - !(vg = mda->ops->vg_read(fid, vgname, mda, &vg_fmtdata, &use_previous_vg)) && !use_previous_vg)) { - inconsistent = 1; - vg_fmtdata = NULL; - continue; - } - - /* Use previous VG because checksum matches */ - if (!vg) { - vg = correct_vg; - continue; - } - - if (!correct_vg) { - correct_vg = vg; - if (!_update_pv_list(cmd->mem, &all_pvs, correct_vg)) { - _free_pv_list(&all_pvs); - fid->ref_count--; - release_vg(vg); - return_NULL; - } - continue; - } - - if (!id_equal(&vg->id, &correct_vg->id)) { - inconsistent = 1; - inconsistent_vgid = 1; - } - - /* FIXME Also ensure contents same - checksums same? */ - if (correct_vg->seqno != vg->seqno) { - /* Ignore inconsistent seqno if told to skip repair logic */ - if (cmd->metadata_read_only || skipped_rescan) - log_warn("Not repairing metadata for VG %s.", vgname); - else - inconsistent = 1; - - if (!_update_pv_list(cmd->mem, &all_pvs, vg)) { - _free_pv_list(&all_pvs); - fid->ref_count--; - release_vg(vg); - release_vg(correct_vg); - return_NULL; - } - if (vg->seqno > correct_vg->seqno) { - release_vg(correct_vg); - correct_vg = vg; - } else { - mda->status |= MDA_INCONSISTENT; - ++inconsistent_mda_count; - } - } - - if (vg != correct_vg) { - release_vg(vg); - vg_fmtdata = NULL; - } - } - fid->ref_count--; - - /* Give up looking */ - if (!correct_vg) { - _free_pv_list(&all_pvs); - _destroy_fid(&fid); - return_NULL; - } - } - - /* - * If there is no precommitted metadata, committed metadata - * is read and stored in the cache even if use_precommitted is set - */ - lvmcache_update_vg(correct_vg, (correct_vg->status & PRECOMMITTED)); - - if (inconsistent) { - /* FIXME Test should be if we're *using* precommitted metadata not if we were searching for it */ - if (use_precommitted) { - log_error("Inconsistent pre-commit metadata copies " - "for volume group %s", vgname); - - /* - * Check whether all of the inconsistent MDAs were on - * MISSING PVs -- in that case, we should be safe. - */ - dm_list_iterate_items(mda, &fid->metadata_areas_in_use) { - if (mda->status & MDA_INCONSISTENT) { - log_debug_metadata("Checking inconsistent MDA: %s", dev_name(mda_get_device(mda))); - dm_list_iterate_items(pvl, &correct_vg->pvs) { - if (mda_get_device(mda) == pvl->pv->dev && - (pvl->pv->status & MISSING_PV)) - --inconsistent_mda_count; - } - } - } - - if (inconsistent_mda_count < 0) - log_error(INTERNAL_ERROR "Too many inconsistent MDAs."); - - if (!inconsistent_mda_count) { - _free_pv_list(&all_pvs); - return correct_vg; - } - _free_pv_list(&all_pvs); - release_vg(correct_vg); - return NULL; - } - - if (!enable_repair) { - _free_pv_list(&all_pvs); - *mdas_consistent = 0; - return correct_vg; - } - - if (skipped_rescan) { - log_warn("Not repairing metadata for VG %s.", vgname); - _free_pv_list(&all_pvs); - release_vg(correct_vg); - return_NULL; - } - - /* Don't touch if vgids didn't match */ - if (inconsistent_vgid) { - log_warn("WARNING: Inconsistent metadata UUIDs found for volume group %s.", vgname); - _free_pv_list(&all_pvs); - *mdas_consistent = 0; - return correct_vg; - } - - /* - * If PV is marked missing but we found it, - * update metadata and remove MISSING flag - */ - dm_list_iterate_items(pvl, &all_pvs) - _check_reappeared_pv(correct_vg, pvl->pv, 1); - - if (!_repair_inconsistent_vg(correct_vg, lockd_state)) { - _free_pv_list(&all_pvs); - release_vg(correct_vg); - return NULL; - } - - if (!_wipe_outdated_pvs(cmd, correct_vg, &all_pvs, lockd_state)) { - _free_pv_list(&all_pvs); - release_vg(correct_vg); - return_NULL; - } - } - - _free_pv_list(&all_pvs); - - if (vg_missing_pv_count(correct_vg)) { - log_verbose("There are %d physical volumes missing.", - vg_missing_pv_count(correct_vg)); - vg_mark_partial_lvs(correct_vg, 1); - } - - if ((correct_vg->status & PVMOVE) && !pvmove_mode()) { - log_error("Interrupted pvmove detected in volume group %s.", - correct_vg->name); - log_print("Please restore the metadata by running vgcfgrestore."); - release_vg(correct_vg); - return NULL; - } - - /* We have the VG now finally, check if PV ext info is in sync with VG metadata. */ - if (!_check_or_repair_pv_ext(cmd, correct_vg, lockd_state, skipped_rescan ? 0 : enable_repair, - &inconsistent_pvs)) { - release_vg(correct_vg); - return_NULL; - } - - if (correct_vg && enable_repair && !skipped_rescan) { - if (update_old_pv_ext && !_vg_update_old_pv_ext_if_needed(correct_vg)) { - release_vg(correct_vg); - return_NULL; - } - - if (strip_historical_lvs && !vg_strip_outdated_historical_lvs(correct_vg)) { - release_vg(correct_vg); - return_NULL; + log_warn("WARNING: PV %s in VG %s is missing the used flag in PV header.", + dev_name(pvl->pv->dev), vg->name); } } - if (inconsistent_pvs) - *mdas_consistent = 0; - - return correct_vg; + return 1; } #define DEV_LIST_DELIM ", " @@ -4333,7 +3589,7 @@ static int _check_devs_used_correspond_with_lv(struct dm_pool *mem, struct dm_li return 1; } -static int _check_devs_used_correspond_with_vg(struct volume_group *vg) +static void _check_devs_used_correspond_with_vg(struct volume_group *vg) { struct dm_pool *mem; char vgid[ID_LEN + 1]; @@ -4343,9 +3599,6 @@ static int _check_devs_used_correspond_with_vg(struct volume_group *vg) struct device_list *dl; int found_inconsistent = 0; - if (is_orphan_vg(vg->name)) - return 1; - strncpy(vgid, (const char *) vg->id.uuid, sizeof(vgid)); vgid[ID_LEN] = '\0'; @@ -4366,7 +3619,7 @@ static int _check_devs_used_correspond_with_vg(struct volume_group *vg) } if (!(list = dev_cache_get_dev_list_for_vgid(vgid))) - return 1; + return; dm_list_iterate_items(dl, list) { if (!(dl->dev->flags & DEV_OPEN_FAILURE) && @@ -4378,79 +3631,19 @@ static int _check_devs_used_correspond_with_vg(struct volume_group *vg) if (found_inconsistent) { if (!(mem = dm_pool_create("vg_devs_check", 1024))) - return_0; + return; dm_list_iterate_items(lvl, &vg->lvs) { if (!_check_devs_used_correspond_with_lv(mem, list, lvl->lv)) { dm_pool_destroy(mem); - return_0; + return; } } dm_pool_destroy(mem); } - return 1; -} - -struct volume_group *vg_read_internal(struct cmd_context *cmd, - const char *vgname, const char *vgid, - uint32_t lockd_state, uint32_t warn_flags, - int enable_repair, - int *mdas_consistent) -{ - struct volume_group *vg; - struct lv_list *lvl; - - if (!(vg = _vg_read(cmd, vgname, vgid, lockd_state, - warn_flags, enable_repair, mdas_consistent, 0))) - goto_out; - - if (!check_pv_dev_sizes(vg)) - log_warn("One or more devices used as PVs in VG %s " - "have changed sizes.", vg->name); - - if (!check_pv_segments(vg)) { - log_error(INTERNAL_ERROR "PV segments corrupted in %s.", - vg->name); - release_vg(vg); - vg = NULL; - goto out; - } - - dm_list_iterate_items(lvl, &vg->lvs) { - if (!check_lv_segments(lvl->lv, 0)) { - log_error(INTERNAL_ERROR "LV segments corrupted in %s.", - lvl->lv->name); - release_vg(vg); - vg = NULL; - goto out; - } - } - - dm_list_iterate_items(lvl, &vg->lvs) { - /* - * Checks that cross-reference other LVs. - */ - if (!check_lv_segments(lvl->lv, 1)) { - log_error(INTERNAL_ERROR "LV segments corrupted in %s.", - lvl->lv->name); - release_vg(vg); - vg = NULL; - goto out; - } - } - - (void) _check_devs_used_correspond_with_vg(vg); -out: - if (!*mdas_consistent && (warn_flags & WARN_INCONSISTENT)) { - if (is_orphan_vg(vgname)) - log_warn("WARNING: Found inconsistent standalone Physical Volumes."); - else - log_warn("WARNING: Volume Group %s is not consistent.", vgname); - } - - return vg; + return; } void free_pv_fid(struct physical_volume *pv) @@ -4699,10 +3892,6 @@ uint32_t vg_bad_status_bits(const struct volume_group *vg, uint64_t status) { uint32_t failure = 0; - if ((status & CLUSTERED) && !_access_vg_clustered(vg->cmd, vg)) - /* Return because other flags are considered undefined. */ - return FAILED_CLUSTERED; - if ((status & EXPORTED_VG) && vg_is_exported(vg)) { log_error("Volume group %s is exported", vg->name); @@ -4734,48 +3923,6 @@ int vg_check_status(const struct volume_group *vg, uint64_t status) return !vg_bad_status_bits(vg, status); } -/* - * VG is left unlocked on failure - */ -static struct volume_group *_recover_vg(struct cmd_context *cmd, - const char *vg_name, const char *vgid, - int is_shared, uint32_t lockd_state) -{ - int mdas_consistent = 0; - struct volume_group *vg; - uint32_t state = 0; - - unlock_vg(cmd, NULL, vg_name); - - if (!lock_vol(cmd, vg_name, LCK_VG_WRITE, NULL)) - return_NULL; - - /* - * Convert vg lock in lvmlockd from sh to ex. - */ - if (is_shared && !(lockd_state & LDST_FAIL) && !(lockd_state & LDST_EX)) { - log_debug("Upgrade lvmlockd lock to repair vg %s.", vg_name); - if (!lockd_vg(cmd, vg_name, "ex", 0, &state)) { - log_warn("Skip repair for shared VG without exclusive lock."); - return NULL; - } - lockd_state |= LDST_EX; - } - - if (!(vg = vg_read_internal(cmd, vg_name, vgid, lockd_state, 0, 1, &mdas_consistent))) { - unlock_vg(cmd, NULL, vg_name); - return_NULL; - } - - if (!mdas_consistent) { - release_vg(vg); - unlock_vg(cmd, NULL, vg_name); - return_NULL; - } - - return (struct volume_group *)vg; -} - static int _allow_extra_system_id(struct cmd_context *cmd, const char *system_id) { const struct dm_config_node *cn; @@ -4805,9 +3952,6 @@ static int _allow_extra_system_id(struct cmd_context *cmd, const char *system_id static int _access_vg_lock_type(struct cmd_context *cmd, struct volume_group *vg, uint32_t lockd_state, uint32_t *failure) { - if (!is_real_vg(vg->name)) - return 1; - if (cmd->lockd_vg_disable) return 1; @@ -4954,225 +4098,15 @@ static int _access_vg_systemid(struct cmd_context *cmd, struct volume_group *vg) } /* - * FIXME: move vg_bad_status_bits() checks in here. - */ -static int _vg_access_permitted(struct cmd_context *cmd, struct volume_group *vg, - uint32_t lockd_state, uint32_t *failure) -{ - if (!is_real_vg(vg->name)) { - return 1; - } - - if (!_access_vg_clustered(cmd, vg)) { - *failure |= FAILED_CLUSTERED; - return 0; - } - - if (!_access_vg_lock_type(cmd, vg, lockd_state, failure)) { - /* Either FAILED_LOCK_TYPE or FAILED_LOCK_MODE were set. */ - return 0; - } - - if (!_access_vg_systemid(cmd, vg)) { - *failure |= FAILED_SYSTEMID; - return 0; - } - - return 1; -} - -/* - * Consolidated locking, reading, and status flag checking. - * - * If the metadata is inconsistent, setting READ_ALLOW_INCONSISTENT in - * read_flags will return it with FAILED_INCONSISTENT set instead of - * giving you nothing. - * - * Use vg_read_error(vg) to determine the result. Nonzero means there were - * problems reading the volume group. - * Zero value means that the VG is open and appropriate locks are held. - */ -static struct volume_group *_vg_lock_and_read(struct cmd_context *cmd, const char *vg_name, - const char *vgid, - uint32_t lock_flags, - uint64_t status_flags, - uint32_t read_flags, - uint32_t lockd_state) -{ - struct volume_group *vg = NULL; - uint32_t failure = 0; - uint32_t warn_flags = 0; - int mdas_consistent = 1; - int enable_repair = 1; - int is_shared = 0; - int skip_lock = is_orphan_vg(vg_name) && (read_flags & PROCESS_SKIP_ORPHAN_LOCK); - - if ((read_flags & READ_ALLOW_INCONSISTENT) || (lock_flags != LCK_VG_WRITE)) { - enable_repair = 0; - warn_flags |= WARN_INCONSISTENT; - } - - if (!validate_name(vg_name) && !is_orphan_vg(vg_name)) { - log_error("Volume group name \"%s\" has invalid characters.", - vg_name); - return NULL; - } - - if (!skip_lock && - !lock_vol(cmd, vg_name, lock_flags, NULL)) { - log_error("Can't get lock for %s", vg_name); - return _vg_make_handle(cmd, vg, FAILED_LOCKING); - } - - if (skip_lock) - log_very_verbose("Locking %s already done", vg_name); - - if (is_orphan_vg(vg_name)) - status_flags &= ~LVM_WRITE; - - if (!(vg = vg_read_internal(cmd, vg_name, vgid, lockd_state, warn_flags, enable_repair, &mdas_consistent))) { - if (!(read_flags & READ_OK_NOTFOUND)) - log_error("Volume group \"%s\" not found", vg_name); - failure |= FAILED_NOTFOUND; - goto bad; - } - - if (!_vg_access_permitted(cmd, vg, lockd_state, &failure)) - goto bad; - - /* - * If we called vg_read_internal above without repair enabled, - * and the read found inconsistent mdas, then then get a write/ex - * lock and call it again with repair enabled so it will fix - * the inconsistent mdas. - * - * FIXME: factor vg repair out of vg_read. The vg_read caller - * should get an error about the vg have problems and then call - * a repair-specific function if it wants to. (NB there are - * other kinds of repairs hidden in _vg_read that should be - * pulled out in addition to _recover_vg). - */ - if (!mdas_consistent && !enable_repair) { - is_shared = vg_is_shared(vg); - release_vg(vg); - - if (!(vg = _recover_vg(cmd, vg_name, vgid, is_shared, lockd_state))) { - if (is_orphan_vg(vg_name)) - log_error("Recovery of standalone physical volumes failed."); - else - log_error("Recovery of volume group \"%s\" failed.", vg_name); - failure |= FAILED_RECOVERY; - goto bad_no_unlock; - } - } - - /* - * Check that the tool can handle tricky cases -- missing PVs and - * unknown segment types. - */ - - if (!cmd->handles_missing_pvs && vg_missing_pv_count(vg) && - lock_flags == LCK_VG_WRITE) { - log_error("Cannot change VG %s while PVs are missing.", vg->name); - log_error("Consider vgreduce --removemissing."); - failure |= FAILED_INCONSISTENT; /* FIXME new failure code here? */ - goto bad; - } - - if (!cmd->handles_unknown_segments && vg_has_unknown_segments(vg) && - lock_flags == LCK_VG_WRITE) { - log_error("Cannot change VG %s with unknown segments in it!", - vg->name); - failure |= FAILED_INCONSISTENT; /* FIXME new failure code here? */ - goto bad; - } - - failure |= vg_bad_status_bits(vg, status_flags); - if (failure) - goto_bad; - - if (!(vg = _vg_make_handle(cmd, vg, failure)) || vg_read_error(vg)) - if (!skip_lock) - unlock_vg(cmd, vg, vg_name); - - return vg; - -bad: - if (!skip_lock) - unlock_vg(cmd, vg, vg_name); - -bad_no_unlock: - return _vg_make_handle(cmd, vg, failure); -} - -/* - * vg_read: High-level volume group metadata read function. - * - * vg_read_error() must be used on any handle returned to check for errors. - * - * - metadata inconsistent and automatic correction failed: FAILED_INCONSISTENT - * - VG is read-only: FAILED_READ_ONLY - * - VG is EXPORTED, unless flags has READ_ALLOW_EXPORTED: FAILED_EXPORTED - * - VG is not RESIZEABLE: FAILED_RESIZEABLE - * - locking failed: FAILED_LOCKING - * - * On failures, all locks are released, unless one of the following applies: - * - vgname_is_locked(lock_name) is true - * FIXME: remove the above 2 conditions if possible and make an error always - * release the lock. - * - * Volume groups are opened read-only unless flags contains READ_FOR_UPDATE. - * - * Checking for VG existence: - * - * FIXME: We want vg_read to attempt automatic recovery after acquiring a - * temporary write lock: if that fails, we bail out as usual, with failed & - * FAILED_INCONSISTENT. If it works, we are good to go. Code that's been in - * toollib just set lock_flags to LCK_VG_WRITE and called vg_read_internal with - * *consistent = 1. - */ -struct volume_group *vg_read(struct cmd_context *cmd, const char *vg_name, - const char *vgid, uint32_t read_flags, uint32_t lockd_state) -{ - uint64_t status_flags = UINT64_C(0); - uint32_t lock_flags = LCK_VG_READ; - - if (read_flags & READ_FOR_UPDATE) { - status_flags |= EXPORTED_VG | LVM_WRITE; - lock_flags = LCK_VG_WRITE; - } - - if (read_flags & READ_ALLOW_EXPORTED) - status_flags &= ~EXPORTED_VG; - - return _vg_lock_and_read(cmd, vg_name, vgid, lock_flags, status_flags, read_flags, lockd_state); -} - -/* - * A high-level volume group metadata reading function. Open a volume group for - * later update (this means the user code can change the metadata and later - * request the new metadata to be written and committed). - */ -struct volume_group *vg_read_for_update(struct cmd_context *cmd, const char *vg_name, - const char *vgid, uint32_t read_flags, uint32_t lockd_state) -{ - struct volume_group *vg = vg_read(cmd, vg_name, vgid, read_flags | READ_FOR_UPDATE, lockd_state); - - if (!vg || vg_read_error(vg)) - stack; - - return vg; -} - -/* * Test the validity of a VG handle returned by vg_read() or vg_read_for_update(). + * FIXME: drop this function */ uint32_t vg_read_error(struct volume_group *vg_handle) { if (!vg_handle) return FAILED_ALLOCATION; - return vg_handle->read_status; + return SUCCESS; } /* @@ -5668,3 +4602,631 @@ int lv_on_pmem(struct logical_volume *lv) return 0; } +static struct volume_group *_vg_read(struct cmd_context *cmd, + const char *vgname, + const char *vgid, + unsigned precommitted) +{ + const struct format_type *fmt = cmd->fmt; + struct format_instance *fid = NULL; + struct format_instance_ctx fic; + struct volume_group *vg, *vg_ret = NULL; + struct metadata_area *mda, *mda2; + unsigned use_precommitted = precommitted; + struct device *mda_dev, *dev_ret; + struct cached_vg_fmtdata *vg_fmtdata = NULL; /* Additional format-specific data about the vg */ + int found_old_metadata = 0; + unsigned use_previous_vg; + + log_debug_metadata("Reading VG %s %s", vgname ?: "<no name>", vgid ?: "<no vgid>"); + + /* + * Rescan the devices that are associated with this vg in lvmcache. + * This repeats what was done by the command's initial label scan, + * but only the devices associated with this VG. + * + * The lvmcache info about these devs is from the initial label scan + * performed by the command before the vg lock was held. Now the VG + * lock is held, so we rescan all the info from the devs in case + * something changed between the initial scan and now that the lock + * is held. + * + * Some commands (e.g. reporting) are fine reporting data read by + * the label scan. It doesn't matter if the devs changed between + * the label scan and here, we can report what was seen in the + * scan, even though it is the old state, since we will not be + * making any modifications. If the VG was being modified during + * the scan, and caused us to see inconsistent metadata on the + * different PVs in the VG, then we do want to rescan the devs + * here to get a consistent view of the VG. Note that we don't + * know if the scan found all the PVs in the VG at this point. + * We don't know that until vg_read looks at the list of PVs in + * the metadata and compares that to the devices found by the scan. + * + * It's possible that a change made to the VG during scan was + * adding or removing a PV from the VG. In this case, the list + * of devices associated with the VG in lvmcache would change + * due to the rescan. + * + * The devs in the VG may be persistently inconsistent due to some + * previous problem. In this case, rescanning the labels here will + * find the same inconsistency. The VG repair (mistakenly done by + * vg_read below) is supposed to fix that. + * + * FIXME: sort out the usage of the global lock (which is mixed up + * with the orphan lock), and when we can tell that the global + * lock is taken prior to the label scan, and still held here, + * we can also skip the rescan in that case. + */ + if (!cmd->can_use_one_scan || lvmcache_scan_mismatch(cmd, vgname, vgid)) { + log_debug_metadata("Rescanning devices for %s", vgname); + lvmcache_label_rescan_vg(cmd, vgname, vgid); + } else { + log_debug_metadata("Skipped rescanning devices for %s", vgname); + } + + /* Now determine the correct vgname if none was supplied */ + if (!vgname && !(vgname = lvmcache_vgname_from_vgid(cmd->mem, vgid))) { + log_debug_metadata("Cache did not find VG name from vgid %s", vgid); + return NULL; + } + + /* Determine the correct vgid if none was supplied */ + if (!vgid && !(vgid = lvmcache_vgid_from_vgname(cmd, vgname))) { + log_debug_metadata("Cache did not find VG vgid from name %s", vgname); + return NULL; + } + + /* + * A "format instance" is an abstraction for a VG location, + * i.e. where a VG's metadata exists on disk. + * + * An fic (format_instance_ctx) is a temporary struct used + * to create an fid (format_instance). The fid hangs around + * and is used to create a 'vg' to which it connected (vg->fid). + * + * The 'fic' describes a VG in terms of fmt/name/id. + * + * The 'fid' describes a VG in more detail than the fic, + * holding information about where to find the VG metadata. + * + * The 'vg' describes the VG in the most detail representing + * all the VG metadata. + * + * The fic and fid are set up by create_instance() to describe + * the VG location. This happens before the VG metadata is + * assembled into the more familiar struct volume_group "vg". + * + * The fid has one main purpose: to keep track of the metadata + * locations for a given VG. It does this by putting 'mda' + * structs on fid->metadata_areas_in_use, which specify where + * metadata is located on disk. It gets this information + * (metadata locations for a specific VG) from the command's + * initial label scan. The info is passed indirectly via + * lvmcache info/vginfo structs, which are created by the + * label scan and then copied into fid by create_instance(). + * + * FIXME: just use the vginfo/info->mdas lists directly instead + * of copying them into the fid list. + */ + + fic.type = FMT_INSTANCE_MDAS | FMT_INSTANCE_AUX_MDAS; + fic.context.vg_ref.vg_name = vgname; + fic.context.vg_ref.vg_id = vgid; + + /* + * Sets up the metadata areas that we need to read below. + * For each info in vginfo->infos, for each mda in info->mdas, + * (found during label_scan), copy the mda to fid->metadata_areas_in_use + */ + if (!(fid = fmt->ops->create_instance(fmt, &fic))) { + log_error("Failed to create format instance"); + return NULL; + } + + /* + * We use the fid globally here so prevent the release_vg + * call to destroy the fid - we may want to reuse it! + */ + fid->ref_count++; + + + /* + * label_scan found PVs for this VG and set up lvmcache to describe the + * VG/PVs that we use here to read the VG. It created 'vginfo' for the + * VG, and created an 'info' attached to vginfo for each PV. It also + * added a metadata_area struct to info->mdas for each metadata area it + * found on the PV. The info->mdas structs are copied to + * fid->metadata_areas_in_use by create_instance above, and here we + * read VG metadata from each of those mdas. + */ + dm_list_iterate_items(mda, &fid->metadata_areas_in_use) { + mda_dev = mda_get_device(mda); + + /* I don't think this can happen */ + if (!mda_dev) { + log_warn("Ignoring metadata for VG %s from missing dev.", vgname); + continue; + } + + use_previous_vg = 0; + + if (use_precommitted) { + log_debug_metadata("Reading VG %s precommit metadata from %s %llu", + vgname, dev_name(mda_dev), (unsigned long long)mda->header_start); + + vg = mda->ops->vg_read_precommit(fid, vgname, mda, &vg_fmtdata, &use_previous_vg); + + if (!vg && !use_previous_vg) { + log_warn("WARNING: Reading VG %s precommit on %s failed.", vgname, dev_name(mda_dev)); + vg_fmtdata = NULL; + continue; + } + } else { + log_debug_metadata("Reading VG %s metadata from %s %llu", + vgname, dev_name(mda_dev), (unsigned long long)mda->header_start); + + vg = mda->ops->vg_read(fid, vgname, mda, &vg_fmtdata, &use_previous_vg); + + if (!vg && !use_previous_vg) { + log_warn("WARNING: Reading VG %s on %s failed.", vgname, dev_name(mda_dev)); + vg_fmtdata = NULL; + continue; + } + } + + if (!vg) + continue; + + if (vg && !vg_ret) { + vg_ret = vg; + dev_ret = mda_dev; + continue; + } + + /* + * Use the newest copy of the metadata found on any mdas. + * Above, We could check if the scan found an old metadata + * seqno in this mda and just skip reading it again; then these + * seqno checks would just be sanity checks. + */ + + if (vg->seqno == vg_ret->seqno) { + release_vg(vg); + continue; + } + + if (vg->seqno > vg_ret->seqno) { + log_warn("WARNING: ignoring old metadata seqno %u on %s vs new metadata seqno %u on %s for VG %s.", + vg_ret->seqno, dev_name(dev_ret), + vg->seqno, dev_name(mda_dev), vg->name); + found_old_metadata = 1; + release_vg(vg_ret); + vg_ret = vg; + dev_ret = mda_dev; + vg_fmtdata = NULL; + continue; + } + + if (vg_ret->seqno > vg->seqno) { + log_warn("WARNING: ignoring old metadata seqno %u on %s vs new metadata seqno %u on %s for VG %s.", + vg->seqno, dev_name(mda_dev), + vg_ret->seqno, dev_name(dev_ret), vg->name); + found_old_metadata = 1; + release_vg(vg); + vg_fmtdata = NULL; + continue; + } + } + + if (found_old_metadata) + log_warn("WARNING: Inconsistent metadata found for VG %s", vgname); + + vg = NULL; + + if (vg_ret) + set_pv_devices(fid, vg_ret); + + fid->ref_count--; + + if (!vg_ret) { + _destroy_fid(&fid); + goto_out; + } + + /* + * Correct the lvmcache representation of the VG using the metadata + * that we have chosen above (vg_ret). + * + * The vginfo/info representation created by label_scan was not + * entirely correct since it did not use the full or final metadata. + * + * In lvmcache, PVs with no mdas were not attached to the vginfo during + * label_scan because label_scan didn't know where they should go. Now + * that we have the VG metadata we can tell, so use that to attach those + * info's to the vginfo. + * + * Also, outdated PVs that have been removed from the VG were incorrectly + * attached to the vginfo during label_scan, and now need to be detached. + */ + lvmcache_update_vg_from_read(vg_ret, vg_ret->status & PRECOMMITTED); + + /* + * lvmcache_update_vg identified outdated mdas that we read above that + * are not actually part of the VG. Remove those outdated mdas from + * the fid's list of mdas. + */ + dm_list_iterate_items_safe(mda, mda2, &fid->metadata_areas_in_use) { + mda_dev = mda_get_device(mda); + if (lvmcache_is_outdated_dev(cmd, vg_ret->name, (const char *)&vg_ret->id, mda_dev)) { + log_debug_metadata("vg_read %s ignore mda for outdated dev %s", + vg_ret->name, dev_name(mda_dev)); + /* FIXME: use _del_mda */ + dm_list_del(&mda->list); + } + } + +out: + return vg_ret; +} + +struct volume_group *vg_read(struct cmd_context *cmd, const char *vg_name, const char *vgid, + uint32_t read_flags, uint32_t lockd_state, + uint32_t *error_flags, struct volume_group **error_vg) +{ + struct volume_group *vg = NULL; + struct lv_list *lvl; + struct pv_list *pvl; + int missing_pv_dev = 0; + int missing_pv_flag = 0; + uint32_t failure = 0; + int writing = (read_flags & READ_FOR_UPDATE); + + /* + * FIXME: is this function still used to read orphans? + * If so, replace any callers with vg_read_orphans. + */ + if (is_orphan_vg(vg_name)) { + int skip_lock = read_flags & PROCESS_SKIP_ORPHAN_LOCK; + log_very_verbose("Reading orphan VG %s", vg_name); + + if (!skip_lock && !lock_vol(cmd, vg_name, LCK_VG_READ, NULL)) + return_NULL; + + vg = vg_read_orphans(cmd, vg_name); + + if (!skip_lock) + unlock_vg(cmd, vg, vg_name); + + *error_flags = 0; + *error_vg = NULL; + return vg; + } + + if (!validate_name(vg_name)) { + log_error("Volume group name \"%s\" has invalid characters.", vg_name); + return NULL; + } + + if (!lock_vol(cmd, vg_name, writing ? LCK_VG_WRITE : LCK_VG_READ, NULL)) { + log_error("Can't get lock for %s", vg_name); + failure |= FAILED_LOCKING; + goto_bad; + } + + if (!(vg = _vg_read(cmd, vg_name, vgid, 0))) { + /* Some callers don't care if the VG doesn't exist and don't want an error message. */ + if (!(read_flags & READ_OK_NOTFOUND)) + log_error("Volume group \"%s\" not found", vg_name); + failure |= FAILED_NOTFOUND; + goto_bad; + } + + /* + * Check and warn if PV ext info is not in sync with VG metadata + * (vg_write fixes.) + */ + _check_pv_ext(cmd, vg); + + if (!vg_strip_outdated_historical_lvs(vg)) + log_warn("WARNING: failed to strip outdated historical lvs."); + + /* + * Check for missing devices in the VG. In most cases a VG cannot be + * changed while it's missing devices. This restriction is implemented + * here in vg_read. Below we return an error from vg_read if the + * vg_read flag indicates that the command is going to modify the VG. + * (We should probably implement this restriction elsewhere instead of + * returning an error from vg_read.) + * + * The PV's device may be present while the PV for the device has the + * MISSING_PV flag set in the metadata. This happened because the VG + * was written while this dev was missing, so the MISSING flag was + * written in the metadata for PV. Now the device has reappeared. + * However, the VG has changed since the device was last present, and + * if the device has outdated data it may not be safe to just start + * using it again. + * + * If there were no PE's used on the PV, we can just clear the MISSING + * flag, but if there were PE's used we need to continue to treat the + * PV as if the device is missing, limiting operations like the VG has + * a missing device, and requiring the user to remove the reappeared + * device from the VG, like a missing device, with vgreduce + * --removemissing. + */ + dm_list_iterate_items(pvl, &vg->pvs) { + if (!pvl->pv->dev) { + /* The obvious and common case of a missing device. */ + + log_warn("WARNING: VG %s is missing PVID %s.", vg_name, (const char *)&pvl->pv->id); + missing_pv_dev++; + + } else if (pvl->pv->status & MISSING_PV) { + /* A device that was missing but has reappeared. */ + + if (pvl->pv->pe_alloc_count == 0) { + log_warn("WARNING: VG %s has unused reappeared PV %s.", vg_name, dev_name(pvl->pv->dev)); + pvl->pv->status &= ~MISSING_PV; + /* tell vgextend restoremissing that MISSING flag was cleared here */ + pvl->pv->unused_missing_cleared = 1; + } else { + log_warn("WARNING: VG %s was missing PV %s.", vg_name, dev_name(pvl->pv->dev)); + missing_pv_flag++; + } + } + } + + if (missing_pv_dev || missing_pv_flag) + vg_mark_partial_lvs(vg, 1); + + if (!check_pv_segments(vg)) { + log_error(INTERNAL_ERROR "PV segments corrupted in %s.", vg->name); + failure |= FAILED_INTERNAL_ERROR; + goto_bad; + } + + dm_list_iterate_items(lvl, &vg->lvs) { + if (!check_lv_segments(lvl->lv, 0)) { + log_error(INTERNAL_ERROR "LV segments corrupted in %s.", lvl->lv->name); + failure |= FAILED_INTERNAL_ERROR; + goto_bad; + } + } + + dm_list_iterate_items(lvl, &vg->lvs) { + /* Checks that cross-reference other LVs. */ + if (!check_lv_segments(lvl->lv, 1)) { + log_error(INTERNAL_ERROR "LV segments corrupted in %s.", lvl->lv->name); + failure |= FAILED_INTERNAL_ERROR; + goto_bad; + } + } + + if (!check_pv_dev_sizes(vg)) + log_warn("WARNING: One or more devices used as PVs in VG %s have changed sizes.", vg->name); + + _check_devs_used_correspond_with_vg(vg); + + if (!_access_vg_lock_type(cmd, vg, lockd_state, &failure)) { + /* Either FAILED_LOCK_TYPE or FAILED_LOCK_MODE were set. */ + goto_bad; + } + + if (!_access_vg_systemid(cmd, vg)) { + failure |= FAILED_SYSTEMID; + goto_bad; + } + + if (!_access_vg_clustered(cmd, vg)) { + failure |= FAILED_CLUSTERED; + goto_bad; + } + + if (writing && !(read_flags & READ_ALLOW_EXPORTED) && vg_is_exported(vg)) { + log_error("Volume group %s is exported", vg->name); + failure |= FAILED_EXPORTED; + goto_bad; + } + + if (writing && !(vg->status & LVM_WRITE)) { + log_error("Volume group %s is read-only", vg->name); + failure |= FAILED_READ_ONLY; + goto_bad; + } + + if (!cmd->handles_missing_pvs && (missing_pv_dev || missing_pv_flag) && writing) { + log_error("Cannot change VG %s while PVs are missing.", vg->name); + log_error("See vgreduce --removemissing and vgextend --restoremissing."); + failure |= FAILED_NOT_ENABLED; + goto_bad; + } + + if (!cmd->handles_unknown_segments && vg_has_unknown_segments(vg) && writing) { + log_error("Cannot change VG %s with unknown segments in it!", vg->name); + failure |= FAILED_NOT_ENABLED; /* FIXME new failure code here? */ + goto_bad; + } + + /* + * When we are reading the VG with the intention of writing it, + * we save a second copy of the VG in vg->vg_committed. This + * copy remains unmodified by the command operation, and is used + * later if there is an error and we want to reactivate LVs. + * FIXME: be specific about exactly when this works correctly. + */ + if (writing) { + struct dm_config_tree *cft; + + if (dm_pool_locked(vg->vgmem)) { + /* FIXME: can this happen? */ + log_warn("WARNING: vg_read no vg copy: pool locked"); + goto out; + } + + if (vg->vg_committed) { + /* FIXME: can this happen? */ + log_warn("WARNING: vg_read no vg copy: copy exists"); + release_vg(vg->vg_committed); + vg->vg_committed = NULL; + } + + if (vg->vg_precommitted) { + /* FIXME: can this happen? */ + log_warn("WARNING: vg_read no vg copy: pre copy exists"); + release_vg(vg->vg_precommitted); + vg->vg_precommitted = NULL; + } + + if (!(cft = export_vg_to_config_tree(vg))) { + log_warn("WARNING: vg_read no vg copy: copy export failed"); + goto out; + } + + if (!(vg->vg_committed = import_vg_from_config_tree(cft, vg->fid))) + log_warn("WARNING: vg_read no vg copy: copy import failed"); + + dm_config_destroy(cft); + } else { + if (vg->vg_precommitted) + log_error(INTERNAL_ERROR "vg_read vg %p vg_precommitted %p", vg, vg->vg_precommitted); + if (vg->vg_committed) + log_error(INTERNAL_ERROR "vg_read vg %p vg_committed %p", vg, vg->vg_committed); + } +out: + /* We return with the VG lock held when read is successful. */ + *error_flags = SUCCESS; + if (error_vg) + *error_vg = NULL; + return vg; + +bad: + *error_flags = failure; + + /* + * FIXME: get rid of this case so we don't have to return the vg when + * there's an error. It is here for process_each_pv() which wants to + * eliminate the VG's devs from the list of devs it is processing, even + * when it can't access the VG because of wrong system id or similar. + * This could be done by looking at lvmcache info structs intead of 'vg'. + * It's also used by process_each_vg/process_each_lv which want to + * include error_vg values (like system_id) in error messages. + * These values could also be found from lvmcache vginfo. + */ + if (error_vg && vg) { + if (vg->vg_precommitted) + log_error(INTERNAL_ERROR "vg_read vg %p vg_precommitted %p", vg, vg->vg_precommitted); + if (vg->vg_committed) + log_error(INTERNAL_ERROR "vg_read vg %p vg_committed %p", vg, vg->vg_committed); + + /* caller must unlock_vg and release_vg */ + *error_vg = vg; + return_NULL; + } + + if (vg) { + unlock_vg(cmd, vg, vg_name); + release_vg(vg); + } + if (error_vg) + *error_vg = NULL; + return_NULL; +} + +/* + * Simply a version of vg_read() that automatically sets the READ_FOR_UPDATE + * flag, which means the caller intends to write the VG after reading it, + * so vg_read should acquire an exclusive file lock on the vg. + */ +struct volume_group *vg_read_for_update(struct cmd_context *cmd, const char *vg_name, + const char *vgid, uint32_t read_flags, uint32_t lockd_state) +{ + struct volume_group *vg; + uint32_t error_flags = 0; + + vg = vg_read(cmd, vg_name, vgid, read_flags | READ_FOR_UPDATE, lockd_state, &error_flags, NULL); + + return vg; +} + +void vg_write_commit_bad_mdas(struct cmd_context *cmd, struct volume_group *vg) +{ + struct dm_list bad_mdas; + struct metadata_area *mda; + struct device *dev; + + dm_list_init(&bad_mdas); + + lvmcache_get_bad_mdas(cmd, vg->name, (const char *)&vg->id, &bad_mdas); + + dm_list_iterate_items(mda, &bad_mdas) { + dev = mda_get_device(mda); + + /* + * bad_fields: + * + * 0: shouldn't happen + * + * READ|INTERNAL: there's probably nothing wrong on disk + * + * MAGIC|START: there's a good chance that we were + * reading the mda_header from the wrong location; maybe + * the pv_header location was wrong. We don't want to + * write new metadata to the wrong location. To handle + * this we would want to do some further verification that + * we have the mda location correct. + * + * VERSION|CHECKSUM: when the others are correct these + * look safe to repair. + * + * HEADER: general error related to header, covered by fields + * above. + * + * TEXT: general error related to text metadata, we can repair. + */ + if (!mda->bad_fields || + (mda->bad_fields & BAD_MDA_READ) || + (mda->bad_fields & BAD_MDA_INTERNAL) || + (mda->bad_fields & BAD_MDA_MAGIC) || + (mda->bad_fields & BAD_MDA_START)) { + log_warn("WARNING: not repairing bad metadata (0x%x) for mda%d on %s", + mda->bad_fields, mda->mda_num, dev_name(dev)); + continue; + } + + /* + * vg_write/vg_commit reread the mda_header which checks the + * mda header fields and fails if any are bad, which stops + * vg_write/vg_commit from continuing. Suppress these header + * field checks when we know the field is bad and we are going + * to replace it. FIXME: do vg_write/vg_commit really need to + * reread and recheck the mda_header again (probably not)? + */ + + if (mda->bad_fields & BAD_MDA_CHECKSUM) + mda->ignore_bad_fields |= BAD_MDA_CHECKSUM; + if (mda->bad_fields & BAD_MDA_VERSION) + mda->ignore_bad_fields |= BAD_MDA_VERSION; + + log_warn("WARNING: repairing bad metadata (0x%x) in mda%d at %llu on %s.", + mda->bad_fields, mda->mda_num, (unsigned long long)mda->header_start, dev_name(dev)); + + if (!mda->ops->vg_write(vg->fid, vg, mda)) { + log_warn("WARNING: failed to write VG %s metadata to bad mda%d at %llu on %s.", + vg->name, mda->mda_num, (unsigned long long)mda->header_start, dev_name(dev)); + continue; + } + + if (!mda->ops->vg_precommit(vg->fid, vg, mda)) { + log_warn("WARNING: failed to precommit VG %s metadata to bad mda%d at %llu on %s.", + vg->name, mda->mda_num, (unsigned long long)mda->header_start, dev_name(dev)); + continue; + } + + if (!mda->ops->vg_commit(vg->fid, vg, mda)) { + log_warn("WARNING: failed to commit VG %s metadata to bad mda%d at %llu on %s.", + vg->name, mda->mda_num, (unsigned long long)mda->header_start, dev_name(dev)); + continue; + } + } +} + |