/*- * Copyright (c) 2014-present MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * * See the file LICENSE for redistribution information. */ #include "wt_internal.h" /* * __wt_direct_io_size_check -- * Return a size from the configuration, complaining if it's insufficient for direct I/O. */ int __wt_direct_io_size_check( WT_SESSION_IMPL *session, const char **cfg, const char *config_name, uint32_t *allocsizep) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; int64_t align; *allocsizep = 0; conn = S2C(session); WT_RET(__wt_config_gets(session, cfg, config_name, &cval)); /* * This function exists as a place to hang this comment: if direct I/O is configured, page sizes * must be at least as large as any buffer alignment as well as a multiple of the alignment. * Linux gets unhappy if you configure direct I/O and then don't do I/O in alignments and units * of its happy place. */ if (FLD_ISSET(conn->direct_io, WT_DIRECT_IO_CHECKPOINT | WT_DIRECT_IO_DATA)) { align = (int64_t)conn->buffer_alignment; if (align != 0 && (cval.val < align || cval.val % align != 0)) WT_RET_MSG(session, EINVAL, "when direct I/O is configured, the %s size must be at least as large as the buffer " "alignment as well as a multiple of the buffer alignment", config_name); } *allocsizep = (uint32_t)cval.val; return (0); } /* * __check_imported_ts -- * Check the aggregated timestamps for each checkpoint in a file that we've imported. We're not * allowed to import files with timestamps ahead of our oldest timestamp since a subsequent * rollback to stable could result in data loss and historical reads could yield unexpected * values. Therefore, this function should return non-zero to callers to signify that this is * the case. */ static int __check_imported_ts(WT_SESSION_IMPL *session, const char *uri, const char *config) { WT_CKPT *ckptbase, *ckpt; WT_DECL_RET; WT_TXN_GLOBAL *txn_global; ckptbase = NULL; txn_global = &S2C(session)->txn_global; WT_ERR_NOTFOUND_OK(__wt_meta_ckptlist_get_from_config(session, false, &ckptbase, config), true); if (ret == WT_NOTFOUND) WT_ERR_MSG(session, EINVAL, "%s: import could not find any checkpoint information in supplied metadata", uri); /* Now iterate over each checkpoint and compare the aggregate timestamps with our oldest. */ WT_CKPT_FOREACH (ckptbase, ckpt) { if (ckpt->ta.newest_start_durable_ts > txn_global->oldest_timestamp) WT_ERR_MSG(session, EINVAL, "%s: import found aggregated newest start durable timestamp newer than the current " "oldest timestamp, newest_start_durable_ts=%" PRIu64 ", oldest_ts=%" PRIu64, uri, ckpt->ta.newest_start_durable_ts, txn_global->oldest_timestamp); /* * No need to check "newest stop" here as "newest stop durable" serves that purpose. When a * file has at least one record without a stop timestamp, "newest stop" will be set to max * whereas "newest stop durable" refers to the newest non-max timestamp which is more useful * to us in terms of comparing with oldest. */ if (ckpt->ta.newest_stop_durable_ts > txn_global->oldest_timestamp) { WT_ASSERT(session, ckpt->ta.newest_stop_durable_ts != WT_TS_MAX); WT_ERR_MSG(session, EINVAL, "%s: import found aggregated newest stop durable timestamp newer than the current " "oldest timestamp, newest_stop_durable_ts=%" PRIu64 ", oldest_ts=%" PRIu64, uri, ckpt->ta.newest_stop_durable_ts, txn_global->oldest_timestamp); } } err: if (ckptbase != NULL) __wt_meta_ckptlist_free(session, &ckptbase); return (ret); } /* * __create_file -- * Create a new 'file:' object. */ static int __create_file( WT_SESSION_IMPL *session, const char *uri, bool exclusive, bool import, const char *config) { WT_CONFIG_ITEM cval; WT_DECL_ITEM(val); WT_DECL_RET; const char *filename, **p, *filecfg[] = {WT_CONFIG_BASE(session, file_meta), config, NULL, NULL, NULL}; char *fileconf, *filemeta; uint32_t allocsize; bool exists, import_repair, is_metadata; fileconf = filemeta = NULL; import_repair = false; is_metadata = strcmp(uri, WT_METAFILE_URI) == 0; filename = uri; WT_PREFIX_SKIP_REQUIRED(session, filename, "file:"); /* Check if the file already exists. */ if (!is_metadata && (ret = __wt_metadata_search(session, uri, &fileconf)) != WT_NOTFOUND) { /* * Regardless of the 'exclusive' flag, we should raise an error if we try to import an * existing URI rather than just silently returning. */ if (exclusive || import) WT_TRET(EEXIST); goto err; } exists = false; /* * At this moment the uri doesn't exist in the metadata. In scenarios like, the database folder * is copied without a checkpoint into another location and trying to recover from it leads to * that history store file exists on disk but not as part of metadata. As we recreate the * history store file on every restart to ensure that history store file is present. Make sure * to remove the already exist history store file in the directory. */ if (strcmp(uri, WT_HS_URI) == 0) { WT_IGNORE_RET(__wt_fs_exist(session, filename, &exists)); if (exists) WT_IGNORE_RET(__wt_fs_remove(session, filename, true)); } /* Sanity check the allocation size. */ WT_ERR(__wt_direct_io_size_check(session, filecfg, "allocation_size", &allocsize)); /* * If we are importing an existing object rather than creating a new one, there are two possible * scenarios. Either (1) the file configuration string from the source database metadata is * specified in the input config string, or (2) the import.repair option is set and we need to * reconstruct the configuration metadata from the file. */ if (import) { /* First verify that the data to import exists on disk. */ WT_IGNORE_RET(__wt_fs_exist(session, filename, &exists)); if (!exists) WT_ERR_MSG(session, ENOENT, "%s", uri); import_repair = __wt_config_getones(session, config, "import.repair", &cval) == 0 && cval.val != 0; if (!import_repair) { if (__wt_config_getones(session, config, "import.file_metadata", &cval) == 0 && cval.len != 0) { /* * The string may be enclosed by delimiters (e.g. braces, quotes, parentheses) to * avoid configuration string characters acting as separators. Discard the first and * last characters in this case. */ if (cval.type == WT_CONFIG_ITEM_STRUCT) { cval.str++; cval.len -= 2; } WT_ERR(__wt_strndup(session, cval.str, cval.len, &filemeta)); filecfg[2] = filemeta; } else { /* * If there is no file metadata provided, the user should be specifying a "repair". * To prevent mistakes with API usage, we should return an error here rather than * inferring a repair. */ WT_ERR_MSG(session, EINVAL, "%s: import requires that 'file_metadata' is specified or the 'repair' option is " "provided", uri); } } } else { /* Create the file. */ WT_ERR(__wt_block_manager_create(session, filename, allocsize)); /* * Track the creation of this file. * * If something down the line fails, we're going to need to roll this back. Specifically do * NOT track the op in the import case since we do not want to wipe a data file just because * we fail to import it. */ if (WT_META_TRACKING(session)) WT_ERR(__wt_meta_track_fileop(session, NULL, uri)); } /* * If creating an ordinary file, append the file ID and current version numbers to the passed-in * configuration and insert the resulting configuration into the metadata. */ if (!is_metadata) { if (!import_repair) { WT_ERR(__wt_scr_alloc(session, 0, &val)); WT_ERR(__wt_buf_fmt(session, val, "id=%" PRIu32 ",version=(major=%d,minor=%d)", ++S2C(session)->next_file_id, WT_BTREE_MAJOR_VERSION_MAX, WT_BTREE_MINOR_VERSION_MAX)); for (p = filecfg; *p != NULL; ++p) ; *p = val->data; WT_ERR(__wt_config_collapse(session, filecfg, &fileconf)); } else { /* Try to recreate the associated metadata from the imported data source. */ WT_ERR(__wt_import_repair(session, uri, &fileconf)); } WT_ERR(__wt_metadata_insert(session, uri, fileconf)); /* * Ensure that the timestamps in the imported data file are not in the future relative to * our oldest timestamp. */ if (import) WT_ERR(__check_imported_ts(session, uri, fileconf)); } /* * Open the file to check that it was setup correctly. We don't need to pass the configuration, * we just wrote the collapsed configuration into the metadata file, and it's going to be * read/used by underlying functions. * * Keep the handle exclusive until it is released at the end of the call, otherwise we could * race with a drop. */ WT_ERR(__wt_session_get_dhandle(session, uri, NULL, NULL, WT_DHANDLE_EXCLUSIVE)); if (WT_META_TRACKING(session)) WT_ERR(__wt_meta_track_handle_lock(session, true)); else WT_ERR(__wt_session_release_dhandle(session)); err: __wt_scr_free(session, &val); __wt_free(session, fileconf); __wt_free(session, filemeta); return (ret); } /* * __wt_schema_colgroup_source -- * Get the URI of the data source for a column group. */ int __wt_schema_colgroup_source( WT_SESSION_IMPL *session, WT_TABLE *table, const char *cgname, const char *config, WT_ITEM *buf) { WT_CONFIG_ITEM cval; WT_DECL_RET; size_t len; const char *prefix, *suffix, *tablename; tablename = table->iface.name + strlen("table:"); if ((ret = __wt_config_getones(session, config, "type", &cval)) == 0 && !WT_STRING_MATCH("file", cval.str, cval.len)) { prefix = cval.str; len = cval.len; suffix = ""; } else { prefix = "file"; len = strlen(prefix); suffix = ".wt"; } WT_RET_NOTFOUND_OK(ret); if (cgname == NULL) WT_RET(__wt_buf_fmt(session, buf, "%.*s:%s%s", (int)len, prefix, tablename, suffix)); else WT_RET( __wt_buf_fmt(session, buf, "%.*s:%s_%s%s", (int)len, prefix, tablename, cgname, suffix)); return (0); } /* * __create_colgroup -- * Create a column group. */ static int __create_colgroup(WT_SESSION_IMPL *session, const char *name, bool exclusive, const char *config) { WT_CONFIG_ITEM cval; WT_DECL_RET; WT_ITEM confbuf, fmt, namebuf; WT_TABLE *table; size_t tlen; char *cgconf, *origconf; const char **cfgp, *cfg[4] = {WT_CONFIG_BASE(session, colgroup_meta), config, NULL, NULL}; const char *cgname, *source, *sourceconf, *tablename; const char *sourcecfg[] = {config, NULL, NULL}; bool exists, tracked; sourceconf = NULL; cgconf = origconf = NULL; WT_CLEAR(fmt); WT_CLEAR(confbuf); WT_CLEAR(namebuf); exists = tracked = false; tablename = name; WT_PREFIX_SKIP_REQUIRED(session, tablename, "colgroup:"); cgname = strchr(tablename, ':'); if (cgname != NULL) { tlen = (size_t)(cgname - tablename); ++cgname; } else tlen = strlen(tablename); if ((ret = __wt_schema_get_table( session, tablename, tlen, true, WT_DHANDLE_EXCLUSIVE, &table)) != 0) WT_RET_MSG(session, (ret == WT_NOTFOUND) ? ENOENT : ret, "Can't create '%s' for non-existent table '%.*s'", name, (int)tlen, tablename); if (WT_META_TRACKING(session)) { WT_WITH_DHANDLE(session, &table->iface, ret = __wt_meta_track_handle_lock(session, false)); WT_ERR(ret); tracked = true; } /* Make sure the column group is referenced from the table. */ if (cgname != NULL && (ret = __wt_config_subgets(session, &table->cgconf, cgname, &cval)) != 0) WT_ERR_MSG(session, EINVAL, "Column group '%s' not found in table '%.*s'", cgname, (int)tlen, tablename); /* Check if the column group already exists. */ if ((ret = __wt_metadata_search(session, name, &origconf)) == 0) { if (exclusive) WT_ERR(EEXIST); exists = true; } WT_ERR_NOTFOUND_OK(ret, false); /* Find the first NULL entry in the cfg stack. */ for (cfgp = &cfg[1]; *cfgp; cfgp++) ; /* Add the source to the colgroup config before collapsing. */ if (__wt_config_getones(session, config, "source", &cval) == 0 && cval.len != 0) { WT_ERR(__wt_buf_fmt(session, &namebuf, "%.*s", (int)cval.len, cval.str)); source = namebuf.data; } else { WT_ERR(__wt_schema_colgroup_source(session, table, cgname, config, &namebuf)); source = namebuf.data; WT_ERR(__wt_buf_fmt(session, &confbuf, "source=\"%s\"", source)); *cfgp++ = confbuf.data; } /* Calculate the key/value formats: these go into the source config. */ WT_ERR(__wt_buf_fmt(session, &fmt, "key_format=%s", table->key_format)); if (cgname == NULL) WT_ERR(__wt_buf_catfmt(session, &fmt, ",value_format=%s", table->value_format)); else { if (__wt_config_getones(session, config, "columns", &cval) != 0) WT_ERR_MSG(session, EINVAL, "No 'columns' configuration for '%s'", name); WT_ERR(__wt_buf_catfmt(session, &fmt, ",value_format=")); WT_ERR(__wt_struct_reformat(session, table, cval.str, cval.len, NULL, true, &fmt)); } sourcecfg[1] = fmt.data; WT_ERR(__wt_config_merge(session, sourcecfg, NULL, &sourceconf)); WT_ERR(__wt_schema_create(session, source, sourceconf)); WT_ERR(__wt_config_collapse(session, cfg, &cgconf)); if (!exists) { WT_ERR(__wt_metadata_insert(session, name, cgconf)); WT_ERR(__wt_schema_open_colgroups(session, table)); } err: __wt_free(session, cgconf); __wt_free(session, sourceconf); __wt_free(session, origconf); __wt_buf_free(session, &confbuf); __wt_buf_free(session, &fmt); __wt_buf_free(session, &namebuf); if (!tracked) WT_TRET(__wt_schema_release_table(session, &table)); return (ret); } /* * __wt_schema_index_source -- * Get the URI of the data source for an index. */ int __wt_schema_index_source( WT_SESSION_IMPL *session, WT_TABLE *table, const char *idxname, const char *config, WT_ITEM *buf) { WT_CONFIG_ITEM cval; WT_DECL_RET; size_t len; const char *prefix, *suffix, *tablename; tablename = table->iface.name + strlen("table:"); if ((ret = __wt_config_getones(session, config, "type", &cval)) == 0 && !WT_STRING_MATCH("file", cval.str, cval.len)) { prefix = cval.str; len = cval.len; suffix = "_idx"; } else { prefix = "file"; len = strlen(prefix); suffix = ".wti"; } WT_RET_NOTFOUND_OK(ret); WT_RET( __wt_buf_fmt(session, buf, "%.*s:%s_%s%s", (int)len, prefix, tablename, idxname, suffix)); return (0); } /* * __fill_index -- * Fill the index from the current contents of the table. */ static int __fill_index(WT_SESSION_IMPL *session, WT_TABLE *table, WT_INDEX *idx) { WT_CURSOR *tcur, *icur; WT_DECL_RET; WT_SESSION *wt_session; wt_session = &session->iface; tcur = NULL; icur = NULL; WT_RET(__wt_schema_open_colgroups(session, table)); /* * If the column groups have not been completely created, there cannot be data inserted yet, and * we're done. */ if (!table->cg_complete) return (0); WT_ERR(wt_session->open_cursor(wt_session, idx->source, NULL, "bulk=unordered", &icur)); WT_ERR(wt_session->open_cursor(wt_session, table->iface.name, NULL, "readonly", &tcur)); while ((ret = tcur->next(tcur)) == 0) WT_ERR(__wt_apply_single_idx(session, idx, icur, (WT_CURSOR_TABLE *)tcur, icur->insert)); WT_ERR_NOTFOUND_OK(ret, false); err: if (icur) WT_TRET(icur->close(icur)); if (tcur) WT_TRET(tcur->close(tcur)); return (ret); } /* * __create_index -- * Create an index. */ static int __create_index(WT_SESSION_IMPL *session, const char *name, bool exclusive, const char *config) { WT_CONFIG kcols, pkcols; WT_CONFIG_ITEM ckey, cval, icols, kval; WT_DECL_PACK_VALUE(pv); WT_DECL_RET; WT_INDEX *idx; WT_ITEM confbuf, extra_cols, fmt, namebuf; WT_PACK pack; WT_TABLE *table; size_t tlen; u_int i, npublic_cols; char *idxconf, *origconf; const char *cfg[4] = {WT_CONFIG_BASE(session, index_meta), NULL, NULL, NULL}; const char *source, *sourceconf, *idxname, *tablename; const char *sourcecfg[] = {config, NULL, NULL}; bool exists, have_extractor; sourceconf = NULL; idxconf = origconf = NULL; WT_CLEAR(confbuf); WT_CLEAR(fmt); WT_CLEAR(extra_cols); WT_CLEAR(namebuf); exists = have_extractor = false; tablename = name; WT_PREFIX_SKIP_REQUIRED(session, tablename, "index:"); idxname = strchr(tablename, ':'); if (idxname == NULL) WT_RET_MSG( session, EINVAL, "Invalid index name, should be