/*- * Copyright (c) 2014-2020 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * * See the file LICENSE for redistribution information. */ #include "wt_internal.h" static int __desc_read(WT_SESSION_IMPL *, uint32_t allocsize, WT_BLOCK *); /* * __wt_block_manager_drop -- * Drop a file. */ int __wt_block_manager_drop(WT_SESSION_IMPL *session, const char *filename, bool durable) { return (__wt_remove_if_exists(session, filename, durable)); } /* * __wt_block_manager_create -- * Create a file. */ int __wt_block_manager_create(WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize) { WT_DECL_ITEM(tmp); WT_DECL_RET; WT_FH *fh; int suffix; bool exists; /* * Create the underlying file and open a handle. * * Since WiredTiger schema operations are (currently) non-transactional, it's possible to see a * partially-created file left from a previous create. Further, there's nothing to prevent users * from creating files in our space. Move any existing files out of the way and complain. */ for (;;) { if ((ret = __wt_open(session, filename, WT_FS_OPEN_FILE_TYPE_DATA, WT_FS_OPEN_CREATE | WT_FS_OPEN_DURABLE | WT_FS_OPEN_EXCLUSIVE, &fh)) == 0) break; WT_ERR_TEST(ret != EEXIST, ret, false); if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); for (suffix = 1;; ++suffix) { WT_ERR(__wt_buf_fmt(session, tmp, "%s.%d", filename, suffix)); WT_ERR(__wt_fs_exist(session, tmp->data, &exists)); if (!exists) { WT_ERR(__wt_fs_rename(session, filename, tmp->data, false)); WT_ERR(__wt_msg(session, "unexpected file %s found, renamed to %s", filename, (const char *)tmp->data)); break; } } } /* Write out the file's meta-data. */ ret = __wt_desc_write(session, fh, allocsize); /* * Ensure the truncated file has made it to disk, then the upper-level is never surprised. */ WT_TRET(__wt_fsync(session, fh, true)); /* Close the file handle. */ WT_TRET(__wt_close(session, &fh)); /* Undo any create on error. */ if (ret != 0) WT_TRET(__wt_fs_remove(session, filename, false)); err: __wt_scr_free(session, &tmp); return (ret); } /* * __block_destroy -- * Destroy a block handle. */ static int __block_destroy(WT_SESSION_IMPL *session, WT_BLOCK *block) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; uint64_t bucket; conn = S2C(session); bucket = block->name_hash % WT_HASH_ARRAY_SIZE; WT_CONN_BLOCK_REMOVE(conn, block, bucket); __wt_free(session, block->name); if (block->fh != NULL) WT_TRET(__wt_close(session, &block->fh)); __wt_spin_destroy(session, &block->live_lock); __wt_overwrite_and_free(session, block); return (ret); } /* * __wt_block_configure_first_fit -- * Configure first-fit allocation. */ void __wt_block_configure_first_fit(WT_BLOCK *block, bool on) { /* * Switch to first-fit allocation so we rewrite blocks at the start of the file; use atomic * instructions because checkpoints also configure first-fit allocation, and this way we stay on * first-fit allocation as long as any operation wants it. */ if (on) (void)__wt_atomic_add32(&block->allocfirst, 1); else (void)__wt_atomic_sub32(&block->allocfirst, 1); } /* * __wt_block_open -- * Open a block handle. */ int __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], bool forced_salvage, bool readonly, uint32_t allocsize, WT_BLOCK **blockp) { WT_BLOCK *block; WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; WT_DECL_RET; uint64_t bucket, hash; uint32_t flags; *blockp = block = NULL; __wt_verbose(session, WT_VERB_BLOCK, "open: %s", filename); conn = S2C(session); hash = __wt_hash_city64(filename, strlen(filename)); bucket = hash % WT_HASH_ARRAY_SIZE; __wt_spin_lock(session, &conn->block_lock); TAILQ_FOREACH (block, &conn->blockhash[bucket], hashq) { if (strcmp(filename, block->name) == 0) { ++block->ref; *blockp = block; __wt_spin_unlock(session, &conn->block_lock); return (0); } } /* * Basic structure allocation, initialization. * * Note: set the block's name-hash value before any work that can fail because cleanup calls the * block destroy code which uses that hash value to remove the block from the underlying linked * lists. */ WT_ERR(__wt_calloc_one(session, &block)); block->ref = 1; block->name_hash = hash; block->allocsize = allocsize; WT_CONN_BLOCK_INSERT(conn, block, bucket); WT_ERR(__wt_strdup(session, filename, &block->name)); WT_ERR(__wt_config_gets(session, cfg, "block_allocation", &cval)); block->allocfirst = WT_STRING_MATCH("first", cval.str, cval.len); /* Configuration: optional OS buffer cache maximum size. */ WT_ERR(__wt_config_gets(session, cfg, "os_cache_max", &cval)); block->os_cache_max = (size_t)cval.val; /* Configuration: optional immediate write scheduling flag. */ WT_ERR(__wt_config_gets(session, cfg, "os_cache_dirty_max", &cval)); block->os_cache_dirty_max = (size_t)cval.val; /* Set the file extension information. */ block->extend_len = conn->data_extend_len; /* * Open the underlying file handle. * * "direct_io=checkpoint" configures direct I/O for readonly data files. */ flags = 0; WT_ERR(__wt_config_gets(session, cfg, "access_pattern_hint", &cval)); if (WT_STRING_MATCH("random", cval.str, cval.len)) LF_SET(WT_FS_OPEN_ACCESS_RAND); else if (WT_STRING_MATCH("sequential", cval.str, cval.len)) LF_SET(WT_FS_OPEN_ACCESS_SEQ); if (readonly && FLD_ISSET(conn->direct_io, WT_DIRECT_IO_CHECKPOINT)) LF_SET(WT_FS_OPEN_DIRECTIO); if (!readonly && FLD_ISSET(conn->direct_io, WT_DIRECT_IO_DATA)) LF_SET(WT_FS_OPEN_DIRECTIO); WT_ERR(__wt_open(session, filename, WT_FS_OPEN_FILE_TYPE_DATA, flags, &block->fh)); /* Set the file's size. */ WT_ERR(__wt_filesize(session, block->fh, &block->size)); /* Initialize the live checkpoint's lock. */ WT_ERR(__wt_spin_init(session, &block->live_lock, "block manager")); /* * Read the description information from the first block. * * Salvage is a special case: if we're forcing the salvage, we don't look at anything, including * the description information. */ if (!forced_salvage) WT_ERR(__desc_read(session, allocsize, block)); *blockp = block; __wt_spin_unlock(session, &conn->block_lock); return (0); err: if (block != NULL) WT_TRET(__block_destroy(session, block)); __wt_spin_unlock(session, &conn->block_lock); return (ret); } /* * __wt_block_close -- * Close a block handle. */ int __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; if (block == NULL) /* Safety check */ return (0); conn = S2C(session); __wt_verbose(session, WT_VERB_BLOCK, "close: %s", block->name == NULL ? "" : block->name); __wt_spin_lock(session, &conn->block_lock); /* Reference count is initialized to 1. */ if (block->ref == 0 || --block->ref == 0) ret = __block_destroy(session, block); __wt_spin_unlock(session, &conn->block_lock); return (ret); } /* * __wt_desc_write -- * Write a file's initial descriptor structure. */ int __wt_desc_write(WT_SESSION_IMPL *session, WT_FH *fh, uint32_t allocsize) { WT_BLOCK_DESC *desc; WT_DECL_ITEM(buf); WT_DECL_RET; /* If in-memory, we don't read or write the descriptor structure. */ if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) return (0); /* Use a scratch buffer to get correct alignment for direct I/O. */ WT_RET(__wt_scr_alloc(session, allocsize, &buf)); memset(buf->mem, 0, allocsize); /* * Checksum a little-endian version of the header, and write everything in little-endian format. * The checksum is (potentially) returned in a big-endian format, swap it into place in a * separate step. */ desc = buf->mem; desc->magic = WT_BLOCK_MAGIC; desc->majorv = WT_BLOCK_MAJOR_VERSION; desc->minorv = WT_BLOCK_MINOR_VERSION; desc->checksum = 0; __wt_block_desc_byteswap(desc); desc->checksum = __wt_checksum(desc, allocsize); #ifdef WORDS_BIGENDIAN desc->checksum = __wt_bswap32(desc->checksum); #endif ret = __wt_write(session, fh, (wt_off_t)0, (size_t)allocsize, desc); __wt_scr_free(session, &buf); return (ret); } /* * __desc_read -- * Read and verify the file's metadata. */ static int __desc_read(WT_SESSION_IMPL *session, uint32_t allocsize, WT_BLOCK *block) { WT_BLOCK_DESC *desc; WT_DECL_ITEM(buf); WT_DECL_RET; uint32_t checksum_saved, checksum_tmp; bool checksum_matched; /* If in-memory, we don't read or write the descriptor structure. */ if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) return (0); /* * If a data file is smaller than the allocation size, we're not going to be able to read the * descriptor block. * * If we're performing rollback to stable as part of recovery, we should treat this as if the * file has been deleted; that is, to log an error but continue on. * * In the general case, we should return a generic error and signal that we've detected data * corruption. * * FIXME-WT-5832: MongoDB relies heavily on the error codes reported when opening cursors (which * hits this logic if the relevant data handle isn't already open). However this code gets run * in rollback to stable as part of recovery where we want to skip any corrupted data files * temporarily to allow MongoDB to initiate salvage. This is why we've been forced into this * situation. We should address this as part of WT-5832 and clarify what error codes we expect * to be returning across the API boundary. */ if (block->size < allocsize) { if (F_ISSET(session, WT_SESSION_ROLLBACK_TO_STABLE)) ret = ENOENT; else { ret = WT_ERROR; F_SET(S2C(session), WT_CONN_DATA_CORRUPTION); } WT_RET_MSG(session, ret, "File %s is smaller than allocation size; file size=%" PRId64 ", alloc size=%" PRIu32, block->name, block->size, allocsize); } /* Use a scratch buffer to get correct alignment for direct I/O. */ WT_RET(__wt_scr_alloc(session, allocsize, &buf)); /* Read the first allocation-sized block and verify the file format. */ WT_ERR(__wt_read(session, block->fh, (wt_off_t)0, (size_t)allocsize, buf->mem)); /* * Handle little- and big-endian objects. Objects are written in little- endian format: save the * header checksum, and calculate the checksum for the header in its little-endian form. Then, * restore the header's checksum, and byte-swap the whole thing as necessary, leaving us with a * calculated checksum that should match the checksum in the header. */ desc = buf->mem; checksum_saved = checksum_tmp = desc->checksum; #ifdef WORDS_BIGENDIAN checksum_tmp = __wt_bswap32(checksum_tmp); #endif desc->checksum = 0; checksum_matched = __wt_checksum_match(desc, allocsize, checksum_tmp); desc->checksum = checksum_saved; __wt_block_desc_byteswap(desc); /* * We fail the open if the checksum fails, or the magic number is wrong or the major/minor * numbers are unsupported for this version. This test is done even if the caller is verifying * or salvaging the file: it makes sense for verify, and for salvage we don't overwrite files * without some reason to believe they are WiredTiger files. The user may have entered the wrong * file name, and is now frantically pounding their interrupt key. */ if (desc->magic != WT_BLOCK_MAGIC || !checksum_matched) { if (strcmp(block->name, WT_METAFILE) == 0 || strcmp(block->name, WT_HS_FILE) == 0) WT_ERR_MSG(session, WT_TRY_SALVAGE, "%s is corrupted", block->name); WT_ERR_MSG(session, WT_ERROR, "%s does not appear to be a WiredTiger file", block->name); } if (desc->majorv > WT_BLOCK_MAJOR_VERSION || (desc->majorv == WT_BLOCK_MAJOR_VERSION && desc->minorv > WT_BLOCK_MINOR_VERSION)) WT_ERR_MSG(session, WT_ERROR, "unsupported WiredTiger file version: this build only " "supports major/minor versions up to %d/%d, and the file " "is version %" PRIu16 "/%" PRIu16, WT_BLOCK_MAJOR_VERSION, WT_BLOCK_MINOR_VERSION, desc->majorv, desc->minorv); __wt_verbose(session, WT_VERB_BLOCK, "%s: magic %" PRIu32 ", major/minor: %" PRIu32 "/%" PRIu32, block->name, desc->magic, desc->majorv, desc->minorv); err: __wt_scr_free(session, &buf); return (ret); } /* * __wt_block_stat -- * Set the statistics for a live block handle. */ void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats) { /* * Reading from the live system's structure normally requires locking, but it's an 8B statistics * read, there's no need. */ WT_STAT_WRITE(session, stats, allocation_size, block->allocsize); WT_STAT_WRITE(session, stats, block_checkpoint_size, (int64_t)block->live.ckpt_size); WT_STAT_WRITE(session, stats, block_magic, WT_BLOCK_MAGIC); WT_STAT_WRITE(session, stats, block_major, WT_BLOCK_MAJOR_VERSION); WT_STAT_WRITE(session, stats, block_minor, WT_BLOCK_MINOR_VERSION); WT_STAT_WRITE(session, stats, block_reuse_bytes, (int64_t)block->live.avail.bytes); WT_STAT_WRITE(session, stats, block_size, block->size); } /* * __wt_block_manager_size -- * Return the size of a live block handle. */ int __wt_block_manager_size(WT_BM *bm, WT_SESSION_IMPL *session, wt_off_t *sizep) { WT_UNUSED(session); *sizep = bm->block->size; return (0); } /* * __wt_block_manager_named_size -- * Return the size of a named file. */ int __wt_block_manager_named_size(WT_SESSION_IMPL *session, const char *name, wt_off_t *sizep) { return (__wt_fs_size(session, name, sizep)); }